]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/mxge/if_mxge.c
MFV r320905: Import upstream fix for CVE-2017-11103.
[FreeBSD/FreeBSD.git] / sys / dev / mxge / if_mxge.c
1 /******************************************************************************
2
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 #include <sys/zlib.h>
50
51 #include <net/if.h>
52 #include <net/if_var.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57
58 #include <net/bpf.h>
59
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81
82 #include <vm/vm.h>              /* for pmap_mapdev() */
83 #include <vm/pmap.h>
84
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #ifdef IFNET_BUF_RING
94 #include <sys/buf_ring.h>
95 #endif
96
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131
132   DEVMETHOD_END
133 };
134
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141
142 static devclass_t mxge_devclass;
143
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154
155 static int
156 mxge_probe(device_t dev)
157 {
158         int rev;
159
160
161         if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162             ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163              (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164                 rev = pci_get_revid(dev);
165                 switch (rev) {
166                 case MXGE_PCI_REV_Z8E:
167                         device_set_desc(dev, "Myri10G-PCIE-8A");
168                         break;
169                 case MXGE_PCI_REV_Z8ES:
170                         device_set_desc(dev, "Myri10G-PCIE-8B");
171                         break;
172                 default:
173                         device_set_desc(dev, "Myri10G-PCIE-8??");
174                         device_printf(dev, "Unrecognized rev %d NIC\n",
175                                       rev);
176                         break;  
177                 }
178                 return 0;
179         }
180         return ENXIO;
181 }
182
183 static void
184 mxge_enable_wc(mxge_softc_t *sc)
185 {
186 #if defined(__i386) || defined(__amd64)
187         vm_offset_t len;
188         int err;
189
190         sc->wc = 1;
191         len = rman_get_size(sc->mem_res);
192         err = pmap_change_attr((vm_offset_t) sc->sram,
193                                len, PAT_WRITE_COMBINING);
194         if (err != 0) {
195                 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196                               err);
197                 sc->wc = 0;
198         }
199 #endif          
200 }
201
202
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206                          int error)
207 {
208         if (error == 0) {
209                 *(bus_addr_t *) arg = segs->ds_addr;
210         }
211 }
212
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215                    bus_size_t alignment)
216 {
217         int err;
218         device_t dev = sc->dev;
219         bus_size_t boundary, maxsegsize;
220
221         if (bytes > 4096 && alignment == 4096) {
222                 boundary = 0;
223                 maxsegsize = bytes;
224         } else {
225                 boundary = 4096;
226                 maxsegsize = 4096;
227         }
228
229         /* allocate DMAable memory tags */
230         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
231                                  alignment,             /* alignment */
232                                  boundary,              /* boundary */
233                                  BUS_SPACE_MAXADDR,     /* low */
234                                  BUS_SPACE_MAXADDR,     /* high */
235                                  NULL, NULL,            /* filter */
236                                  bytes,                 /* maxsize */
237                                  1,                     /* num segs */
238                                  maxsegsize,            /* maxsegsize */
239                                  BUS_DMA_COHERENT,      /* flags */
240                                  NULL, NULL,            /* lock */
241                                  &dma->dmat);           /* tag */
242         if (err != 0) {
243                 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244                 return err;
245         }
246
247         /* allocate DMAable memory & map */
248         err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249                                (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250                                 | BUS_DMA_ZERO),  &dma->map);
251         if (err != 0) {
252                 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253                 goto abort_with_dmat;
254         }
255
256         /* load the memory */
257         err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258                               mxge_dmamap_callback,
259                               (void *)&dma->bus_addr, 0);
260         if (err != 0) {
261                 device_printf(dev, "couldn't load map (err = %d)\n", err);
262                 goto abort_with_mem;
263         }
264         return 0;
265
266 abort_with_mem:
267         bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269         (void)bus_dma_tag_destroy(dma->dmat);
270         return err;
271 }
272
273
274 static void
275 mxge_dma_free(mxge_dma_t *dma)
276 {
277         bus_dmamap_unload(dma->dmat, dma->map);
278         bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279         (void)bus_dma_tag_destroy(dma->dmat);
280 }
281
282 /*
283  * The eeprom strings on the lanaiX have the format
284  * SN=x\0
285  * MAC=x:x:x:x:x:x\0
286  * PC=text\0
287  */
288
289 static int
290 mxge_parse_strings(mxge_softc_t *sc)
291 {
292         char *ptr;
293         int i, found_mac, found_sn2;
294         char *endptr;
295
296         ptr = sc->eeprom_strings;
297         found_mac = 0;
298         found_sn2 = 0;
299         while (*ptr != '\0') {
300                 if (strncmp(ptr, "MAC=", 4) == 0) {
301                         ptr += 4;
302                         for (i = 0;;) {
303                                 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304                                 if (endptr - ptr != 2)
305                                         goto abort;
306                                 ptr = endptr;
307                                 if (++i == 6)
308                                         break;
309                                 if (*ptr++ != ':')
310                                         goto abort;
311                         }
312                         found_mac = 1;
313                 } else if (strncmp(ptr, "PC=", 3) == 0) {
314                         ptr += 3;
315                         strlcpy(sc->product_code_string, ptr,
316                             sizeof(sc->product_code_string));
317                 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318                         ptr += 3;
319                         strlcpy(sc->serial_number_string, ptr,
320                             sizeof(sc->serial_number_string));
321                 } else if (strncmp(ptr, "SN2=", 4) == 0) {
322                         /* SN2 takes precedence over SN */
323                         ptr += 4;
324                         found_sn2 = 1;
325                         strlcpy(sc->serial_number_string, ptr,
326                             sizeof(sc->serial_number_string));
327                 }
328                 while (*ptr++ != '\0') {}
329         }
330
331         if (found_mac)
332                 return 0;
333
334  abort:
335         device_printf(sc->dev, "failed to parse eeprom_strings\n");
336
337         return ENXIO;
338 }
339
340 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341 static void
342 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 {
344         uint32_t val;
345         unsigned long base, off;
346         char *va, *cfgptr;
347         device_t pdev, mcp55;
348         uint16_t vendor_id, device_id, word;
349         uintptr_t bus, slot, func, ivend, idev;
350         uint32_t *ptr32;
351
352
353         if (!mxge_nvidia_ecrc_enable)
354                 return;
355
356         pdev = device_get_parent(device_get_parent(sc->dev));
357         if (pdev == NULL) {
358                 device_printf(sc->dev, "could not find parent?\n");
359                 return;
360         }
361         vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362         device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363
364         if (vendor_id != 0x10de)
365                 return;
366
367         base = 0;
368
369         if (device_id == 0x005d) {
370                 /* ck804, base address is magic */
371                 base = 0xe0000000UL;
372         } else if (device_id >= 0x0374 && device_id <= 0x378) {
373                 /* mcp55, base address stored in chipset */
374                 mcp55 = pci_find_bsf(0, 0, 0);
375                 if (mcp55 &&
376                     0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377                     0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378                         word = pci_read_config(mcp55, 0x90, 2);
379                         base = ((unsigned long)word & 0x7ffeU) << 25;
380                 }
381         }
382         if (!base)
383                 return;
384
385         /* XXXX
386            Test below is commented because it is believed that doing
387            config read/write beyond 0xff will access the config space
388            for the next larger function.  Uncomment this and remove
389            the hacky pmap_mapdev() way of accessing config space when
390            FreeBSD grows support for extended pcie config space access
391         */
392 #if 0   
393         /* See if we can, by some miracle, access the extended
394            config space */
395         val = pci_read_config(pdev, 0x178, 4);
396         if (val != 0xffffffff) {
397                 val |= 0x40;
398                 pci_write_config(pdev, 0x178, val, 4);
399                 return;
400         }
401 #endif
402         /* Rather than using normal pci config space writes, we must
403          * map the Nvidia config space ourselves.  This is because on
404          * opteron/nvidia class machine the 0xe000000 mapping is
405          * handled by the nvidia chipset, that means the internal PCI
406          * device (the on-chip northbridge), or the amd-8131 bridge
407          * and things behind them are not visible by this method.
408          */
409
410         BUS_READ_IVAR(device_get_parent(pdev), pdev,
411                       PCI_IVAR_BUS, &bus);
412         BUS_READ_IVAR(device_get_parent(pdev), pdev,
413                       PCI_IVAR_SLOT, &slot);
414         BUS_READ_IVAR(device_get_parent(pdev), pdev,
415                       PCI_IVAR_FUNCTION, &func);
416         BUS_READ_IVAR(device_get_parent(pdev), pdev,
417                       PCI_IVAR_VENDOR, &ivend);
418         BUS_READ_IVAR(device_get_parent(pdev), pdev,
419                       PCI_IVAR_DEVICE, &idev);
420                                         
421         off =  base
422                 + 0x00100000UL * (unsigned long)bus
423                 + 0x00001000UL * (unsigned long)(func
424                                                  + 8 * slot);
425
426         /* map it into the kernel */
427         va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428         
429
430         if (va == NULL) {
431                 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432                 return;
433         }
434         /* get a pointer to the config space mapped into the kernel */
435         cfgptr = va + (off & PAGE_MASK);
436
437         /* make sure that we can really access it */
438         vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439         device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440         if (! (vendor_id == ivend && device_id == idev)) {
441                 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442                               vendor_id, device_id);
443                 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444                 return;
445         }
446
447         ptr32 = (uint32_t*)(cfgptr + 0x178);
448         val = *ptr32;
449
450         if (val == 0xffffffff) {
451                 device_printf(sc->dev, "extended mapping failed\n");
452                 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453                 return;
454         }
455         *ptr32 = val | 0x40;
456         pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457         if (mxge_verbose)
458                 device_printf(sc->dev,
459                               "Enabled ECRC on upstream Nvidia bridge "
460                               "at %d:%d:%d\n",
461                               (int)bus, (int)slot, (int)func);
462         return;
463 }
464 #else
465 static void
466 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467 {
468         device_printf(sc->dev,
469                       "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470         return;
471 }
472 #endif
473
474
475 static int
476 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 {
478         mxge_cmd_t cmd;
479         bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480         int status;
481         uint32_t len;
482         char *test = " ";
483
484
485         /* Run a small DMA test.
486          * The magic multipliers to the length tell the firmware
487          * to do DMA read, write, or read+write tests.  The
488          * results are returned in cmd.data0.  The upper 16
489          * bits of the return is the number of transfers completed.
490          * The lower 16 bits is the time in 0.5us ticks that the
491          * transfers took to complete.
492          */
493
494         len = sc->tx_boundary;
495
496         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498         cmd.data2 = len * 0x10000;
499         status = mxge_send_cmd(sc, test_type, &cmd);
500         if (status != 0) {
501                 test = "read";
502                 goto abort;
503         }
504         sc->read_dma = ((cmd.data0>>16) * len * 2) /
505                 (cmd.data0 & 0xffff);
506         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508         cmd.data2 = len * 0x1;
509         status = mxge_send_cmd(sc, test_type, &cmd);
510         if (status != 0) {
511                 test = "write";
512                 goto abort;
513         }
514         sc->write_dma = ((cmd.data0>>16) * len * 2) /
515                 (cmd.data0 & 0xffff);
516
517         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519         cmd.data2 = len * 0x10001;
520         status = mxge_send_cmd(sc, test_type, &cmd);
521         if (status != 0) {
522                 test = "read/write";
523                 goto abort;
524         }
525         sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526                 (cmd.data0 & 0xffff);
527
528 abort:
529         if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530                 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531                               test, status);
532
533         return status;
534 }
535
536 /*
537  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538  * when the PCI-E Completion packets are aligned on an 8-byte
539  * boundary.  Some PCI-E chip sets always align Completion packets; on
540  * the ones that do not, the alignment can be enforced by enabling
541  * ECRC generation (if supported).
542  *
543  * When PCI-E Completion packets are not aligned, it is actually more
544  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545  *
546  * If the driver can neither enable ECRC nor verify that it has
547  * already been enabled, then it must use a firmware image which works
548  * around unaligned completion packets (ethp_z8e.dat), and it should
549  * also ensure that it never gives the device a Read-DMA which is
550  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551  * enabled, then the driver should use the aligned (eth_z8e.dat)
552  * firmware image, and set tx_boundary to 4KB.
553  */
554
555 static int
556 mxge_firmware_probe(mxge_softc_t *sc)
557 {
558         device_t dev = sc->dev;
559         int reg, status;
560         uint16_t pectl;
561
562         sc->tx_boundary = 4096;
563         /*
564          * Verify the max read request size was set to 4KB
565          * before trying the test with 4KB.
566          */
567         if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568                 pectl = pci_read_config(dev, reg + 0x8, 2);
569                 if ((pectl & (5 << 12)) != (5 << 12)) {
570                         device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571                                       pectl);
572                         sc->tx_boundary = 2048;
573                 }
574         }
575
576         /*
577          * load the optimized firmware (which assumes aligned PCIe
578          * completions) in order to see if it works on this host.
579          */
580         sc->fw_name = mxge_fw_aligned;
581         status = mxge_load_firmware(sc, 1);
582         if (status != 0) {
583                 return status;
584         }
585
586         /*
587          * Enable ECRC if possible
588          */
589         mxge_enable_nvidia_ecrc(sc);
590
591         /*
592          * Run a DMA test which watches for unaligned completions and
593          * aborts on the first one seen.  Not required on Z8ES or newer.
594          */
595         if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596                 return 0;
597         status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598         if (status == 0)
599                 return 0; /* keep the aligned firmware */
600
601         if (status != E2BIG)
602                 device_printf(dev, "DMA test failed: %d\n", status);
603         if (status == ENOSYS)
604                 device_printf(dev, "Falling back to ethp! "
605                               "Please install up to date fw\n");
606         return status;
607 }
608
609 static int
610 mxge_select_firmware(mxge_softc_t *sc)
611 {
612         int aligned = 0;
613         int force_firmware = mxge_force_firmware;
614
615         if (sc->throttle)
616                 force_firmware = sc->throttle;
617
618         if (force_firmware != 0) {
619                 if (force_firmware == 1)
620                         aligned = 1;
621                 else
622                         aligned = 0;
623                 if (mxge_verbose)
624                         device_printf(sc->dev,
625                                       "Assuming %s completions (forced)\n",
626                                       aligned ? "aligned" : "unaligned");
627                 goto abort;
628         }
629
630         /* if the PCIe link width is 4 or less, we can use the aligned
631            firmware and skip any checks */
632         if (sc->link_width != 0 && sc->link_width <= 4) {
633                 device_printf(sc->dev,
634                               "PCIe x%d Link, expect reduced performance\n",
635                               sc->link_width);
636                 aligned = 1;
637                 goto abort;
638         }
639
640         if (0 == mxge_firmware_probe(sc))
641                 return 0;
642
643 abort:
644         if (aligned) {
645                 sc->fw_name = mxge_fw_aligned;
646                 sc->tx_boundary = 4096;
647         } else {
648                 sc->fw_name = mxge_fw_unaligned;
649                 sc->tx_boundary = 2048;
650         }
651         return (mxge_load_firmware(sc, 0));
652 }
653
654 static int
655 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656 {
657
658
659         if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660                 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661                               be32toh(hdr->mcp_type));
662                 return EIO;
663         }
664
665         /* save firmware version for sysctl */
666         strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667         if (mxge_verbose)
668                 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669
670         sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671                &sc->fw_ver_minor, &sc->fw_ver_tiny);
672
673         if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674               && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675                 device_printf(sc->dev, "Found firmware version %s\n",
676                               sc->fw_version);
677                 device_printf(sc->dev, "Driver needs %d.%d\n",
678                               MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679                 return EINVAL;
680         }
681         return 0;
682
683 }
684
685 static void *
686 z_alloc(void *nil, u_int items, u_int size)
687 {
688         void *ptr;
689
690         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691         return ptr;
692 }
693
694 static void
695 z_free(void *nil, void *ptr)
696 {
697         free(ptr, M_TEMP);
698 }
699
700
701 static int
702 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 {
704         z_stream zs;
705         char *inflate_buffer;
706         const struct firmware *fw;
707         const mcp_gen_header_t *hdr;
708         unsigned hdr_offset;
709         int status;
710         unsigned int i;
711         char dummy;
712         size_t fw_len;
713
714         fw = firmware_get(sc->fw_name);
715         if (fw == NULL) {
716                 device_printf(sc->dev, "Could not find firmware image %s\n",
717                               sc->fw_name);
718                 return ENOENT;
719         }
720
721
722
723         /* setup zlib and decompress f/w */
724         bzero(&zs, sizeof (zs));
725         zs.zalloc = z_alloc;
726         zs.zfree = z_free;
727         status = inflateInit(&zs);
728         if (status != Z_OK) {
729                 status = EIO;
730                 goto abort_with_fw;
731         }
732
733         /* the uncompressed size is stored as the firmware version,
734            which would otherwise go unused */
735         fw_len = (size_t) fw->version;
736         inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737         if (inflate_buffer == NULL)
738                 goto abort_with_zs;
739         zs.avail_in = fw->datasize;
740         zs.next_in = __DECONST(char *, fw->data);
741         zs.avail_out = fw_len;
742         zs.next_out = inflate_buffer;
743         status = inflate(&zs, Z_FINISH);
744         if (status != Z_STREAM_END) {
745                 device_printf(sc->dev, "zlib %d\n", status);
746                 status = EIO;
747                 goto abort_with_buffer;
748         }
749
750         /* check id */
751         hdr_offset = htobe32(*(const uint32_t *)
752                              (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753         if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754                 device_printf(sc->dev, "Bad firmware file");
755                 status = EIO;
756                 goto abort_with_buffer;
757         }
758         hdr = (const void*)(inflate_buffer + hdr_offset);
759
760         status = mxge_validate_firmware(sc, hdr);
761         if (status != 0)
762                 goto abort_with_buffer;
763
764         /* Copy the inflated firmware to NIC SRAM. */
765         for (i = 0; i < fw_len; i += 256) {
766                 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767                               inflate_buffer + i,
768                               min(256U, (unsigned)(fw_len - i)));
769                 wmb();
770                 dummy = *sc->sram;
771                 wmb();
772         }
773
774         *limit = fw_len;
775         status = 0;
776 abort_with_buffer:
777         free(inflate_buffer, M_TEMP);
778 abort_with_zs:
779         inflateEnd(&zs);
780 abort_with_fw:
781         firmware_put(fw, FIRMWARE_UNLOAD);
782         return status;
783 }
784
785 /*
786  * Enable or disable periodic RDMAs from the host to make certain
787  * chipsets resend dropped PCIe messages
788  */
789
790 static void
791 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 {
793         char buf_bytes[72];
794         volatile uint32_t *confirm;
795         volatile char *submit;
796         uint32_t *buf, dma_low, dma_high;
797         int i;
798
799         buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800
801         /* clear confirmation addr */
802         confirm = (volatile uint32_t *)sc->cmd;
803         *confirm = 0;
804         wmb();
805
806         /* send an rdma command to the PCIe engine, and wait for the
807            response in the confirmation address.  The firmware should
808            write a -1 there to indicate it is alive and well
809         */
810
811         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813         buf[0] = htobe32(dma_high);             /* confirm addr MSW */
814         buf[1] = htobe32(dma_low);              /* confirm addr LSW */
815         buf[2] = htobe32(0xffffffff);           /* confirm data */
816         dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817         dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818         buf[3] = htobe32(dma_high);             /* dummy addr MSW */
819         buf[4] = htobe32(dma_low);              /* dummy addr LSW */
820         buf[5] = htobe32(enable);                       /* enable? */
821
822
823         submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824
825         mxge_pio_copy(submit, buf, 64);
826         wmb();
827         DELAY(1000);
828         wmb();
829         i = 0;
830         while (*confirm != 0xffffffff && i < 20) {
831                 DELAY(1000);
832                 i++;
833         }
834         if (*confirm != 0xffffffff) {
835                 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836                               (enable ? "enable" : "disable"), confirm,
837                               *confirm);
838         }
839         return;
840 }
841
842 static int
843 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 {
845         mcp_cmd_t *buf;
846         char buf_bytes[sizeof(*buf) + 8];
847         volatile mcp_cmd_response_t *response = sc->cmd;
848         volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849         uint32_t dma_low, dma_high;
850         int err, sleep_total = 0;
851
852         /* ensure buf is aligned to 8 bytes */
853         buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854
855         buf->data0 = htobe32(data->data0);
856         buf->data1 = htobe32(data->data1);
857         buf->data2 = htobe32(data->data2);
858         buf->cmd = htobe32(cmd);
859         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861
862         buf->response_addr.low = htobe32(dma_low);
863         buf->response_addr.high = htobe32(dma_high);
864         mtx_lock(&sc->cmd_mtx);
865         response->result = 0xffffffff;
866         wmb();
867         mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868
869         /* wait up to 20ms */
870         err = EAGAIN;
871         for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872                 bus_dmamap_sync(sc->cmd_dma.dmat,
873                                 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874                 wmb();
875                 switch (be32toh(response->result)) {
876                 case 0:
877                         data->data0 = be32toh(response->data);
878                         err = 0;
879                         break;
880                 case 0xffffffff:
881                         DELAY(1000);
882                         break;
883                 case MXGEFW_CMD_UNKNOWN:
884                         err = ENOSYS;
885                         break;
886                 case MXGEFW_CMD_ERROR_UNALIGNED:
887                         err = E2BIG;
888                         break;
889                 case MXGEFW_CMD_ERROR_BUSY:
890                         err = EBUSY;
891                         break;
892                 case MXGEFW_CMD_ERROR_I2C_ABSENT:
893                         err = ENXIO;
894                         break;
895                 default:
896                         device_printf(sc->dev,
897                                       "mxge: command %d "
898                                       "failed, result = %d\n",
899                                       cmd, be32toh(response->result));
900                         err = ENXIO;
901                         break;
902                 }
903                 if (err != EAGAIN)
904                         break;
905         }
906         if (err == EAGAIN)
907                 device_printf(sc->dev, "mxge: command %d timed out"
908                               "result = %d\n",
909                               cmd, be32toh(response->result));
910         mtx_unlock(&sc->cmd_mtx);
911         return err;
912 }
913
914 static int
915 mxge_adopt_running_firmware(mxge_softc_t *sc)
916 {
917         struct mcp_gen_header *hdr;
918         const size_t bytes = sizeof (struct mcp_gen_header);
919         size_t hdr_offset;
920         int status;
921
922         /* find running firmware header */
923         hdr_offset = htobe32(*(volatile uint32_t *)
924                              (sc->sram + MCP_HEADER_PTR_OFFSET));
925
926         if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927                 device_printf(sc->dev,
928                               "Running firmware has bad header offset (%d)\n",
929                               (int)hdr_offset);
930                 return EIO;
931         }
932
933         /* copy header of running firmware from SRAM to host memory to
934          * validate firmware */
935         hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936         if (hdr == NULL) {
937                 device_printf(sc->dev, "could not malloc firmware hdr\n");
938                 return ENOMEM;
939         }
940         bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941                                 rman_get_bushandle(sc->mem_res),
942                                 hdr_offset, (char *)hdr, bytes);
943         status = mxge_validate_firmware(sc, hdr);
944         free(hdr, M_DEVBUF);
945
946         /*
947          * check to see if adopted firmware has bug where adopting
948          * it will cause broadcasts to be filtered unless the NIC
949          * is kept in ALLMULTI mode
950          */
951         if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952             sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953                 sc->adopted_rx_filter_bug = 1;
954                 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955                               "working around rx filter bug\n",
956                               sc->fw_ver_major, sc->fw_ver_minor,
957                               sc->fw_ver_tiny);
958         }
959
960         return status;
961 }
962
963
964 static int
965 mxge_load_firmware(mxge_softc_t *sc, int adopt)
966 {
967         volatile uint32_t *confirm;
968         volatile char *submit;
969         char buf_bytes[72];
970         uint32_t *buf, size, dma_low, dma_high;
971         int status, i;
972
973         buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974
975         size = sc->sram_size;
976         status = mxge_load_firmware_helper(sc, &size);
977         if (status) {
978                 if (!adopt)
979                         return status;
980                 /* Try to use the currently running firmware, if
981                    it is new enough */
982                 status = mxge_adopt_running_firmware(sc);
983                 if (status) {
984                         device_printf(sc->dev,
985                                       "failed to adopt running firmware\n");
986                         return status;
987                 }
988                 device_printf(sc->dev,
989                               "Successfully adopted running firmware\n");
990                 if (sc->tx_boundary == 4096) {
991                         device_printf(sc->dev,
992                                 "Using firmware currently running on NIC"
993                                  ".  For optimal\n");
994                         device_printf(sc->dev,
995                                  "performance consider loading optimized "
996                                  "firmware\n");
997                 }
998                 sc->fw_name = mxge_fw_unaligned;
999                 sc->tx_boundary = 2048;
1000                 return 0;
1001         }
1002         /* clear confirmation addr */
1003         confirm = (volatile uint32_t *)sc->cmd;
1004         *confirm = 0;
1005         wmb();
1006         /* send a reload command to the bootstrap MCP, and wait for the
1007            response in the confirmation address.  The firmware should
1008            write a -1 there to indicate it is alive and well
1009         */
1010
1011         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013
1014         buf[0] = htobe32(dma_high);     /* confirm addr MSW */
1015         buf[1] = htobe32(dma_low);      /* confirm addr LSW */
1016         buf[2] = htobe32(0xffffffff);   /* confirm data */
1017
1018         /* FIX: All newest firmware should un-protect the bottom of
1019            the sram before handoff. However, the very first interfaces
1020            do not. Therefore the handoff copy must skip the first 8 bytes
1021         */
1022                                         /* where the code starts*/
1023         buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024         buf[4] = htobe32(size - 8);     /* length of code */
1025         buf[5] = htobe32(8);            /* where to copy to */
1026         buf[6] = htobe32(0);            /* where to jump to */
1027
1028         submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029         mxge_pio_copy(submit, buf, 64);
1030         wmb();
1031         DELAY(1000);
1032         wmb();
1033         i = 0;
1034         while (*confirm != 0xffffffff && i < 20) {
1035                 DELAY(1000*10);
1036                 i++;
1037                 bus_dmamap_sync(sc->cmd_dma.dmat,
1038                                 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039         }
1040         if (*confirm != 0xffffffff) {
1041                 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042                         confirm, *confirm);
1043                 
1044                 return ENXIO;
1045         }
1046         return 0;
1047 }
1048
1049 static int
1050 mxge_update_mac_address(mxge_softc_t *sc)
1051 {
1052         mxge_cmd_t cmd;
1053         uint8_t *addr = sc->mac_addr;
1054         int status;
1055
1056         
1057         cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058                      | (addr[2] << 8) | addr[3]);
1059
1060         cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061
1062         status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063         return status;
1064 }
1065
1066 static int
1067 mxge_change_pause(mxge_softc_t *sc, int pause)
1068 {       
1069         mxge_cmd_t cmd;
1070         int status;
1071
1072         if (pause)
1073                 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074                                        &cmd);
1075         else
1076                 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077                                        &cmd);
1078
1079         if (status) {
1080                 device_printf(sc->dev, "Failed to set flow control mode\n");
1081                 return ENXIO;
1082         }
1083         sc->pause = pause;
1084         return 0;
1085 }
1086
1087 static void
1088 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089 {       
1090         mxge_cmd_t cmd;
1091         int status;
1092
1093         if (mxge_always_promisc)
1094                 promisc = 1;
1095
1096         if (promisc)
1097                 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098                                        &cmd);
1099         else
1100                 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101                                        &cmd);
1102
1103         if (status) {
1104                 device_printf(sc->dev, "Failed to set promisc mode\n");
1105         }
1106 }
1107
1108 static void
1109 mxge_set_multicast_list(mxge_softc_t *sc)
1110 {
1111         mxge_cmd_t cmd;
1112         struct ifmultiaddr *ifma;
1113         struct ifnet *ifp = sc->ifp;
1114         int err;
1115
1116         /* This firmware is known to not support multicast */
1117         if (!sc->fw_multicast_support)
1118                 return;
1119
1120         /* Disable multicast filtering while we play with the lists*/
1121         err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122         if (err != 0) {
1123                 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124                        " error status: %d\n", err);
1125                 return;
1126         }
1127         
1128         if (sc->adopted_rx_filter_bug)
1129                 return;
1130         
1131         if (ifp->if_flags & IFF_ALLMULTI)
1132                 /* request to disable multicast filtering, so quit here */
1133                 return;
1134
1135         /* Flush all the filters */
1136
1137         err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138         if (err != 0) {
1139                 device_printf(sc->dev,
1140                               "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141                               ", error status: %d\n", err);
1142                 return;
1143         }
1144
1145         /* Walk the multicast list, and add each address */
1146
1147         if_maddr_rlock(ifp);
1148         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149                 if (ifma->ifma_addr->sa_family != AF_LINK)
1150                         continue;
1151                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152                       &cmd.data0, 4);
1153                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154                       &cmd.data1, 2);
1155                 cmd.data0 = htonl(cmd.data0);
1156                 cmd.data1 = htonl(cmd.data1);
1157                 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158                 if (err != 0) {
1159                         device_printf(sc->dev, "Failed "
1160                                "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161                                "%d\t", err);
1162                         /* abort, leaving multicast filtering off */
1163                         if_maddr_runlock(ifp);
1164                         return;
1165                 }
1166         }
1167         if_maddr_runlock(ifp);
1168         /* Enable multicast filtering */
1169         err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170         if (err != 0) {
1171                 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172                        ", error status: %d\n", err);
1173         }
1174 }
1175
1176 static int
1177 mxge_max_mtu(mxge_softc_t *sc)
1178 {
1179         mxge_cmd_t cmd;
1180         int status;
1181
1182         if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183                 return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184
1185         /* try to set nbufs to see if it we can
1186            use virtually contiguous jumbos */
1187         cmd.data0 = 0;
1188         status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189                                &cmd);
1190         if (status == 0)
1191                 return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192
1193         /* otherwise, we're limited to MJUMPAGESIZE */
1194         return MJUMPAGESIZE - MXGEFW_PAD;
1195 }
1196
1197 static int
1198 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199 {
1200         struct mxge_slice_state *ss;
1201         mxge_rx_done_t *rx_done;
1202         volatile uint32_t *irq_claim;
1203         mxge_cmd_t cmd;
1204         int slice, status;
1205
1206         /* try to send a reset command to the card to see if it
1207            is alive */
1208         memset(&cmd, 0, sizeof (cmd));
1209         status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210         if (status != 0) {
1211                 device_printf(sc->dev, "failed reset\n");
1212                 return ENXIO;
1213         }
1214
1215         mxge_dummy_rdma(sc, 1);
1216
1217
1218         /* set the intrq size */
1219         cmd.data0 = sc->rx_ring_size;
1220         status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221
1222         /*
1223          * Even though we already know how many slices are supported
1224          * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225          * has magic side effects, and must be called after a reset.
1226          * It must be called prior to calling any RSS related cmds,
1227          * including assigning an interrupt queue for anything but
1228          * slice 0.  It must also be called *after*
1229          * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230          * the firmware to compute offsets.
1231          */
1232         
1233         if (sc->num_slices > 1) {
1234                 /* ask the maximum number of slices it supports */
1235                 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236                                            &cmd);
1237                 if (status != 0) {
1238                         device_printf(sc->dev,
1239                                       "failed to get number of slices\n");
1240                         return status;
1241                 }
1242                 /*
1243                  * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244                  * to setting up the interrupt queue DMA
1245                  */
1246                 cmd.data0 = sc->num_slices;
1247                 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248 #ifdef IFNET_BUF_RING
1249                 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250 #endif
1251                 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252                                            &cmd);
1253                 if (status != 0) {
1254                         device_printf(sc->dev,
1255                                       "failed to set number of slices\n");
1256                         return status;
1257                 }
1258         }
1259
1260
1261         if (interrupts_setup) {
1262                 /* Now exchange information about interrupts  */
1263                 for (slice = 0; slice < sc->num_slices; slice++) {
1264                         rx_done = &sc->ss[slice].rx_done;
1265                         memset(rx_done->entry, 0, sc->rx_ring_size);
1266                         cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267                         cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268                         cmd.data2 = slice;
1269                         status |= mxge_send_cmd(sc,
1270                                                 MXGEFW_CMD_SET_INTRQ_DMA,
1271                                                 &cmd);
1272                 }
1273         }
1274
1275         status |= mxge_send_cmd(sc,
1276                                 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277         
1278
1279         sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280
1281         status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282         irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283
1284
1285         status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286                                 &cmd);
1287         sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288         if (status != 0) {
1289                 device_printf(sc->dev, "failed set interrupt parameters\n");
1290                 return status;
1291         }
1292         
1293
1294         *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295
1296         
1297         /* run a DMA benchmark */
1298         (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299
1300         for (slice = 0; slice < sc->num_slices; slice++) {
1301                 ss = &sc->ss[slice];
1302
1303                 ss->irq_claim = irq_claim + (2 * slice);
1304                 /* reset mcp/driver shared state back to 0 */
1305                 ss->rx_done.idx = 0;
1306                 ss->rx_done.cnt = 0;
1307                 ss->tx.req = 0;
1308                 ss->tx.done = 0;
1309                 ss->tx.pkt_done = 0;
1310                 ss->tx.queue_active = 0;
1311                 ss->tx.activate = 0;
1312                 ss->tx.deactivate = 0;
1313                 ss->tx.wake = 0;
1314                 ss->tx.defrag = 0;
1315                 ss->tx.stall = 0;
1316                 ss->rx_big.cnt = 0;
1317                 ss->rx_small.cnt = 0;
1318                 ss->lc.lro_bad_csum = 0;
1319                 ss->lc.lro_queued = 0;
1320                 ss->lc.lro_flushed = 0;
1321                 if (ss->fw_stats != NULL) {
1322                         bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323                 }
1324         }
1325         sc->rdma_tags_available = 15;
1326         status = mxge_update_mac_address(sc);
1327         mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328         mxge_change_pause(sc, sc->pause);
1329         mxge_set_multicast_list(sc);
1330         if (sc->throttle) {
1331                 cmd.data0 = sc->throttle;
1332                 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333                                   &cmd)) {
1334                         device_printf(sc->dev,
1335                                       "can't enable throttle\n");
1336                 }
1337         }
1338         return status;
1339 }
1340
1341 static int
1342 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343 {
1344         mxge_cmd_t cmd;
1345         mxge_softc_t *sc;
1346         int err;
1347         unsigned int throttle;
1348
1349         sc = arg1;
1350         throttle = sc->throttle;
1351         err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352         if (err != 0) {
1353                 return err;
1354         }
1355
1356         if (throttle == sc->throttle)
1357                 return 0;
1358
1359         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360                 return EINVAL;
1361         
1362         mtx_lock(&sc->driver_mtx);
1363         cmd.data0 = throttle;
1364         err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365         if (err == 0)
1366                 sc->throttle = throttle;
1367         mtx_unlock(&sc->driver_mtx);    
1368         return err;
1369 }
1370
1371 static int
1372 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 {
1374         mxge_softc_t *sc;
1375         unsigned int intr_coal_delay;
1376         int err;
1377
1378         sc = arg1;
1379         intr_coal_delay = sc->intr_coal_delay;
1380         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381         if (err != 0) {
1382                 return err;
1383         }
1384         if (intr_coal_delay == sc->intr_coal_delay)
1385                 return 0;
1386
1387         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388                 return EINVAL;
1389
1390         mtx_lock(&sc->driver_mtx);
1391         *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392         sc->intr_coal_delay = intr_coal_delay;
1393         
1394         mtx_unlock(&sc->driver_mtx);
1395         return err;
1396 }
1397
1398 static int
1399 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 {
1401         mxge_softc_t *sc;
1402         unsigned int enabled;
1403         int err;
1404
1405         sc = arg1;
1406         enabled = sc->pause;
1407         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408         if (err != 0) {
1409                 return err;
1410         }
1411         if (enabled == sc->pause)
1412                 return 0;
1413
1414         mtx_lock(&sc->driver_mtx);
1415         err = mxge_change_pause(sc, enabled);
1416         mtx_unlock(&sc->driver_mtx);
1417         return err;
1418 }
1419
1420 static int
1421 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422 {
1423         int err;
1424
1425         if (arg1 == NULL)
1426                 return EFAULT;
1427         arg2 = be32toh(*(int *)arg1);
1428         arg1 = NULL;
1429         err = sysctl_handle_int(oidp, arg1, arg2, req);
1430
1431         return err;
1432 }
1433
1434 static void
1435 mxge_rem_sysctls(mxge_softc_t *sc)
1436 {
1437         struct mxge_slice_state *ss;
1438         int slice;
1439
1440         if (sc->slice_sysctl_tree == NULL)
1441                 return;
1442
1443         for (slice = 0; slice < sc->num_slices; slice++) {
1444                 ss = &sc->ss[slice];
1445                 if (ss == NULL || ss->sysctl_tree == NULL)
1446                         continue;
1447                 sysctl_ctx_free(&ss->sysctl_ctx);
1448                 ss->sysctl_tree = NULL;
1449         }
1450         sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451         sc->slice_sysctl_tree = NULL;
1452 }
1453
1454 static void
1455 mxge_add_sysctls(mxge_softc_t *sc)
1456 {
1457         struct sysctl_ctx_list *ctx;
1458         struct sysctl_oid_list *children;
1459         mcp_irq_data_t *fw;
1460         struct mxge_slice_state *ss;
1461         int slice;
1462         char slice_num[8];
1463
1464         ctx = device_get_sysctl_ctx(sc->dev);
1465         children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466         fw = sc->ss[0].fw_stats;
1467
1468         /* random information */
1469         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470                        "firmware_version",
1471                        CTLFLAG_RD, sc->fw_version,
1472                        0, "firmware version");
1473         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474                        "serial_number",
1475                        CTLFLAG_RD, sc->serial_number_string,
1476                        0, "serial number");
1477         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478                        "product_code",
1479                        CTLFLAG_RD, sc->product_code_string,
1480                        0, "product_code");
1481         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482                        "pcie_link_width",
1483                        CTLFLAG_RD, &sc->link_width,
1484                        0, "tx_boundary");
1485         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486                        "tx_boundary",
1487                        CTLFLAG_RD, &sc->tx_boundary,
1488                        0, "tx_boundary");
1489         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490                        "write_combine",
1491                        CTLFLAG_RD, &sc->wc,
1492                        0, "write combining PIO?");
1493         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494                        "read_dma_MBs",
1495                        CTLFLAG_RD, &sc->read_dma,
1496                        0, "DMA Read speed in MB/s");
1497         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498                        "write_dma_MBs",
1499                        CTLFLAG_RD, &sc->write_dma,
1500                        0, "DMA Write speed in MB/s");
1501         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502                        "read_write_dma_MBs",
1503                        CTLFLAG_RD, &sc->read_write_dma,
1504                        0, "DMA concurrent Read/Write speed in MB/s");
1505         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506                        "watchdog_resets",
1507                        CTLFLAG_RD, &sc->watchdog_resets,
1508                        0, "Number of times NIC was reset");
1509
1510
1511         /* performance related tunables */
1512         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513                         "intr_coal_delay",
1514                         CTLTYPE_INT|CTLFLAG_RW, sc,
1515                         0, mxge_change_intr_coal,
1516                         "I", "interrupt coalescing delay in usecs");
1517
1518         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519                         "throttle",
1520                         CTLTYPE_INT|CTLFLAG_RW, sc,
1521                         0, mxge_change_throttle,
1522                         "I", "transmit throttling");
1523
1524         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525                         "flow_control_enabled",
1526                         CTLTYPE_INT|CTLFLAG_RW, sc,
1527                         0, mxge_change_flow_control,
1528                         "I", "interrupt coalescing delay in usecs");
1529
1530         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531                        "deassert_wait",
1532                        CTLFLAG_RW, &mxge_deassert_wait,
1533                        0, "Wait for IRQ line to go low in ihandler");
1534
1535         /* stats block from firmware is in network byte order.
1536            Need to swap it */
1537         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538                         "link_up",
1539                         CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540                         0, mxge_handle_be32,
1541                         "I", "link up");
1542         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543                         "rdma_tags_available",
1544                         CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545                         0, mxge_handle_be32,
1546                         "I", "rdma_tags_available");
1547         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548                         "dropped_bad_crc32",
1549                         CTLTYPE_INT|CTLFLAG_RD,
1550                         &fw->dropped_bad_crc32,
1551                         0, mxge_handle_be32,
1552                         "I", "dropped_bad_crc32");
1553         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554                         "dropped_bad_phy",
1555                         CTLTYPE_INT|CTLFLAG_RD,
1556                         &fw->dropped_bad_phy,
1557                         0, mxge_handle_be32,
1558                         "I", "dropped_bad_phy");
1559         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560                         "dropped_link_error_or_filtered",
1561                         CTLTYPE_INT|CTLFLAG_RD,
1562                         &fw->dropped_link_error_or_filtered,
1563                         0, mxge_handle_be32,
1564                         "I", "dropped_link_error_or_filtered");
1565         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566                         "dropped_link_overflow",
1567                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568                         0, mxge_handle_be32,
1569                         "I", "dropped_link_overflow");
1570         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571                         "dropped_multicast_filtered",
1572                         CTLTYPE_INT|CTLFLAG_RD,
1573                         &fw->dropped_multicast_filtered,
1574                         0, mxge_handle_be32,
1575                         "I", "dropped_multicast_filtered");
1576         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577                         "dropped_no_big_buffer",
1578                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579                         0, mxge_handle_be32,
1580                         "I", "dropped_no_big_buffer");
1581         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582                         "dropped_no_small_buffer",
1583                         CTLTYPE_INT|CTLFLAG_RD,
1584                         &fw->dropped_no_small_buffer,
1585                         0, mxge_handle_be32,
1586                         "I", "dropped_no_small_buffer");
1587         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588                         "dropped_overrun",
1589                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590                         0, mxge_handle_be32,
1591                         "I", "dropped_overrun");
1592         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593                         "dropped_pause",
1594                         CTLTYPE_INT|CTLFLAG_RD,
1595                         &fw->dropped_pause,
1596                         0, mxge_handle_be32,
1597                         "I", "dropped_pause");
1598         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599                         "dropped_runt",
1600                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601                         0, mxge_handle_be32,
1602                         "I", "dropped_runt");
1603
1604         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605                         "dropped_unicast_filtered",
1606                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607                         0, mxge_handle_be32,
1608                         "I", "dropped_unicast_filtered");
1609
1610         /* verbose printing? */
1611         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612                        "verbose",
1613                        CTLFLAG_RW, &mxge_verbose,
1614                        0, "verbose printing");
1615
1616         /* add counters exported for debugging from all slices */
1617         sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618         sc->slice_sysctl_tree =
1619                 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620                                 "slice", CTLFLAG_RD, 0, "");
1621
1622         for (slice = 0; slice < sc->num_slices; slice++) {
1623                 ss = &sc->ss[slice];
1624                 sysctl_ctx_init(&ss->sysctl_ctx);
1625                 ctx = &ss->sysctl_ctx;
1626                 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627                 sprintf(slice_num, "%d", slice);
1628                 ss->sysctl_tree =
1629                         SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630                                         CTLFLAG_RD, 0, "");
1631                 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633                                "rx_small_cnt",
1634                                CTLFLAG_RD, &ss->rx_small.cnt,
1635                                0, "rx_small_cnt");
1636                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637                                "rx_big_cnt",
1638                                CTLFLAG_RD, &ss->rx_big.cnt,
1639                                0, "rx_small_cnt");
1640                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1641                                "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642                                0, "number of lro merge queues flushed");
1643
1644                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1645                                "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646                                0, "number of bad csums preventing LRO");
1647
1648                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1649                                "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650                                0, "number of frames appended to lro merge"
1651                                "queues");
1652
1653 #ifndef IFNET_BUF_RING
1654                 /* only transmit from slice 0 for now */
1655                 if (slice > 0)
1656                         continue;
1657 #endif
1658                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659                                "tx_req",
1660                                CTLFLAG_RD, &ss->tx.req,
1661                                0, "tx_req");
1662
1663                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664                                "tx_done",
1665                                CTLFLAG_RD, &ss->tx.done,
1666                                0, "tx_done");
1667                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668                                "tx_pkt_done",
1669                                CTLFLAG_RD, &ss->tx.pkt_done,
1670                                0, "tx_done");
1671                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672                                "tx_stall",
1673                                CTLFLAG_RD, &ss->tx.stall,
1674                                0, "tx_stall");
1675                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676                                "tx_wake",
1677                                CTLFLAG_RD, &ss->tx.wake,
1678                                0, "tx_wake");
1679                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680                                "tx_defrag",
1681                                CTLFLAG_RD, &ss->tx.defrag,
1682                                0, "tx_defrag");
1683                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684                                "tx_queue_active",
1685                                CTLFLAG_RD, &ss->tx.queue_active,
1686                                0, "tx_queue_active");
1687                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688                                "tx_activate",
1689                                CTLFLAG_RD, &ss->tx.activate,
1690                                0, "tx_activate");
1691                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692                                "tx_deactivate",
1693                                CTLFLAG_RD, &ss->tx.deactivate,
1694                                0, "tx_deactivate");
1695         }
1696 }
1697
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699    backwards one at a time and handle ring wraps */
1700
1701 static inline void
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703                             mcp_kreq_ether_send_t *src, int cnt)
1704 {
1705         int idx, starting_slot;
1706         starting_slot = tx->req;
1707         while (cnt > 1) {
1708                 cnt--;
1709                 idx = (starting_slot + cnt) & tx->mask;
1710                 mxge_pio_copy(&tx->lanai[idx],
1711                               &src[cnt], sizeof(*src));
1712                 wmb();
1713         }
1714 }
1715
1716 /*
1717  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718  * at most 32 bytes at a time, so as to avoid involving the software
1719  * pio handler in the nic.   We re-write the first segment's flags
1720  * to mark them valid only after writing the entire chain
1721  */
1722
1723 static inline void
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725                   int cnt)
1726 {
1727         int idx, i;
1728         uint32_t *src_ints;
1729         volatile uint32_t *dst_ints;
1730         mcp_kreq_ether_send_t *srcp;
1731         volatile mcp_kreq_ether_send_t *dstp, *dst;
1732         uint8_t last_flags;
1733         
1734         idx = tx->req & tx->mask;
1735
1736         last_flags = src->flags;
1737         src->flags = 0;
1738         wmb();
1739         dst = dstp = &tx->lanai[idx];
1740         srcp = src;
1741
1742         if ((idx + cnt) < tx->mask) {
1743                 for (i = 0; i < (cnt - 1); i += 2) {
1744                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745                         wmb(); /* force write every 32 bytes */
1746                         srcp += 2;
1747                         dstp += 2;
1748                 }
1749         } else {
1750                 /* submit all but the first request, and ensure
1751                    that it is submitted below */
1752                 mxge_submit_req_backwards(tx, src, cnt);
1753                 i = 0;
1754         }
1755         if (i < cnt) {
1756                 /* submit the first request */
1757                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1758                 wmb(); /* barrier before setting valid flag */
1759         }
1760
1761         /* re-write the last 32-bits with the valid flags */
1762         src->flags = last_flags;
1763         src_ints = (uint32_t *)src;
1764         src_ints+=3;
1765         dst_ints = (volatile uint32_t *)dst;
1766         dst_ints+=3;
1767         *dst_ints =  *src_ints;
1768         tx->req += cnt;
1769         wmb();
1770 }
1771
1772 static int
1773 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774     struct mxge_pkt_info *pi)
1775 {
1776         struct ether_vlan_header *eh;
1777         uint16_t etype;
1778         int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779 #if IFCAP_TSO6 && defined(INET6)
1780         int nxt;
1781 #endif
1782
1783         eh = mtod(m, struct ether_vlan_header *);
1784         if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785                 etype = ntohs(eh->evl_proto);
1786                 pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787         } else {
1788                 etype = ntohs(eh->evl_encap_proto);
1789                 pi->ip_off = ETHER_HDR_LEN;
1790         }
1791
1792         switch (etype) {
1793         case ETHERTYPE_IP:
1794                 /*
1795                  * ensure ip header is in first mbuf, copy it to a
1796                  * scratch buffer if not
1797                  */
1798                 pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799                 pi->ip6 = NULL;
1800                 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801                         m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802                             ss->scratch);
1803                         pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804                 }
1805                 pi->ip_hlen = pi->ip->ip_hl << 2;
1806                 if (!tso)
1807                         return 0;
1808
1809                 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810                     sizeof(struct tcphdr))) {
1811                         m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812                             sizeof(struct tcphdr), ss->scratch);
1813                         pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814                 }
1815                 pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816                 break;
1817 #if IFCAP_TSO6 && defined(INET6)
1818         case ETHERTYPE_IPV6:
1819                 pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820                 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821                         m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822                             ss->scratch);
1823                         pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824                 }
1825                 nxt = 0;
1826                 pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827                 pi->ip_hlen -= pi->ip_off;
1828                 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829                         return EINVAL;
1830
1831                 if (!tso)
1832                         return 0;
1833
1834                 if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835                         return EINVAL;
1836
1837                 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838                     sizeof(struct tcphdr))) {
1839                         m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840                             sizeof(struct tcphdr), ss->scratch);
1841                         pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842                 }
1843                 pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844                 break;
1845 #endif
1846         default:
1847                 return EINVAL;
1848         }
1849         return 0;
1850 }
1851
1852 #if IFCAP_TSO4
1853
1854 static void
1855 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856                int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 {
1858         mxge_tx_ring_t *tx;
1859         mcp_kreq_ether_send_t *req;
1860         bus_dma_segment_t *seg;
1861         uint32_t low, high_swapped;
1862         int len, seglen, cum_len, cum_len_next;
1863         int next_is_first, chop, cnt, rdma_count, small;
1864         uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865         uint8_t flags, flags_next;
1866         static int once;
1867
1868         mss = m->m_pkthdr.tso_segsz;
1869
1870         /* negative cum_len signifies to the
1871          * send loop that we are still in the
1872          * header portion of the TSO packet.
1873          */
1874
1875         cksum_offset = pi->ip_off + pi->ip_hlen;
1876         cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877
1878         /* TSO implies checksum offload on this hardware */
1879         if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880                 /*
1881                  * If packet has full TCP csum, replace it with pseudo hdr
1882                  * sum that the NIC expects, otherwise the NIC will emit
1883                  * packets with bad TCP checksums.
1884                  */
1885                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886                 if (pi->ip6) {
1887 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888                         m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889                         sum = in6_cksum_pseudo(pi->ip6,
1890                             m->m_pkthdr.len - cksum_offset,
1891                             IPPROTO_TCP, 0);
1892 #endif
1893                 } else {
1894 #ifdef INET
1895                         m->m_pkthdr.csum_flags |= CSUM_TCP;
1896                         sum = in_pseudo(pi->ip->ip_src.s_addr,
1897                             pi->ip->ip_dst.s_addr,
1898                             htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899                                     cksum_offset)));
1900 #endif
1901                 }
1902                 m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903                     cksum_offset, sizeof(sum), (caddr_t)&sum);
1904         }
1905         flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906
1907         
1908         /* for TSO, pseudo_hdr_offset holds mss.
1909          * The firmware figures out where to put
1910          * the checksum by parsing the header. */
1911         pseudo_hdr_offset = htobe16(mss);
1912
1913         if (pi->ip6) {
1914                 /*
1915                  * for IPv6 TSO, the "checksum offset" is re-purposed
1916                  * to store the TCP header len
1917                  */
1918                 cksum_offset = (pi->tcp->th_off << 2);
1919         }
1920
1921         tx = &ss->tx;
1922         req = tx->req_list;
1923         seg = tx->seg_list;
1924         cnt = 0;
1925         rdma_count = 0;
1926         /* "rdma_count" is the number of RDMAs belonging to the
1927          * current packet BEFORE the current send request. For
1928          * non-TSO packets, this is equal to "count".
1929          * For TSO packets, rdma_count needs to be reset
1930          * to 0 after a segment cut.
1931          *
1932          * The rdma_count field of the send request is
1933          * the number of RDMAs of the packet starting at
1934          * that request. For TSO send requests with one ore more cuts
1935          * in the middle, this is the number of RDMAs starting
1936          * after the last cut in the request. All previous
1937          * segments before the last cut implicitly have 1 RDMA.
1938          *
1939          * Since the number of RDMAs is not known beforehand,
1940          * it must be filled-in retroactively - after each
1941          * segmentation cut or at the end of the entire packet.
1942          */
1943
1944         while (busdma_seg_cnt) {
1945                 /* Break the busdma segment up into pieces*/
1946                 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947                 high_swapped =  htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948                 len = seg->ds_len;
1949
1950                 while (len) {
1951                         flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952                         seglen = len;
1953                         cum_len_next = cum_len + seglen;
1954                         (req-rdma_count)->rdma_count = rdma_count + 1;
1955                         if (__predict_true(cum_len >= 0)) {
1956                                 /* payload */
1957                                 chop = (cum_len_next > mss);
1958                                 cum_len_next = cum_len_next % mss;
1959                                 next_is_first = (cum_len_next == 0);
1960                                 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961                                 flags_next |= next_is_first *
1962                                         MXGEFW_FLAGS_FIRST;
1963                                 rdma_count |= -(chop | next_is_first);
1964                                 rdma_count += chop & !next_is_first;
1965                         } else if (cum_len_next >= 0) {
1966                                 /* header ends */
1967                                 rdma_count = -1;
1968                                 cum_len_next = 0;
1969                                 seglen = -cum_len;
1970                                 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971                                 flags_next = MXGEFW_FLAGS_TSO_PLD |
1972                                         MXGEFW_FLAGS_FIRST |
1973                                         (small * MXGEFW_FLAGS_SMALL);
1974                             }
1975                         
1976                         req->addr_high = high_swapped;
1977                         req->addr_low = htobe32(low);
1978                         req->pseudo_hdr_offset = pseudo_hdr_offset;
1979                         req->pad = 0;
1980                         req->rdma_count = 1;
1981                         req->length = htobe16(seglen);
1982                         req->cksum_offset = cksum_offset;
1983                         req->flags = flags | ((cum_len & 1) *
1984                                               MXGEFW_FLAGS_ALIGN_ODD);
1985                         low += seglen;
1986                         len -= seglen;
1987                         cum_len = cum_len_next;
1988                         flags = flags_next;
1989                         req++;
1990                         cnt++;
1991                         rdma_count++;
1992                         if (cksum_offset != 0 && !pi->ip6) {
1993                                 if (__predict_false(cksum_offset > seglen))
1994                                         cksum_offset -= seglen;
1995                                 else
1996                                         cksum_offset = 0;
1997                         }
1998                         if (__predict_false(cnt > tx->max_desc))
1999                                 goto drop;
2000                 }
2001                 busdma_seg_cnt--;
2002                 seg++;
2003         }
2004         (req-rdma_count)->rdma_count = rdma_count;
2005
2006         do {
2007                 req--;
2008                 req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009         } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010
2011         tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012         mxge_submit_req(tx, tx->req_list, cnt);
2013 #ifdef IFNET_BUF_RING
2014         if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015                 /* tell the NIC to start polling this slice */
2016                 *tx->send_go = 1;
2017                 tx->queue_active = 1;
2018                 tx->activate++;
2019                 wmb();
2020         }
2021 #endif
2022         return;
2023
2024 drop:
2025         bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026         m_freem(m);
2027         ss->oerrors++;
2028         if (!once) {
2029                 printf("tx->max_desc exceeded via TSO!\n");
2030                 printf("mss = %d, %ld, %d!\n", mss,
2031                        (long)seg - (long)tx->seg_list, tx->max_desc);
2032                 once = 1;
2033         }
2034         return;
2035
2036 }
2037
2038 #endif /* IFCAP_TSO4 */
2039
2040 #ifdef MXGE_NEW_VLAN_API
2041 /*
2042  * We reproduce the software vlan tag insertion from
2043  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044  * vlan tag insertion. We need to advertise this in order to have the
2045  * vlan interface respect our csum offload flags.
2046  */
2047 static struct mbuf *
2048 mxge_vlan_tag_insert(struct mbuf *m)
2049 {
2050         struct ether_vlan_header *evl;
2051
2052         M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053         if (__predict_false(m == NULL))
2054                 return NULL;
2055         if (m->m_len < sizeof(*evl)) {
2056                 m = m_pullup(m, sizeof(*evl));
2057                 if (__predict_false(m == NULL))
2058                         return NULL;
2059         }
2060         /*
2061          * Transform the Ethernet header into an Ethernet header
2062          * with 802.1Q encapsulation.
2063          */
2064         evl = mtod(m, struct ether_vlan_header *);
2065         bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066               (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067         evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068         evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069         m->m_flags &= ~M_VLANTAG;
2070         return m;
2071 }
2072 #endif /* MXGE_NEW_VLAN_API */
2073
2074 static void
2075 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076 {
2077         struct mxge_pkt_info pi = {0,0,0,0};
2078         mxge_softc_t *sc;
2079         mcp_kreq_ether_send_t *req;
2080         bus_dma_segment_t *seg;
2081         struct mbuf *m_tmp;
2082         struct ifnet *ifp;
2083         mxge_tx_ring_t *tx;
2084         int cnt, cum_len, err, i, idx, odd_flag;
2085         uint16_t pseudo_hdr_offset;
2086         uint8_t flags, cksum_offset;
2087
2088
2089         sc = ss->sc;
2090         ifp = sc->ifp;
2091         tx = &ss->tx;
2092
2093 #ifdef MXGE_NEW_VLAN_API
2094         if (m->m_flags & M_VLANTAG) {
2095                 m = mxge_vlan_tag_insert(m);
2096                 if (__predict_false(m == NULL))
2097                         goto drop_without_m;
2098         }
2099 #endif
2100         if (m->m_pkthdr.csum_flags &
2101             (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102                 if (mxge_parse_tx(ss, m, &pi))
2103                         goto drop;
2104         }
2105
2106         /* (try to) map the frame for DMA */
2107         idx = tx->req & tx->mask;
2108         err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109                                       m, tx->seg_list, &cnt,
2110                                       BUS_DMA_NOWAIT);
2111         if (__predict_false(err == EFBIG)) {
2112                 /* Too many segments in the chain.  Try
2113                    to defrag */
2114                 m_tmp = m_defrag(m, M_NOWAIT);
2115                 if (m_tmp == NULL) {
2116                         goto drop;
2117                 }
2118                 ss->tx.defrag++;
2119                 m = m_tmp;
2120                 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121                                               tx->info[idx].map,
2122                                               m, tx->seg_list, &cnt,
2123                                               BUS_DMA_NOWAIT);
2124         }
2125         if (__predict_false(err != 0)) {
2126                 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127                               " packet len = %d\n", err, m->m_pkthdr.len);
2128                 goto drop;
2129         }
2130         bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131                         BUS_DMASYNC_PREWRITE);
2132         tx->info[idx].m = m;
2133
2134 #if IFCAP_TSO4
2135         /* TSO is different enough, we handle it in another routine */
2136         if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137                 mxge_encap_tso(ss, m, cnt, &pi);
2138                 return;
2139         }
2140 #endif
2141
2142         req = tx->req_list;
2143         cksum_offset = 0;
2144         pseudo_hdr_offset = 0;
2145         flags = MXGEFW_FLAGS_NO_TSO;
2146
2147         /* checksum offloading? */
2148         if (m->m_pkthdr.csum_flags &
2149             (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150                 /* ensure ip header is in first mbuf, copy
2151                    it to a scratch buffer if not */
2152                 cksum_offset = pi.ip_off + pi.ip_hlen;
2153                 pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154                 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155                 req->cksum_offset = cksum_offset;
2156                 flags |= MXGEFW_FLAGS_CKSUM;
2157                 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158         } else {
2159                 odd_flag = 0;
2160         }
2161         if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162                 flags |= MXGEFW_FLAGS_SMALL;
2163
2164         /* convert segments into a request list */
2165         cum_len = 0;
2166         seg = tx->seg_list;
2167         req->flags = MXGEFW_FLAGS_FIRST;
2168         for (i = 0; i < cnt; i++) {
2169                 req->addr_low =
2170                         htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171                 req->addr_high =
2172                         htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173                 req->length = htobe16(seg->ds_len);
2174                 req->cksum_offset = cksum_offset;
2175                 if (cksum_offset > seg->ds_len)
2176                         cksum_offset -= seg->ds_len;
2177                 else
2178                         cksum_offset = 0;
2179                 req->pseudo_hdr_offset = pseudo_hdr_offset;
2180                 req->pad = 0; /* complete solid 16-byte block */
2181                 req->rdma_count = 1;
2182                 req->flags |= flags | ((cum_len & 1) * odd_flag);
2183                 cum_len += seg->ds_len;
2184                 seg++;
2185                 req++;
2186                 req->flags = 0;
2187         }
2188         req--;
2189         /* pad runts to 60 bytes */
2190         if (cum_len < 60) {
2191                 req++;
2192                 req->addr_low =
2193                         htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194                 req->addr_high =
2195                         htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196                 req->length = htobe16(60 - cum_len);
2197                 req->cksum_offset = 0;
2198                 req->pseudo_hdr_offset = pseudo_hdr_offset;
2199                 req->pad = 0; /* complete solid 16-byte block */
2200                 req->rdma_count = 1;
2201                 req->flags |= flags | ((cum_len & 1) * odd_flag);
2202                 cnt++;
2203         }
2204
2205         tx->req_list[0].rdma_count = cnt;
2206 #if 0
2207         /* print what the firmware will see */
2208         for (i = 0; i < cnt; i++) {
2209                 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210                     "cso:%d, flags:0x%x, rdma:%d\n",
2211                     i, (int)ntohl(tx->req_list[i].addr_high),
2212                     (int)ntohl(tx->req_list[i].addr_low),
2213                     (int)ntohs(tx->req_list[i].length),
2214                     (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215                     tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216                     tx->req_list[i].rdma_count);
2217         }
2218         printf("--------------\n");
2219 #endif
2220         tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221         mxge_submit_req(tx, tx->req_list, cnt);
2222 #ifdef IFNET_BUF_RING
2223         if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224                 /* tell the NIC to start polling this slice */
2225                 *tx->send_go = 1;
2226                 tx->queue_active = 1;
2227                 tx->activate++;
2228                 wmb();
2229         }
2230 #endif
2231         return;
2232
2233 drop:
2234         m_freem(m);
2235 drop_without_m:
2236         ss->oerrors++;
2237         return;
2238 }
2239
2240 #ifdef IFNET_BUF_RING
2241 static void
2242 mxge_qflush(struct ifnet *ifp)
2243 {
2244         mxge_softc_t *sc = ifp->if_softc;
2245         mxge_tx_ring_t *tx;
2246         struct mbuf *m;
2247         int slice;
2248
2249         for (slice = 0; slice < sc->num_slices; slice++) {
2250                 tx = &sc->ss[slice].tx;
2251                 mtx_lock(&tx->mtx);
2252                 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253                         m_freem(m);
2254                 mtx_unlock(&tx->mtx);
2255         }
2256         if_qflush(ifp);
2257 }
2258
2259 static inline void
2260 mxge_start_locked(struct mxge_slice_state *ss)
2261 {
2262         mxge_softc_t *sc;
2263         struct mbuf *m;
2264         struct ifnet *ifp;
2265         mxge_tx_ring_t *tx;
2266
2267         sc = ss->sc;
2268         ifp = sc->ifp;
2269         tx = &ss->tx;
2270
2271         while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272                 m = drbr_dequeue(ifp, tx->br);
2273                 if (m == NULL) {
2274                         return;
2275                 }
2276                 /* let BPF see it */
2277                 BPF_MTAP(ifp, m);
2278
2279                 /* give it to the nic */
2280                 mxge_encap(ss, m);
2281         }
2282         /* ran out of transmit slots */
2283         if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284             && (!drbr_empty(ifp, tx->br))) {
2285                 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286                 tx->stall++;
2287         }
2288 }
2289
2290 static int
2291 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292 {
2293         mxge_softc_t *sc;
2294         struct ifnet *ifp;
2295         mxge_tx_ring_t *tx;
2296         int err;
2297
2298         sc = ss->sc;
2299         ifp = sc->ifp;
2300         tx = &ss->tx;
2301
2302         if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303             IFF_DRV_RUNNING) {
2304                 err = drbr_enqueue(ifp, tx->br, m);
2305                 return (err);
2306         }
2307
2308         if (!drbr_needs_enqueue(ifp, tx->br) &&
2309             ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310                 /* let BPF see it */
2311                 BPF_MTAP(ifp, m);
2312                 /* give it to the nic */
2313                 mxge_encap(ss, m);
2314         } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315                 return (err);
2316         }
2317         if (!drbr_empty(ifp, tx->br))
2318                 mxge_start_locked(ss);
2319         return (0);
2320 }
2321
2322 static int
2323 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324 {
2325         mxge_softc_t *sc = ifp->if_softc;
2326         struct mxge_slice_state *ss;
2327         mxge_tx_ring_t *tx;
2328         int err = 0;
2329         int slice;
2330
2331         slice = m->m_pkthdr.flowid;
2332         slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333
2334         ss = &sc->ss[slice];
2335         tx = &ss->tx;
2336
2337         if (mtx_trylock(&tx->mtx)) {
2338                 err = mxge_transmit_locked(ss, m);
2339                 mtx_unlock(&tx->mtx);
2340         } else {
2341                 err = drbr_enqueue(ifp, tx->br, m);
2342         }
2343
2344         return (err);
2345 }
2346
2347 #else
2348
2349 static inline void
2350 mxge_start_locked(struct mxge_slice_state *ss)
2351 {
2352         mxge_softc_t *sc;
2353         struct mbuf *m;
2354         struct ifnet *ifp;
2355         mxge_tx_ring_t *tx;
2356
2357         sc = ss->sc;
2358         ifp = sc->ifp;
2359         tx = &ss->tx;
2360         while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361                 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362                 if (m == NULL) {
2363                         return;
2364                 }
2365                 /* let BPF see it */
2366                 BPF_MTAP(ifp, m);
2367
2368                 /* give it to the nic */
2369                 mxge_encap(ss, m);
2370         }
2371         /* ran out of transmit slots */
2372         if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373                 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374                 tx->stall++;
2375         }
2376 }
2377 #endif
2378 static void
2379 mxge_start(struct ifnet *ifp)
2380 {
2381         mxge_softc_t *sc = ifp->if_softc;
2382         struct mxge_slice_state *ss;
2383
2384         /* only use the first slice for now */
2385         ss = &sc->ss[0];
2386         mtx_lock(&ss->tx.mtx);
2387         mxge_start_locked(ss);
2388         mtx_unlock(&ss->tx.mtx);                
2389 }
2390
2391 /*
2392  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393  * at most 32 bytes at a time, so as to avoid involving the software
2394  * pio handler in the nic.   We re-write the first segment's low
2395  * DMA address to mark it valid only after we write the entire chunk
2396  * in a burst
2397  */
2398 static inline void
2399 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400                 mcp_kreq_ether_recv_t *src)
2401 {
2402         uint32_t low;
2403
2404         low = src->addr_low;
2405         src->addr_low = 0xffffffff;
2406         mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407         wmb();
2408         mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409         wmb();
2410         src->addr_low = low;
2411         dst->addr_low = low;
2412         wmb();
2413 }
2414
2415 static int
2416 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417 {
2418         bus_dma_segment_t seg;
2419         struct mbuf *m;
2420         mxge_rx_ring_t *rx = &ss->rx_small;
2421         int cnt, err;
2422
2423         m = m_gethdr(M_NOWAIT, MT_DATA);
2424         if (m == NULL) {
2425                 rx->alloc_fail++;
2426                 err = ENOBUFS;
2427                 goto done;
2428         }
2429         m->m_len = MHLEN;
2430         err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431                                       &seg, &cnt, BUS_DMA_NOWAIT);
2432         if (err != 0) {
2433                 m_free(m);
2434                 goto done;
2435         }
2436         rx->info[idx].m = m;
2437         rx->shadow[idx].addr_low =
2438                 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439         rx->shadow[idx].addr_high =
2440                 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441
2442 done:
2443         if ((idx & 7) == 7)
2444                 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445         return err;
2446 }
2447
2448 static int
2449 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450 {
2451         bus_dma_segment_t seg[3];
2452         struct mbuf *m;
2453         mxge_rx_ring_t *rx = &ss->rx_big;
2454         int cnt, err, i;
2455
2456         m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457         if (m == NULL) {
2458                 rx->alloc_fail++;
2459                 err = ENOBUFS;
2460                 goto done;
2461         }
2462         m->m_len = rx->mlen;
2463         err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464                                       seg, &cnt, BUS_DMA_NOWAIT);
2465         if (err != 0) {
2466                 m_free(m);
2467                 goto done;
2468         }
2469         rx->info[idx].m = m;
2470         rx->shadow[idx].addr_low =
2471                 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472         rx->shadow[idx].addr_high =
2473                 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474
2475 #if MXGE_VIRT_JUMBOS
2476         for (i = 1; i < cnt; i++) {
2477                 rx->shadow[idx + i].addr_low =
2478                         htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479                 rx->shadow[idx + i].addr_high =
2480                         htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481        }
2482 #endif
2483
2484 done:
2485        for (i = 0; i < rx->nbufs; i++) {
2486                 if ((idx & 7) == 7) {
2487                         mxge_submit_8rx(&rx->lanai[idx - 7],
2488                                         &rx->shadow[idx - 7]);
2489                 }
2490                 idx++;
2491         }
2492         return err;
2493 }
2494
2495 #ifdef INET6
2496
2497 static uint16_t
2498 mxge_csum_generic(uint16_t *raw, int len)
2499 {
2500         uint32_t csum;
2501
2502
2503         csum = 0;
2504         while (len > 0) {
2505                 csum += *raw;
2506                 raw++;
2507                 len -= 2;
2508         }
2509         csum = (csum >> 16) + (csum & 0xffff);
2510         csum = (csum >> 16) + (csum & 0xffff);
2511         return (uint16_t)csum;
2512 }
2513
2514 static inline uint16_t
2515 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 {
2517         uint32_t partial;
2518         int nxt, cksum_offset;
2519         struct ip6_hdr *ip6 = p;
2520         uint16_t c;
2521
2522         nxt = ip6->ip6_nxt;
2523         cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524         if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525                 cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526                                            IPPROTO_IPV6, &nxt);
2527                 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528                         return (1);
2529         }
2530
2531         /*
2532          * IPv6 headers do not contain a checksum, and hence
2533          * do not checksum to zero, so they don't "fall out"
2534          * of the partial checksum calculation like IPv4
2535          * headers do.  We need to fix the partial checksum by
2536          * subtracting the checksum of the IPv6 header.
2537          */
2538
2539         partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540                                     ETHER_HDR_LEN);
2541         csum += ~partial;
2542         csum +=  (csum < ~partial);
2543         csum = (csum >> 16) + (csum & 0xFFFF);
2544         csum = (csum >> 16) + (csum & 0xFFFF);
2545         c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546                              csum);
2547         c ^= 0xffff;
2548         return (c);
2549 }
2550 #endif /* INET6 */
2551 /*
2552  *  Myri10GE hardware checksums are not valid if the sender
2553  *  padded the frame with non-zero padding.  This is because
2554  *  the firmware just does a simple 16-bit 1s complement
2555  *  checksum across the entire frame, excluding the first 14
2556  *  bytes.  It is best to simply to check the checksum and
2557  *  tell the stack about it only if the checksum is good
2558  */
2559
2560 static inline uint16_t
2561 mxge_rx_csum(struct mbuf *m, int csum)
2562 {
2563         struct ether_header *eh;
2564 #ifdef INET
2565         struct ip *ip;
2566 #endif
2567 #if defined(INET) || defined(INET6)
2568         int cap = m->m_pkthdr.rcvif->if_capenable;
2569 #endif
2570         uint16_t c, etype;
2571
2572
2573         eh = mtod(m, struct ether_header *);
2574         etype = ntohs(eh->ether_type);
2575         switch (etype) {
2576 #ifdef INET
2577         case ETHERTYPE_IP:
2578                 if ((cap & IFCAP_RXCSUM) == 0)
2579                         return (1);
2580                 ip = (struct ip *)(eh + 1);
2581                 if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582                         return (1);
2583                 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584                               htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585                                     (ip->ip_hl << 2) + ip->ip_p));
2586                 c ^= 0xffff;
2587                 break;
2588 #endif
2589 #ifdef INET6
2590         case ETHERTYPE_IPV6:
2591                 if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592                         return (1);
2593                 c = mxge_rx_csum6((eh + 1), m, csum);
2594                 break;
2595 #endif
2596         default:
2597                 c = 1;
2598         }
2599         return (c);
2600 }
2601
2602 static void
2603 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604 {
2605         struct ether_vlan_header *evl;
2606         struct ether_header *eh;
2607         uint32_t partial;
2608
2609         evl = mtod(m, struct ether_vlan_header *);
2610         eh = mtod(m, struct ether_header *);
2611
2612         /*
2613          * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614          * after what the firmware thought was the end of the ethernet
2615          * header.
2616          */
2617
2618         /* put checksum into host byte order */
2619         *csum = ntohs(*csum);
2620         partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621         (*csum) += ~partial;
2622         (*csum) +=  ((*csum) < ~partial);
2623         (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624         (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625
2626         /* restore checksum to network byte order;
2627            later consumers expect this */
2628         *csum = htons(*csum);
2629
2630         /* save the tag */
2631 #ifdef MXGE_NEW_VLAN_API        
2632         m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633 #else
2634         {
2635                 struct m_tag *mtag;
2636                 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637                                    M_NOWAIT);
2638                 if (mtag == NULL)
2639                         return;
2640                 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641                 m_tag_prepend(m, mtag);
2642         }
2643
2644 #endif
2645         m->m_flags |= M_VLANTAG;
2646
2647         /*
2648          * Remove the 802.1q header by copying the Ethernet
2649          * addresses over it and adjusting the beginning of
2650          * the data in the mbuf.  The encapsulated Ethernet
2651          * type field is already in place.
2652          */
2653         bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654               ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655         m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656 }
2657
2658
2659 static inline void
2660 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661                  uint32_t csum, int lro)
2662 {
2663         mxge_softc_t *sc;
2664         struct ifnet *ifp;
2665         struct mbuf *m;
2666         struct ether_header *eh;
2667         mxge_rx_ring_t *rx;
2668         bus_dmamap_t old_map;
2669         int idx;
2670
2671         sc = ss->sc;
2672         ifp = sc->ifp;
2673         rx = &ss->rx_big;
2674         idx = rx->cnt & rx->mask;
2675         rx->cnt += rx->nbufs;
2676         /* save a pointer to the received mbuf */
2677         m = rx->info[idx].m;
2678         /* try to replace the received mbuf */
2679         if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680                 /* drop the frame -- the old mbuf is re-cycled */
2681                 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2682                 return;
2683         }
2684
2685         /* unmap the received buffer */
2686         old_map = rx->info[idx].map;
2687         bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688         bus_dmamap_unload(rx->dmat, old_map);
2689
2690         /* swap the bus_dmamap_t's */
2691         rx->info[idx].map = rx->extra_map;
2692         rx->extra_map = old_map;
2693
2694         /* mcp implicitly skips 1st 2 bytes so that packet is properly
2695          * aligned */
2696         m->m_data += MXGEFW_PAD;
2697
2698         m->m_pkthdr.rcvif = ifp;
2699         m->m_len = m->m_pkthdr.len = len;
2700         ss->ipackets++;
2701         eh = mtod(m, struct ether_header *);
2702         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703                 mxge_vlan_tag_remove(m, &csum);
2704         }
2705         /* flowid only valid if RSS hashing is enabled */
2706         if (sc->num_slices > 1) {
2707                 m->m_pkthdr.flowid = (ss - sc->ss);
2708                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2709         }
2710         /* if the checksum is valid, mark it in the mbuf header */
2711         if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2712             (0 == mxge_rx_csum(m, csum))) {
2713                 /* Tell the stack that the  checksum is good */
2714                 m->m_pkthdr.csum_data = 0xffff;
2715                 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2716                         CSUM_DATA_VALID;
2717
2718 #if defined(INET) || defined (INET6)
2719                 if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2720                         return;
2721 #endif
2722         }
2723         /* pass the frame up the stack */
2724         (*ifp->if_input)(ifp, m);
2725 }
2726
2727 static inline void
2728 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2729                    uint32_t csum, int lro)
2730 {
2731         mxge_softc_t *sc;
2732         struct ifnet *ifp;
2733         struct ether_header *eh;
2734         struct mbuf *m;
2735         mxge_rx_ring_t *rx;
2736         bus_dmamap_t old_map;
2737         int idx;
2738
2739         sc = ss->sc;
2740         ifp = sc->ifp;
2741         rx = &ss->rx_small;
2742         idx = rx->cnt & rx->mask;
2743         rx->cnt++;
2744         /* save a pointer to the received mbuf */
2745         m = rx->info[idx].m;
2746         /* try to replace the received mbuf */
2747         if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2748                 /* drop the frame -- the old mbuf is re-cycled */
2749                 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2750                 return;
2751         }
2752
2753         /* unmap the received buffer */
2754         old_map = rx->info[idx].map;
2755         bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2756         bus_dmamap_unload(rx->dmat, old_map);
2757
2758         /* swap the bus_dmamap_t's */
2759         rx->info[idx].map = rx->extra_map;
2760         rx->extra_map = old_map;
2761
2762         /* mcp implicitly skips 1st 2 bytes so that packet is properly
2763          * aligned */
2764         m->m_data += MXGEFW_PAD;
2765
2766         m->m_pkthdr.rcvif = ifp;
2767         m->m_len = m->m_pkthdr.len = len;
2768         ss->ipackets++;
2769         eh = mtod(m, struct ether_header *);
2770         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2771                 mxge_vlan_tag_remove(m, &csum);
2772         }
2773         /* flowid only valid if RSS hashing is enabled */
2774         if (sc->num_slices > 1) {
2775                 m->m_pkthdr.flowid = (ss - sc->ss);
2776                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2777         }
2778         /* if the checksum is valid, mark it in the mbuf header */
2779         if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2780             (0 == mxge_rx_csum(m, csum))) {
2781                 /* Tell the stack that the  checksum is good */
2782                 m->m_pkthdr.csum_data = 0xffff;
2783                 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2784                         CSUM_DATA_VALID;
2785
2786 #if defined(INET) || defined (INET6)
2787                 if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2788                         return;
2789 #endif
2790         }
2791         /* pass the frame up the stack */
2792         (*ifp->if_input)(ifp, m);
2793 }
2794
2795 static inline void
2796 mxge_clean_rx_done(struct mxge_slice_state *ss)
2797 {
2798         mxge_rx_done_t *rx_done = &ss->rx_done;
2799         int limit = 0;
2800         uint16_t length;
2801         uint16_t checksum;
2802         int lro;
2803
2804         lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2805         while (rx_done->entry[rx_done->idx].length != 0) {
2806                 length = ntohs(rx_done->entry[rx_done->idx].length);
2807                 rx_done->entry[rx_done->idx].length = 0;
2808                 checksum = rx_done->entry[rx_done->idx].checksum;
2809                 if (length <= (MHLEN - MXGEFW_PAD))
2810                         mxge_rx_done_small(ss, length, checksum, lro);
2811                 else
2812                         mxge_rx_done_big(ss, length, checksum, lro);
2813                 rx_done->cnt++;
2814                 rx_done->idx = rx_done->cnt & rx_done->mask;
2815
2816                 /* limit potential for livelock */
2817                 if (__predict_false(++limit > rx_done->mask / 2))
2818                         break;
2819         }
2820 #if defined(INET)  || defined (INET6)
2821         tcp_lro_flush_all(&ss->lc);
2822 #endif
2823 }
2824
2825
2826 static inline void
2827 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2828 {
2829         struct ifnet *ifp;
2830         mxge_tx_ring_t *tx;
2831         struct mbuf *m;
2832         bus_dmamap_t map;
2833         int idx;
2834         int *flags;
2835
2836         tx = &ss->tx;
2837         ifp = ss->sc->ifp;
2838         while (tx->pkt_done != mcp_idx) {
2839                 idx = tx->done & tx->mask;
2840                 tx->done++;
2841                 m = tx->info[idx].m;
2842                 /* mbuf and DMA map only attached to the first
2843                    segment per-mbuf */
2844                 if (m != NULL) {
2845                         ss->obytes += m->m_pkthdr.len;
2846                         if (m->m_flags & M_MCAST)
2847                                 ss->omcasts++;
2848                         ss->opackets++;
2849                         tx->info[idx].m = NULL;
2850                         map = tx->info[idx].map;
2851                         bus_dmamap_unload(tx->dmat, map);
2852                         m_freem(m);
2853                 }
2854                 if (tx->info[idx].flag) {
2855                         tx->info[idx].flag = 0;
2856                         tx->pkt_done++;
2857                 }
2858         }
2859         
2860         /* If we have space, clear IFF_OACTIVE to tell the stack that
2861            its OK to send packets */
2862 #ifdef IFNET_BUF_RING
2863         flags = &ss->if_drv_flags;
2864 #else
2865         flags = &ifp->if_drv_flags;
2866 #endif
2867         mtx_lock(&ss->tx.mtx);
2868         if ((*flags) & IFF_DRV_OACTIVE &&
2869             tx->req - tx->done < (tx->mask + 1)/4) {
2870                 *(flags) &= ~IFF_DRV_OACTIVE;
2871                 ss->tx.wake++;
2872                 mxge_start_locked(ss);
2873         }
2874 #ifdef IFNET_BUF_RING
2875         if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2876                 /* let the NIC stop polling this queue, since there
2877                  * are no more transmits pending */
2878                 if (tx->req == tx->done) {
2879                         *tx->send_stop = 1;
2880                         tx->queue_active = 0;
2881                         tx->deactivate++;
2882                         wmb();
2883                 }
2884         }
2885 #endif
2886         mtx_unlock(&ss->tx.mtx);
2887
2888 }
2889
2890 static struct mxge_media_type mxge_xfp_media_types[] =
2891 {
2892         {IFM_10G_CX4,   0x7f,           "10GBASE-CX4 (module)"},
2893         {IFM_10G_SR,    (1 << 7),       "10GBASE-SR"},
2894         {IFM_10G_LR,    (1 << 6),       "10GBASE-LR"},
2895         {0,             (1 << 5),       "10GBASE-ER"},
2896         {IFM_10G_LRM,   (1 << 4),       "10GBASE-LRM"},
2897         {0,             (1 << 3),       "10GBASE-SW"},
2898         {0,             (1 << 2),       "10GBASE-LW"},
2899         {0,             (1 << 1),       "10GBASE-EW"},
2900         {0,             (1 << 0),       "Reserved"}
2901 };
2902 static struct mxge_media_type mxge_sfp_media_types[] =
2903 {
2904         {IFM_10G_TWINAX,      0,        "10GBASE-Twinax"},
2905         {0,             (1 << 7),       "Reserved"},
2906         {IFM_10G_LRM,   (1 << 6),       "10GBASE-LRM"},
2907         {IFM_10G_LR,    (1 << 5),       "10GBASE-LR"},
2908         {IFM_10G_SR,    (1 << 4),       "10GBASE-SR"},
2909         {IFM_10G_TWINAX,(1 << 0),       "10GBASE-Twinax"}
2910 };
2911
2912 static void
2913 mxge_media_set(mxge_softc_t *sc, int media_type)
2914 {
2915
2916         
2917         ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2918                     0, NULL);
2919         ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2920         sc->current_media = media_type;
2921         sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2922 }
2923
2924 static void
2925 mxge_media_init(mxge_softc_t *sc)
2926 {
2927         char *ptr;
2928         int i;
2929
2930         ifmedia_removeall(&sc->media);
2931         mxge_media_set(sc, IFM_AUTO);
2932
2933         /*
2934          * parse the product code to deterimine the interface type
2935          * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2936          * after the 3rd dash in the driver's cached copy of the
2937          * EEPROM's product code string.
2938          */
2939         ptr = sc->product_code_string;
2940         if (ptr == NULL) {
2941                 device_printf(sc->dev, "Missing product code\n");
2942                 return;
2943         }
2944
2945         for (i = 0; i < 3; i++, ptr++) {
2946                 ptr = strchr(ptr, '-');
2947                 if (ptr == NULL) {
2948                         device_printf(sc->dev,
2949                                       "only %d dashes in PC?!?\n", i);
2950                         return;
2951                 }
2952         }
2953         if (*ptr == 'C' || *(ptr +1) == 'C') {
2954                 /* -C is CX4 */
2955                 sc->connector = MXGE_CX4;
2956                 mxge_media_set(sc, IFM_10G_CX4);
2957         } else if (*ptr == 'Q') {
2958                 /* -Q is Quad Ribbon Fiber */
2959                 sc->connector = MXGE_QRF;
2960                 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2961                 /* FreeBSD has no media type for Quad ribbon fiber */
2962         } else if (*ptr == 'R') {
2963                 /* -R is XFP */
2964                 sc->connector = MXGE_XFP;
2965         } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2966                 /* -S or -2S is SFP+ */
2967                 sc->connector = MXGE_SFP;
2968         } else {
2969                 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2970         }
2971 }
2972
2973 /*
2974  * Determine the media type for a NIC.  Some XFPs will identify
2975  * themselves only when their link is up, so this is initiated via a
2976  * link up interrupt.  However, this can potentially take up to
2977  * several milliseconds, so it is run via the watchdog routine, rather
2978  * than in the interrupt handler itself.
2979  */
2980 static void
2981 mxge_media_probe(mxge_softc_t *sc)
2982 {
2983         mxge_cmd_t cmd;
2984         char *cage_type;
2985
2986         struct mxge_media_type *mxge_media_types = NULL;
2987         int i, err, ms, mxge_media_type_entries;
2988         uint32_t byte;
2989
2990         sc->need_media_probe = 0;
2991
2992         if (sc->connector == MXGE_XFP) {
2993                 /* -R is XFP */
2994                 mxge_media_types = mxge_xfp_media_types;
2995                 mxge_media_type_entries =
2996                         nitems(mxge_xfp_media_types);
2997                 byte = MXGE_XFP_COMPLIANCE_BYTE;
2998                 cage_type = "XFP";
2999         } else  if (sc->connector == MXGE_SFP) {
3000                 /* -S or -2S is SFP+ */
3001                 mxge_media_types = mxge_sfp_media_types;
3002                 mxge_media_type_entries =
3003                         nitems(mxge_sfp_media_types);
3004                 cage_type = "SFP+";
3005                 byte = 3;
3006         } else {
3007                 /* nothing to do; media type cannot change */
3008                 return;
3009         }
3010
3011         /*
3012          * At this point we know the NIC has an XFP cage, so now we
3013          * try to determine what is in the cage by using the
3014          * firmware's XFP I2C commands to read the XFP 10GbE compilance
3015          * register.  We read just one byte, which may take over
3016          * a millisecond
3017          */
3018
3019         cmd.data0 = 0;   /* just fetch 1 byte, not all 256 */
3020         cmd.data1 = byte;
3021         err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3022         if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3023                 device_printf(sc->dev, "failed to read XFP\n");
3024         }
3025         if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3026                 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3027         }
3028         if (err != MXGEFW_CMD_OK) {
3029                 return;
3030         }
3031
3032         /* now we wait for the data to be cached */
3033         cmd.data0 = byte;
3034         err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3035         for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3036                 DELAY(1000);
3037                 cmd.data0 = byte;
3038                 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3039         }
3040         if (err != MXGEFW_CMD_OK) {
3041                 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3042                               cage_type, err, ms);
3043                 return;
3044         }
3045                 
3046         if (cmd.data0 == mxge_media_types[0].bitmask) {
3047                 if (mxge_verbose)
3048                         device_printf(sc->dev, "%s:%s\n", cage_type,
3049                                       mxge_media_types[0].name);
3050                 if (sc->current_media != mxge_media_types[0].flag) {
3051                         mxge_media_init(sc);
3052                         mxge_media_set(sc, mxge_media_types[0].flag);
3053                 }
3054                 return;
3055         }
3056         for (i = 1; i < mxge_media_type_entries; i++) {
3057                 if (cmd.data0 & mxge_media_types[i].bitmask) {
3058                         if (mxge_verbose)
3059                                 device_printf(sc->dev, "%s:%s\n",
3060                                               cage_type,
3061                                               mxge_media_types[i].name);
3062
3063                         if (sc->current_media != mxge_media_types[i].flag) {
3064                                 mxge_media_init(sc);
3065                                 mxge_media_set(sc, mxge_media_types[i].flag);
3066                         }
3067                         return;
3068                 }
3069         }
3070         if (mxge_verbose)
3071                 device_printf(sc->dev, "%s media 0x%x unknown\n",
3072                               cage_type, cmd.data0);
3073
3074         return;
3075 }
3076
3077 static void
3078 mxge_intr(void *arg)
3079 {
3080         struct mxge_slice_state *ss = arg;
3081         mxge_softc_t *sc = ss->sc;
3082         mcp_irq_data_t *stats = ss->fw_stats;
3083         mxge_tx_ring_t *tx = &ss->tx;
3084         mxge_rx_done_t *rx_done = &ss->rx_done;
3085         uint32_t send_done_count;
3086         uint8_t valid;
3087
3088
3089 #ifndef IFNET_BUF_RING
3090         /* an interrupt on a non-zero slice is implicitly valid
3091            since MSI-X irqs are not shared */
3092         if (ss != sc->ss) {
3093                 mxge_clean_rx_done(ss);
3094                 *ss->irq_claim = be32toh(3);
3095                 return;
3096         }
3097 #endif
3098
3099         /* make sure the DMA has finished */
3100         if (!stats->valid) {
3101                 return;
3102         }
3103         valid = stats->valid;
3104
3105         if (sc->legacy_irq) {
3106                 /* lower legacy IRQ  */
3107                 *sc->irq_deassert = 0;
3108                 if (!mxge_deassert_wait)
3109                         /* don't wait for conf. that irq is low */
3110                         stats->valid = 0;
3111         } else {
3112                 stats->valid = 0;
3113         }
3114
3115         /* loop while waiting for legacy irq deassertion */
3116         do {
3117                 /* check for transmit completes and receives */
3118                 send_done_count = be32toh(stats->send_done_count);
3119                 while ((send_done_count != tx->pkt_done) ||
3120                        (rx_done->entry[rx_done->idx].length != 0)) {
3121                         if (send_done_count != tx->pkt_done)
3122                                 mxge_tx_done(ss, (int)send_done_count);
3123                         mxge_clean_rx_done(ss);
3124                         send_done_count = be32toh(stats->send_done_count);
3125                 }
3126                 if (sc->legacy_irq && mxge_deassert_wait)
3127                         wmb();
3128         } while (*((volatile uint8_t *) &stats->valid));
3129
3130         /* fw link & error stats meaningful only on the first slice */
3131         if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3132                 if (sc->link_state != stats->link_up) {
3133                         sc->link_state = stats->link_up;
3134                         if (sc->link_state) {
3135                                 if_link_state_change(sc->ifp, LINK_STATE_UP);
3136                                 if (mxge_verbose)
3137                                         device_printf(sc->dev, "link up\n");
3138                         } else {
3139                                 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3140                                 if (mxge_verbose)
3141                                         device_printf(sc->dev, "link down\n");
3142                         }
3143                         sc->need_media_probe = 1;
3144                 }
3145                 if (sc->rdma_tags_available !=
3146                     be32toh(stats->rdma_tags_available)) {
3147                         sc->rdma_tags_available =
3148                                 be32toh(stats->rdma_tags_available);
3149                         device_printf(sc->dev, "RDMA timed out! %d tags "
3150                                       "left\n", sc->rdma_tags_available);
3151                 }
3152
3153                 if (stats->link_down) {
3154                         sc->down_cnt += stats->link_down;
3155                         sc->link_state = 0;
3156                         if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3157                 }
3158         }
3159
3160         /* check to see if we have rx token to pass back */
3161         if (valid & 0x1)
3162             *ss->irq_claim = be32toh(3);
3163         *(ss->irq_claim + 1) = be32toh(3);
3164 }
3165
3166 static void
3167 mxge_init(void *arg)
3168 {
3169         mxge_softc_t *sc = arg;
3170         struct ifnet *ifp = sc->ifp;
3171
3172
3173         mtx_lock(&sc->driver_mtx);
3174         if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3175                 (void) mxge_open(sc);
3176         mtx_unlock(&sc->driver_mtx);
3177 }
3178
3179
3180
3181 static void
3182 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3183 {
3184         int i;
3185
3186 #if defined(INET) || defined(INET6)
3187         tcp_lro_free(&ss->lc);
3188 #endif
3189         for (i = 0; i <= ss->rx_big.mask; i++) {
3190                 if (ss->rx_big.info[i].m == NULL)
3191                         continue;
3192                 bus_dmamap_unload(ss->rx_big.dmat,
3193                                   ss->rx_big.info[i].map);
3194                 m_freem(ss->rx_big.info[i].m);
3195                 ss->rx_big.info[i].m = NULL;
3196         }
3197
3198         for (i = 0; i <= ss->rx_small.mask; i++) {
3199                 if (ss->rx_small.info[i].m == NULL)
3200                         continue;
3201                 bus_dmamap_unload(ss->rx_small.dmat,
3202                                   ss->rx_small.info[i].map);
3203                 m_freem(ss->rx_small.info[i].m);
3204                 ss->rx_small.info[i].m = NULL;
3205         }
3206
3207         /* transmit ring used only on the first slice */
3208         if (ss->tx.info == NULL)
3209                 return;
3210
3211         for (i = 0; i <= ss->tx.mask; i++) {
3212                 ss->tx.info[i].flag = 0;
3213                 if (ss->tx.info[i].m == NULL)
3214                         continue;
3215                 bus_dmamap_unload(ss->tx.dmat,
3216                                   ss->tx.info[i].map);
3217                 m_freem(ss->tx.info[i].m);
3218                 ss->tx.info[i].m = NULL;
3219         }
3220 }
3221
3222 static void
3223 mxge_free_mbufs(mxge_softc_t *sc)
3224 {
3225         int slice;
3226
3227         for (slice = 0; slice < sc->num_slices; slice++)
3228                 mxge_free_slice_mbufs(&sc->ss[slice]);
3229 }
3230
3231 static void
3232 mxge_free_slice_rings(struct mxge_slice_state *ss)
3233 {
3234         int i;
3235
3236
3237         if (ss->rx_done.entry != NULL)
3238                 mxge_dma_free(&ss->rx_done.dma);
3239         ss->rx_done.entry = NULL;
3240
3241         if (ss->tx.req_bytes != NULL)
3242                 free(ss->tx.req_bytes, M_DEVBUF);
3243         ss->tx.req_bytes = NULL;
3244
3245         if (ss->tx.seg_list != NULL)
3246                 free(ss->tx.seg_list, M_DEVBUF);
3247         ss->tx.seg_list = NULL;
3248
3249         if (ss->rx_small.shadow != NULL)
3250                 free(ss->rx_small.shadow, M_DEVBUF);
3251         ss->rx_small.shadow = NULL;
3252
3253         if (ss->rx_big.shadow != NULL)
3254                 free(ss->rx_big.shadow, M_DEVBUF);
3255         ss->rx_big.shadow = NULL;
3256
3257         if (ss->tx.info != NULL) {
3258                 if (ss->tx.dmat != NULL) {
3259                         for (i = 0; i <= ss->tx.mask; i++) {
3260                                 bus_dmamap_destroy(ss->tx.dmat,
3261                                                    ss->tx.info[i].map);
3262                         }
3263                         bus_dma_tag_destroy(ss->tx.dmat);
3264                 }
3265                 free(ss->tx.info, M_DEVBUF);
3266         }
3267         ss->tx.info = NULL;
3268
3269         if (ss->rx_small.info != NULL) {
3270                 if (ss->rx_small.dmat != NULL) {
3271                         for (i = 0; i <= ss->rx_small.mask; i++) {
3272                                 bus_dmamap_destroy(ss->rx_small.dmat,
3273                                                    ss->rx_small.info[i].map);
3274                         }
3275                         bus_dmamap_destroy(ss->rx_small.dmat,
3276                                            ss->rx_small.extra_map);
3277                         bus_dma_tag_destroy(ss->rx_small.dmat);
3278                 }
3279                 free(ss->rx_small.info, M_DEVBUF);
3280         }
3281         ss->rx_small.info = NULL;
3282
3283         if (ss->rx_big.info != NULL) {
3284                 if (ss->rx_big.dmat != NULL) {
3285                         for (i = 0; i <= ss->rx_big.mask; i++) {
3286                                 bus_dmamap_destroy(ss->rx_big.dmat,
3287                                                    ss->rx_big.info[i].map);
3288                         }
3289                         bus_dmamap_destroy(ss->rx_big.dmat,
3290                                            ss->rx_big.extra_map);
3291                         bus_dma_tag_destroy(ss->rx_big.dmat);
3292                 }
3293                 free(ss->rx_big.info, M_DEVBUF);
3294         }
3295         ss->rx_big.info = NULL;
3296 }
3297
3298 static void
3299 mxge_free_rings(mxge_softc_t *sc)
3300 {
3301         int slice;
3302
3303         for (slice = 0; slice < sc->num_slices; slice++)
3304                 mxge_free_slice_rings(&sc->ss[slice]);
3305 }
3306
3307 static int
3308 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3309                        int tx_ring_entries)
3310 {
3311         mxge_softc_t *sc = ss->sc;
3312         size_t bytes;
3313         int err, i;
3314
3315         /* allocate per-slice receive resources */
3316
3317         ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3318         ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3319
3320         /* allocate the rx shadow rings */
3321         bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3322         ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3323
3324         bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3325         ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3326
3327         /* allocate the rx host info rings */
3328         bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3329         ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330
3331         bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3332         ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333
3334         /* allocate the rx busdma resources */
3335         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3336                                  1,                     /* alignment */
3337                                  4096,                  /* boundary */
3338                                  BUS_SPACE_MAXADDR,     /* low */
3339                                  BUS_SPACE_MAXADDR,     /* high */
3340                                  NULL, NULL,            /* filter */
3341                                  MHLEN,                 /* maxsize */
3342                                  1,                     /* num segs */
3343                                  MHLEN,                 /* maxsegsize */
3344                                  BUS_DMA_ALLOCNOW,      /* flags */
3345                                  NULL, NULL,            /* lock */
3346                                  &ss->rx_small.dmat);   /* tag */
3347         if (err != 0) {
3348                 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3349                               err);
3350                 return err;
3351         }
3352
3353         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3354                                  1,                     /* alignment */
3355 #if MXGE_VIRT_JUMBOS
3356                                  4096,                  /* boundary */
3357 #else
3358                                  0,                     /* boundary */
3359 #endif
3360                                  BUS_SPACE_MAXADDR,     /* low */
3361                                  BUS_SPACE_MAXADDR,     /* high */
3362                                  NULL, NULL,            /* filter */
3363                                  3*4096,                /* maxsize */
3364 #if MXGE_VIRT_JUMBOS
3365                                  3,                     /* num segs */
3366                                  4096,                  /* maxsegsize*/
3367 #else
3368                                  1,                     /* num segs */
3369                                  MJUM9BYTES,            /* maxsegsize*/
3370 #endif
3371                                  BUS_DMA_ALLOCNOW,      /* flags */
3372                                  NULL, NULL,            /* lock */
3373                                  &ss->rx_big.dmat);     /* tag */
3374         if (err != 0) {
3375                 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3376                               err);
3377                 return err;
3378         }
3379         for (i = 0; i <= ss->rx_small.mask; i++) {
3380                 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3381                                         &ss->rx_small.info[i].map);
3382                 if (err != 0) {
3383                         device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3384                                       err);
3385                         return err;
3386                 }
3387         }
3388         err = bus_dmamap_create(ss->rx_small.dmat, 0,
3389                                 &ss->rx_small.extra_map);
3390         if (err != 0) {
3391                 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3392                               err);
3393                 return err;
3394         }
3395
3396         for (i = 0; i <= ss->rx_big.mask; i++) {
3397                 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3398                                         &ss->rx_big.info[i].map);
3399                 if (err != 0) {
3400                         device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3401                                       err);
3402                         return err;
3403                 }
3404         }
3405         err = bus_dmamap_create(ss->rx_big.dmat, 0,
3406                                 &ss->rx_big.extra_map);
3407         if (err != 0) {
3408                 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3409                               err);
3410                 return err;
3411         }
3412
3413         /* now allocate TX resources */
3414
3415 #ifndef IFNET_BUF_RING
3416         /* only use a single TX ring for now */
3417         if (ss != ss->sc->ss)
3418                 return 0;
3419 #endif
3420
3421         ss->tx.mask = tx_ring_entries - 1;
3422         ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3423
3424         
3425         /* allocate the tx request copy block */
3426         bytes = 8 +
3427                 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3428         ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3429         /* ensure req_list entries are aligned to 8 bytes */
3430         ss->tx.req_list = (mcp_kreq_ether_send_t *)
3431                 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3432
3433         /* allocate the tx busdma segment list */
3434         bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3435         ss->tx.seg_list = (bus_dma_segment_t *)
3436                 malloc(bytes, M_DEVBUF, M_WAITOK);
3437
3438         /* allocate the tx host info ring */
3439         bytes = tx_ring_entries * sizeof (*ss->tx.info);
3440         ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3441         
3442         /* allocate the tx busdma resources */
3443         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3444                                  1,                     /* alignment */
3445                                  sc->tx_boundary,       /* boundary */
3446                                  BUS_SPACE_MAXADDR,     /* low */
3447                                  BUS_SPACE_MAXADDR,     /* high */
3448                                  NULL, NULL,            /* filter */
3449                                  65536 + 256,           /* maxsize */
3450                                  ss->tx.max_desc - 2,   /* num segs */
3451                                  sc->tx_boundary,       /* maxsegsz */
3452                                  BUS_DMA_ALLOCNOW,      /* flags */
3453                                  NULL, NULL,            /* lock */
3454                                  &ss->tx.dmat);         /* tag */
3455         
3456         if (err != 0) {
3457                 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3458                               err);
3459                 return err;
3460         }
3461
3462         /* now use these tags to setup dmamaps for each slot
3463            in the ring */
3464         for (i = 0; i <= ss->tx.mask; i++) {
3465                 err = bus_dmamap_create(ss->tx.dmat, 0,
3466                                         &ss->tx.info[i].map);
3467                 if (err != 0) {
3468                         device_printf(sc->dev, "Err %d  tx dmamap\n",
3469                                       err);
3470                         return err;
3471                 }
3472         }
3473         return 0;
3474
3475 }
3476
3477 static int
3478 mxge_alloc_rings(mxge_softc_t *sc)
3479 {
3480         mxge_cmd_t cmd;
3481         int tx_ring_size;
3482         int tx_ring_entries, rx_ring_entries;
3483         int err, slice;
3484         
3485         /* get ring sizes */
3486         err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3487         tx_ring_size = cmd.data0;
3488         if (err != 0) {
3489                 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3490                 goto abort;
3491         }
3492
3493         tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3494         rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3495         IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3496         sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3497         IFQ_SET_READY(&sc->ifp->if_snd);
3498
3499         for (slice = 0; slice < sc->num_slices; slice++) {
3500                 err = mxge_alloc_slice_rings(&sc->ss[slice],
3501                                              rx_ring_entries,
3502                                              tx_ring_entries);
3503                 if (err != 0)
3504                         goto abort;
3505         }
3506         return 0;
3507
3508 abort:
3509         mxge_free_rings(sc);
3510         return err;
3511
3512 }
3513
3514
3515 static void
3516 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3517 {
3518         int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3519
3520         if (bufsize < MCLBYTES) {
3521                 /* easy, everything fits in a single buffer */
3522                 *big_buf_size = MCLBYTES;
3523                 *cl_size = MCLBYTES;
3524                 *nbufs = 1;
3525                 return;
3526         }
3527
3528         if (bufsize < MJUMPAGESIZE) {
3529                 /* still easy, everything still fits in a single buffer */
3530                 *big_buf_size = MJUMPAGESIZE;
3531                 *cl_size = MJUMPAGESIZE;
3532                 *nbufs = 1;
3533                 return;
3534         }
3535 #if MXGE_VIRT_JUMBOS
3536         /* now we need to use virtually contiguous buffers */
3537         *cl_size = MJUM9BYTES;
3538         *big_buf_size = 4096;
3539         *nbufs = mtu / 4096 + 1;
3540         /* needs to be a power of two, so round up */
3541         if (*nbufs == 3)
3542                 *nbufs = 4;
3543 #else
3544         *cl_size = MJUM9BYTES;
3545         *big_buf_size = MJUM9BYTES;
3546         *nbufs = 1;
3547 #endif
3548 }
3549
3550 static int
3551 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3552 {
3553         mxge_softc_t *sc;
3554         mxge_cmd_t cmd;
3555         bus_dmamap_t map;
3556         int err, i, slice;
3557
3558
3559         sc = ss->sc;
3560         slice = ss - sc->ss;
3561
3562 #if defined(INET) || defined(INET6)
3563         (void)tcp_lro_init(&ss->lc);
3564 #endif
3565         ss->lc.ifp = sc->ifp;
3566         
3567         /* get the lanai pointers to the send and receive rings */
3568
3569         err = 0;
3570 #ifndef IFNET_BUF_RING
3571         /* We currently only send from the first slice */
3572         if (slice == 0) {
3573 #endif
3574                 cmd.data0 = slice;
3575                 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3576                 ss->tx.lanai =
3577                         (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3578                 ss->tx.send_go = (volatile uint32_t *)
3579                         (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3580                 ss->tx.send_stop = (volatile uint32_t *)
3581                 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3582 #ifndef IFNET_BUF_RING
3583         }
3584 #endif
3585         cmd.data0 = slice;
3586         err |= mxge_send_cmd(sc,
3587                              MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3588         ss->rx_small.lanai =
3589                 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3590         cmd.data0 = slice;
3591         err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3592         ss->rx_big.lanai =
3593                 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3594
3595         if (err != 0) {
3596                 device_printf(sc->dev,
3597                               "failed to get ring sizes or locations\n");
3598                 return EIO;
3599         }
3600
3601         /* stock receive rings */
3602         for (i = 0; i <= ss->rx_small.mask; i++) {
3603                 map = ss->rx_small.info[i].map;
3604                 err = mxge_get_buf_small(ss, map, i);
3605                 if (err) {
3606                         device_printf(sc->dev, "alloced %d/%d smalls\n",
3607                                       i, ss->rx_small.mask + 1);
3608                         return ENOMEM;
3609                 }
3610         }
3611         for (i = 0; i <= ss->rx_big.mask; i++) {
3612                 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3613                 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3614         }
3615         ss->rx_big.nbufs = nbufs;
3616         ss->rx_big.cl_size = cl_size;
3617         ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3618                 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3619         for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3620                 map = ss->rx_big.info[i].map;
3621                 err = mxge_get_buf_big(ss, map, i);
3622                 if (err) {
3623                         device_printf(sc->dev, "alloced %d/%d bigs\n",
3624                                       i, ss->rx_big.mask + 1);
3625                         return ENOMEM;
3626                 }
3627         }
3628         return 0;
3629 }
3630
3631 static int
3632 mxge_open(mxge_softc_t *sc)
3633 {
3634         mxge_cmd_t cmd;
3635         int err, big_bytes, nbufs, slice, cl_size, i;
3636         bus_addr_t bus;
3637         volatile uint8_t *itable;
3638         struct mxge_slice_state *ss;
3639
3640         /* Copy the MAC address in case it was overridden */
3641         bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3642
3643         err = mxge_reset(sc, 1);
3644         if (err != 0) {
3645                 device_printf(sc->dev, "failed to reset\n");
3646                 return EIO;
3647         }
3648
3649         if (sc->num_slices > 1) {
3650                 /* setup the indirection table */
3651                 cmd.data0 = sc->num_slices;
3652                 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3653                                     &cmd);
3654
3655                 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3656                                      &cmd);
3657                 if (err != 0) {
3658                         device_printf(sc->dev,
3659                                       "failed to setup rss tables\n");
3660                         return err;
3661                 }
3662
3663                 /* just enable an identity mapping */
3664                 itable = sc->sram + cmd.data0;
3665                 for (i = 0; i < sc->num_slices; i++)
3666                         itable[i] = (uint8_t)i;
3667
3668                 cmd.data0 = 1;
3669                 cmd.data1 = mxge_rss_hash_type;
3670                 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3671                 if (err != 0) {
3672                         device_printf(sc->dev, "failed to enable slices\n");
3673                         return err;
3674                 }
3675         }
3676
3677
3678         mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3679
3680         cmd.data0 = nbufs;
3681         err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3682                             &cmd);
3683         /* error is only meaningful if we're trying to set
3684            MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3685         if (err && nbufs > 1) {
3686                 device_printf(sc->dev,
3687                               "Failed to set alway-use-n to %d\n",
3688                               nbufs);
3689                 return EIO;
3690         }
3691         /* Give the firmware the mtu and the big and small buffer
3692            sizes.  The firmware wants the big buf size to be a power
3693            of two. Luckily, FreeBSD's clusters are powers of two */
3694         cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3695         err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3696         cmd.data0 = MHLEN - MXGEFW_PAD;
3697         err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3698                              &cmd);
3699         cmd.data0 = big_bytes;
3700         err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3701
3702         if (err != 0) {
3703                 device_printf(sc->dev, "failed to setup params\n");
3704                 goto abort;
3705         }
3706
3707         /* Now give him the pointer to the stats block */
3708         for (slice = 0;
3709 #ifdef IFNET_BUF_RING
3710              slice < sc->num_slices;
3711 #else
3712              slice < 1;
3713 #endif
3714              slice++) {
3715                 ss = &sc->ss[slice];
3716                 cmd.data0 =
3717                         MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3718                 cmd.data1 =
3719                         MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3720                 cmd.data2 = sizeof(struct mcp_irq_data);
3721                 cmd.data2 |= (slice << 16);
3722                 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3723         }
3724
3725         if (err != 0) {
3726                 bus = sc->ss->fw_stats_dma.bus_addr;
3727                 bus += offsetof(struct mcp_irq_data, send_done_count);
3728                 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3729                 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3730                 err = mxge_send_cmd(sc,
3731                                     MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3732                                     &cmd);
3733                 /* Firmware cannot support multicast without STATS_DMA_V2 */
3734                 sc->fw_multicast_support = 0;
3735         } else {
3736                 sc->fw_multicast_support = 1;
3737         }
3738
3739         if (err != 0) {
3740                 device_printf(sc->dev, "failed to setup params\n");
3741                 goto abort;
3742         }
3743
3744         for (slice = 0; slice < sc->num_slices; slice++) {
3745                 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3746                 if (err != 0) {
3747                         device_printf(sc->dev, "couldn't open slice %d\n",
3748                                       slice);
3749                         goto abort;
3750                 }
3751         }
3752
3753         /* Finally, start the firmware running */
3754         err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3755         if (err) {
3756                 device_printf(sc->dev, "Couldn't bring up link\n");
3757                 goto abort;
3758         }
3759 #ifdef IFNET_BUF_RING
3760         for (slice = 0; slice < sc->num_slices; slice++) {
3761                 ss = &sc->ss[slice];
3762                 ss->if_drv_flags |= IFF_DRV_RUNNING;
3763                 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3764         }
3765 #endif
3766         sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3767         sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3768
3769         return 0;
3770
3771
3772 abort:
3773         mxge_free_mbufs(sc);
3774
3775         return err;
3776 }
3777
3778 static int
3779 mxge_close(mxge_softc_t *sc, int down)
3780 {
3781         mxge_cmd_t cmd;
3782         int err, old_down_cnt;
3783 #ifdef IFNET_BUF_RING
3784         struct mxge_slice_state *ss;    
3785         int slice;
3786 #endif
3787
3788 #ifdef IFNET_BUF_RING
3789         for (slice = 0; slice < sc->num_slices; slice++) {
3790                 ss = &sc->ss[slice];
3791                 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3792         }
3793 #endif
3794         sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3795         if (!down) {
3796                 old_down_cnt = sc->down_cnt;
3797                 wmb();
3798                 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3799                 if (err) {
3800                         device_printf(sc->dev,
3801                                       "Couldn't bring down link\n");
3802                 }
3803                 if (old_down_cnt == sc->down_cnt) {
3804                         /* wait for down irq */
3805                         DELAY(10 * sc->intr_coal_delay);
3806                 }
3807                 wmb();
3808                 if (old_down_cnt == sc->down_cnt) {
3809                         device_printf(sc->dev, "never got down irq\n");
3810                 }
3811         }
3812         mxge_free_mbufs(sc);
3813
3814         return 0;
3815 }
3816
3817 static void
3818 mxge_setup_cfg_space(mxge_softc_t *sc)
3819 {
3820         device_t dev = sc->dev;
3821         int reg;
3822         uint16_t lnk, pectl;
3823
3824         /* find the PCIe link width and set max read request to 4KB*/
3825         if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3826                 lnk = pci_read_config(dev, reg + 0x12, 2);
3827                 sc->link_width = (lnk >> 4) & 0x3f;
3828
3829                 if (sc->pectl == 0) {
3830                         pectl = pci_read_config(dev, reg + 0x8, 2);
3831                         pectl = (pectl & ~0x7000) | (5 << 12);
3832                         pci_write_config(dev, reg + 0x8, pectl, 2);
3833                         sc->pectl = pectl;
3834                 } else {
3835                         /* restore saved pectl after watchdog reset */
3836                         pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3837                 }
3838         }
3839
3840         /* Enable DMA and Memory space access */
3841         pci_enable_busmaster(dev);
3842 }
3843
3844 static uint32_t
3845 mxge_read_reboot(mxge_softc_t *sc)
3846 {
3847         device_t dev = sc->dev;
3848         uint32_t vs;
3849
3850         /* find the vendor specific offset */
3851         if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3852                 device_printf(sc->dev,
3853                               "could not find vendor specific offset\n");
3854                 return (uint32_t)-1;
3855         }
3856         /* enable read32 mode */
3857         pci_write_config(dev, vs + 0x10, 0x3, 1);
3858         /* tell NIC which register to read */
3859         pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3860         return (pci_read_config(dev, vs + 0x14, 4));
3861 }
3862
3863 static void
3864 mxge_watchdog_reset(mxge_softc_t *sc)
3865 {
3866         struct pci_devinfo *dinfo;
3867         struct mxge_slice_state *ss;
3868         int err, running, s, num_tx_slices = 1;
3869         uint32_t reboot;
3870         uint16_t cmd;
3871
3872         err = ENXIO;
3873
3874         device_printf(sc->dev, "Watchdog reset!\n");
3875
3876         /*
3877          * check to see if the NIC rebooted.  If it did, then all of
3878          * PCI config space has been reset, and things like the
3879          * busmaster bit will be zero.  If this is the case, then we
3880          * must restore PCI config space before the NIC can be used
3881          * again
3882          */
3883         cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3884         if (cmd == 0xffff) {
3885                 /*
3886                  * maybe the watchdog caught the NIC rebooting; wait
3887                  * up to 100ms for it to finish.  If it does not come
3888                  * back, then give up
3889                  */
3890                 DELAY(1000*100);
3891                 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3892                 if (cmd == 0xffff) {
3893                         device_printf(sc->dev, "NIC disappeared!\n");
3894                 }
3895         }
3896         if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3897                 /* print the reboot status */
3898                 reboot = mxge_read_reboot(sc);
3899                 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3900                               reboot);
3901                 running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3902                 if (running) {
3903
3904                         /*
3905                          * quiesce NIC so that TX routines will not try to
3906                          * xmit after restoration of BAR
3907                          */
3908
3909                         /* Mark the link as down */
3910                         if (sc->link_state) {
3911                                 sc->link_state = 0;
3912                                 if_link_state_change(sc->ifp,
3913                                                      LINK_STATE_DOWN);
3914                         }
3915 #ifdef IFNET_BUF_RING
3916                         num_tx_slices = sc->num_slices;
3917 #endif
3918                         /* grab all TX locks to ensure no tx  */
3919                         for (s = 0; s < num_tx_slices; s++) {
3920                                 ss = &sc->ss[s];
3921                                 mtx_lock(&ss->tx.mtx);
3922                         }
3923                         mxge_close(sc, 1);
3924                 }
3925                 /* restore PCI configuration space */
3926                 dinfo = device_get_ivars(sc->dev);
3927                 pci_cfg_restore(sc->dev, dinfo);
3928
3929                 /* and redo any changes we made to our config space */
3930                 mxge_setup_cfg_space(sc);
3931
3932                 /* reload f/w */
3933                 err = mxge_load_firmware(sc, 0);
3934                 if (err) {
3935                         device_printf(sc->dev,
3936                                       "Unable to re-load f/w\n");
3937                 }
3938                 if (running) {
3939                         if (!err)
3940                                 err = mxge_open(sc);
3941                         /* release all TX locks */
3942                         for (s = 0; s < num_tx_slices; s++) {
3943                                 ss = &sc->ss[s];
3944 #ifdef IFNET_BUF_RING
3945                                 mxge_start_locked(ss);
3946 #endif
3947                                 mtx_unlock(&ss->tx.mtx);
3948                         }
3949                 }
3950                 sc->watchdog_resets++;
3951         } else {
3952                 device_printf(sc->dev,
3953                               "NIC did not reboot, not resetting\n");
3954                 err = 0;
3955         }
3956         if (err) {
3957                 device_printf(sc->dev, "watchdog reset failed\n");
3958         } else {
3959                 if (sc->dying == 2)
3960                         sc->dying = 0;
3961                 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3962         }
3963 }
3964
3965 static void
3966 mxge_watchdog_task(void *arg, int pending)
3967 {
3968         mxge_softc_t *sc = arg;
3969
3970
3971         mtx_lock(&sc->driver_mtx);
3972         mxge_watchdog_reset(sc);
3973         mtx_unlock(&sc->driver_mtx);
3974 }
3975
3976 static void
3977 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3978 {
3979         tx = &sc->ss[slice].tx;
3980         device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3981         device_printf(sc->dev,
3982                       "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3983                       tx->req, tx->done, tx->queue_active);
3984         device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3985                               tx->activate, tx->deactivate);
3986         device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3987                       tx->pkt_done,
3988                       be32toh(sc->ss->fw_stats->send_done_count));
3989 }
3990
3991 static int
3992 mxge_watchdog(mxge_softc_t *sc)
3993 {
3994         mxge_tx_ring_t *tx;
3995         uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3996         int i, err = 0;
3997
3998         /* see if we have outstanding transmits, which
3999            have been pending for more than mxge_ticks */
4000         for (i = 0;
4001 #ifdef IFNET_BUF_RING
4002              (i < sc->num_slices) && (err == 0);
4003 #else
4004              (i < 1) && (err == 0);
4005 #endif
4006              i++) {
4007                 tx = &sc->ss[i].tx;             
4008                 if (tx->req != tx->done &&
4009                     tx->watchdog_req != tx->watchdog_done &&
4010                     tx->done == tx->watchdog_done) {
4011                         /* check for pause blocking before resetting */
4012                         if (tx->watchdog_rx_pause == rx_pause) {
4013                                 mxge_warn_stuck(sc, tx, i);
4014                                 taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4015                                 return (ENXIO);
4016                         }
4017                         else
4018                                 device_printf(sc->dev, "Flow control blocking "
4019                                               "xmits, check link partner\n");
4020                 }
4021
4022                 tx->watchdog_req = tx->req;
4023                 tx->watchdog_done = tx->done;
4024                 tx->watchdog_rx_pause = rx_pause;
4025         }
4026
4027         if (sc->need_media_probe)
4028                 mxge_media_probe(sc);
4029         return (err);
4030 }
4031
4032 static uint64_t
4033 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4034 {
4035         struct mxge_softc *sc;
4036         uint64_t rv;
4037
4038         sc = if_getsoftc(ifp);
4039         rv = 0;
4040
4041         switch (cnt) {
4042         case IFCOUNTER_IPACKETS:
4043                 for (int s = 0; s < sc->num_slices; s++)
4044                         rv += sc->ss[s].ipackets;
4045                 return (rv);
4046         case IFCOUNTER_OPACKETS:
4047                 for (int s = 0; s < sc->num_slices; s++)
4048                         rv += sc->ss[s].opackets;
4049                 return (rv);
4050         case IFCOUNTER_OERRORS:
4051                 for (int s = 0; s < sc->num_slices; s++)
4052                         rv += sc->ss[s].oerrors;
4053                 return (rv);
4054 #ifdef IFNET_BUF_RING
4055         case IFCOUNTER_OBYTES:
4056                 for (int s = 0; s < sc->num_slices; s++)
4057                         rv += sc->ss[s].obytes;
4058                 return (rv);
4059         case IFCOUNTER_OMCASTS:
4060                 for (int s = 0; s < sc->num_slices; s++)
4061                         rv += sc->ss[s].omcasts;
4062                 return (rv);
4063         case IFCOUNTER_OQDROPS:
4064                 for (int s = 0; s < sc->num_slices; s++)
4065                         rv += sc->ss[s].tx.br->br_drops;
4066                 return (rv);
4067 #endif
4068         default:
4069                 return (if_get_counter_default(ifp, cnt));
4070         }
4071 }
4072
4073 static void
4074 mxge_tick(void *arg)
4075 {
4076         mxge_softc_t *sc = arg;
4077         u_long pkts = 0;
4078         int err = 0;
4079         int running, ticks;
4080         uint16_t cmd;
4081
4082         ticks = mxge_ticks;
4083         running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4084         if (running) {
4085                 if (!sc->watchdog_countdown) {
4086                         err = mxge_watchdog(sc);
4087                         sc->watchdog_countdown = 4;
4088                 }
4089                 sc->watchdog_countdown--;
4090         }
4091         if (pkts == 0) {
4092                 /* ensure NIC did not suffer h/w fault while idle */
4093                 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);                
4094                 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4095                         sc->dying = 2;
4096                         taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4097                         err = ENXIO;
4098                 }
4099                 /* look less often if NIC is idle */
4100                 ticks *= 4;
4101         }
4102
4103         if (err == 0)
4104                 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4105
4106 }
4107
4108 static int
4109 mxge_media_change(struct ifnet *ifp)
4110 {
4111         return EINVAL;
4112 }
4113
4114 static int
4115 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4116 {
4117         struct ifnet *ifp = sc->ifp;
4118         int real_mtu, old_mtu;
4119         int err = 0;
4120
4121
4122         real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4123         if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4124                 return EINVAL;
4125         mtx_lock(&sc->driver_mtx);
4126         old_mtu = ifp->if_mtu;
4127         ifp->if_mtu = mtu;
4128         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4129                 mxge_close(sc, 0);
4130                 err = mxge_open(sc);
4131                 if (err != 0) {
4132                         ifp->if_mtu = old_mtu;
4133                         mxge_close(sc, 0);
4134                         (void) mxge_open(sc);
4135                 }
4136         }
4137         mtx_unlock(&sc->driver_mtx);
4138         return err;
4139 }       
4140
4141 static void
4142 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4143 {
4144         mxge_softc_t *sc = ifp->if_softc;
4145         
4146
4147         if (sc == NULL)
4148                 return;
4149         ifmr->ifm_status = IFM_AVALID;
4150         ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4151         ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4152         ifmr->ifm_active |= sc->current_media;
4153 }
4154
4155 static int
4156 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4157 {
4158         mxge_softc_t *sc = ifp->if_softc;
4159         struct ifreq *ifr = (struct ifreq *)data;
4160         int err, mask;
4161
4162         err = 0;
4163         switch (command) {
4164         case SIOCSIFADDR:
4165         case SIOCGIFADDR:
4166                 err = ether_ioctl(ifp, command, data);
4167                 break;
4168
4169         case SIOCSIFMTU:
4170                 err = mxge_change_mtu(sc, ifr->ifr_mtu);
4171                 break;
4172
4173         case SIOCSIFFLAGS:
4174                 mtx_lock(&sc->driver_mtx);
4175                 if (sc->dying) {
4176                         mtx_unlock(&sc->driver_mtx);
4177                         return EINVAL;
4178                 }
4179                 if (ifp->if_flags & IFF_UP) {
4180                         if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4181                                 err = mxge_open(sc);
4182                         } else {
4183                                 /* take care of promis can allmulti
4184                                    flag chages */
4185                                 mxge_change_promisc(sc,
4186                                                     ifp->if_flags & IFF_PROMISC);
4187                                 mxge_set_multicast_list(sc);
4188                         }
4189                 } else {
4190                         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4191                                 mxge_close(sc, 0);
4192                         }
4193                 }
4194                 mtx_unlock(&sc->driver_mtx);
4195                 break;
4196
4197         case SIOCADDMULTI:
4198         case SIOCDELMULTI:
4199                 mtx_lock(&sc->driver_mtx);
4200                 mxge_set_multicast_list(sc);
4201                 mtx_unlock(&sc->driver_mtx);
4202                 break;
4203
4204         case SIOCSIFCAP:
4205                 mtx_lock(&sc->driver_mtx);
4206                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4207                 if (mask & IFCAP_TXCSUM) {
4208                         if (IFCAP_TXCSUM & ifp->if_capenable) {
4209                                 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4210                                 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4211                         } else {
4212                                 ifp->if_capenable |= IFCAP_TXCSUM;
4213                                 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4214                         }
4215                 } else if (mask & IFCAP_RXCSUM) {
4216                         if (IFCAP_RXCSUM & ifp->if_capenable) {
4217                                 ifp->if_capenable &= ~IFCAP_RXCSUM;
4218                         } else {
4219                                 ifp->if_capenable |= IFCAP_RXCSUM;
4220                         }
4221                 }
4222                 if (mask & IFCAP_TSO4) {
4223                         if (IFCAP_TSO4 & ifp->if_capenable) {
4224                                 ifp->if_capenable &= ~IFCAP_TSO4;
4225                         } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4226                                 ifp->if_capenable |= IFCAP_TSO4;
4227                                 ifp->if_hwassist |= CSUM_TSO;
4228                         } else {
4229                                 printf("mxge requires tx checksum offload"
4230                                        " be enabled to use TSO\n");
4231                                 err = EINVAL;
4232                         }
4233                 }
4234 #if IFCAP_TSO6
4235                 if (mask & IFCAP_TXCSUM_IPV6) {
4236                         if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4237                                 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4238                                                        | IFCAP_TSO6);
4239                                 ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4240                                                       | CSUM_UDP);
4241                         } else {
4242                                 ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4243                                 ifp->if_hwassist |= (CSUM_TCP_IPV6
4244                                                      | CSUM_UDP_IPV6);
4245                         }
4246                 } else if (mask & IFCAP_RXCSUM_IPV6) {
4247                         if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4248                                 ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4249                         } else {
4250                                 ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4251                         }
4252                 }
4253                 if (mask & IFCAP_TSO6) {
4254                         if (IFCAP_TSO6 & ifp->if_capenable) {
4255                                 ifp->if_capenable &= ~IFCAP_TSO6;
4256                         } else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4257                                 ifp->if_capenable |= IFCAP_TSO6;
4258                                 ifp->if_hwassist |= CSUM_TSO;
4259                         } else {
4260                                 printf("mxge requires tx checksum offload"
4261                                        " be enabled to use TSO\n");
4262                                 err = EINVAL;
4263                         }
4264                 }
4265 #endif /*IFCAP_TSO6 */
4266
4267                 if (mask & IFCAP_LRO)
4268                         ifp->if_capenable ^= IFCAP_LRO;
4269                 if (mask & IFCAP_VLAN_HWTAGGING)
4270                         ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4271                 if (mask & IFCAP_VLAN_HWTSO)
4272                         ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4273
4274                 if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4275                     !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4276                         ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4277
4278                 mtx_unlock(&sc->driver_mtx);
4279                 VLAN_CAPABILITIES(ifp);
4280
4281                 break;
4282
4283         case SIOCGIFMEDIA:
4284                 mtx_lock(&sc->driver_mtx);
4285                 mxge_media_probe(sc);
4286                 mtx_unlock(&sc->driver_mtx);
4287                 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4288                                     &sc->media, command);
4289                 break;
4290
4291         default:
4292                 err = ENOTTY;
4293         }
4294         return err;
4295 }
4296
4297 static void
4298 mxge_fetch_tunables(mxge_softc_t *sc)
4299 {
4300
4301         TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4302         TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4303                           &mxge_flow_control);
4304         TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4305                           &mxge_intr_coal_delay);       
4306         TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4307                           &mxge_nvidia_ecrc_enable);    
4308         TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4309                           &mxge_force_firmware);        
4310         TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4311                           &mxge_deassert_wait); 
4312         TUNABLE_INT_FETCH("hw.mxge.verbose",
4313                           &mxge_verbose);       
4314         TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4315         TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4316         TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4317         TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4318         TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4319         TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4320
4321         if (bootverbose)
4322                 mxge_verbose = 1;
4323         if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4324                 mxge_intr_coal_delay = 30;
4325         if (mxge_ticks == 0)
4326                 mxge_ticks = hz / 2;
4327         sc->pause = mxge_flow_control;
4328         if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4329             || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4330                 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4331         }
4332         if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4333             mxge_initial_mtu < ETHER_MIN_LEN)
4334                 mxge_initial_mtu = ETHERMTU_JUMBO;
4335
4336         if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4337                 mxge_throttle = MXGE_MAX_THROTTLE;
4338         if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4339                 mxge_throttle = MXGE_MIN_THROTTLE;
4340         sc->throttle = mxge_throttle;
4341 }
4342
4343
4344 static void
4345 mxge_free_slices(mxge_softc_t *sc)
4346 {
4347         struct mxge_slice_state *ss;
4348         int i;
4349
4350
4351         if (sc->ss == NULL)
4352                 return;
4353
4354         for (i = 0; i < sc->num_slices; i++) {
4355                 ss = &sc->ss[i];
4356                 if (ss->fw_stats != NULL) {
4357                         mxge_dma_free(&ss->fw_stats_dma);
4358                         ss->fw_stats = NULL;
4359 #ifdef IFNET_BUF_RING
4360                         if (ss->tx.br != NULL) {
4361                                 drbr_free(ss->tx.br, M_DEVBUF);
4362                                 ss->tx.br = NULL;
4363                         }
4364 #endif
4365                         mtx_destroy(&ss->tx.mtx);
4366                 }
4367                 if (ss->rx_done.entry != NULL) {
4368                         mxge_dma_free(&ss->rx_done.dma);
4369                         ss->rx_done.entry = NULL;
4370                 }
4371         }
4372         free(sc->ss, M_DEVBUF);
4373         sc->ss = NULL;
4374 }
4375
4376 static int
4377 mxge_alloc_slices(mxge_softc_t *sc)
4378 {
4379         mxge_cmd_t cmd;
4380         struct mxge_slice_state *ss;
4381         size_t bytes;
4382         int err, i, max_intr_slots;
4383
4384         err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4385         if (err != 0) {
4386                 device_printf(sc->dev, "Cannot determine rx ring size\n");
4387                 return err;
4388         }
4389         sc->rx_ring_size = cmd.data0;
4390         max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4391         
4392         bytes = sizeof (*sc->ss) * sc->num_slices;
4393         sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4394         if (sc->ss == NULL)
4395                 return (ENOMEM);
4396         for (i = 0; i < sc->num_slices; i++) {
4397                 ss = &sc->ss[i];
4398
4399                 ss->sc = sc;
4400
4401                 /* allocate per-slice rx interrupt queues */
4402                 
4403                 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4404                 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4405                 if (err != 0)
4406                         goto abort;
4407                 ss->rx_done.entry = ss->rx_done.dma.addr;
4408                 bzero(ss->rx_done.entry, bytes);
4409
4410                 /*
4411                  * allocate the per-slice firmware stats; stats
4412                  * (including tx) are used used only on the first
4413                  * slice for now
4414                  */
4415 #ifndef IFNET_BUF_RING
4416                 if (i > 0)
4417                         continue;
4418 #endif
4419
4420                 bytes = sizeof (*ss->fw_stats);
4421                 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4422                                      sizeof (*ss->fw_stats), 64);
4423                 if (err != 0)
4424                         goto abort;
4425                 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4426                 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4427                          "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4428                 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4429 #ifdef IFNET_BUF_RING
4430                 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4431                                            &ss->tx.mtx);
4432 #endif
4433         }
4434
4435         return (0);
4436
4437 abort:
4438         mxge_free_slices(sc);
4439         return (ENOMEM);
4440 }
4441
4442 static void
4443 mxge_slice_probe(mxge_softc_t *sc)
4444 {
4445         mxge_cmd_t cmd;
4446         char *old_fw;
4447         int msix_cnt, status, max_intr_slots;
4448
4449         sc->num_slices = 1;
4450         /*
4451          *  don't enable multiple slices if they are not enabled,
4452          *  or if this is not an SMP system
4453          */
4454         
4455         if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4456                 return;
4457
4458         /* see how many MSI-X interrupts are available */
4459         msix_cnt = pci_msix_count(sc->dev);
4460         if (msix_cnt < 2)
4461                 return;
4462
4463         /* now load the slice aware firmware see what it supports */
4464         old_fw = sc->fw_name;
4465         if (old_fw == mxge_fw_aligned)
4466                 sc->fw_name = mxge_fw_rss_aligned;
4467         else
4468                 sc->fw_name = mxge_fw_rss_unaligned;
4469         status = mxge_load_firmware(sc, 0);
4470         if (status != 0) {
4471                 device_printf(sc->dev, "Falling back to a single slice\n");
4472                 return;
4473         }
4474         
4475         /* try to send a reset command to the card to see if it
4476            is alive */
4477         memset(&cmd, 0, sizeof (cmd));
4478         status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4479         if (status != 0) {
4480                 device_printf(sc->dev, "failed reset\n");
4481                 goto abort_with_fw;
4482         }
4483
4484         /* get rx ring size */
4485         status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4486         if (status != 0) {
4487                 device_printf(sc->dev, "Cannot determine rx ring size\n");
4488                 goto abort_with_fw;
4489         }
4490         max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4491
4492         /* tell it the size of the interrupt queues */
4493         cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4494         status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4495         if (status != 0) {
4496                 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4497                 goto abort_with_fw;
4498         }
4499
4500         /* ask the maximum number of slices it supports */
4501         status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4502         if (status != 0) {
4503                 device_printf(sc->dev,
4504                               "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4505                 goto abort_with_fw;
4506         }
4507         sc->num_slices = cmd.data0;
4508         if (sc->num_slices > msix_cnt)
4509                 sc->num_slices = msix_cnt;
4510
4511         if (mxge_max_slices == -1) {
4512                 /* cap to number of CPUs in system */
4513                 if (sc->num_slices > mp_ncpus)
4514                         sc->num_slices = mp_ncpus;
4515         } else {
4516                 if (sc->num_slices > mxge_max_slices)
4517                         sc->num_slices = mxge_max_slices;
4518         }
4519         /* make sure it is a power of two */
4520         while (sc->num_slices & (sc->num_slices - 1))
4521                 sc->num_slices--;
4522
4523         if (mxge_verbose)
4524                 device_printf(sc->dev, "using %d slices\n",
4525                               sc->num_slices);
4526         
4527         return;
4528
4529 abort_with_fw:
4530         sc->fw_name = old_fw;
4531         (void) mxge_load_firmware(sc, 0);
4532 }
4533
4534 static int
4535 mxge_add_msix_irqs(mxge_softc_t *sc)
4536 {
4537         size_t bytes;
4538         int count, err, i, rid;
4539
4540         rid = PCIR_BAR(2);
4541         sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4542                                                     &rid, RF_ACTIVE);
4543
4544         if (sc->msix_table_res == NULL) {
4545                 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4546                 return ENXIO;
4547         }
4548
4549         count = sc->num_slices;
4550         err = pci_alloc_msix(sc->dev, &count);
4551         if (err != 0) {
4552                 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4553                               "err = %d \n", sc->num_slices, err);
4554                 goto abort_with_msix_table;
4555         }
4556         if (count < sc->num_slices) {
4557                 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4558                               count, sc->num_slices);
4559                 device_printf(sc->dev,
4560                               "Try setting hw.mxge.max_slices to %d\n",
4561                               count);
4562                 err = ENOSPC;
4563                 goto abort_with_msix;
4564         }
4565         bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4566         sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4567         if (sc->msix_irq_res == NULL) {
4568                 err = ENOMEM;
4569                 goto abort_with_msix;
4570         }
4571
4572         for (i = 0; i < sc->num_slices; i++) {
4573                 rid = i + 1;
4574                 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4575                                                           SYS_RES_IRQ,
4576                                                           &rid, RF_ACTIVE);
4577                 if (sc->msix_irq_res[i] == NULL) {
4578                         device_printf(sc->dev, "couldn't allocate IRQ res"
4579                                       " for message %d\n", i);
4580                         err = ENXIO;
4581                         goto abort_with_res;
4582                 }
4583         }
4584
4585         bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4586         sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4587
4588         for (i = 0; i < sc->num_slices; i++) {
4589                 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4590                                      INTR_TYPE_NET | INTR_MPSAFE,
4591 #if __FreeBSD_version > 700030
4592                                      NULL,
4593 #endif
4594                                      mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4595                 if (err != 0) {
4596                         device_printf(sc->dev, "couldn't setup intr for "
4597                                       "message %d\n", i);
4598                         goto abort_with_intr;
4599                 }
4600                 bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4601                                   sc->msix_ih[i], "s%d", i);
4602         }
4603
4604         if (mxge_verbose) {
4605                 device_printf(sc->dev, "using %d msix IRQs:",
4606                               sc->num_slices);
4607                 for (i = 0; i < sc->num_slices; i++)
4608                         printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4609                 printf("\n");
4610         }
4611         return (0);
4612
4613 abort_with_intr:
4614         for (i = 0; i < sc->num_slices; i++) {
4615                 if (sc->msix_ih[i] != NULL) {
4616                         bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4617                                           sc->msix_ih[i]);
4618                         sc->msix_ih[i] = NULL;
4619                 }
4620         }
4621         free(sc->msix_ih, M_DEVBUF);
4622
4623
4624 abort_with_res:
4625         for (i = 0; i < sc->num_slices; i++) {
4626                 rid = i + 1;
4627                 if (sc->msix_irq_res[i] != NULL)
4628                         bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4629                                              sc->msix_irq_res[i]);
4630                 sc->msix_irq_res[i] = NULL;
4631         }
4632         free(sc->msix_irq_res, M_DEVBUF);
4633
4634
4635 abort_with_msix:
4636         pci_release_msi(sc->dev);
4637
4638 abort_with_msix_table:
4639         bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4640                              sc->msix_table_res);
4641
4642         return err;
4643 }
4644
4645 static int
4646 mxge_add_single_irq(mxge_softc_t *sc)
4647 {
4648         int count, err, rid;
4649
4650         count = pci_msi_count(sc->dev);
4651         if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4652                 rid = 1;
4653         } else {
4654                 rid = 0;
4655                 sc->legacy_irq = 1;
4656         }
4657         sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4658                                              RF_SHAREABLE | RF_ACTIVE);
4659         if (sc->irq_res == NULL) {
4660                 device_printf(sc->dev, "could not alloc interrupt\n");
4661                 return ENXIO;
4662         }
4663         if (mxge_verbose)
4664                 device_printf(sc->dev, "using %s irq %jd\n",
4665                               sc->legacy_irq ? "INTx" : "MSI",
4666                               rman_get_start(sc->irq_res));
4667         err = bus_setup_intr(sc->dev, sc->irq_res,
4668                              INTR_TYPE_NET | INTR_MPSAFE,
4669 #if __FreeBSD_version > 700030
4670                              NULL,
4671 #endif
4672                              mxge_intr, &sc->ss[0], &sc->ih);
4673         if (err != 0) {
4674                 bus_release_resource(sc->dev, SYS_RES_IRQ,
4675                                      sc->legacy_irq ? 0 : 1, sc->irq_res);
4676                 if (!sc->legacy_irq)
4677                         pci_release_msi(sc->dev);
4678         }
4679         return err;
4680 }
4681
4682 static void
4683 mxge_rem_msix_irqs(mxge_softc_t *sc)
4684 {
4685         int i, rid;
4686
4687         for (i = 0; i < sc->num_slices; i++) {
4688                 if (sc->msix_ih[i] != NULL) {
4689                         bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4690                                           sc->msix_ih[i]);
4691                         sc->msix_ih[i] = NULL;
4692                 }
4693         }
4694         free(sc->msix_ih, M_DEVBUF);
4695
4696         for (i = 0; i < sc->num_slices; i++) {
4697                 rid = i + 1;
4698                 if (sc->msix_irq_res[i] != NULL)
4699                         bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4700                                              sc->msix_irq_res[i]);
4701                 sc->msix_irq_res[i] = NULL;
4702         }
4703         free(sc->msix_irq_res, M_DEVBUF);
4704
4705         bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4706                              sc->msix_table_res);
4707
4708         pci_release_msi(sc->dev);
4709         return;
4710 }
4711
4712 static void
4713 mxge_rem_single_irq(mxge_softc_t *sc)
4714 {
4715         bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4716         bus_release_resource(sc->dev, SYS_RES_IRQ,
4717                              sc->legacy_irq ? 0 : 1, sc->irq_res);
4718         if (!sc->legacy_irq)
4719                 pci_release_msi(sc->dev);
4720 }
4721
4722 static void
4723 mxge_rem_irq(mxge_softc_t *sc)
4724 {
4725         if (sc->num_slices > 1)
4726                 mxge_rem_msix_irqs(sc);
4727         else
4728                 mxge_rem_single_irq(sc);
4729 }
4730
4731 static int
4732 mxge_add_irq(mxge_softc_t *sc)
4733 {
4734         int err;
4735
4736         if (sc->num_slices > 1)
4737                 err = mxge_add_msix_irqs(sc);
4738         else
4739                 err = mxge_add_single_irq(sc);
4740         
4741         if (0 && err == 0 && sc->num_slices > 1) {
4742                 mxge_rem_msix_irqs(sc);
4743                 err = mxge_add_msix_irqs(sc);
4744         }
4745         return err;
4746 }
4747
4748
4749 static int
4750 mxge_attach(device_t dev)
4751 {
4752         mxge_cmd_t cmd;
4753         mxge_softc_t *sc = device_get_softc(dev);
4754         struct ifnet *ifp;
4755         int err, rid;
4756
4757         sc->dev = dev;
4758         mxge_fetch_tunables(sc);
4759
4760         TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4761         sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4762                                   taskqueue_thread_enqueue, &sc->tq);
4763         if (sc->tq == NULL) {
4764                 err = ENOMEM;
4765                 goto abort_with_nothing;
4766         }
4767
4768         err = bus_dma_tag_create(bus_get_dma_tag(dev),  /* parent */
4769                                  1,                     /* alignment */
4770                                  0,                     /* boundary */
4771                                  BUS_SPACE_MAXADDR,     /* low */
4772                                  BUS_SPACE_MAXADDR,     /* high */
4773                                  NULL, NULL,            /* filter */
4774                                  65536 + 256,           /* maxsize */
4775                                  MXGE_MAX_SEND_DESC,    /* num segs */
4776                                  65536,                 /* maxsegsize */
4777                                  0,                     /* flags */
4778                                  NULL, NULL,            /* lock */
4779                                  &sc->parent_dmat);     /* tag */
4780
4781         if (err != 0) {
4782                 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4783                               err);
4784                 goto abort_with_tq;
4785         }
4786
4787         ifp = sc->ifp = if_alloc(IFT_ETHER);
4788         if (ifp == NULL) {
4789                 device_printf(dev, "can not if_alloc()\n");
4790                 err = ENOSPC;
4791                 goto abort_with_parent_dmat;
4792         }
4793         if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4794
4795         snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4796                  device_get_nameunit(dev));
4797         mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4798         snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4799                  "%s:drv", device_get_nameunit(dev));
4800         mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4801                  MTX_NETWORK_LOCK, MTX_DEF);
4802
4803         callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4804
4805         mxge_setup_cfg_space(sc);
4806         
4807         /* Map the board into the kernel */
4808         rid = PCIR_BARS;
4809         sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4810                                              RF_ACTIVE);
4811         if (sc->mem_res == NULL) {
4812                 device_printf(dev, "could not map memory\n");
4813                 err = ENXIO;
4814                 goto abort_with_lock;
4815         }
4816         sc->sram = rman_get_virtual(sc->mem_res);
4817         sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4818         if (sc->sram_size > rman_get_size(sc->mem_res)) {
4819                 device_printf(dev, "impossible memory region size %jd\n",
4820                               rman_get_size(sc->mem_res));
4821                 err = ENXIO;
4822                 goto abort_with_mem_res;
4823         }
4824
4825         /* make NULL terminated copy of the EEPROM strings section of
4826            lanai SRAM */
4827         bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4828         bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4829                                 rman_get_bushandle(sc->mem_res),
4830                                 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4831                                 sc->eeprom_strings,
4832                                 MXGE_EEPROM_STRINGS_SIZE - 2);
4833         err = mxge_parse_strings(sc);
4834         if (err != 0)
4835                 goto abort_with_mem_res;
4836
4837         /* Enable write combining for efficient use of PCIe bus */
4838         mxge_enable_wc(sc);
4839
4840         /* Allocate the out of band dma memory */
4841         err = mxge_dma_alloc(sc, &sc->cmd_dma,
4842                              sizeof (mxge_cmd_t), 64);
4843         if (err != 0)
4844                 goto abort_with_mem_res;
4845         sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4846         err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4847         if (err != 0)
4848                 goto abort_with_cmd_dma;
4849
4850         err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4851         if (err != 0)
4852                 goto abort_with_zeropad_dma;
4853
4854         /* select & load the firmware */
4855         err = mxge_select_firmware(sc);
4856         if (err != 0)
4857                 goto abort_with_dmabench;
4858         sc->intr_coal_delay = mxge_intr_coal_delay;
4859
4860         mxge_slice_probe(sc);
4861         err = mxge_alloc_slices(sc);
4862         if (err != 0)
4863                 goto abort_with_dmabench;
4864
4865         err = mxge_reset(sc, 0);
4866         if (err != 0)
4867                 goto abort_with_slices;
4868
4869         err = mxge_alloc_rings(sc);
4870         if (err != 0) {
4871                 device_printf(sc->dev, "failed to allocate rings\n");
4872                 goto abort_with_slices;
4873         }
4874
4875         err = mxge_add_irq(sc);
4876         if (err != 0) {
4877                 device_printf(sc->dev, "failed to add irq\n");
4878                 goto abort_with_rings;
4879         }
4880
4881         ifp->if_baudrate = IF_Gbps(10);
4882         ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4883                 IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4884                 IFCAP_RXCSUM_IPV6;
4885 #if defined(INET) || defined(INET6)
4886         ifp->if_capabilities |= IFCAP_LRO;
4887 #endif
4888
4889 #ifdef MXGE_NEW_VLAN_API
4890         ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4891
4892         /* Only FW 1.4.32 and newer can do TSO over vlans */
4893         if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4894             sc->fw_ver_tiny >= 32)
4895                 ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4896 #endif
4897         sc->max_mtu = mxge_max_mtu(sc);
4898         if (sc->max_mtu >= 9000)
4899                 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4900         else
4901                 device_printf(dev, "MTU limited to %d.  Install "
4902                               "latest firmware for 9000 byte jumbo support\n",
4903                               sc->max_mtu - ETHER_HDR_LEN);
4904         ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4905         ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4906         /* check to see if f/w supports TSO for IPv6 */
4907         if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4908                 if (CSUM_TCP_IPV6)
4909                         ifp->if_capabilities |= IFCAP_TSO6;
4910                 sc->max_tso6_hlen = min(cmd.data0,
4911                                         sizeof (sc->ss[0].scratch));
4912         }
4913         ifp->if_capenable = ifp->if_capabilities;
4914         if (sc->lro_cnt == 0)
4915                 ifp->if_capenable &= ~IFCAP_LRO;
4916         ifp->if_init = mxge_init;
4917         ifp->if_softc = sc;
4918         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4919         ifp->if_ioctl = mxge_ioctl;
4920         ifp->if_start = mxge_start;
4921         ifp->if_get_counter = mxge_get_counter;
4922         /* Initialise the ifmedia structure */
4923         ifmedia_init(&sc->media, 0, mxge_media_change,
4924                      mxge_media_status);
4925         mxge_media_init(sc);
4926         mxge_media_probe(sc);
4927         sc->dying = 0;
4928         ether_ifattach(ifp, sc->mac_addr);
4929         /* ether_ifattach sets mtu to ETHERMTU */
4930         if (mxge_initial_mtu != ETHERMTU)
4931                 mxge_change_mtu(sc, mxge_initial_mtu);
4932
4933         mxge_add_sysctls(sc);
4934 #ifdef IFNET_BUF_RING
4935         ifp->if_transmit = mxge_transmit;
4936         ifp->if_qflush = mxge_qflush;
4937 #endif
4938         taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4939                                 device_get_nameunit(sc->dev));
4940         callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4941         return 0;
4942
4943 abort_with_rings:
4944         mxge_free_rings(sc);
4945 abort_with_slices:
4946         mxge_free_slices(sc);
4947 abort_with_dmabench:
4948         mxge_dma_free(&sc->dmabench_dma);
4949 abort_with_zeropad_dma:
4950         mxge_dma_free(&sc->zeropad_dma);
4951 abort_with_cmd_dma:
4952         mxge_dma_free(&sc->cmd_dma);
4953 abort_with_mem_res:
4954         bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4955 abort_with_lock:
4956         pci_disable_busmaster(dev);
4957         mtx_destroy(&sc->cmd_mtx);
4958         mtx_destroy(&sc->driver_mtx);
4959         if_free(ifp);
4960 abort_with_parent_dmat:
4961         bus_dma_tag_destroy(sc->parent_dmat);
4962 abort_with_tq:
4963         if (sc->tq != NULL) {
4964                 taskqueue_drain(sc->tq, &sc->watchdog_task);
4965                 taskqueue_free(sc->tq);
4966                 sc->tq = NULL;
4967         }
4968 abort_with_nothing:
4969         return err;
4970 }
4971
4972 static int
4973 mxge_detach(device_t dev)
4974 {
4975         mxge_softc_t *sc = device_get_softc(dev);
4976
4977         if (mxge_vlans_active(sc)) {
4978                 device_printf(sc->dev,
4979                               "Detach vlans before removing module\n");
4980                 return EBUSY;
4981         }
4982         mtx_lock(&sc->driver_mtx);
4983         sc->dying = 1;
4984         if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4985                 mxge_close(sc, 0);
4986         mtx_unlock(&sc->driver_mtx);
4987         ether_ifdetach(sc->ifp);
4988         if (sc->tq != NULL) {
4989                 taskqueue_drain(sc->tq, &sc->watchdog_task);
4990                 taskqueue_free(sc->tq);
4991                 sc->tq = NULL;
4992         }
4993         callout_drain(&sc->co_hdl);
4994         ifmedia_removeall(&sc->media);
4995         mxge_dummy_rdma(sc, 0);
4996         mxge_rem_sysctls(sc);
4997         mxge_rem_irq(sc);
4998         mxge_free_rings(sc);
4999         mxge_free_slices(sc);
5000         mxge_dma_free(&sc->dmabench_dma);
5001         mxge_dma_free(&sc->zeropad_dma);
5002         mxge_dma_free(&sc->cmd_dma);
5003         bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5004         pci_disable_busmaster(dev);
5005         mtx_destroy(&sc->cmd_mtx);
5006         mtx_destroy(&sc->driver_mtx);
5007         if_free(sc->ifp);
5008         bus_dma_tag_destroy(sc->parent_dmat);
5009         return 0;
5010 }
5011
5012 static int
5013 mxge_shutdown(device_t dev)
5014 {
5015         return 0;
5016 }
5017
5018 /*
5019   This file uses Myri10GE driver indentation.
5020
5021   Local Variables:
5022   c-file-style:"linux"
5023   tab-width:8
5024   End:
5025 */