]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/mxge/if_mxge.c
Merge wpa_supplicant/hostapd 2.4.
[FreeBSD/FreeBSD.git] / sys / dev / mxge / if_mxge.c
1 /******************************************************************************
2
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/if_arp.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56
57 #include <net/bpf.h>
58
59 #include <net/if_types.h>
60 #include <net/if_vlan_var.h>
61 #include <net/zlib.h>
62
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81
82 #include <vm/vm.h>              /* for pmap_mapdev() */
83 #include <vm/pmap.h>
84
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #ifdef IFNET_BUF_RING
94 #include <sys/buf_ring.h>
95 #endif
96
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131
132   DEVMETHOD_END
133 };
134
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141
142 static devclass_t mxge_devclass;
143
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154
155 static int
156 mxge_probe(device_t dev)
157 {
158         int rev;
159
160
161         if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162             ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163              (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164                 rev = pci_get_revid(dev);
165                 switch (rev) {
166                 case MXGE_PCI_REV_Z8E:
167                         device_set_desc(dev, "Myri10G-PCIE-8A");
168                         break;
169                 case MXGE_PCI_REV_Z8ES:
170                         device_set_desc(dev, "Myri10G-PCIE-8B");
171                         break;
172                 default:
173                         device_set_desc(dev, "Myri10G-PCIE-8??");
174                         device_printf(dev, "Unrecognized rev %d NIC\n",
175                                       rev);
176                         break;  
177                 }
178                 return 0;
179         }
180         return ENXIO;
181 }
182
183 static void
184 mxge_enable_wc(mxge_softc_t *sc)
185 {
186 #if defined(__i386) || defined(__amd64)
187         vm_offset_t len;
188         int err;
189
190         sc->wc = 1;
191         len = rman_get_size(sc->mem_res);
192         err = pmap_change_attr((vm_offset_t) sc->sram,
193                                len, PAT_WRITE_COMBINING);
194         if (err != 0) {
195                 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196                               err);
197                 sc->wc = 0;
198         }
199 #endif          
200 }
201
202
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206                          int error)
207 {
208         if (error == 0) {
209                 *(bus_addr_t *) arg = segs->ds_addr;
210         }
211 }
212
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215                    bus_size_t alignment)
216 {
217         int err;
218         device_t dev = sc->dev;
219         bus_size_t boundary, maxsegsize;
220
221         if (bytes > 4096 && alignment == 4096) {
222                 boundary = 0;
223                 maxsegsize = bytes;
224         } else {
225                 boundary = 4096;
226                 maxsegsize = 4096;
227         }
228
229         /* allocate DMAable memory tags */
230         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
231                                  alignment,             /* alignment */
232                                  boundary,              /* boundary */
233                                  BUS_SPACE_MAXADDR,     /* low */
234                                  BUS_SPACE_MAXADDR,     /* high */
235                                  NULL, NULL,            /* filter */
236                                  bytes,                 /* maxsize */
237                                  1,                     /* num segs */
238                                  maxsegsize,            /* maxsegsize */
239                                  BUS_DMA_COHERENT,      /* flags */
240                                  NULL, NULL,            /* lock */
241                                  &dma->dmat);           /* tag */
242         if (err != 0) {
243                 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244                 return err;
245         }
246
247         /* allocate DMAable memory & map */
248         err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249                                (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250                                 | BUS_DMA_ZERO),  &dma->map);
251         if (err != 0) {
252                 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253                 goto abort_with_dmat;
254         }
255
256         /* load the memory */
257         err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258                               mxge_dmamap_callback,
259                               (void *)&dma->bus_addr, 0);
260         if (err != 0) {
261                 device_printf(dev, "couldn't load map (err = %d)\n", err);
262                 goto abort_with_mem;
263         }
264         return 0;
265
266 abort_with_mem:
267         bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269         (void)bus_dma_tag_destroy(dma->dmat);
270         return err;
271 }
272
273
274 static void
275 mxge_dma_free(mxge_dma_t *dma)
276 {
277         bus_dmamap_unload(dma->dmat, dma->map);
278         bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279         (void)bus_dma_tag_destroy(dma->dmat);
280 }
281
282 /*
283  * The eeprom strings on the lanaiX have the format
284  * SN=x\0
285  * MAC=x:x:x:x:x:x\0
286  * PC=text\0
287  */
288
289 static int
290 mxge_parse_strings(mxge_softc_t *sc)
291 {
292         char *ptr;
293         int i, found_mac, found_sn2;
294         char *endptr;
295
296         ptr = sc->eeprom_strings;
297         found_mac = 0;
298         found_sn2 = 0;
299         while (*ptr != '\0') {
300                 if (strncmp(ptr, "MAC=", 4) == 0) {
301                         ptr += 4;
302                         for (i = 0;;) {
303                                 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304                                 if (endptr - ptr != 2)
305                                         goto abort;
306                                 ptr = endptr;
307                                 if (++i == 6)
308                                         break;
309                                 if (*ptr++ != ':')
310                                         goto abort;
311                         }
312                         found_mac = 1;
313                 } else if (strncmp(ptr, "PC=", 3) == 0) {
314                         ptr += 3;
315                         strlcpy(sc->product_code_string, ptr,
316                             sizeof(sc->product_code_string));
317                 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318                         ptr += 3;
319                         strlcpy(sc->serial_number_string, ptr,
320                             sizeof(sc->serial_number_string));
321                 } else if (strncmp(ptr, "SN2=", 4) == 0) {
322                         /* SN2 takes precedence over SN */
323                         ptr += 4;
324                         found_sn2 = 1;
325                         strlcpy(sc->serial_number_string, ptr,
326                             sizeof(sc->serial_number_string));
327                 }
328                 while (*ptr++ != '\0') {}
329         }
330
331         if (found_mac)
332                 return 0;
333
334  abort:
335         device_printf(sc->dev, "failed to parse eeprom_strings\n");
336
337         return ENXIO;
338 }
339
340 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341 static void
342 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 {
344         uint32_t val;
345         unsigned long base, off;
346         char *va, *cfgptr;
347         device_t pdev, mcp55;
348         uint16_t vendor_id, device_id, word;
349         uintptr_t bus, slot, func, ivend, idev;
350         uint32_t *ptr32;
351
352
353         if (!mxge_nvidia_ecrc_enable)
354                 return;
355
356         pdev = device_get_parent(device_get_parent(sc->dev));
357         if (pdev == NULL) {
358                 device_printf(sc->dev, "could not find parent?\n");
359                 return;
360         }
361         vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362         device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363
364         if (vendor_id != 0x10de)
365                 return;
366
367         base = 0;
368
369         if (device_id == 0x005d) {
370                 /* ck804, base address is magic */
371                 base = 0xe0000000UL;
372         } else if (device_id >= 0x0374 && device_id <= 0x378) {
373                 /* mcp55, base address stored in chipset */
374                 mcp55 = pci_find_bsf(0, 0, 0);
375                 if (mcp55 &&
376                     0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377                     0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378                         word = pci_read_config(mcp55, 0x90, 2);
379                         base = ((unsigned long)word & 0x7ffeU) << 25;
380                 }
381         }
382         if (!base)
383                 return;
384
385         /* XXXX
386            Test below is commented because it is believed that doing
387            config read/write beyond 0xff will access the config space
388            for the next larger function.  Uncomment this and remove
389            the hacky pmap_mapdev() way of accessing config space when
390            FreeBSD grows support for extended pcie config space access
391         */
392 #if 0   
393         /* See if we can, by some miracle, access the extended
394            config space */
395         val = pci_read_config(pdev, 0x178, 4);
396         if (val != 0xffffffff) {
397                 val |= 0x40;
398                 pci_write_config(pdev, 0x178, val, 4);
399                 return;
400         }
401 #endif
402         /* Rather than using normal pci config space writes, we must
403          * map the Nvidia config space ourselves.  This is because on
404          * opteron/nvidia class machine the 0xe000000 mapping is
405          * handled by the nvidia chipset, that means the internal PCI
406          * device (the on-chip northbridge), or the amd-8131 bridge
407          * and things behind them are not visible by this method.
408          */
409
410         BUS_READ_IVAR(device_get_parent(pdev), pdev,
411                       PCI_IVAR_BUS, &bus);
412         BUS_READ_IVAR(device_get_parent(pdev), pdev,
413                       PCI_IVAR_SLOT, &slot);
414         BUS_READ_IVAR(device_get_parent(pdev), pdev,
415                       PCI_IVAR_FUNCTION, &func);
416         BUS_READ_IVAR(device_get_parent(pdev), pdev,
417                       PCI_IVAR_VENDOR, &ivend);
418         BUS_READ_IVAR(device_get_parent(pdev), pdev,
419                       PCI_IVAR_DEVICE, &idev);
420                                         
421         off =  base
422                 + 0x00100000UL * (unsigned long)bus
423                 + 0x00001000UL * (unsigned long)(func
424                                                  + 8 * slot);
425
426         /* map it into the kernel */
427         va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428         
429
430         if (va == NULL) {
431                 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432                 return;
433         }
434         /* get a pointer to the config space mapped into the kernel */
435         cfgptr = va + (off & PAGE_MASK);
436
437         /* make sure that we can really access it */
438         vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439         device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440         if (! (vendor_id == ivend && device_id == idev)) {
441                 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442                               vendor_id, device_id);
443                 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444                 return;
445         }
446
447         ptr32 = (uint32_t*)(cfgptr + 0x178);
448         val = *ptr32;
449
450         if (val == 0xffffffff) {
451                 device_printf(sc->dev, "extended mapping failed\n");
452                 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453                 return;
454         }
455         *ptr32 = val | 0x40;
456         pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457         if (mxge_verbose)
458                 device_printf(sc->dev,
459                               "Enabled ECRC on upstream Nvidia bridge "
460                               "at %d:%d:%d\n",
461                               (int)bus, (int)slot, (int)func);
462         return;
463 }
464 #else
465 static void
466 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467 {
468         device_printf(sc->dev,
469                       "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470         return;
471 }
472 #endif
473
474
475 static int
476 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 {
478         mxge_cmd_t cmd;
479         bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480         int status;
481         uint32_t len;
482         char *test = " ";
483
484
485         /* Run a small DMA test.
486          * The magic multipliers to the length tell the firmware
487          * to do DMA read, write, or read+write tests.  The
488          * results are returned in cmd.data0.  The upper 16
489          * bits of the return is the number of transfers completed.
490          * The lower 16 bits is the time in 0.5us ticks that the
491          * transfers took to complete.
492          */
493
494         len = sc->tx_boundary;
495
496         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498         cmd.data2 = len * 0x10000;
499         status = mxge_send_cmd(sc, test_type, &cmd);
500         if (status != 0) {
501                 test = "read";
502                 goto abort;
503         }
504         sc->read_dma = ((cmd.data0>>16) * len * 2) /
505                 (cmd.data0 & 0xffff);
506         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508         cmd.data2 = len * 0x1;
509         status = mxge_send_cmd(sc, test_type, &cmd);
510         if (status != 0) {
511                 test = "write";
512                 goto abort;
513         }
514         sc->write_dma = ((cmd.data0>>16) * len * 2) /
515                 (cmd.data0 & 0xffff);
516
517         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519         cmd.data2 = len * 0x10001;
520         status = mxge_send_cmd(sc, test_type, &cmd);
521         if (status != 0) {
522                 test = "read/write";
523                 goto abort;
524         }
525         sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526                 (cmd.data0 & 0xffff);
527
528 abort:
529         if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530                 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531                               test, status);
532
533         return status;
534 }
535
536 /*
537  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538  * when the PCI-E Completion packets are aligned on an 8-byte
539  * boundary.  Some PCI-E chip sets always align Completion packets; on
540  * the ones that do not, the alignment can be enforced by enabling
541  * ECRC generation (if supported).
542  *
543  * When PCI-E Completion packets are not aligned, it is actually more
544  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545  *
546  * If the driver can neither enable ECRC nor verify that it has
547  * already been enabled, then it must use a firmware image which works
548  * around unaligned completion packets (ethp_z8e.dat), and it should
549  * also ensure that it never gives the device a Read-DMA which is
550  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551  * enabled, then the driver should use the aligned (eth_z8e.dat)
552  * firmware image, and set tx_boundary to 4KB.
553  */
554
555 static int
556 mxge_firmware_probe(mxge_softc_t *sc)
557 {
558         device_t dev = sc->dev;
559         int reg, status;
560         uint16_t pectl;
561
562         sc->tx_boundary = 4096;
563         /*
564          * Verify the max read request size was set to 4KB
565          * before trying the test with 4KB.
566          */
567         if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568                 pectl = pci_read_config(dev, reg + 0x8, 2);
569                 if ((pectl & (5 << 12)) != (5 << 12)) {
570                         device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571                                       pectl);
572                         sc->tx_boundary = 2048;
573                 }
574         }
575
576         /*
577          * load the optimized firmware (which assumes aligned PCIe
578          * completions) in order to see if it works on this host.
579          */
580         sc->fw_name = mxge_fw_aligned;
581         status = mxge_load_firmware(sc, 1);
582         if (status != 0) {
583                 return status;
584         }
585
586         /*
587          * Enable ECRC if possible
588          */
589         mxge_enable_nvidia_ecrc(sc);
590
591         /*
592          * Run a DMA test which watches for unaligned completions and
593          * aborts on the first one seen.  Not required on Z8ES or newer.
594          */
595         if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596                 return 0;
597         status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598         if (status == 0)
599                 return 0; /* keep the aligned firmware */
600
601         if (status != E2BIG)
602                 device_printf(dev, "DMA test failed: %d\n", status);
603         if (status == ENOSYS)
604                 device_printf(dev, "Falling back to ethp! "
605                               "Please install up to date fw\n");
606         return status;
607 }
608
609 static int
610 mxge_select_firmware(mxge_softc_t *sc)
611 {
612         int aligned = 0;
613         int force_firmware = mxge_force_firmware;
614
615         if (sc->throttle)
616                 force_firmware = sc->throttle;
617
618         if (force_firmware != 0) {
619                 if (force_firmware == 1)
620                         aligned = 1;
621                 else
622                         aligned = 0;
623                 if (mxge_verbose)
624                         device_printf(sc->dev,
625                                       "Assuming %s completions (forced)\n",
626                                       aligned ? "aligned" : "unaligned");
627                 goto abort;
628         }
629
630         /* if the PCIe link width is 4 or less, we can use the aligned
631            firmware and skip any checks */
632         if (sc->link_width != 0 && sc->link_width <= 4) {
633                 device_printf(sc->dev,
634                               "PCIe x%d Link, expect reduced performance\n",
635                               sc->link_width);
636                 aligned = 1;
637                 goto abort;
638         }
639
640         if (0 == mxge_firmware_probe(sc))
641                 return 0;
642
643 abort:
644         if (aligned) {
645                 sc->fw_name = mxge_fw_aligned;
646                 sc->tx_boundary = 4096;
647         } else {
648                 sc->fw_name = mxge_fw_unaligned;
649                 sc->tx_boundary = 2048;
650         }
651         return (mxge_load_firmware(sc, 0));
652 }
653
654 static int
655 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656 {
657
658
659         if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660                 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661                               be32toh(hdr->mcp_type));
662                 return EIO;
663         }
664
665         /* save firmware version for sysctl */
666         strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667         if (mxge_verbose)
668                 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669
670         sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671                &sc->fw_ver_minor, &sc->fw_ver_tiny);
672
673         if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674               && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675                 device_printf(sc->dev, "Found firmware version %s\n",
676                               sc->fw_version);
677                 device_printf(sc->dev, "Driver needs %d.%d\n",
678                               MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679                 return EINVAL;
680         }
681         return 0;
682
683 }
684
685 static void *
686 z_alloc(void *nil, u_int items, u_int size)
687 {
688         void *ptr;
689
690         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691         return ptr;
692 }
693
694 static void
695 z_free(void *nil, void *ptr)
696 {
697         free(ptr, M_TEMP);
698 }
699
700
701 static int
702 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 {
704         z_stream zs;
705         char *inflate_buffer;
706         const struct firmware *fw;
707         const mcp_gen_header_t *hdr;
708         unsigned hdr_offset;
709         int status;
710         unsigned int i;
711         char dummy;
712         size_t fw_len;
713
714         fw = firmware_get(sc->fw_name);
715         if (fw == NULL) {
716                 device_printf(sc->dev, "Could not find firmware image %s\n",
717                               sc->fw_name);
718                 return ENOENT;
719         }
720
721
722
723         /* setup zlib and decompress f/w */
724         bzero(&zs, sizeof (zs));
725         zs.zalloc = z_alloc;
726         zs.zfree = z_free;
727         status = inflateInit(&zs);
728         if (status != Z_OK) {
729                 status = EIO;
730                 goto abort_with_fw;
731         }
732
733         /* the uncompressed size is stored as the firmware version,
734            which would otherwise go unused */
735         fw_len = (size_t) fw->version;
736         inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737         if (inflate_buffer == NULL)
738                 goto abort_with_zs;
739         zs.avail_in = fw->datasize;
740         zs.next_in = __DECONST(char *, fw->data);
741         zs.avail_out = fw_len;
742         zs.next_out = inflate_buffer;
743         status = inflate(&zs, Z_FINISH);
744         if (status != Z_STREAM_END) {
745                 device_printf(sc->dev, "zlib %d\n", status);
746                 status = EIO;
747                 goto abort_with_buffer;
748         }
749
750         /* check id */
751         hdr_offset = htobe32(*(const uint32_t *)
752                              (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753         if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754                 device_printf(sc->dev, "Bad firmware file");
755                 status = EIO;
756                 goto abort_with_buffer;
757         }
758         hdr = (const void*)(inflate_buffer + hdr_offset);
759
760         status = mxge_validate_firmware(sc, hdr);
761         if (status != 0)
762                 goto abort_with_buffer;
763
764         /* Copy the inflated firmware to NIC SRAM. */
765         for (i = 0; i < fw_len; i += 256) {
766                 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767                               inflate_buffer + i,
768                               min(256U, (unsigned)(fw_len - i)));
769                 wmb();
770                 dummy = *sc->sram;
771                 wmb();
772         }
773
774         *limit = fw_len;
775         status = 0;
776 abort_with_buffer:
777         free(inflate_buffer, M_TEMP);
778 abort_with_zs:
779         inflateEnd(&zs);
780 abort_with_fw:
781         firmware_put(fw, FIRMWARE_UNLOAD);
782         return status;
783 }
784
785 /*
786  * Enable or disable periodic RDMAs from the host to make certain
787  * chipsets resend dropped PCIe messages
788  */
789
790 static void
791 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 {
793         char buf_bytes[72];
794         volatile uint32_t *confirm;
795         volatile char *submit;
796         uint32_t *buf, dma_low, dma_high;
797         int i;
798
799         buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800
801         /* clear confirmation addr */
802         confirm = (volatile uint32_t *)sc->cmd;
803         *confirm = 0;
804         wmb();
805
806         /* send an rdma command to the PCIe engine, and wait for the
807            response in the confirmation address.  The firmware should
808            write a -1 there to indicate it is alive and well
809         */
810
811         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813         buf[0] = htobe32(dma_high);             /* confirm addr MSW */
814         buf[1] = htobe32(dma_low);              /* confirm addr LSW */
815         buf[2] = htobe32(0xffffffff);           /* confirm data */
816         dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817         dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818         buf[3] = htobe32(dma_high);             /* dummy addr MSW */
819         buf[4] = htobe32(dma_low);              /* dummy addr LSW */
820         buf[5] = htobe32(enable);                       /* enable? */
821
822
823         submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824
825         mxge_pio_copy(submit, buf, 64);
826         wmb();
827         DELAY(1000);
828         wmb();
829         i = 0;
830         while (*confirm != 0xffffffff && i < 20) {
831                 DELAY(1000);
832                 i++;
833         }
834         if (*confirm != 0xffffffff) {
835                 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836                               (enable ? "enable" : "disable"), confirm,
837                               *confirm);
838         }
839         return;
840 }
841
842 static int
843 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 {
845         mcp_cmd_t *buf;
846         char buf_bytes[sizeof(*buf) + 8];
847         volatile mcp_cmd_response_t *response = sc->cmd;
848         volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849         uint32_t dma_low, dma_high;
850         int err, sleep_total = 0;
851
852         /* ensure buf is aligned to 8 bytes */
853         buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854
855         buf->data0 = htobe32(data->data0);
856         buf->data1 = htobe32(data->data1);
857         buf->data2 = htobe32(data->data2);
858         buf->cmd = htobe32(cmd);
859         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861
862         buf->response_addr.low = htobe32(dma_low);
863         buf->response_addr.high = htobe32(dma_high);
864         mtx_lock(&sc->cmd_mtx);
865         response->result = 0xffffffff;
866         wmb();
867         mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868
869         /* wait up to 20ms */
870         err = EAGAIN;
871         for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872                 bus_dmamap_sync(sc->cmd_dma.dmat,
873                                 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874                 wmb();
875                 switch (be32toh(response->result)) {
876                 case 0:
877                         data->data0 = be32toh(response->data);
878                         err = 0;
879                         break;
880                 case 0xffffffff:
881                         DELAY(1000);
882                         break;
883                 case MXGEFW_CMD_UNKNOWN:
884                         err = ENOSYS;
885                         break;
886                 case MXGEFW_CMD_ERROR_UNALIGNED:
887                         err = E2BIG;
888                         break;
889                 case MXGEFW_CMD_ERROR_BUSY:
890                         err = EBUSY;
891                         break;
892                 case MXGEFW_CMD_ERROR_I2C_ABSENT:
893                         err = ENXIO;
894                         break;
895                 default:
896                         device_printf(sc->dev,
897                                       "mxge: command %d "
898                                       "failed, result = %d\n",
899                                       cmd, be32toh(response->result));
900                         err = ENXIO;
901                         break;
902                 }
903                 if (err != EAGAIN)
904                         break;
905         }
906         if (err == EAGAIN)
907                 device_printf(sc->dev, "mxge: command %d timed out"
908                               "result = %d\n",
909                               cmd, be32toh(response->result));
910         mtx_unlock(&sc->cmd_mtx);
911         return err;
912 }
913
914 static int
915 mxge_adopt_running_firmware(mxge_softc_t *sc)
916 {
917         struct mcp_gen_header *hdr;
918         const size_t bytes = sizeof (struct mcp_gen_header);
919         size_t hdr_offset;
920         int status;
921
922         /* find running firmware header */
923         hdr_offset = htobe32(*(volatile uint32_t *)
924                              (sc->sram + MCP_HEADER_PTR_OFFSET));
925
926         if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927                 device_printf(sc->dev,
928                               "Running firmware has bad header offset (%d)\n",
929                               (int)hdr_offset);
930                 return EIO;
931         }
932
933         /* copy header of running firmware from SRAM to host memory to
934          * validate firmware */
935         hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936         if (hdr == NULL) {
937                 device_printf(sc->dev, "could not malloc firmware hdr\n");
938                 return ENOMEM;
939         }
940         bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941                                 rman_get_bushandle(sc->mem_res),
942                                 hdr_offset, (char *)hdr, bytes);
943         status = mxge_validate_firmware(sc, hdr);
944         free(hdr, M_DEVBUF);
945
946         /*
947          * check to see if adopted firmware has bug where adopting
948          * it will cause broadcasts to be filtered unless the NIC
949          * is kept in ALLMULTI mode
950          */
951         if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952             sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953                 sc->adopted_rx_filter_bug = 1;
954                 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955                               "working around rx filter bug\n",
956                               sc->fw_ver_major, sc->fw_ver_minor,
957                               sc->fw_ver_tiny);
958         }
959
960         return status;
961 }
962
963
964 static int
965 mxge_load_firmware(mxge_softc_t *sc, int adopt)
966 {
967         volatile uint32_t *confirm;
968         volatile char *submit;
969         char buf_bytes[72];
970         uint32_t *buf, size, dma_low, dma_high;
971         int status, i;
972
973         buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974
975         size = sc->sram_size;
976         status = mxge_load_firmware_helper(sc, &size);
977         if (status) {
978                 if (!adopt)
979                         return status;
980                 /* Try to use the currently running firmware, if
981                    it is new enough */
982                 status = mxge_adopt_running_firmware(sc);
983                 if (status) {
984                         device_printf(sc->dev,
985                                       "failed to adopt running firmware\n");
986                         return status;
987                 }
988                 device_printf(sc->dev,
989                               "Successfully adopted running firmware\n");
990                 if (sc->tx_boundary == 4096) {
991                         device_printf(sc->dev,
992                                 "Using firmware currently running on NIC"
993                                  ".  For optimal\n");
994                         device_printf(sc->dev,
995                                  "performance consider loading optimized "
996                                  "firmware\n");
997                 }
998                 sc->fw_name = mxge_fw_unaligned;
999                 sc->tx_boundary = 2048;
1000                 return 0;
1001         }
1002         /* clear confirmation addr */
1003         confirm = (volatile uint32_t *)sc->cmd;
1004         *confirm = 0;
1005         wmb();
1006         /* send a reload command to the bootstrap MCP, and wait for the
1007            response in the confirmation address.  The firmware should
1008            write a -1 there to indicate it is alive and well
1009         */
1010
1011         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013
1014         buf[0] = htobe32(dma_high);     /* confirm addr MSW */
1015         buf[1] = htobe32(dma_low);      /* confirm addr LSW */
1016         buf[2] = htobe32(0xffffffff);   /* confirm data */
1017
1018         /* FIX: All newest firmware should un-protect the bottom of
1019            the sram before handoff. However, the very first interfaces
1020            do not. Therefore the handoff copy must skip the first 8 bytes
1021         */
1022                                         /* where the code starts*/
1023         buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024         buf[4] = htobe32(size - 8);     /* length of code */
1025         buf[5] = htobe32(8);            /* where to copy to */
1026         buf[6] = htobe32(0);            /* where to jump to */
1027
1028         submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029         mxge_pio_copy(submit, buf, 64);
1030         wmb();
1031         DELAY(1000);
1032         wmb();
1033         i = 0;
1034         while (*confirm != 0xffffffff && i < 20) {
1035                 DELAY(1000*10);
1036                 i++;
1037                 bus_dmamap_sync(sc->cmd_dma.dmat,
1038                                 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039         }
1040         if (*confirm != 0xffffffff) {
1041                 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042                         confirm, *confirm);
1043                 
1044                 return ENXIO;
1045         }
1046         return 0;
1047 }
1048
1049 static int
1050 mxge_update_mac_address(mxge_softc_t *sc)
1051 {
1052         mxge_cmd_t cmd;
1053         uint8_t *addr = sc->mac_addr;
1054         int status;
1055
1056         
1057         cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058                      | (addr[2] << 8) | addr[3]);
1059
1060         cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061
1062         status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063         return status;
1064 }
1065
1066 static int
1067 mxge_change_pause(mxge_softc_t *sc, int pause)
1068 {       
1069         mxge_cmd_t cmd;
1070         int status;
1071
1072         if (pause)
1073                 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074                                        &cmd);
1075         else
1076                 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077                                        &cmd);
1078
1079         if (status) {
1080                 device_printf(sc->dev, "Failed to set flow control mode\n");
1081                 return ENXIO;
1082         }
1083         sc->pause = pause;
1084         return 0;
1085 }
1086
1087 static void
1088 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089 {       
1090         mxge_cmd_t cmd;
1091         int status;
1092
1093         if (mxge_always_promisc)
1094                 promisc = 1;
1095
1096         if (promisc)
1097                 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098                                        &cmd);
1099         else
1100                 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101                                        &cmd);
1102
1103         if (status) {
1104                 device_printf(sc->dev, "Failed to set promisc mode\n");
1105         }
1106 }
1107
1108 static void
1109 mxge_set_multicast_list(mxge_softc_t *sc)
1110 {
1111         mxge_cmd_t cmd;
1112         struct ifmultiaddr *ifma;
1113         struct ifnet *ifp = sc->ifp;
1114         int err;
1115
1116         /* This firmware is known to not support multicast */
1117         if (!sc->fw_multicast_support)
1118                 return;
1119
1120         /* Disable multicast filtering while we play with the lists*/
1121         err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122         if (err != 0) {
1123                 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124                        " error status: %d\n", err);
1125                 return;
1126         }
1127         
1128         if (sc->adopted_rx_filter_bug)
1129                 return;
1130         
1131         if (ifp->if_flags & IFF_ALLMULTI)
1132                 /* request to disable multicast filtering, so quit here */
1133                 return;
1134
1135         /* Flush all the filters */
1136
1137         err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138         if (err != 0) {
1139                 device_printf(sc->dev,
1140                               "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141                               ", error status: %d\n", err);
1142                 return;
1143         }
1144
1145         /* Walk the multicast list, and add each address */
1146
1147         if_maddr_rlock(ifp);
1148         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149                 if (ifma->ifma_addr->sa_family != AF_LINK)
1150                         continue;
1151                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152                       &cmd.data0, 4);
1153                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154                       &cmd.data1, 2);
1155                 cmd.data0 = htonl(cmd.data0);
1156                 cmd.data1 = htonl(cmd.data1);
1157                 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158                 if (err != 0) {
1159                         device_printf(sc->dev, "Failed "
1160                                "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161                                "%d\t", err);
1162                         /* abort, leaving multicast filtering off */
1163                         if_maddr_runlock(ifp);
1164                         return;
1165                 }
1166         }
1167         if_maddr_runlock(ifp);
1168         /* Enable multicast filtering */
1169         err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170         if (err != 0) {
1171                 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172                        ", error status: %d\n", err);
1173         }
1174 }
1175
1176 static int
1177 mxge_max_mtu(mxge_softc_t *sc)
1178 {
1179         mxge_cmd_t cmd;
1180         int status;
1181
1182         if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183                 return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184
1185         /* try to set nbufs to see if it we can
1186            use virtually contiguous jumbos */
1187         cmd.data0 = 0;
1188         status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189                                &cmd);
1190         if (status == 0)
1191                 return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192
1193         /* otherwise, we're limited to MJUMPAGESIZE */
1194         return MJUMPAGESIZE - MXGEFW_PAD;
1195 }
1196
1197 static int
1198 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199 {
1200         struct mxge_slice_state *ss;
1201         mxge_rx_done_t *rx_done;
1202         volatile uint32_t *irq_claim;
1203         mxge_cmd_t cmd;
1204         int slice, status;
1205
1206         /* try to send a reset command to the card to see if it
1207            is alive */
1208         memset(&cmd, 0, sizeof (cmd));
1209         status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210         if (status != 0) {
1211                 device_printf(sc->dev, "failed reset\n");
1212                 return ENXIO;
1213         }
1214
1215         mxge_dummy_rdma(sc, 1);
1216
1217
1218         /* set the intrq size */
1219         cmd.data0 = sc->rx_ring_size;
1220         status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221
1222         /*
1223          * Even though we already know how many slices are supported
1224          * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225          * has magic side effects, and must be called after a reset.
1226          * It must be called prior to calling any RSS related cmds,
1227          * including assigning an interrupt queue for anything but
1228          * slice 0.  It must also be called *after*
1229          * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230          * the firmware to compute offsets.
1231          */
1232         
1233         if (sc->num_slices > 1) {
1234                 /* ask the maximum number of slices it supports */
1235                 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236                                            &cmd);
1237                 if (status != 0) {
1238                         device_printf(sc->dev,
1239                                       "failed to get number of slices\n");
1240                         return status;
1241                 }
1242                 /*
1243                  * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244                  * to setting up the interrupt queue DMA
1245                  */
1246                 cmd.data0 = sc->num_slices;
1247                 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248 #ifdef IFNET_BUF_RING
1249                 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250 #endif
1251                 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252                                            &cmd);
1253                 if (status != 0) {
1254                         device_printf(sc->dev,
1255                                       "failed to set number of slices\n");
1256                         return status;
1257                 }
1258         }
1259
1260
1261         if (interrupts_setup) {
1262                 /* Now exchange information about interrupts  */
1263                 for (slice = 0; slice < sc->num_slices; slice++) {
1264                         rx_done = &sc->ss[slice].rx_done;
1265                         memset(rx_done->entry, 0, sc->rx_ring_size);
1266                         cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267                         cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268                         cmd.data2 = slice;
1269                         status |= mxge_send_cmd(sc,
1270                                                 MXGEFW_CMD_SET_INTRQ_DMA,
1271                                                 &cmd);
1272                 }
1273         }
1274
1275         status |= mxge_send_cmd(sc,
1276                                 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277         
1278
1279         sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280
1281         status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282         irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283
1284
1285         status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286                                 &cmd);
1287         sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288         if (status != 0) {
1289                 device_printf(sc->dev, "failed set interrupt parameters\n");
1290                 return status;
1291         }
1292         
1293
1294         *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295
1296         
1297         /* run a DMA benchmark */
1298         (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299
1300         for (slice = 0; slice < sc->num_slices; slice++) {
1301                 ss = &sc->ss[slice];
1302
1303                 ss->irq_claim = irq_claim + (2 * slice);
1304                 /* reset mcp/driver shared state back to 0 */
1305                 ss->rx_done.idx = 0;
1306                 ss->rx_done.cnt = 0;
1307                 ss->tx.req = 0;
1308                 ss->tx.done = 0;
1309                 ss->tx.pkt_done = 0;
1310                 ss->tx.queue_active = 0;
1311                 ss->tx.activate = 0;
1312                 ss->tx.deactivate = 0;
1313                 ss->tx.wake = 0;
1314                 ss->tx.defrag = 0;
1315                 ss->tx.stall = 0;
1316                 ss->rx_big.cnt = 0;
1317                 ss->rx_small.cnt = 0;
1318                 ss->lc.lro_bad_csum = 0;
1319                 ss->lc.lro_queued = 0;
1320                 ss->lc.lro_flushed = 0;
1321                 if (ss->fw_stats != NULL) {
1322                         bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323                 }
1324         }
1325         sc->rdma_tags_available = 15;
1326         status = mxge_update_mac_address(sc);
1327         mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328         mxge_change_pause(sc, sc->pause);
1329         mxge_set_multicast_list(sc);
1330         if (sc->throttle) {
1331                 cmd.data0 = sc->throttle;
1332                 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333                                   &cmd)) {
1334                         device_printf(sc->dev,
1335                                       "can't enable throttle\n");
1336                 }
1337         }
1338         return status;
1339 }
1340
1341 static int
1342 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343 {
1344         mxge_cmd_t cmd;
1345         mxge_softc_t *sc;
1346         int err;
1347         unsigned int throttle;
1348
1349         sc = arg1;
1350         throttle = sc->throttle;
1351         err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352         if (err != 0) {
1353                 return err;
1354         }
1355
1356         if (throttle == sc->throttle)
1357                 return 0;
1358
1359         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360                 return EINVAL;
1361         
1362         mtx_lock(&sc->driver_mtx);
1363         cmd.data0 = throttle;
1364         err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365         if (err == 0)
1366                 sc->throttle = throttle;
1367         mtx_unlock(&sc->driver_mtx);    
1368         return err;
1369 }
1370
1371 static int
1372 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 {
1374         mxge_softc_t *sc;
1375         unsigned int intr_coal_delay;
1376         int err;
1377
1378         sc = arg1;
1379         intr_coal_delay = sc->intr_coal_delay;
1380         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381         if (err != 0) {
1382                 return err;
1383         }
1384         if (intr_coal_delay == sc->intr_coal_delay)
1385                 return 0;
1386
1387         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388                 return EINVAL;
1389
1390         mtx_lock(&sc->driver_mtx);
1391         *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392         sc->intr_coal_delay = intr_coal_delay;
1393         
1394         mtx_unlock(&sc->driver_mtx);
1395         return err;
1396 }
1397
1398 static int
1399 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 {
1401         mxge_softc_t *sc;
1402         unsigned int enabled;
1403         int err;
1404
1405         sc = arg1;
1406         enabled = sc->pause;
1407         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408         if (err != 0) {
1409                 return err;
1410         }
1411         if (enabled == sc->pause)
1412                 return 0;
1413
1414         mtx_lock(&sc->driver_mtx);
1415         err = mxge_change_pause(sc, enabled);
1416         mtx_unlock(&sc->driver_mtx);
1417         return err;
1418 }
1419
1420 static int
1421 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422 {
1423         int err;
1424
1425         if (arg1 == NULL)
1426                 return EFAULT;
1427         arg2 = be32toh(*(int *)arg1);
1428         arg1 = NULL;
1429         err = sysctl_handle_int(oidp, arg1, arg2, req);
1430
1431         return err;
1432 }
1433
1434 static void
1435 mxge_rem_sysctls(mxge_softc_t *sc)
1436 {
1437         struct mxge_slice_state *ss;
1438         int slice;
1439
1440         if (sc->slice_sysctl_tree == NULL)
1441                 return;
1442
1443         for (slice = 0; slice < sc->num_slices; slice++) {
1444                 ss = &sc->ss[slice];
1445                 if (ss == NULL || ss->sysctl_tree == NULL)
1446                         continue;
1447                 sysctl_ctx_free(&ss->sysctl_ctx);
1448                 ss->sysctl_tree = NULL;
1449         }
1450         sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451         sc->slice_sysctl_tree = NULL;
1452 }
1453
1454 static void
1455 mxge_add_sysctls(mxge_softc_t *sc)
1456 {
1457         struct sysctl_ctx_list *ctx;
1458         struct sysctl_oid_list *children;
1459         mcp_irq_data_t *fw;
1460         struct mxge_slice_state *ss;
1461         int slice;
1462         char slice_num[8];
1463
1464         ctx = device_get_sysctl_ctx(sc->dev);
1465         children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466         fw = sc->ss[0].fw_stats;
1467
1468         /* random information */
1469         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470                        "firmware_version",
1471                        CTLFLAG_RD, sc->fw_version,
1472                        0, "firmware version");
1473         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474                        "serial_number",
1475                        CTLFLAG_RD, sc->serial_number_string,
1476                        0, "serial number");
1477         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478                        "product_code",
1479                        CTLFLAG_RD, sc->product_code_string,
1480                        0, "product_code");
1481         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482                        "pcie_link_width",
1483                        CTLFLAG_RD, &sc->link_width,
1484                        0, "tx_boundary");
1485         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486                        "tx_boundary",
1487                        CTLFLAG_RD, &sc->tx_boundary,
1488                        0, "tx_boundary");
1489         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490                        "write_combine",
1491                        CTLFLAG_RD, &sc->wc,
1492                        0, "write combining PIO?");
1493         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494                        "read_dma_MBs",
1495                        CTLFLAG_RD, &sc->read_dma,
1496                        0, "DMA Read speed in MB/s");
1497         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498                        "write_dma_MBs",
1499                        CTLFLAG_RD, &sc->write_dma,
1500                        0, "DMA Write speed in MB/s");
1501         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502                        "read_write_dma_MBs",
1503                        CTLFLAG_RD, &sc->read_write_dma,
1504                        0, "DMA concurrent Read/Write speed in MB/s");
1505         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506                        "watchdog_resets",
1507                        CTLFLAG_RD, &sc->watchdog_resets,
1508                        0, "Number of times NIC was reset");
1509
1510
1511         /* performance related tunables */
1512         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513                         "intr_coal_delay",
1514                         CTLTYPE_INT|CTLFLAG_RW, sc,
1515                         0, mxge_change_intr_coal,
1516                         "I", "interrupt coalescing delay in usecs");
1517
1518         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519                         "throttle",
1520                         CTLTYPE_INT|CTLFLAG_RW, sc,
1521                         0, mxge_change_throttle,
1522                         "I", "transmit throttling");
1523
1524         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525                         "flow_control_enabled",
1526                         CTLTYPE_INT|CTLFLAG_RW, sc,
1527                         0, mxge_change_flow_control,
1528                         "I", "interrupt coalescing delay in usecs");
1529
1530         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531                        "deassert_wait",
1532                        CTLFLAG_RW, &mxge_deassert_wait,
1533                        0, "Wait for IRQ line to go low in ihandler");
1534
1535         /* stats block from firmware is in network byte order.
1536            Need to swap it */
1537         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538                         "link_up",
1539                         CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540                         0, mxge_handle_be32,
1541                         "I", "link up");
1542         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543                         "rdma_tags_available",
1544                         CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545                         0, mxge_handle_be32,
1546                         "I", "rdma_tags_available");
1547         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548                         "dropped_bad_crc32",
1549                         CTLTYPE_INT|CTLFLAG_RD,
1550                         &fw->dropped_bad_crc32,
1551                         0, mxge_handle_be32,
1552                         "I", "dropped_bad_crc32");
1553         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554                         "dropped_bad_phy",
1555                         CTLTYPE_INT|CTLFLAG_RD,
1556                         &fw->dropped_bad_phy,
1557                         0, mxge_handle_be32,
1558                         "I", "dropped_bad_phy");
1559         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560                         "dropped_link_error_or_filtered",
1561                         CTLTYPE_INT|CTLFLAG_RD,
1562                         &fw->dropped_link_error_or_filtered,
1563                         0, mxge_handle_be32,
1564                         "I", "dropped_link_error_or_filtered");
1565         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566                         "dropped_link_overflow",
1567                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568                         0, mxge_handle_be32,
1569                         "I", "dropped_link_overflow");
1570         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571                         "dropped_multicast_filtered",
1572                         CTLTYPE_INT|CTLFLAG_RD,
1573                         &fw->dropped_multicast_filtered,
1574                         0, mxge_handle_be32,
1575                         "I", "dropped_multicast_filtered");
1576         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577                         "dropped_no_big_buffer",
1578                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579                         0, mxge_handle_be32,
1580                         "I", "dropped_no_big_buffer");
1581         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582                         "dropped_no_small_buffer",
1583                         CTLTYPE_INT|CTLFLAG_RD,
1584                         &fw->dropped_no_small_buffer,
1585                         0, mxge_handle_be32,
1586                         "I", "dropped_no_small_buffer");
1587         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588                         "dropped_overrun",
1589                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590                         0, mxge_handle_be32,
1591                         "I", "dropped_overrun");
1592         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593                         "dropped_pause",
1594                         CTLTYPE_INT|CTLFLAG_RD,
1595                         &fw->dropped_pause,
1596                         0, mxge_handle_be32,
1597                         "I", "dropped_pause");
1598         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599                         "dropped_runt",
1600                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601                         0, mxge_handle_be32,
1602                         "I", "dropped_runt");
1603
1604         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605                         "dropped_unicast_filtered",
1606                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607                         0, mxge_handle_be32,
1608                         "I", "dropped_unicast_filtered");
1609
1610         /* verbose printing? */
1611         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612                        "verbose",
1613                        CTLFLAG_RW, &mxge_verbose,
1614                        0, "verbose printing");
1615
1616         /* add counters exported for debugging from all slices */
1617         sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618         sc->slice_sysctl_tree =
1619                 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620                                 "slice", CTLFLAG_RD, 0, "");
1621
1622         for (slice = 0; slice < sc->num_slices; slice++) {
1623                 ss = &sc->ss[slice];
1624                 sysctl_ctx_init(&ss->sysctl_ctx);
1625                 ctx = &ss->sysctl_ctx;
1626                 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627                 sprintf(slice_num, "%d", slice);
1628                 ss->sysctl_tree =
1629                         SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630                                         CTLFLAG_RD, 0, "");
1631                 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633                                "rx_small_cnt",
1634                                CTLFLAG_RD, &ss->rx_small.cnt,
1635                                0, "rx_small_cnt");
1636                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637                                "rx_big_cnt",
1638                                CTLFLAG_RD, &ss->rx_big.cnt,
1639                                0, "rx_small_cnt");
1640                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641                                "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642                                0, "number of lro merge queues flushed");
1643
1644                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645                                "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646                                0, "number of bad csums preventing LRO");
1647
1648                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649                                "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650                                0, "number of frames appended to lro merge"
1651                                "queues");
1652
1653 #ifndef IFNET_BUF_RING
1654                 /* only transmit from slice 0 for now */
1655                 if (slice > 0)
1656                         continue;
1657 #endif
1658                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659                                "tx_req",
1660                                CTLFLAG_RD, &ss->tx.req,
1661                                0, "tx_req");
1662
1663                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664                                "tx_done",
1665                                CTLFLAG_RD, &ss->tx.done,
1666                                0, "tx_done");
1667                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668                                "tx_pkt_done",
1669                                CTLFLAG_RD, &ss->tx.pkt_done,
1670                                0, "tx_done");
1671                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672                                "tx_stall",
1673                                CTLFLAG_RD, &ss->tx.stall,
1674                                0, "tx_stall");
1675                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676                                "tx_wake",
1677                                CTLFLAG_RD, &ss->tx.wake,
1678                                0, "tx_wake");
1679                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680                                "tx_defrag",
1681                                CTLFLAG_RD, &ss->tx.defrag,
1682                                0, "tx_defrag");
1683                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684                                "tx_queue_active",
1685                                CTLFLAG_RD, &ss->tx.queue_active,
1686                                0, "tx_queue_active");
1687                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688                                "tx_activate",
1689                                CTLFLAG_RD, &ss->tx.activate,
1690                                0, "tx_activate");
1691                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692                                "tx_deactivate",
1693                                CTLFLAG_RD, &ss->tx.deactivate,
1694                                0, "tx_deactivate");
1695         }
1696 }
1697
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699    backwards one at a time and handle ring wraps */
1700
1701 static inline void
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703                             mcp_kreq_ether_send_t *src, int cnt)
1704 {
1705         int idx, starting_slot;
1706         starting_slot = tx->req;
1707         while (cnt > 1) {
1708                 cnt--;
1709                 idx = (starting_slot + cnt) & tx->mask;
1710                 mxge_pio_copy(&tx->lanai[idx],
1711                               &src[cnt], sizeof(*src));
1712                 wmb();
1713         }
1714 }
1715
1716 /*
1717  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718  * at most 32 bytes at a time, so as to avoid involving the software
1719  * pio handler in the nic.   We re-write the first segment's flags
1720  * to mark them valid only after writing the entire chain
1721  */
1722
1723 static inline void
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725                   int cnt)
1726 {
1727         int idx, i;
1728         uint32_t *src_ints;
1729         volatile uint32_t *dst_ints;
1730         mcp_kreq_ether_send_t *srcp;
1731         volatile mcp_kreq_ether_send_t *dstp, *dst;
1732         uint8_t last_flags;
1733         
1734         idx = tx->req & tx->mask;
1735
1736         last_flags = src->flags;
1737         src->flags = 0;
1738         wmb();
1739         dst = dstp = &tx->lanai[idx];
1740         srcp = src;
1741
1742         if ((idx + cnt) < tx->mask) {
1743                 for (i = 0; i < (cnt - 1); i += 2) {
1744                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745                         wmb(); /* force write every 32 bytes */
1746                         srcp += 2;
1747                         dstp += 2;
1748                 }
1749         } else {
1750                 /* submit all but the first request, and ensure
1751                    that it is submitted below */
1752                 mxge_submit_req_backwards(tx, src, cnt);
1753                 i = 0;
1754         }
1755         if (i < cnt) {
1756                 /* submit the first request */
1757                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1758                 wmb(); /* barrier before setting valid flag */
1759         }
1760
1761         /* re-write the last 32-bits with the valid flags */
1762         src->flags = last_flags;
1763         src_ints = (uint32_t *)src;
1764         src_ints+=3;
1765         dst_ints = (volatile uint32_t *)dst;
1766         dst_ints+=3;
1767         *dst_ints =  *src_ints;
1768         tx->req += cnt;
1769         wmb();
1770 }
1771
1772 static int
1773 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774     struct mxge_pkt_info *pi)
1775 {
1776         struct ether_vlan_header *eh;
1777         uint16_t etype;
1778         int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779 #if IFCAP_TSO6 && defined(INET6)
1780         int nxt;
1781 #endif
1782
1783         eh = mtod(m, struct ether_vlan_header *);
1784         if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785                 etype = ntohs(eh->evl_proto);
1786                 pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787         } else {
1788                 etype = ntohs(eh->evl_encap_proto);
1789                 pi->ip_off = ETHER_HDR_LEN;
1790         }
1791
1792         switch (etype) {
1793         case ETHERTYPE_IP:
1794                 /*
1795                  * ensure ip header is in first mbuf, copy it to a
1796                  * scratch buffer if not
1797                  */
1798                 pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799                 pi->ip6 = NULL;
1800                 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801                         m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802                             ss->scratch);
1803                         pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804                 }
1805                 pi->ip_hlen = pi->ip->ip_hl << 2;
1806                 if (!tso)
1807                         return 0;
1808
1809                 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810                     sizeof(struct tcphdr))) {
1811                         m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812                             sizeof(struct tcphdr), ss->scratch);
1813                         pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814                 }
1815                 pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816                 break;
1817 #if IFCAP_TSO6 && defined(INET6)
1818         case ETHERTYPE_IPV6:
1819                 pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820                 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821                         m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822                             ss->scratch);
1823                         pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824                 }
1825                 nxt = 0;
1826                 pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827                 pi->ip_hlen -= pi->ip_off;
1828                 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829                         return EINVAL;
1830
1831                 if (!tso)
1832                         return 0;
1833
1834                 if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835                         return EINVAL;
1836
1837                 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838                     sizeof(struct tcphdr))) {
1839                         m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840                             sizeof(struct tcphdr), ss->scratch);
1841                         pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842                 }
1843                 pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844                 break;
1845 #endif
1846         default:
1847                 return EINVAL;
1848         }
1849         return 0;
1850 }
1851
1852 #if IFCAP_TSO4
1853
1854 static void
1855 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856                int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 {
1858         mxge_tx_ring_t *tx;
1859         mcp_kreq_ether_send_t *req;
1860         bus_dma_segment_t *seg;
1861         uint32_t low, high_swapped;
1862         int len, seglen, cum_len, cum_len_next;
1863         int next_is_first, chop, cnt, rdma_count, small;
1864         uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865         uint8_t flags, flags_next;
1866         static int once;
1867
1868         mss = m->m_pkthdr.tso_segsz;
1869
1870         /* negative cum_len signifies to the
1871          * send loop that we are still in the
1872          * header portion of the TSO packet.
1873          */
1874
1875         cksum_offset = pi->ip_off + pi->ip_hlen;
1876         cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877
1878         /* TSO implies checksum offload on this hardware */
1879         if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880                 /*
1881                  * If packet has full TCP csum, replace it with pseudo hdr
1882                  * sum that the NIC expects, otherwise the NIC will emit
1883                  * packets with bad TCP checksums.
1884                  */
1885                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886                 if (pi->ip6) {
1887 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888                         m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889                         sum = in6_cksum_pseudo(pi->ip6,
1890                             m->m_pkthdr.len - cksum_offset,
1891                             IPPROTO_TCP, 0);
1892 #endif
1893                 } else {
1894 #ifdef INET
1895                         m->m_pkthdr.csum_flags |= CSUM_TCP;
1896                         sum = in_pseudo(pi->ip->ip_src.s_addr,
1897                             pi->ip->ip_dst.s_addr,
1898                             htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899                                     cksum_offset)));
1900 #endif
1901                 }
1902                 m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903                     cksum_offset, sizeof(sum), (caddr_t)&sum);
1904         }
1905         flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906
1907         
1908         /* for TSO, pseudo_hdr_offset holds mss.
1909          * The firmware figures out where to put
1910          * the checksum by parsing the header. */
1911         pseudo_hdr_offset = htobe16(mss);
1912
1913         if (pi->ip6) {
1914                 /*
1915                  * for IPv6 TSO, the "checksum offset" is re-purposed
1916                  * to store the TCP header len
1917                  */
1918                 cksum_offset = (pi->tcp->th_off << 2);
1919         }
1920
1921         tx = &ss->tx;
1922         req = tx->req_list;
1923         seg = tx->seg_list;
1924         cnt = 0;
1925         rdma_count = 0;
1926         /* "rdma_count" is the number of RDMAs belonging to the
1927          * current packet BEFORE the current send request. For
1928          * non-TSO packets, this is equal to "count".
1929          * For TSO packets, rdma_count needs to be reset
1930          * to 0 after a segment cut.
1931          *
1932          * The rdma_count field of the send request is
1933          * the number of RDMAs of the packet starting at
1934          * that request. For TSO send requests with one ore more cuts
1935          * in the middle, this is the number of RDMAs starting
1936          * after the last cut in the request. All previous
1937          * segments before the last cut implicitly have 1 RDMA.
1938          *
1939          * Since the number of RDMAs is not known beforehand,
1940          * it must be filled-in retroactively - after each
1941          * segmentation cut or at the end of the entire packet.
1942          */
1943
1944         while (busdma_seg_cnt) {
1945                 /* Break the busdma segment up into pieces*/
1946                 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947                 high_swapped =  htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948                 len = seg->ds_len;
1949
1950                 while (len) {
1951                         flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952                         seglen = len;
1953                         cum_len_next = cum_len + seglen;
1954                         (req-rdma_count)->rdma_count = rdma_count + 1;
1955                         if (__predict_true(cum_len >= 0)) {
1956                                 /* payload */
1957                                 chop = (cum_len_next > mss);
1958                                 cum_len_next = cum_len_next % mss;
1959                                 next_is_first = (cum_len_next == 0);
1960                                 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961                                 flags_next |= next_is_first *
1962                                         MXGEFW_FLAGS_FIRST;
1963                                 rdma_count |= -(chop | next_is_first);
1964                                 rdma_count += chop & !next_is_first;
1965                         } else if (cum_len_next >= 0) {
1966                                 /* header ends */
1967                                 rdma_count = -1;
1968                                 cum_len_next = 0;
1969                                 seglen = -cum_len;
1970                                 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971                                 flags_next = MXGEFW_FLAGS_TSO_PLD |
1972                                         MXGEFW_FLAGS_FIRST |
1973                                         (small * MXGEFW_FLAGS_SMALL);
1974                             }
1975                         
1976                         req->addr_high = high_swapped;
1977                         req->addr_low = htobe32(low);
1978                         req->pseudo_hdr_offset = pseudo_hdr_offset;
1979                         req->pad = 0;
1980                         req->rdma_count = 1;
1981                         req->length = htobe16(seglen);
1982                         req->cksum_offset = cksum_offset;
1983                         req->flags = flags | ((cum_len & 1) *
1984                                               MXGEFW_FLAGS_ALIGN_ODD);
1985                         low += seglen;
1986                         len -= seglen;
1987                         cum_len = cum_len_next;
1988                         flags = flags_next;
1989                         req++;
1990                         cnt++;
1991                         rdma_count++;
1992                         if (cksum_offset != 0 && !pi->ip6) {
1993                                 if (__predict_false(cksum_offset > seglen))
1994                                         cksum_offset -= seglen;
1995                                 else
1996                                         cksum_offset = 0;
1997                         }
1998                         if (__predict_false(cnt > tx->max_desc))
1999                                 goto drop;
2000                 }
2001                 busdma_seg_cnt--;
2002                 seg++;
2003         }
2004         (req-rdma_count)->rdma_count = rdma_count;
2005
2006         do {
2007                 req--;
2008                 req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009         } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010
2011         tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012         mxge_submit_req(tx, tx->req_list, cnt);
2013 #ifdef IFNET_BUF_RING
2014         if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015                 /* tell the NIC to start polling this slice */
2016                 *tx->send_go = 1;
2017                 tx->queue_active = 1;
2018                 tx->activate++;
2019                 wmb();
2020         }
2021 #endif
2022         return;
2023
2024 drop:
2025         bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026         m_freem(m);
2027         ss->oerrors++;
2028         if (!once) {
2029                 printf("tx->max_desc exceeded via TSO!\n");
2030                 printf("mss = %d, %ld, %d!\n", mss,
2031                        (long)seg - (long)tx->seg_list, tx->max_desc);
2032                 once = 1;
2033         }
2034         return;
2035
2036 }
2037
2038 #endif /* IFCAP_TSO4 */
2039
2040 #ifdef MXGE_NEW_VLAN_API
2041 /*
2042  * We reproduce the software vlan tag insertion from
2043  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044  * vlan tag insertion. We need to advertise this in order to have the
2045  * vlan interface respect our csum offload flags.
2046  */
2047 static struct mbuf *
2048 mxge_vlan_tag_insert(struct mbuf *m)
2049 {
2050         struct ether_vlan_header *evl;
2051
2052         M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053         if (__predict_false(m == NULL))
2054                 return NULL;
2055         if (m->m_len < sizeof(*evl)) {
2056                 m = m_pullup(m, sizeof(*evl));
2057                 if (__predict_false(m == NULL))
2058                         return NULL;
2059         }
2060         /*
2061          * Transform the Ethernet header into an Ethernet header
2062          * with 802.1Q encapsulation.
2063          */
2064         evl = mtod(m, struct ether_vlan_header *);
2065         bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066               (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067         evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068         evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069         m->m_flags &= ~M_VLANTAG;
2070         return m;
2071 }
2072 #endif /* MXGE_NEW_VLAN_API */
2073
2074 static void
2075 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076 {
2077         struct mxge_pkt_info pi = {0,0,0,0};
2078         mxge_softc_t *sc;
2079         mcp_kreq_ether_send_t *req;
2080         bus_dma_segment_t *seg;
2081         struct mbuf *m_tmp;
2082         struct ifnet *ifp;
2083         mxge_tx_ring_t *tx;
2084         int cnt, cum_len, err, i, idx, odd_flag;
2085         uint16_t pseudo_hdr_offset;
2086         uint8_t flags, cksum_offset;
2087
2088
2089         sc = ss->sc;
2090         ifp = sc->ifp;
2091         tx = &ss->tx;
2092
2093 #ifdef MXGE_NEW_VLAN_API
2094         if (m->m_flags & M_VLANTAG) {
2095                 m = mxge_vlan_tag_insert(m);
2096                 if (__predict_false(m == NULL))
2097                         goto drop_without_m;
2098         }
2099 #endif
2100         if (m->m_pkthdr.csum_flags &
2101             (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102                 if (mxge_parse_tx(ss, m, &pi))
2103                         goto drop;
2104         }
2105
2106         /* (try to) map the frame for DMA */
2107         idx = tx->req & tx->mask;
2108         err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109                                       m, tx->seg_list, &cnt,
2110                                       BUS_DMA_NOWAIT);
2111         if (__predict_false(err == EFBIG)) {
2112                 /* Too many segments in the chain.  Try
2113                    to defrag */
2114                 m_tmp = m_defrag(m, M_NOWAIT);
2115                 if (m_tmp == NULL) {
2116                         goto drop;
2117                 }
2118                 ss->tx.defrag++;
2119                 m = m_tmp;
2120                 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121                                               tx->info[idx].map,
2122                                               m, tx->seg_list, &cnt,
2123                                               BUS_DMA_NOWAIT);
2124         }
2125         if (__predict_false(err != 0)) {
2126                 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127                               " packet len = %d\n", err, m->m_pkthdr.len);
2128                 goto drop;
2129         }
2130         bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131                         BUS_DMASYNC_PREWRITE);
2132         tx->info[idx].m = m;
2133
2134 #if IFCAP_TSO4
2135         /* TSO is different enough, we handle it in another routine */
2136         if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137                 mxge_encap_tso(ss, m, cnt, &pi);
2138                 return;
2139         }
2140 #endif
2141
2142         req = tx->req_list;
2143         cksum_offset = 0;
2144         pseudo_hdr_offset = 0;
2145         flags = MXGEFW_FLAGS_NO_TSO;
2146
2147         /* checksum offloading? */
2148         if (m->m_pkthdr.csum_flags &
2149             (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150                 /* ensure ip header is in first mbuf, copy
2151                    it to a scratch buffer if not */
2152                 cksum_offset = pi.ip_off + pi.ip_hlen;
2153                 pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154                 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155                 req->cksum_offset = cksum_offset;
2156                 flags |= MXGEFW_FLAGS_CKSUM;
2157                 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158         } else {
2159                 odd_flag = 0;
2160         }
2161         if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162                 flags |= MXGEFW_FLAGS_SMALL;
2163
2164         /* convert segments into a request list */
2165         cum_len = 0;
2166         seg = tx->seg_list;
2167         req->flags = MXGEFW_FLAGS_FIRST;
2168         for (i = 0; i < cnt; i++) {
2169                 req->addr_low =
2170                         htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171                 req->addr_high =
2172                         htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173                 req->length = htobe16(seg->ds_len);
2174                 req->cksum_offset = cksum_offset;
2175                 if (cksum_offset > seg->ds_len)
2176                         cksum_offset -= seg->ds_len;
2177                 else
2178                         cksum_offset = 0;
2179                 req->pseudo_hdr_offset = pseudo_hdr_offset;
2180                 req->pad = 0; /* complete solid 16-byte block */
2181                 req->rdma_count = 1;
2182                 req->flags |= flags | ((cum_len & 1) * odd_flag);
2183                 cum_len += seg->ds_len;
2184                 seg++;
2185                 req++;
2186                 req->flags = 0;
2187         }
2188         req--;
2189         /* pad runts to 60 bytes */
2190         if (cum_len < 60) {
2191                 req++;
2192                 req->addr_low =
2193                         htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194                 req->addr_high =
2195                         htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196                 req->length = htobe16(60 - cum_len);
2197                 req->cksum_offset = 0;
2198                 req->pseudo_hdr_offset = pseudo_hdr_offset;
2199                 req->pad = 0; /* complete solid 16-byte block */
2200                 req->rdma_count = 1;
2201                 req->flags |= flags | ((cum_len & 1) * odd_flag);
2202                 cnt++;
2203         }
2204
2205         tx->req_list[0].rdma_count = cnt;
2206 #if 0
2207         /* print what the firmware will see */
2208         for (i = 0; i < cnt; i++) {
2209                 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210                     "cso:%d, flags:0x%x, rdma:%d\n",
2211                     i, (int)ntohl(tx->req_list[i].addr_high),
2212                     (int)ntohl(tx->req_list[i].addr_low),
2213                     (int)ntohs(tx->req_list[i].length),
2214                     (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215                     tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216                     tx->req_list[i].rdma_count);
2217         }
2218         printf("--------------\n");
2219 #endif
2220         tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221         mxge_submit_req(tx, tx->req_list, cnt);
2222 #ifdef IFNET_BUF_RING
2223         if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224                 /* tell the NIC to start polling this slice */
2225                 *tx->send_go = 1;
2226                 tx->queue_active = 1;
2227                 tx->activate++;
2228                 wmb();
2229         }
2230 #endif
2231         return;
2232
2233 drop:
2234         m_freem(m);
2235 drop_without_m:
2236         ss->oerrors++;
2237         return;
2238 }
2239
2240 #ifdef IFNET_BUF_RING
2241 static void
2242 mxge_qflush(struct ifnet *ifp)
2243 {
2244         mxge_softc_t *sc = ifp->if_softc;
2245         mxge_tx_ring_t *tx;
2246         struct mbuf *m;
2247         int slice;
2248
2249         for (slice = 0; slice < sc->num_slices; slice++) {
2250                 tx = &sc->ss[slice].tx;
2251                 mtx_lock(&tx->mtx);
2252                 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253                         m_freem(m);
2254                 mtx_unlock(&tx->mtx);
2255         }
2256         if_qflush(ifp);
2257 }
2258
2259 static inline void
2260 mxge_start_locked(struct mxge_slice_state *ss)
2261 {
2262         mxge_softc_t *sc;
2263         struct mbuf *m;
2264         struct ifnet *ifp;
2265         mxge_tx_ring_t *tx;
2266
2267         sc = ss->sc;
2268         ifp = sc->ifp;
2269         tx = &ss->tx;
2270
2271         while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272                 m = drbr_dequeue(ifp, tx->br);
2273                 if (m == NULL) {
2274                         return;
2275                 }
2276                 /* let BPF see it */
2277                 BPF_MTAP(ifp, m);
2278
2279                 /* give it to the nic */
2280                 mxge_encap(ss, m);
2281         }
2282         /* ran out of transmit slots */
2283         if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284             && (!drbr_empty(ifp, tx->br))) {
2285                 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286                 tx->stall++;
2287         }
2288 }
2289
2290 static int
2291 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292 {
2293         mxge_softc_t *sc;
2294         struct ifnet *ifp;
2295         mxge_tx_ring_t *tx;
2296         int err;
2297
2298         sc = ss->sc;
2299         ifp = sc->ifp;
2300         tx = &ss->tx;
2301
2302         if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303             IFF_DRV_RUNNING) {
2304                 err = drbr_enqueue(ifp, tx->br, m);
2305                 return (err);
2306         }
2307
2308         if (!drbr_needs_enqueue(ifp, tx->br) &&
2309             ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310                 /* let BPF see it */
2311                 BPF_MTAP(ifp, m);
2312                 /* give it to the nic */
2313                 mxge_encap(ss, m);
2314         } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315                 return (err);
2316         }
2317         if (!drbr_empty(ifp, tx->br))
2318                 mxge_start_locked(ss);
2319         return (0);
2320 }
2321
2322 static int
2323 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324 {
2325         mxge_softc_t *sc = ifp->if_softc;
2326         struct mxge_slice_state *ss;
2327         mxge_tx_ring_t *tx;
2328         int err = 0;
2329         int slice;
2330
2331         slice = m->m_pkthdr.flowid;
2332         slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333
2334         ss = &sc->ss[slice];
2335         tx = &ss->tx;
2336
2337         if (mtx_trylock(&tx->mtx)) {
2338                 err = mxge_transmit_locked(ss, m);
2339                 mtx_unlock(&tx->mtx);
2340         } else {
2341                 err = drbr_enqueue(ifp, tx->br, m);
2342         }
2343
2344         return (err);
2345 }
2346
2347 #else
2348
2349 static inline void
2350 mxge_start_locked(struct mxge_slice_state *ss)
2351 {
2352         mxge_softc_t *sc;
2353         struct mbuf *m;
2354         struct ifnet *ifp;
2355         mxge_tx_ring_t *tx;
2356
2357         sc = ss->sc;
2358         ifp = sc->ifp;
2359         tx = &ss->tx;
2360         while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361                 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362                 if (m == NULL) {
2363                         return;
2364                 }
2365                 /* let BPF see it */
2366                 BPF_MTAP(ifp, m);
2367
2368                 /* give it to the nic */
2369                 mxge_encap(ss, m);
2370         }
2371         /* ran out of transmit slots */
2372         if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373                 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374                 tx->stall++;
2375         }
2376 }
2377 #endif
2378 static void
2379 mxge_start(struct ifnet *ifp)
2380 {
2381         mxge_softc_t *sc = ifp->if_softc;
2382         struct mxge_slice_state *ss;
2383
2384         /* only use the first slice for now */
2385         ss = &sc->ss[0];
2386         mtx_lock(&ss->tx.mtx);
2387         mxge_start_locked(ss);
2388         mtx_unlock(&ss->tx.mtx);                
2389 }
2390
2391 /*
2392  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393  * at most 32 bytes at a time, so as to avoid involving the software
2394  * pio handler in the nic.   We re-write the first segment's low
2395  * DMA address to mark it valid only after we write the entire chunk
2396  * in a burst
2397  */
2398 static inline void
2399 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400                 mcp_kreq_ether_recv_t *src)
2401 {
2402         uint32_t low;
2403
2404         low = src->addr_low;
2405         src->addr_low = 0xffffffff;
2406         mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407         wmb();
2408         mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409         wmb();
2410         src->addr_low = low;
2411         dst->addr_low = low;
2412         wmb();
2413 }
2414
2415 static int
2416 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417 {
2418         bus_dma_segment_t seg;
2419         struct mbuf *m;
2420         mxge_rx_ring_t *rx = &ss->rx_small;
2421         int cnt, err;
2422
2423         m = m_gethdr(M_NOWAIT, MT_DATA);
2424         if (m == NULL) {
2425                 rx->alloc_fail++;
2426                 err = ENOBUFS;
2427                 goto done;
2428         }
2429         m->m_len = MHLEN;
2430         err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431                                       &seg, &cnt, BUS_DMA_NOWAIT);
2432         if (err != 0) {
2433                 m_free(m);
2434                 goto done;
2435         }
2436         rx->info[idx].m = m;
2437         rx->shadow[idx].addr_low =
2438                 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439         rx->shadow[idx].addr_high =
2440                 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441
2442 done:
2443         if ((idx & 7) == 7)
2444                 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445         return err;
2446 }
2447
2448 static int
2449 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450 {
2451         bus_dma_segment_t seg[3];
2452         struct mbuf *m;
2453         mxge_rx_ring_t *rx = &ss->rx_big;
2454         int cnt, err, i;
2455
2456         m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457         if (m == NULL) {
2458                 rx->alloc_fail++;
2459                 err = ENOBUFS;
2460                 goto done;
2461         }
2462         m->m_len = rx->mlen;
2463         err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464                                       seg, &cnt, BUS_DMA_NOWAIT);
2465         if (err != 0) {
2466                 m_free(m);
2467                 goto done;
2468         }
2469         rx->info[idx].m = m;
2470         rx->shadow[idx].addr_low =
2471                 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472         rx->shadow[idx].addr_high =
2473                 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474
2475 #if MXGE_VIRT_JUMBOS
2476         for (i = 1; i < cnt; i++) {
2477                 rx->shadow[idx + i].addr_low =
2478                         htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479                 rx->shadow[idx + i].addr_high =
2480                         htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481        }
2482 #endif
2483
2484 done:
2485        for (i = 0; i < rx->nbufs; i++) {
2486                 if ((idx & 7) == 7) {
2487                         mxge_submit_8rx(&rx->lanai[idx - 7],
2488                                         &rx->shadow[idx - 7]);
2489                 }
2490                 idx++;
2491         }
2492         return err;
2493 }
2494
2495 #ifdef INET6
2496
2497 static uint16_t
2498 mxge_csum_generic(uint16_t *raw, int len)
2499 {
2500         uint32_t csum;
2501
2502
2503         csum = 0;
2504         while (len > 0) {
2505                 csum += *raw;
2506                 raw++;
2507                 len -= 2;
2508         }
2509         csum = (csum >> 16) + (csum & 0xffff);
2510         csum = (csum >> 16) + (csum & 0xffff);
2511         return (uint16_t)csum;
2512 }
2513
2514 static inline uint16_t
2515 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 {
2517         uint32_t partial;
2518         int nxt, cksum_offset;
2519         struct ip6_hdr *ip6 = p;
2520         uint16_t c;
2521
2522         nxt = ip6->ip6_nxt;
2523         cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524         if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525                 cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526                                            IPPROTO_IPV6, &nxt);
2527                 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528                         return (1);
2529         }
2530
2531         /*
2532          * IPv6 headers do not contain a checksum, and hence
2533          * do not checksum to zero, so they don't "fall out"
2534          * of the partial checksum calculation like IPv4
2535          * headers do.  We need to fix the partial checksum by
2536          * subtracting the checksum of the IPv6 header.
2537          */
2538
2539         partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540                                     ETHER_HDR_LEN);
2541         csum += ~partial;
2542         csum +=  (csum < ~partial);
2543         csum = (csum >> 16) + (csum & 0xFFFF);
2544         csum = (csum >> 16) + (csum & 0xFFFF);
2545         c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546                              csum);
2547         c ^= 0xffff;
2548         return (c);
2549 }
2550 #endif /* INET6 */
2551 /*
2552  *  Myri10GE hardware checksums are not valid if the sender
2553  *  padded the frame with non-zero padding.  This is because
2554  *  the firmware just does a simple 16-bit 1s complement
2555  *  checksum across the entire frame, excluding the first 14
2556  *  bytes.  It is best to simply to check the checksum and
2557  *  tell the stack about it only if the checksum is good
2558  */
2559
2560 static inline uint16_t
2561 mxge_rx_csum(struct mbuf *m, int csum)
2562 {
2563         struct ether_header *eh;
2564 #ifdef INET
2565         struct ip *ip;
2566 #endif
2567 #if defined(INET) || defined(INET6)
2568         int cap = m->m_pkthdr.rcvif->if_capenable;
2569 #endif
2570         uint16_t c, etype;
2571
2572
2573         eh = mtod(m, struct ether_header *);
2574         etype = ntohs(eh->ether_type);
2575         switch (etype) {
2576 #ifdef INET
2577         case ETHERTYPE_IP:
2578                 if ((cap & IFCAP_RXCSUM) == 0)
2579                         return (1);
2580                 ip = (struct ip *)(eh + 1);
2581                 if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582                         return (1);
2583                 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584                               htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585                                     (ip->ip_hl << 2) + ip->ip_p));
2586                 c ^= 0xffff;
2587                 break;
2588 #endif
2589 #ifdef INET6
2590         case ETHERTYPE_IPV6:
2591                 if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592                         return (1);
2593                 c = mxge_rx_csum6((eh + 1), m, csum);
2594                 break;
2595 #endif
2596         default:
2597                 c = 1;
2598         }
2599         return (c);
2600 }
2601
2602 static void
2603 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604 {
2605         struct ether_vlan_header *evl;
2606         struct ether_header *eh;
2607         uint32_t partial;
2608
2609         evl = mtod(m, struct ether_vlan_header *);
2610         eh = mtod(m, struct ether_header *);
2611
2612         /*
2613          * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614          * after what the firmware thought was the end of the ethernet
2615          * header.
2616          */
2617
2618         /* put checksum into host byte order */
2619         *csum = ntohs(*csum);
2620         partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621         (*csum) += ~partial;
2622         (*csum) +=  ((*csum) < ~partial);
2623         (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624         (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625
2626         /* restore checksum to network byte order;
2627            later consumers expect this */
2628         *csum = htons(*csum);
2629
2630         /* save the tag */
2631 #ifdef MXGE_NEW_VLAN_API        
2632         m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633 #else
2634         {
2635                 struct m_tag *mtag;
2636                 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637                                    M_NOWAIT);
2638                 if (mtag == NULL)
2639                         return;
2640                 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641                 m_tag_prepend(m, mtag);
2642         }
2643
2644 #endif
2645         m->m_flags |= M_VLANTAG;
2646
2647         /*
2648          * Remove the 802.1q header by copying the Ethernet
2649          * addresses over it and adjusting the beginning of
2650          * the data in the mbuf.  The encapsulated Ethernet
2651          * type field is already in place.
2652          */
2653         bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654               ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655         m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656 }
2657
2658
2659 static inline void
2660 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661                  uint32_t csum, int lro)
2662 {
2663         mxge_softc_t *sc;
2664         struct ifnet *ifp;
2665         struct mbuf *m;
2666         struct ether_header *eh;
2667         mxge_rx_ring_t *rx;
2668         bus_dmamap_t old_map;
2669         int idx;
2670
2671         sc = ss->sc;
2672         ifp = sc->ifp;
2673         rx = &ss->rx_big;
2674         idx = rx->cnt & rx->mask;
2675         rx->cnt += rx->nbufs;
2676         /* save a pointer to the received mbuf */
2677         m = rx->info[idx].m;
2678         /* try to replace the received mbuf */
2679         if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680                 /* drop the frame -- the old mbuf is re-cycled */
2681                 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2682                 return;
2683         }
2684
2685         /* unmap the received buffer */
2686         old_map = rx->info[idx].map;
2687         bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688         bus_dmamap_unload(rx->dmat, old_map);
2689
2690         /* swap the bus_dmamap_t's */
2691         rx->info[idx].map = rx->extra_map;
2692         rx->extra_map = old_map;
2693
2694         /* mcp implicitly skips 1st 2 bytes so that packet is properly
2695          * aligned */
2696         m->m_data += MXGEFW_PAD;
2697
2698         m->m_pkthdr.rcvif = ifp;
2699         m->m_len = m->m_pkthdr.len = len;
2700         ss->ipackets++;
2701         eh = mtod(m, struct ether_header *);
2702         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703                 mxge_vlan_tag_remove(m, &csum);
2704         }
2705         /* if the checksum is valid, mark it in the mbuf header */
2706         
2707         if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2708             (0 == mxge_rx_csum(m, csum))) {
2709                 /* Tell the stack that the  checksum is good */
2710                 m->m_pkthdr.csum_data = 0xffff;
2711                 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2712                         CSUM_DATA_VALID;
2713
2714 #if defined(INET) || defined (INET6)
2715                 if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2716                         return;
2717 #endif
2718         }
2719         /* flowid only valid if RSS hashing is enabled */
2720         if (sc->num_slices > 1) {
2721                 m->m_pkthdr.flowid = (ss - sc->ss);
2722                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2723         }
2724         /* pass the frame up the stack */
2725         (*ifp->if_input)(ifp, m);
2726 }
2727
2728 static inline void
2729 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730                    uint32_t csum, int lro)
2731 {
2732         mxge_softc_t *sc;
2733         struct ifnet *ifp;
2734         struct ether_header *eh;
2735         struct mbuf *m;
2736         mxge_rx_ring_t *rx;
2737         bus_dmamap_t old_map;
2738         int idx;
2739
2740         sc = ss->sc;
2741         ifp = sc->ifp;
2742         rx = &ss->rx_small;
2743         idx = rx->cnt & rx->mask;
2744         rx->cnt++;
2745         /* save a pointer to the received mbuf */
2746         m = rx->info[idx].m;
2747         /* try to replace the received mbuf */
2748         if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749                 /* drop the frame -- the old mbuf is re-cycled */
2750                 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2751                 return;
2752         }
2753
2754         /* unmap the received buffer */
2755         old_map = rx->info[idx].map;
2756         bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757         bus_dmamap_unload(rx->dmat, old_map);
2758
2759         /* swap the bus_dmamap_t's */
2760         rx->info[idx].map = rx->extra_map;
2761         rx->extra_map = old_map;
2762
2763         /* mcp implicitly skips 1st 2 bytes so that packet is properly
2764          * aligned */
2765         m->m_data += MXGEFW_PAD;
2766
2767         m->m_pkthdr.rcvif = ifp;
2768         m->m_len = m->m_pkthdr.len = len;
2769         ss->ipackets++;
2770         eh = mtod(m, struct ether_header *);
2771         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772                 mxge_vlan_tag_remove(m, &csum);
2773         }
2774         /* if the checksum is valid, mark it in the mbuf header */
2775         if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2776             (0 == mxge_rx_csum(m, csum))) {
2777                 /* Tell the stack that the  checksum is good */
2778                 m->m_pkthdr.csum_data = 0xffff;
2779                 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2780                         CSUM_DATA_VALID;
2781
2782 #if defined(INET) || defined (INET6)
2783                 if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2784                         return;
2785 #endif
2786         }
2787         /* flowid only valid if RSS hashing is enabled */
2788         if (sc->num_slices > 1) {
2789                 m->m_pkthdr.flowid = (ss - sc->ss);
2790                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2791         }
2792         /* pass the frame up the stack */
2793         (*ifp->if_input)(ifp, m);
2794 }
2795
2796 static inline void
2797 mxge_clean_rx_done(struct mxge_slice_state *ss)
2798 {
2799         mxge_rx_done_t *rx_done = &ss->rx_done;
2800         int limit = 0;
2801         uint16_t length;
2802         uint16_t checksum;
2803         int lro;
2804
2805         lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806         while (rx_done->entry[rx_done->idx].length != 0) {
2807                 length = ntohs(rx_done->entry[rx_done->idx].length);
2808                 rx_done->entry[rx_done->idx].length = 0;
2809                 checksum = rx_done->entry[rx_done->idx].checksum;
2810                 if (length <= (MHLEN - MXGEFW_PAD))
2811                         mxge_rx_done_small(ss, length, checksum, lro);
2812                 else
2813                         mxge_rx_done_big(ss, length, checksum, lro);
2814                 rx_done->cnt++;
2815                 rx_done->idx = rx_done->cnt & rx_done->mask;
2816
2817                 /* limit potential for livelock */
2818                 if (__predict_false(++limit > rx_done->mask / 2))
2819                         break;
2820         }
2821 #if defined(INET)  || defined (INET6)
2822         while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2823                 struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2824                 SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2825                 tcp_lro_flush(&ss->lc, lro);
2826         }
2827 #endif
2828 }
2829
2830
2831 static inline void
2832 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2833 {
2834         struct ifnet *ifp;
2835         mxge_tx_ring_t *tx;
2836         struct mbuf *m;
2837         bus_dmamap_t map;
2838         int idx;
2839         int *flags;
2840
2841         tx = &ss->tx;
2842         ifp = ss->sc->ifp;
2843         while (tx->pkt_done != mcp_idx) {
2844                 idx = tx->done & tx->mask;
2845                 tx->done++;
2846                 m = tx->info[idx].m;
2847                 /* mbuf and DMA map only attached to the first
2848                    segment per-mbuf */
2849                 if (m != NULL) {
2850                         ss->obytes += m->m_pkthdr.len;
2851                         if (m->m_flags & M_MCAST)
2852                                 ss->omcasts++;
2853                         ss->opackets++;
2854                         tx->info[idx].m = NULL;
2855                         map = tx->info[idx].map;
2856                         bus_dmamap_unload(tx->dmat, map);
2857                         m_freem(m);
2858                 }
2859                 if (tx->info[idx].flag) {
2860                         tx->info[idx].flag = 0;
2861                         tx->pkt_done++;
2862                 }
2863         }
2864         
2865         /* If we have space, clear IFF_OACTIVE to tell the stack that
2866            its OK to send packets */
2867 #ifdef IFNET_BUF_RING
2868         flags = &ss->if_drv_flags;
2869 #else
2870         flags = &ifp->if_drv_flags;
2871 #endif
2872         mtx_lock(&ss->tx.mtx);
2873         if ((*flags) & IFF_DRV_OACTIVE &&
2874             tx->req - tx->done < (tx->mask + 1)/4) {
2875                 *(flags) &= ~IFF_DRV_OACTIVE;
2876                 ss->tx.wake++;
2877                 mxge_start_locked(ss);
2878         }
2879 #ifdef IFNET_BUF_RING
2880         if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2881                 /* let the NIC stop polling this queue, since there
2882                  * are no more transmits pending */
2883                 if (tx->req == tx->done) {
2884                         *tx->send_stop = 1;
2885                         tx->queue_active = 0;
2886                         tx->deactivate++;
2887                         wmb();
2888                 }
2889         }
2890 #endif
2891         mtx_unlock(&ss->tx.mtx);
2892
2893 }
2894
2895 static struct mxge_media_type mxge_xfp_media_types[] =
2896 {
2897         {IFM_10G_CX4,   0x7f,           "10GBASE-CX4 (module)"},
2898         {IFM_10G_SR,    (1 << 7),       "10GBASE-SR"},
2899         {IFM_10G_LR,    (1 << 6),       "10GBASE-LR"},
2900         {0,             (1 << 5),       "10GBASE-ER"},
2901         {IFM_10G_LRM,   (1 << 4),       "10GBASE-LRM"},
2902         {0,             (1 << 3),       "10GBASE-SW"},
2903         {0,             (1 << 2),       "10GBASE-LW"},
2904         {0,             (1 << 1),       "10GBASE-EW"},
2905         {0,             (1 << 0),       "Reserved"}
2906 };
2907 static struct mxge_media_type mxge_sfp_media_types[] =
2908 {
2909         {IFM_10G_TWINAX,      0,        "10GBASE-Twinax"},
2910         {0,             (1 << 7),       "Reserved"},
2911         {IFM_10G_LRM,   (1 << 6),       "10GBASE-LRM"},
2912         {IFM_10G_LR,    (1 << 5),       "10GBASE-LR"},
2913         {IFM_10G_SR,    (1 << 4),       "10GBASE-SR"},
2914         {IFM_10G_TWINAX,(1 << 0),       "10GBASE-Twinax"}
2915 };
2916
2917 static void
2918 mxge_media_set(mxge_softc_t *sc, int media_type)
2919 {
2920
2921         
2922         ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2923                     0, NULL);
2924         ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2925         sc->current_media = media_type;
2926         sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2927 }
2928
2929 static void
2930 mxge_media_init(mxge_softc_t *sc)
2931 {
2932         char *ptr;
2933         int i;
2934
2935         ifmedia_removeall(&sc->media);
2936         mxge_media_set(sc, IFM_AUTO);
2937
2938         /*
2939          * parse the product code to deterimine the interface type
2940          * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2941          * after the 3rd dash in the driver's cached copy of the
2942          * EEPROM's product code string.
2943          */
2944         ptr = sc->product_code_string;
2945         if (ptr == NULL) {
2946                 device_printf(sc->dev, "Missing product code\n");
2947                 return;
2948         }
2949
2950         for (i = 0; i < 3; i++, ptr++) {
2951                 ptr = strchr(ptr, '-');
2952                 if (ptr == NULL) {
2953                         device_printf(sc->dev,
2954                                       "only %d dashes in PC?!?\n", i);
2955                         return;
2956                 }
2957         }
2958         if (*ptr == 'C' || *(ptr +1) == 'C') {
2959                 /* -C is CX4 */
2960                 sc->connector = MXGE_CX4;
2961                 mxge_media_set(sc, IFM_10G_CX4);
2962         } else if (*ptr == 'Q') {
2963                 /* -Q is Quad Ribbon Fiber */
2964                 sc->connector = MXGE_QRF;
2965                 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2966                 /* FreeBSD has no media type for Quad ribbon fiber */
2967         } else if (*ptr == 'R') {
2968                 /* -R is XFP */
2969                 sc->connector = MXGE_XFP;
2970         } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2971                 /* -S or -2S is SFP+ */
2972                 sc->connector = MXGE_SFP;
2973         } else {
2974                 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2975         }
2976 }
2977
2978 /*
2979  * Determine the media type for a NIC.  Some XFPs will identify
2980  * themselves only when their link is up, so this is initiated via a
2981  * link up interrupt.  However, this can potentially take up to
2982  * several milliseconds, so it is run via the watchdog routine, rather
2983  * than in the interrupt handler itself.
2984  */
2985 static void
2986 mxge_media_probe(mxge_softc_t *sc)
2987 {
2988         mxge_cmd_t cmd;
2989         char *cage_type;
2990
2991         struct mxge_media_type *mxge_media_types = NULL;
2992         int i, err, ms, mxge_media_type_entries;
2993         uint32_t byte;
2994
2995         sc->need_media_probe = 0;
2996
2997         if (sc->connector == MXGE_XFP) {
2998                 /* -R is XFP */
2999                 mxge_media_types = mxge_xfp_media_types;
3000                 mxge_media_type_entries =
3001                         sizeof (mxge_xfp_media_types) /
3002                         sizeof (mxge_xfp_media_types[0]);
3003                 byte = MXGE_XFP_COMPLIANCE_BYTE;
3004                 cage_type = "XFP";
3005         } else  if (sc->connector == MXGE_SFP) {
3006                 /* -S or -2S is SFP+ */
3007                 mxge_media_types = mxge_sfp_media_types;
3008                 mxge_media_type_entries =
3009                         sizeof (mxge_sfp_media_types) /
3010                         sizeof (mxge_sfp_media_types[0]);
3011                 cage_type = "SFP+";
3012                 byte = 3;
3013         } else {
3014                 /* nothing to do; media type cannot change */
3015                 return;
3016         }
3017
3018         /*
3019          * At this point we know the NIC has an XFP cage, so now we
3020          * try to determine what is in the cage by using the
3021          * firmware's XFP I2C commands to read the XFP 10GbE compilance
3022          * register.  We read just one byte, which may take over
3023          * a millisecond
3024          */
3025
3026         cmd.data0 = 0;   /* just fetch 1 byte, not all 256 */
3027         cmd.data1 = byte;
3028         err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3029         if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3030                 device_printf(sc->dev, "failed to read XFP\n");
3031         }
3032         if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3033                 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3034         }
3035         if (err != MXGEFW_CMD_OK) {
3036                 return;
3037         }
3038
3039         /* now we wait for the data to be cached */
3040         cmd.data0 = byte;
3041         err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3042         for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3043                 DELAY(1000);
3044                 cmd.data0 = byte;
3045                 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3046         }
3047         if (err != MXGEFW_CMD_OK) {
3048                 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3049                               cage_type, err, ms);
3050                 return;
3051         }
3052                 
3053         if (cmd.data0 == mxge_media_types[0].bitmask) {
3054                 if (mxge_verbose)
3055                         device_printf(sc->dev, "%s:%s\n", cage_type,
3056                                       mxge_media_types[0].name);
3057                 if (sc->current_media != mxge_media_types[0].flag) {
3058                         mxge_media_init(sc);
3059                         mxge_media_set(sc, mxge_media_types[0].flag);
3060                 }
3061                 return;
3062         }
3063         for (i = 1; i < mxge_media_type_entries; i++) {
3064                 if (cmd.data0 & mxge_media_types[i].bitmask) {
3065                         if (mxge_verbose)
3066                                 device_printf(sc->dev, "%s:%s\n",
3067                                               cage_type,
3068                                               mxge_media_types[i].name);
3069
3070                         if (sc->current_media != mxge_media_types[i].flag) {
3071                                 mxge_media_init(sc);
3072                                 mxge_media_set(sc, mxge_media_types[i].flag);
3073                         }
3074                         return;
3075                 }
3076         }
3077         if (mxge_verbose)
3078                 device_printf(sc->dev, "%s media 0x%x unknown\n",
3079                               cage_type, cmd.data0);
3080
3081         return;
3082 }
3083
3084 static void
3085 mxge_intr(void *arg)
3086 {
3087         struct mxge_slice_state *ss = arg;
3088         mxge_softc_t *sc = ss->sc;
3089         mcp_irq_data_t *stats = ss->fw_stats;
3090         mxge_tx_ring_t *tx = &ss->tx;
3091         mxge_rx_done_t *rx_done = &ss->rx_done;
3092         uint32_t send_done_count;
3093         uint8_t valid;
3094
3095
3096 #ifndef IFNET_BUF_RING
3097         /* an interrupt on a non-zero slice is implicitly valid
3098            since MSI-X irqs are not shared */
3099         if (ss != sc->ss) {
3100                 mxge_clean_rx_done(ss);
3101                 *ss->irq_claim = be32toh(3);
3102                 return;
3103         }
3104 #endif
3105
3106         /* make sure the DMA has finished */
3107         if (!stats->valid) {
3108                 return;
3109         }
3110         valid = stats->valid;
3111
3112         if (sc->legacy_irq) {
3113                 /* lower legacy IRQ  */
3114                 *sc->irq_deassert = 0;
3115                 if (!mxge_deassert_wait)
3116                         /* don't wait for conf. that irq is low */
3117                         stats->valid = 0;
3118         } else {
3119                 stats->valid = 0;
3120         }
3121
3122         /* loop while waiting for legacy irq deassertion */
3123         do {
3124                 /* check for transmit completes and receives */
3125                 send_done_count = be32toh(stats->send_done_count);
3126                 while ((send_done_count != tx->pkt_done) ||
3127                        (rx_done->entry[rx_done->idx].length != 0)) {
3128                         if (send_done_count != tx->pkt_done)
3129                                 mxge_tx_done(ss, (int)send_done_count);
3130                         mxge_clean_rx_done(ss);
3131                         send_done_count = be32toh(stats->send_done_count);
3132                 }
3133                 if (sc->legacy_irq && mxge_deassert_wait)
3134                         wmb();
3135         } while (*((volatile uint8_t *) &stats->valid));
3136
3137         /* fw link & error stats meaningful only on the first slice */
3138         if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3139                 if (sc->link_state != stats->link_up) {
3140                         sc->link_state = stats->link_up;
3141                         if (sc->link_state) {
3142                                 if_link_state_change(sc->ifp, LINK_STATE_UP);
3143                                 if (mxge_verbose)
3144                                         device_printf(sc->dev, "link up\n");
3145                         } else {
3146                                 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3147                                 if (mxge_verbose)
3148                                         device_printf(sc->dev, "link down\n");
3149                         }
3150                         sc->need_media_probe = 1;
3151                 }
3152                 if (sc->rdma_tags_available !=
3153                     be32toh(stats->rdma_tags_available)) {
3154                         sc->rdma_tags_available =
3155                                 be32toh(stats->rdma_tags_available);
3156                         device_printf(sc->dev, "RDMA timed out! %d tags "
3157                                       "left\n", sc->rdma_tags_available);
3158                 }
3159
3160                 if (stats->link_down) {
3161                         sc->down_cnt += stats->link_down;
3162                         sc->link_state = 0;
3163                         if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3164                 }
3165         }
3166
3167         /* check to see if we have rx token to pass back */
3168         if (valid & 0x1)
3169             *ss->irq_claim = be32toh(3);
3170         *(ss->irq_claim + 1) = be32toh(3);
3171 }
3172
3173 static void
3174 mxge_init(void *arg)
3175 {
3176         mxge_softc_t *sc = arg;
3177         struct ifnet *ifp = sc->ifp;
3178
3179
3180         mtx_lock(&sc->driver_mtx);
3181         if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3182                 (void) mxge_open(sc);
3183         mtx_unlock(&sc->driver_mtx);
3184 }
3185
3186
3187
3188 static void
3189 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3190 {
3191         int i;
3192
3193 #if defined(INET) || defined(INET6)
3194         tcp_lro_free(&ss->lc);
3195 #endif
3196         for (i = 0; i <= ss->rx_big.mask; i++) {
3197                 if (ss->rx_big.info[i].m == NULL)
3198                         continue;
3199                 bus_dmamap_unload(ss->rx_big.dmat,
3200                                   ss->rx_big.info[i].map);
3201                 m_freem(ss->rx_big.info[i].m);
3202                 ss->rx_big.info[i].m = NULL;
3203         }
3204
3205         for (i = 0; i <= ss->rx_small.mask; i++) {
3206                 if (ss->rx_small.info[i].m == NULL)
3207                         continue;
3208                 bus_dmamap_unload(ss->rx_small.dmat,
3209                                   ss->rx_small.info[i].map);
3210                 m_freem(ss->rx_small.info[i].m);
3211                 ss->rx_small.info[i].m = NULL;
3212         }
3213
3214         /* transmit ring used only on the first slice */
3215         if (ss->tx.info == NULL)
3216                 return;
3217
3218         for (i = 0; i <= ss->tx.mask; i++) {
3219                 ss->tx.info[i].flag = 0;
3220                 if (ss->tx.info[i].m == NULL)
3221                         continue;
3222                 bus_dmamap_unload(ss->tx.dmat,
3223                                   ss->tx.info[i].map);
3224                 m_freem(ss->tx.info[i].m);
3225                 ss->tx.info[i].m = NULL;
3226         }
3227 }
3228
3229 static void
3230 mxge_free_mbufs(mxge_softc_t *sc)
3231 {
3232         int slice;
3233
3234         for (slice = 0; slice < sc->num_slices; slice++)
3235                 mxge_free_slice_mbufs(&sc->ss[slice]);
3236 }
3237
3238 static void
3239 mxge_free_slice_rings(struct mxge_slice_state *ss)
3240 {
3241         int i;
3242
3243
3244         if (ss->rx_done.entry != NULL)
3245                 mxge_dma_free(&ss->rx_done.dma);
3246         ss->rx_done.entry = NULL;
3247
3248         if (ss->tx.req_bytes != NULL)
3249                 free(ss->tx.req_bytes, M_DEVBUF);
3250         ss->tx.req_bytes = NULL;
3251
3252         if (ss->tx.seg_list != NULL)
3253                 free(ss->tx.seg_list, M_DEVBUF);
3254         ss->tx.seg_list = NULL;
3255
3256         if (ss->rx_small.shadow != NULL)
3257                 free(ss->rx_small.shadow, M_DEVBUF);
3258         ss->rx_small.shadow = NULL;
3259
3260         if (ss->rx_big.shadow != NULL)
3261                 free(ss->rx_big.shadow, M_DEVBUF);
3262         ss->rx_big.shadow = NULL;
3263
3264         if (ss->tx.info != NULL) {
3265                 if (ss->tx.dmat != NULL) {
3266                         for (i = 0; i <= ss->tx.mask; i++) {
3267                                 bus_dmamap_destroy(ss->tx.dmat,
3268                                                    ss->tx.info[i].map);
3269                         }
3270                         bus_dma_tag_destroy(ss->tx.dmat);
3271                 }
3272                 free(ss->tx.info, M_DEVBUF);
3273         }
3274         ss->tx.info = NULL;
3275
3276         if (ss->rx_small.info != NULL) {
3277                 if (ss->rx_small.dmat != NULL) {
3278                         for (i = 0; i <= ss->rx_small.mask; i++) {
3279                                 bus_dmamap_destroy(ss->rx_small.dmat,
3280                                                    ss->rx_small.info[i].map);
3281                         }
3282                         bus_dmamap_destroy(ss->rx_small.dmat,
3283                                            ss->rx_small.extra_map);
3284                         bus_dma_tag_destroy(ss->rx_small.dmat);
3285                 }
3286                 free(ss->rx_small.info, M_DEVBUF);
3287         }
3288         ss->rx_small.info = NULL;
3289
3290         if (ss->rx_big.info != NULL) {
3291                 if (ss->rx_big.dmat != NULL) {
3292                         for (i = 0; i <= ss->rx_big.mask; i++) {
3293                                 bus_dmamap_destroy(ss->rx_big.dmat,
3294                                                    ss->rx_big.info[i].map);
3295                         }
3296                         bus_dmamap_destroy(ss->rx_big.dmat,
3297                                            ss->rx_big.extra_map);
3298                         bus_dma_tag_destroy(ss->rx_big.dmat);
3299                 }
3300                 free(ss->rx_big.info, M_DEVBUF);
3301         }
3302         ss->rx_big.info = NULL;
3303 }
3304
3305 static void
3306 mxge_free_rings(mxge_softc_t *sc)
3307 {
3308         int slice;
3309
3310         for (slice = 0; slice < sc->num_slices; slice++)
3311                 mxge_free_slice_rings(&sc->ss[slice]);
3312 }
3313
3314 static int
3315 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3316                        int tx_ring_entries)
3317 {
3318         mxge_softc_t *sc = ss->sc;
3319         size_t bytes;
3320         int err, i;
3321
3322         /* allocate per-slice receive resources */
3323
3324         ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3325         ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3326
3327         /* allocate the rx shadow rings */
3328         bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3329         ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330
3331         bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3332         ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333
3334         /* allocate the rx host info rings */
3335         bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3336         ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3337
3338         bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3339         ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3340
3341         /* allocate the rx busdma resources */
3342         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3343                                  1,                     /* alignment */
3344                                  4096,                  /* boundary */
3345                                  BUS_SPACE_MAXADDR,     /* low */
3346                                  BUS_SPACE_MAXADDR,     /* high */
3347                                  NULL, NULL,            /* filter */
3348                                  MHLEN,                 /* maxsize */
3349                                  1,                     /* num segs */
3350                                  MHLEN,                 /* maxsegsize */
3351                                  BUS_DMA_ALLOCNOW,      /* flags */
3352                                  NULL, NULL,            /* lock */
3353                                  &ss->rx_small.dmat);   /* tag */
3354         if (err != 0) {
3355                 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3356                               err);
3357                 return err;
3358         }
3359
3360         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3361                                  1,                     /* alignment */
3362 #if MXGE_VIRT_JUMBOS
3363                                  4096,                  /* boundary */
3364 #else
3365                                  0,                     /* boundary */
3366 #endif
3367                                  BUS_SPACE_MAXADDR,     /* low */
3368                                  BUS_SPACE_MAXADDR,     /* high */
3369                                  NULL, NULL,            /* filter */
3370                                  3*4096,                /* maxsize */
3371 #if MXGE_VIRT_JUMBOS
3372                                  3,                     /* num segs */
3373                                  4096,                  /* maxsegsize*/
3374 #else
3375                                  1,                     /* num segs */
3376                                  MJUM9BYTES,            /* maxsegsize*/
3377 #endif
3378                                  BUS_DMA_ALLOCNOW,      /* flags */
3379                                  NULL, NULL,            /* lock */
3380                                  &ss->rx_big.dmat);     /* tag */
3381         if (err != 0) {
3382                 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3383                               err);
3384                 return err;
3385         }
3386         for (i = 0; i <= ss->rx_small.mask; i++) {
3387                 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3388                                         &ss->rx_small.info[i].map);
3389                 if (err != 0) {
3390                         device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3391                                       err);
3392                         return err;
3393                 }
3394         }
3395         err = bus_dmamap_create(ss->rx_small.dmat, 0,
3396                                 &ss->rx_small.extra_map);
3397         if (err != 0) {
3398                 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3399                               err);
3400                 return err;
3401         }
3402
3403         for (i = 0; i <= ss->rx_big.mask; i++) {
3404                 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3405                                         &ss->rx_big.info[i].map);
3406                 if (err != 0) {
3407                         device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3408                                       err);
3409                         return err;
3410                 }
3411         }
3412         err = bus_dmamap_create(ss->rx_big.dmat, 0,
3413                                 &ss->rx_big.extra_map);
3414         if (err != 0) {
3415                 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3416                               err);
3417                 return err;
3418         }
3419
3420         /* now allocate TX resources */
3421
3422 #ifndef IFNET_BUF_RING
3423         /* only use a single TX ring for now */
3424         if (ss != ss->sc->ss)
3425                 return 0;
3426 #endif
3427
3428         ss->tx.mask = tx_ring_entries - 1;
3429         ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3430
3431         
3432         /* allocate the tx request copy block */
3433         bytes = 8 +
3434                 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3435         ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3436         /* ensure req_list entries are aligned to 8 bytes */
3437         ss->tx.req_list = (mcp_kreq_ether_send_t *)
3438                 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3439
3440         /* allocate the tx busdma segment list */
3441         bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3442         ss->tx.seg_list = (bus_dma_segment_t *)
3443                 malloc(bytes, M_DEVBUF, M_WAITOK);
3444
3445         /* allocate the tx host info ring */
3446         bytes = tx_ring_entries * sizeof (*ss->tx.info);
3447         ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3448         
3449         /* allocate the tx busdma resources */
3450         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3451                                  1,                     /* alignment */
3452                                  sc->tx_boundary,       /* boundary */
3453                                  BUS_SPACE_MAXADDR,     /* low */
3454                                  BUS_SPACE_MAXADDR,     /* high */
3455                                  NULL, NULL,            /* filter */
3456                                  65536 + 256,           /* maxsize */
3457                                  ss->tx.max_desc - 2,   /* num segs */
3458                                  sc->tx_boundary,       /* maxsegsz */
3459                                  BUS_DMA_ALLOCNOW,      /* flags */
3460                                  NULL, NULL,            /* lock */
3461                                  &ss->tx.dmat);         /* tag */
3462         
3463         if (err != 0) {
3464                 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3465                               err);
3466                 return err;
3467         }
3468
3469         /* now use these tags to setup dmamaps for each slot
3470            in the ring */
3471         for (i = 0; i <= ss->tx.mask; i++) {
3472                 err = bus_dmamap_create(ss->tx.dmat, 0,
3473                                         &ss->tx.info[i].map);
3474                 if (err != 0) {
3475                         device_printf(sc->dev, "Err %d  tx dmamap\n",
3476                                       err);
3477                         return err;
3478                 }
3479         }
3480         return 0;
3481
3482 }
3483
3484 static int
3485 mxge_alloc_rings(mxge_softc_t *sc)
3486 {
3487         mxge_cmd_t cmd;
3488         int tx_ring_size;
3489         int tx_ring_entries, rx_ring_entries;
3490         int err, slice;
3491         
3492         /* get ring sizes */
3493         err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3494         tx_ring_size = cmd.data0;
3495         if (err != 0) {
3496                 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3497                 goto abort;
3498         }
3499
3500         tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3501         rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3502         IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3503         sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3504         IFQ_SET_READY(&sc->ifp->if_snd);
3505
3506         for (slice = 0; slice < sc->num_slices; slice++) {
3507                 err = mxge_alloc_slice_rings(&sc->ss[slice],
3508                                              rx_ring_entries,
3509                                              tx_ring_entries);
3510                 if (err != 0)
3511                         goto abort;
3512         }
3513         return 0;
3514
3515 abort:
3516         mxge_free_rings(sc);
3517         return err;
3518
3519 }
3520
3521
3522 static void
3523 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3524 {
3525         int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3526
3527         if (bufsize < MCLBYTES) {
3528                 /* easy, everything fits in a single buffer */
3529                 *big_buf_size = MCLBYTES;
3530                 *cl_size = MCLBYTES;
3531                 *nbufs = 1;
3532                 return;
3533         }
3534
3535         if (bufsize < MJUMPAGESIZE) {
3536                 /* still easy, everything still fits in a single buffer */
3537                 *big_buf_size = MJUMPAGESIZE;
3538                 *cl_size = MJUMPAGESIZE;
3539                 *nbufs = 1;
3540                 return;
3541         }
3542 #if MXGE_VIRT_JUMBOS
3543         /* now we need to use virtually contiguous buffers */
3544         *cl_size = MJUM9BYTES;
3545         *big_buf_size = 4096;
3546         *nbufs = mtu / 4096 + 1;
3547         /* needs to be a power of two, so round up */
3548         if (*nbufs == 3)
3549                 *nbufs = 4;
3550 #else
3551         *cl_size = MJUM9BYTES;
3552         *big_buf_size = MJUM9BYTES;
3553         *nbufs = 1;
3554 #endif
3555 }
3556
3557 static int
3558 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3559 {
3560         mxge_softc_t *sc;
3561         mxge_cmd_t cmd;
3562         bus_dmamap_t map;
3563         int err, i, slice;
3564
3565
3566         sc = ss->sc;
3567         slice = ss - sc->ss;
3568
3569 #if defined(INET) || defined(INET6)
3570         (void)tcp_lro_init(&ss->lc);
3571 #endif
3572         ss->lc.ifp = sc->ifp;
3573         
3574         /* get the lanai pointers to the send and receive rings */
3575
3576         err = 0;
3577 #ifndef IFNET_BUF_RING
3578         /* We currently only send from the first slice */
3579         if (slice == 0) {
3580 #endif
3581                 cmd.data0 = slice;
3582                 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3583                 ss->tx.lanai =
3584                         (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3585                 ss->tx.send_go = (volatile uint32_t *)
3586                         (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3587                 ss->tx.send_stop = (volatile uint32_t *)
3588                 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3589 #ifndef IFNET_BUF_RING
3590         }
3591 #endif
3592         cmd.data0 = slice;
3593         err |= mxge_send_cmd(sc,
3594                              MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3595         ss->rx_small.lanai =
3596                 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3597         cmd.data0 = slice;
3598         err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3599         ss->rx_big.lanai =
3600                 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3601
3602         if (err != 0) {
3603                 device_printf(sc->dev,
3604                               "failed to get ring sizes or locations\n");
3605                 return EIO;
3606         }
3607
3608         /* stock receive rings */
3609         for (i = 0; i <= ss->rx_small.mask; i++) {
3610                 map = ss->rx_small.info[i].map;
3611                 err = mxge_get_buf_small(ss, map, i);
3612                 if (err) {
3613                         device_printf(sc->dev, "alloced %d/%d smalls\n",
3614                                       i, ss->rx_small.mask + 1);
3615                         return ENOMEM;
3616                 }
3617         }
3618         for (i = 0; i <= ss->rx_big.mask; i++) {
3619                 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3620                 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3621         }
3622         ss->rx_big.nbufs = nbufs;
3623         ss->rx_big.cl_size = cl_size;
3624         ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3625                 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3626         for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3627                 map = ss->rx_big.info[i].map;
3628                 err = mxge_get_buf_big(ss, map, i);
3629                 if (err) {
3630                         device_printf(sc->dev, "alloced %d/%d bigs\n",
3631                                       i, ss->rx_big.mask + 1);
3632                         return ENOMEM;
3633                 }
3634         }
3635         return 0;
3636 }
3637
3638 static int
3639 mxge_open(mxge_softc_t *sc)
3640 {
3641         mxge_cmd_t cmd;
3642         int err, big_bytes, nbufs, slice, cl_size, i;
3643         bus_addr_t bus;
3644         volatile uint8_t *itable;
3645         struct mxge_slice_state *ss;
3646
3647         /* Copy the MAC address in case it was overridden */
3648         bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3649
3650         err = mxge_reset(sc, 1);
3651         if (err != 0) {
3652                 device_printf(sc->dev, "failed to reset\n");
3653                 return EIO;
3654         }
3655
3656         if (sc->num_slices > 1) {
3657                 /* setup the indirection table */
3658                 cmd.data0 = sc->num_slices;
3659                 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3660                                     &cmd);
3661
3662                 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3663                                      &cmd);
3664                 if (err != 0) {
3665                         device_printf(sc->dev,
3666                                       "failed to setup rss tables\n");
3667                         return err;
3668                 }
3669
3670                 /* just enable an identity mapping */
3671                 itable = sc->sram + cmd.data0;
3672                 for (i = 0; i < sc->num_slices; i++)
3673                         itable[i] = (uint8_t)i;
3674
3675                 cmd.data0 = 1;
3676                 cmd.data1 = mxge_rss_hash_type;
3677                 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3678                 if (err != 0) {
3679                         device_printf(sc->dev, "failed to enable slices\n");
3680                         return err;
3681                 }
3682         }
3683
3684
3685         mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3686
3687         cmd.data0 = nbufs;
3688         err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3689                             &cmd);
3690         /* error is only meaningful if we're trying to set
3691            MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3692         if (err && nbufs > 1) {
3693                 device_printf(sc->dev,
3694                               "Failed to set alway-use-n to %d\n",
3695                               nbufs);
3696                 return EIO;
3697         }
3698         /* Give the firmware the mtu and the big and small buffer
3699            sizes.  The firmware wants the big buf size to be a power
3700            of two. Luckily, FreeBSD's clusters are powers of two */
3701         cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3702         err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3703         cmd.data0 = MHLEN - MXGEFW_PAD;
3704         err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3705                              &cmd);
3706         cmd.data0 = big_bytes;
3707         err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3708
3709         if (err != 0) {
3710                 device_printf(sc->dev, "failed to setup params\n");
3711                 goto abort;
3712         }
3713
3714         /* Now give him the pointer to the stats block */
3715         for (slice = 0;
3716 #ifdef IFNET_BUF_RING
3717              slice < sc->num_slices;
3718 #else
3719              slice < 1;
3720 #endif
3721              slice++) {
3722                 ss = &sc->ss[slice];
3723                 cmd.data0 =
3724                         MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3725                 cmd.data1 =
3726                         MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3727                 cmd.data2 = sizeof(struct mcp_irq_data);
3728                 cmd.data2 |= (slice << 16);
3729                 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3730         }
3731
3732         if (err != 0) {
3733                 bus = sc->ss->fw_stats_dma.bus_addr;
3734                 bus += offsetof(struct mcp_irq_data, send_done_count);
3735                 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3736                 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3737                 err = mxge_send_cmd(sc,
3738                                     MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3739                                     &cmd);
3740                 /* Firmware cannot support multicast without STATS_DMA_V2 */
3741                 sc->fw_multicast_support = 0;
3742         } else {
3743                 sc->fw_multicast_support = 1;
3744         }
3745
3746         if (err != 0) {
3747                 device_printf(sc->dev, "failed to setup params\n");
3748                 goto abort;
3749         }
3750
3751         for (slice = 0; slice < sc->num_slices; slice++) {
3752                 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3753                 if (err != 0) {
3754                         device_printf(sc->dev, "couldn't open slice %d\n",
3755                                       slice);
3756                         goto abort;
3757                 }
3758         }
3759
3760         /* Finally, start the firmware running */
3761         err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3762         if (err) {
3763                 device_printf(sc->dev, "Couldn't bring up link\n");
3764                 goto abort;
3765         }
3766 #ifdef IFNET_BUF_RING
3767         for (slice = 0; slice < sc->num_slices; slice++) {
3768                 ss = &sc->ss[slice];
3769                 ss->if_drv_flags |= IFF_DRV_RUNNING;
3770                 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3771         }
3772 #endif
3773         sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3774         sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3775
3776         return 0;
3777
3778
3779 abort:
3780         mxge_free_mbufs(sc);
3781
3782         return err;
3783 }
3784
3785 static int
3786 mxge_close(mxge_softc_t *sc, int down)
3787 {
3788         mxge_cmd_t cmd;
3789         int err, old_down_cnt;
3790 #ifdef IFNET_BUF_RING
3791         struct mxge_slice_state *ss;    
3792         int slice;
3793 #endif
3794
3795 #ifdef IFNET_BUF_RING
3796         for (slice = 0; slice < sc->num_slices; slice++) {
3797                 ss = &sc->ss[slice];
3798                 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3799         }
3800 #endif
3801         sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3802         if (!down) {
3803                 old_down_cnt = sc->down_cnt;
3804                 wmb();
3805                 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3806                 if (err) {
3807                         device_printf(sc->dev,
3808                                       "Couldn't bring down link\n");
3809                 }
3810                 if (old_down_cnt == sc->down_cnt) {
3811                         /* wait for down irq */
3812                         DELAY(10 * sc->intr_coal_delay);
3813                 }
3814                 wmb();
3815                 if (old_down_cnt == sc->down_cnt) {
3816                         device_printf(sc->dev, "never got down irq\n");
3817                 }
3818         }
3819         mxge_free_mbufs(sc);
3820
3821         return 0;
3822 }
3823
3824 static void
3825 mxge_setup_cfg_space(mxge_softc_t *sc)
3826 {
3827         device_t dev = sc->dev;
3828         int reg;
3829         uint16_t lnk, pectl;
3830
3831         /* find the PCIe link width and set max read request to 4KB*/
3832         if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3833                 lnk = pci_read_config(dev, reg + 0x12, 2);
3834                 sc->link_width = (lnk >> 4) & 0x3f;
3835
3836                 if (sc->pectl == 0) {
3837                         pectl = pci_read_config(dev, reg + 0x8, 2);
3838                         pectl = (pectl & ~0x7000) | (5 << 12);
3839                         pci_write_config(dev, reg + 0x8, pectl, 2);
3840                         sc->pectl = pectl;
3841                 } else {
3842                         /* restore saved pectl after watchdog reset */
3843                         pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3844                 }
3845         }
3846
3847         /* Enable DMA and Memory space access */
3848         pci_enable_busmaster(dev);
3849 }
3850
3851 static uint32_t
3852 mxge_read_reboot(mxge_softc_t *sc)
3853 {
3854         device_t dev = sc->dev;
3855         uint32_t vs;
3856
3857         /* find the vendor specific offset */
3858         if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3859                 device_printf(sc->dev,
3860                               "could not find vendor specific offset\n");
3861                 return (uint32_t)-1;
3862         }
3863         /* enable read32 mode */
3864         pci_write_config(dev, vs + 0x10, 0x3, 1);
3865         /* tell NIC which register to read */
3866         pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3867         return (pci_read_config(dev, vs + 0x14, 4));
3868 }
3869
3870 static void
3871 mxge_watchdog_reset(mxge_softc_t *sc)
3872 {
3873         struct pci_devinfo *dinfo;
3874         struct mxge_slice_state *ss;
3875         int err, running, s, num_tx_slices = 1;
3876         uint32_t reboot;
3877         uint16_t cmd;
3878
3879         err = ENXIO;
3880
3881         device_printf(sc->dev, "Watchdog reset!\n");
3882
3883         /*
3884          * check to see if the NIC rebooted.  If it did, then all of
3885          * PCI config space has been reset, and things like the
3886          * busmaster bit will be zero.  If this is the case, then we
3887          * must restore PCI config space before the NIC can be used
3888          * again
3889          */
3890         cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3891         if (cmd == 0xffff) {
3892                 /*
3893                  * maybe the watchdog caught the NIC rebooting; wait
3894                  * up to 100ms for it to finish.  If it does not come
3895                  * back, then give up
3896                  */
3897                 DELAY(1000*100);
3898                 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3899                 if (cmd == 0xffff) {
3900                         device_printf(sc->dev, "NIC disappeared!\n");
3901                 }
3902         }
3903         if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3904                 /* print the reboot status */
3905                 reboot = mxge_read_reboot(sc);
3906                 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3907                               reboot);
3908                 running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3909                 if (running) {
3910
3911                         /*
3912                          * quiesce NIC so that TX routines will not try to
3913                          * xmit after restoration of BAR
3914                          */
3915
3916                         /* Mark the link as down */
3917                         if (sc->link_state) {
3918                                 sc->link_state = 0;
3919                                 if_link_state_change(sc->ifp,
3920                                                      LINK_STATE_DOWN);
3921                         }
3922 #ifdef IFNET_BUF_RING
3923                         num_tx_slices = sc->num_slices;
3924 #endif
3925                         /* grab all TX locks to ensure no tx  */
3926                         for (s = 0; s < num_tx_slices; s++) {
3927                                 ss = &sc->ss[s];
3928                                 mtx_lock(&ss->tx.mtx);
3929                         }
3930                         mxge_close(sc, 1);
3931                 }
3932                 /* restore PCI configuration space */
3933                 dinfo = device_get_ivars(sc->dev);
3934                 pci_cfg_restore(sc->dev, dinfo);
3935
3936                 /* and redo any changes we made to our config space */
3937                 mxge_setup_cfg_space(sc);
3938
3939                 /* reload f/w */
3940                 err = mxge_load_firmware(sc, 0);
3941                 if (err) {
3942                         device_printf(sc->dev,
3943                                       "Unable to re-load f/w\n");
3944                 }
3945                 if (running) {
3946                         if (!err)
3947                                 err = mxge_open(sc);
3948                         /* release all TX locks */
3949                         for (s = 0; s < num_tx_slices; s++) {
3950                                 ss = &sc->ss[s];
3951 #ifdef IFNET_BUF_RING
3952                                 mxge_start_locked(ss);
3953 #endif
3954                                 mtx_unlock(&ss->tx.mtx);
3955                         }
3956                 }
3957                 sc->watchdog_resets++;
3958         } else {
3959                 device_printf(sc->dev,
3960                               "NIC did not reboot, not resetting\n");
3961                 err = 0;
3962         }
3963         if (err) {
3964                 device_printf(sc->dev, "watchdog reset failed\n");
3965         } else {
3966                 if (sc->dying == 2)
3967                         sc->dying = 0;
3968                 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3969         }
3970 }
3971
3972 static void
3973 mxge_watchdog_task(void *arg, int pending)
3974 {
3975         mxge_softc_t *sc = arg;
3976
3977
3978         mtx_lock(&sc->driver_mtx);
3979         mxge_watchdog_reset(sc);
3980         mtx_unlock(&sc->driver_mtx);
3981 }
3982
3983 static void
3984 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3985 {
3986         tx = &sc->ss[slice].tx;
3987         device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3988         device_printf(sc->dev,
3989                       "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3990                       tx->req, tx->done, tx->queue_active);
3991         device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3992                               tx->activate, tx->deactivate);
3993         device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3994                       tx->pkt_done,
3995                       be32toh(sc->ss->fw_stats->send_done_count));
3996 }
3997
3998 static int
3999 mxge_watchdog(mxge_softc_t *sc)
4000 {
4001         mxge_tx_ring_t *tx;
4002         uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4003         int i, err = 0;
4004
4005         /* see if we have outstanding transmits, which
4006            have been pending for more than mxge_ticks */
4007         for (i = 0;
4008 #ifdef IFNET_BUF_RING
4009              (i < sc->num_slices) && (err == 0);
4010 #else
4011              (i < 1) && (err == 0);
4012 #endif
4013              i++) {
4014                 tx = &sc->ss[i].tx;             
4015                 if (tx->req != tx->done &&
4016                     tx->watchdog_req != tx->watchdog_done &&
4017                     tx->done == tx->watchdog_done) {
4018                         /* check for pause blocking before resetting */
4019                         if (tx->watchdog_rx_pause == rx_pause) {
4020                                 mxge_warn_stuck(sc, tx, i);
4021                                 taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4022                                 return (ENXIO);
4023                         }
4024                         else
4025                                 device_printf(sc->dev, "Flow control blocking "
4026                                               "xmits, check link partner\n");
4027                 }
4028
4029                 tx->watchdog_req = tx->req;
4030                 tx->watchdog_done = tx->done;
4031                 tx->watchdog_rx_pause = rx_pause;
4032         }
4033
4034         if (sc->need_media_probe)
4035                 mxge_media_probe(sc);
4036         return (err);
4037 }
4038
4039 static uint64_t
4040 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4041 {
4042         struct mxge_softc *sc;
4043         uint64_t rv;
4044
4045         sc = if_getsoftc(ifp);
4046         rv = 0;
4047
4048         switch (cnt) {
4049         case IFCOUNTER_IPACKETS:
4050                 for (int s = 0; s < sc->num_slices; s++)
4051                         rv += sc->ss[s].ipackets;
4052                 return (rv);
4053         case IFCOUNTER_OPACKETS:
4054                 for (int s = 0; s < sc->num_slices; s++)
4055                         rv += sc->ss[s].opackets;
4056                 return (rv);
4057         case IFCOUNTER_OERRORS:
4058                 for (int s = 0; s < sc->num_slices; s++)
4059                         rv += sc->ss[s].oerrors;
4060                 return (rv);
4061 #ifdef IFNET_BUF_RING
4062         case IFCOUNTER_OBYTES:
4063                 for (int s = 0; s < sc->num_slices; s++)
4064                         rv += sc->ss[s].obytes;
4065                 return (rv);
4066         case IFCOUNTER_OMCASTS:
4067                 for (int s = 0; s < sc->num_slices; s++)
4068                         rv += sc->ss[s].omcasts;
4069                 return (rv);
4070         case IFCOUNTER_OQDROPS:
4071                 for (int s = 0; s < sc->num_slices; s++)
4072                         rv += sc->ss[s].tx.br->br_drops;
4073                 return (rv);
4074 #endif
4075         default:
4076                 return (if_get_counter_default(ifp, cnt));
4077         }
4078 }
4079
4080 static void
4081 mxge_tick(void *arg)
4082 {
4083         mxge_softc_t *sc = arg;
4084         u_long pkts = 0;
4085         int err = 0;
4086         int running, ticks;
4087         uint16_t cmd;
4088
4089         ticks = mxge_ticks;
4090         running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4091         if (running) {
4092                 if (!sc->watchdog_countdown) {
4093                         err = mxge_watchdog(sc);
4094                         sc->watchdog_countdown = 4;
4095                 }
4096                 sc->watchdog_countdown--;
4097         }
4098         if (pkts == 0) {
4099                 /* ensure NIC did not suffer h/w fault while idle */
4100                 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);                
4101                 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4102                         sc->dying = 2;
4103                         taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4104                         err = ENXIO;
4105                 }
4106                 /* look less often if NIC is idle */
4107                 ticks *= 4;
4108         }
4109
4110         if (err == 0)
4111                 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4112
4113 }
4114
4115 static int
4116 mxge_media_change(struct ifnet *ifp)
4117 {
4118         return EINVAL;
4119 }
4120
4121 static int
4122 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4123 {
4124         struct ifnet *ifp = sc->ifp;
4125         int real_mtu, old_mtu;
4126         int err = 0;
4127
4128
4129         real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4130         if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4131                 return EINVAL;
4132         mtx_lock(&sc->driver_mtx);
4133         old_mtu = ifp->if_mtu;
4134         ifp->if_mtu = mtu;
4135         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4136                 mxge_close(sc, 0);
4137                 err = mxge_open(sc);
4138                 if (err != 0) {
4139                         ifp->if_mtu = old_mtu;
4140                         mxge_close(sc, 0);
4141                         (void) mxge_open(sc);
4142                 }
4143         }
4144         mtx_unlock(&sc->driver_mtx);
4145         return err;
4146 }       
4147
4148 static void
4149 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4150 {
4151         mxge_softc_t *sc = ifp->if_softc;
4152         
4153
4154         if (sc == NULL)
4155                 return;
4156         ifmr->ifm_status = IFM_AVALID;
4157         ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4158         ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4159         ifmr->ifm_active |= sc->current_media;
4160 }
4161
4162 static int
4163 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4164 {
4165         mxge_softc_t *sc = ifp->if_softc;
4166         struct ifreq *ifr = (struct ifreq *)data;
4167         int err, mask;
4168
4169         err = 0;
4170         switch (command) {
4171         case SIOCSIFADDR:
4172         case SIOCGIFADDR:
4173                 err = ether_ioctl(ifp, command, data);
4174                 break;
4175
4176         case SIOCSIFMTU:
4177                 err = mxge_change_mtu(sc, ifr->ifr_mtu);
4178                 break;
4179
4180         case SIOCSIFFLAGS:
4181                 mtx_lock(&sc->driver_mtx);
4182                 if (sc->dying) {
4183                         mtx_unlock(&sc->driver_mtx);
4184                         return EINVAL;
4185                 }
4186                 if (ifp->if_flags & IFF_UP) {
4187                         if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4188                                 err = mxge_open(sc);
4189                         } else {
4190                                 /* take care of promis can allmulti
4191                                    flag chages */
4192                                 mxge_change_promisc(sc,
4193                                                     ifp->if_flags & IFF_PROMISC);
4194                                 mxge_set_multicast_list(sc);
4195                         }
4196                 } else {
4197                         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4198                                 mxge_close(sc, 0);
4199                         }
4200                 }
4201                 mtx_unlock(&sc->driver_mtx);
4202                 break;
4203
4204         case SIOCADDMULTI:
4205         case SIOCDELMULTI:
4206                 mtx_lock(&sc->driver_mtx);
4207                 mxge_set_multicast_list(sc);
4208                 mtx_unlock(&sc->driver_mtx);
4209                 break;
4210
4211         case SIOCSIFCAP:
4212                 mtx_lock(&sc->driver_mtx);
4213                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4214                 if (mask & IFCAP_TXCSUM) {
4215                         if (IFCAP_TXCSUM & ifp->if_capenable) {
4216                                 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4217                                 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4218                         } else {
4219                                 ifp->if_capenable |= IFCAP_TXCSUM;
4220                                 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4221                         }
4222                 } else if (mask & IFCAP_RXCSUM) {
4223                         if (IFCAP_RXCSUM & ifp->if_capenable) {
4224                                 ifp->if_capenable &= ~IFCAP_RXCSUM;
4225                         } else {
4226                                 ifp->if_capenable |= IFCAP_RXCSUM;
4227                         }
4228                 }
4229                 if (mask & IFCAP_TSO4) {
4230                         if (IFCAP_TSO4 & ifp->if_capenable) {
4231                                 ifp->if_capenable &= ~IFCAP_TSO4;
4232                         } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4233                                 ifp->if_capenable |= IFCAP_TSO4;
4234                                 ifp->if_hwassist |= CSUM_TSO;
4235                         } else {
4236                                 printf("mxge requires tx checksum offload"
4237                                        " be enabled to use TSO\n");
4238                                 err = EINVAL;
4239                         }
4240                 }
4241 #if IFCAP_TSO6
4242                 if (mask & IFCAP_TXCSUM_IPV6) {
4243                         if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4244                                 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4245                                                        | IFCAP_TSO6);
4246                                 ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4247                                                       | CSUM_UDP);
4248                         } else {
4249                                 ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4250                                 ifp->if_hwassist |= (CSUM_TCP_IPV6
4251                                                      | CSUM_UDP_IPV6);
4252                         }
4253                 } else if (mask & IFCAP_RXCSUM_IPV6) {
4254                         if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4255                                 ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4256                         } else {
4257                                 ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4258                         }
4259                 }
4260                 if (mask & IFCAP_TSO6) {
4261                         if (IFCAP_TSO6 & ifp->if_capenable) {
4262                                 ifp->if_capenable &= ~IFCAP_TSO6;
4263                         } else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4264                                 ifp->if_capenable |= IFCAP_TSO6;
4265                                 ifp->if_hwassist |= CSUM_TSO;
4266                         } else {
4267                                 printf("mxge requires tx checksum offload"
4268                                        " be enabled to use TSO\n");
4269                                 err = EINVAL;
4270                         }
4271                 }
4272 #endif /*IFCAP_TSO6 */
4273
4274                 if (mask & IFCAP_LRO)
4275                         ifp->if_capenable ^= IFCAP_LRO;
4276                 if (mask & IFCAP_VLAN_HWTAGGING)
4277                         ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4278                 if (mask & IFCAP_VLAN_HWTSO)
4279                         ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4280
4281                 if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4282                     !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4283                         ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4284
4285                 mtx_unlock(&sc->driver_mtx);
4286                 VLAN_CAPABILITIES(ifp);
4287
4288                 break;
4289
4290         case SIOCGIFMEDIA:
4291                 mtx_lock(&sc->driver_mtx);
4292                 mxge_media_probe(sc);
4293                 mtx_unlock(&sc->driver_mtx);
4294                 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4295                                     &sc->media, command);
4296                 break;
4297
4298         default:
4299                 err = ENOTTY;
4300         }
4301         return err;
4302 }
4303
4304 static void
4305 mxge_fetch_tunables(mxge_softc_t *sc)
4306 {
4307
4308         TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4309         TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4310                           &mxge_flow_control);
4311         TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4312                           &mxge_intr_coal_delay);       
4313         TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4314                           &mxge_nvidia_ecrc_enable);    
4315         TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4316                           &mxge_force_firmware);        
4317         TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4318                           &mxge_deassert_wait); 
4319         TUNABLE_INT_FETCH("hw.mxge.verbose",
4320                           &mxge_verbose);       
4321         TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4322         TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4323         TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4324         TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4325         TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4326         TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4327
4328         if (bootverbose)
4329                 mxge_verbose = 1;
4330         if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4331                 mxge_intr_coal_delay = 30;
4332         if (mxge_ticks == 0)
4333                 mxge_ticks = hz / 2;
4334         sc->pause = mxge_flow_control;
4335         if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4336             || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4337                 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4338         }
4339         if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4340             mxge_initial_mtu < ETHER_MIN_LEN)
4341                 mxge_initial_mtu = ETHERMTU_JUMBO;
4342
4343         if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4344                 mxge_throttle = MXGE_MAX_THROTTLE;
4345         if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4346                 mxge_throttle = MXGE_MIN_THROTTLE;
4347         sc->throttle = mxge_throttle;
4348 }
4349
4350
4351 static void
4352 mxge_free_slices(mxge_softc_t *sc)
4353 {
4354         struct mxge_slice_state *ss;
4355         int i;
4356
4357
4358         if (sc->ss == NULL)
4359                 return;
4360
4361         for (i = 0; i < sc->num_slices; i++) {
4362                 ss = &sc->ss[i];
4363                 if (ss->fw_stats != NULL) {
4364                         mxge_dma_free(&ss->fw_stats_dma);
4365                         ss->fw_stats = NULL;
4366 #ifdef IFNET_BUF_RING
4367                         if (ss->tx.br != NULL) {
4368                                 drbr_free(ss->tx.br, M_DEVBUF);
4369                                 ss->tx.br = NULL;
4370                         }
4371 #endif
4372                         mtx_destroy(&ss->tx.mtx);
4373                 }
4374                 if (ss->rx_done.entry != NULL) {
4375                         mxge_dma_free(&ss->rx_done.dma);
4376                         ss->rx_done.entry = NULL;
4377                 }
4378         }
4379         free(sc->ss, M_DEVBUF);
4380         sc->ss = NULL;
4381 }
4382
4383 static int
4384 mxge_alloc_slices(mxge_softc_t *sc)
4385 {
4386         mxge_cmd_t cmd;
4387         struct mxge_slice_state *ss;
4388         size_t bytes;
4389         int err, i, max_intr_slots;
4390
4391         err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4392         if (err != 0) {
4393                 device_printf(sc->dev, "Cannot determine rx ring size\n");
4394                 return err;
4395         }
4396         sc->rx_ring_size = cmd.data0;
4397         max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4398         
4399         bytes = sizeof (*sc->ss) * sc->num_slices;
4400         sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4401         if (sc->ss == NULL)
4402                 return (ENOMEM);
4403         for (i = 0; i < sc->num_slices; i++) {
4404                 ss = &sc->ss[i];
4405
4406                 ss->sc = sc;
4407
4408                 /* allocate per-slice rx interrupt queues */
4409                 
4410                 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4411                 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4412                 if (err != 0)
4413                         goto abort;
4414                 ss->rx_done.entry = ss->rx_done.dma.addr;
4415                 bzero(ss->rx_done.entry, bytes);
4416
4417                 /*
4418                  * allocate the per-slice firmware stats; stats
4419                  * (including tx) are used used only on the first
4420                  * slice for now
4421                  */
4422 #ifndef IFNET_BUF_RING
4423                 if (i > 0)
4424                         continue;
4425 #endif
4426
4427                 bytes = sizeof (*ss->fw_stats);
4428                 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4429                                      sizeof (*ss->fw_stats), 64);
4430                 if (err != 0)
4431                         goto abort;
4432                 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4433                 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4434                          "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4435                 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4436 #ifdef IFNET_BUF_RING
4437                 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4438                                            &ss->tx.mtx);
4439 #endif
4440         }
4441
4442         return (0);
4443
4444 abort:
4445         mxge_free_slices(sc);
4446         return (ENOMEM);
4447 }
4448
4449 static void
4450 mxge_slice_probe(mxge_softc_t *sc)
4451 {
4452         mxge_cmd_t cmd;
4453         char *old_fw;
4454         int msix_cnt, status, max_intr_slots;
4455
4456         sc->num_slices = 1;
4457         /*
4458          *  don't enable multiple slices if they are not enabled,
4459          *  or if this is not an SMP system
4460          */
4461         
4462         if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4463                 return;
4464
4465         /* see how many MSI-X interrupts are available */
4466         msix_cnt = pci_msix_count(sc->dev);
4467         if (msix_cnt < 2)
4468                 return;
4469
4470         /* now load the slice aware firmware see what it supports */
4471         old_fw = sc->fw_name;
4472         if (old_fw == mxge_fw_aligned)
4473                 sc->fw_name = mxge_fw_rss_aligned;
4474         else
4475                 sc->fw_name = mxge_fw_rss_unaligned;
4476         status = mxge_load_firmware(sc, 0);
4477         if (status != 0) {
4478                 device_printf(sc->dev, "Falling back to a single slice\n");
4479                 return;
4480         }
4481         
4482         /* try to send a reset command to the card to see if it
4483            is alive */
4484         memset(&cmd, 0, sizeof (cmd));
4485         status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4486         if (status != 0) {
4487                 device_printf(sc->dev, "failed reset\n");
4488                 goto abort_with_fw;
4489         }
4490
4491         /* get rx ring size */
4492         status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4493         if (status != 0) {
4494                 device_printf(sc->dev, "Cannot determine rx ring size\n");
4495                 goto abort_with_fw;
4496         }
4497         max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4498
4499         /* tell it the size of the interrupt queues */
4500         cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4501         status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4502         if (status != 0) {
4503                 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4504                 goto abort_with_fw;
4505         }
4506
4507         /* ask the maximum number of slices it supports */
4508         status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4509         if (status != 0) {
4510                 device_printf(sc->dev,
4511                               "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4512                 goto abort_with_fw;
4513         }
4514         sc->num_slices = cmd.data0;
4515         if (sc->num_slices > msix_cnt)
4516                 sc->num_slices = msix_cnt;
4517
4518         if (mxge_max_slices == -1) {
4519                 /* cap to number of CPUs in system */
4520                 if (sc->num_slices > mp_ncpus)
4521                         sc->num_slices = mp_ncpus;
4522         } else {
4523                 if (sc->num_slices > mxge_max_slices)
4524                         sc->num_slices = mxge_max_slices;
4525         }
4526         /* make sure it is a power of two */
4527         while (sc->num_slices & (sc->num_slices - 1))
4528                 sc->num_slices--;
4529
4530         if (mxge_verbose)
4531                 device_printf(sc->dev, "using %d slices\n",
4532                               sc->num_slices);
4533         
4534         return;
4535
4536 abort_with_fw:
4537         sc->fw_name = old_fw;
4538         (void) mxge_load_firmware(sc, 0);
4539 }
4540
4541 static int
4542 mxge_add_msix_irqs(mxge_softc_t *sc)
4543 {
4544         size_t bytes;
4545         int count, err, i, rid;
4546
4547         rid = PCIR_BAR(2);
4548         sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4549                                                     &rid, RF_ACTIVE);
4550
4551         if (sc->msix_table_res == NULL) {
4552                 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4553                 return ENXIO;
4554         }
4555
4556         count = sc->num_slices;
4557         err = pci_alloc_msix(sc->dev, &count);
4558         if (err != 0) {
4559                 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4560                               "err = %d \n", sc->num_slices, err);
4561                 goto abort_with_msix_table;
4562         }
4563         if (count < sc->num_slices) {
4564                 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4565                               count, sc->num_slices);
4566                 device_printf(sc->dev,
4567                               "Try setting hw.mxge.max_slices to %d\n",
4568                               count);
4569                 err = ENOSPC;
4570                 goto abort_with_msix;
4571         }
4572         bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4573         sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4574         if (sc->msix_irq_res == NULL) {
4575                 err = ENOMEM;
4576                 goto abort_with_msix;
4577         }
4578
4579         for (i = 0; i < sc->num_slices; i++) {
4580                 rid = i + 1;
4581                 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4582                                                           SYS_RES_IRQ,
4583                                                           &rid, RF_ACTIVE);
4584                 if (sc->msix_irq_res[i] == NULL) {
4585                         device_printf(sc->dev, "couldn't allocate IRQ res"
4586                                       " for message %d\n", i);
4587                         err = ENXIO;
4588                         goto abort_with_res;
4589                 }
4590         }
4591
4592         bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4593         sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4594
4595         for (i = 0; i < sc->num_slices; i++) {
4596                 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4597                                      INTR_TYPE_NET | INTR_MPSAFE,
4598 #if __FreeBSD_version > 700030
4599                                      NULL,
4600 #endif
4601                                      mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4602                 if (err != 0) {
4603                         device_printf(sc->dev, "couldn't setup intr for "
4604                                       "message %d\n", i);
4605                         goto abort_with_intr;
4606                 }
4607                 bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4608                                   sc->msix_ih[i], "s%d", i);
4609         }
4610
4611         if (mxge_verbose) {
4612                 device_printf(sc->dev, "using %d msix IRQs:",
4613                               sc->num_slices);
4614                 for (i = 0; i < sc->num_slices; i++)
4615                         printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4616                 printf("\n");
4617         }
4618         return (0);
4619
4620 abort_with_intr:
4621         for (i = 0; i < sc->num_slices; i++) {
4622                 if (sc->msix_ih[i] != NULL) {
4623                         bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4624                                           sc->msix_ih[i]);
4625                         sc->msix_ih[i] = NULL;
4626                 }
4627         }
4628         free(sc->msix_ih, M_DEVBUF);
4629
4630
4631 abort_with_res:
4632         for (i = 0; i < sc->num_slices; i++) {
4633                 rid = i + 1;
4634                 if (sc->msix_irq_res[i] != NULL)
4635                         bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4636                                              sc->msix_irq_res[i]);
4637                 sc->msix_irq_res[i] = NULL;
4638         }
4639         free(sc->msix_irq_res, M_DEVBUF);
4640
4641
4642 abort_with_msix:
4643         pci_release_msi(sc->dev);
4644
4645 abort_with_msix_table:
4646         bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4647                              sc->msix_table_res);
4648
4649         return err;
4650 }
4651
4652 static int
4653 mxge_add_single_irq(mxge_softc_t *sc)
4654 {
4655         int count, err, rid;
4656
4657         count = pci_msi_count(sc->dev);
4658         if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4659                 rid = 1;
4660         } else {
4661                 rid = 0;
4662                 sc->legacy_irq = 1;
4663         }
4664         sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4665                                          1, RF_SHAREABLE | RF_ACTIVE);
4666         if (sc->irq_res == NULL) {
4667                 device_printf(sc->dev, "could not alloc interrupt\n");
4668                 return ENXIO;
4669         }
4670         if (mxge_verbose)
4671                 device_printf(sc->dev, "using %s irq %ld\n",
4672                               sc->legacy_irq ? "INTx" : "MSI",
4673                               rman_get_start(sc->irq_res));
4674         err = bus_setup_intr(sc->dev, sc->irq_res,
4675                              INTR_TYPE_NET | INTR_MPSAFE,
4676 #if __FreeBSD_version > 700030
4677                              NULL,
4678 #endif
4679                              mxge_intr, &sc->ss[0], &sc->ih);
4680         if (err != 0) {
4681                 bus_release_resource(sc->dev, SYS_RES_IRQ,
4682                                      sc->legacy_irq ? 0 : 1, sc->irq_res);
4683                 if (!sc->legacy_irq)
4684                         pci_release_msi(sc->dev);
4685         }
4686         return err;
4687 }
4688
4689 static void
4690 mxge_rem_msix_irqs(mxge_softc_t *sc)
4691 {
4692         int i, rid;
4693
4694         for (i = 0; i < sc->num_slices; i++) {
4695                 if (sc->msix_ih[i] != NULL) {
4696                         bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4697                                           sc->msix_ih[i]);
4698                         sc->msix_ih[i] = NULL;
4699                 }
4700         }
4701         free(sc->msix_ih, M_DEVBUF);
4702
4703         for (i = 0; i < sc->num_slices; i++) {
4704                 rid = i + 1;
4705                 if (sc->msix_irq_res[i] != NULL)
4706                         bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4707                                              sc->msix_irq_res[i]);
4708                 sc->msix_irq_res[i] = NULL;
4709         }
4710         free(sc->msix_irq_res, M_DEVBUF);
4711
4712         bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4713                              sc->msix_table_res);
4714
4715         pci_release_msi(sc->dev);
4716         return;
4717 }
4718
4719 static void
4720 mxge_rem_single_irq(mxge_softc_t *sc)
4721 {
4722         bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4723         bus_release_resource(sc->dev, SYS_RES_IRQ,
4724                              sc->legacy_irq ? 0 : 1, sc->irq_res);
4725         if (!sc->legacy_irq)
4726                 pci_release_msi(sc->dev);
4727 }
4728
4729 static void
4730 mxge_rem_irq(mxge_softc_t *sc)
4731 {
4732         if (sc->num_slices > 1)
4733                 mxge_rem_msix_irqs(sc);
4734         else
4735                 mxge_rem_single_irq(sc);
4736 }
4737
4738 static int
4739 mxge_add_irq(mxge_softc_t *sc)
4740 {
4741         int err;
4742
4743         if (sc->num_slices > 1)
4744                 err = mxge_add_msix_irqs(sc);
4745         else
4746                 err = mxge_add_single_irq(sc);
4747         
4748         if (0 && err == 0 && sc->num_slices > 1) {
4749                 mxge_rem_msix_irqs(sc);
4750                 err = mxge_add_msix_irqs(sc);
4751         }
4752         return err;
4753 }
4754
4755
4756 static int
4757 mxge_attach(device_t dev)
4758 {
4759         mxge_cmd_t cmd;
4760         mxge_softc_t *sc = device_get_softc(dev);
4761         struct ifnet *ifp;
4762         int err, rid;
4763
4764         sc->dev = dev;
4765         mxge_fetch_tunables(sc);
4766
4767         TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4768         sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4769                                   taskqueue_thread_enqueue, &sc->tq);
4770         if (sc->tq == NULL) {
4771                 err = ENOMEM;
4772                 goto abort_with_nothing;
4773         }
4774
4775         err = bus_dma_tag_create(bus_get_dma_tag(dev),  /* parent */
4776                                  1,                     /* alignment */
4777                                  0,                     /* boundary */
4778                                  BUS_SPACE_MAXADDR,     /* low */
4779                                  BUS_SPACE_MAXADDR,     /* high */
4780                                  NULL, NULL,            /* filter */
4781                                  65536 + 256,           /* maxsize */
4782                                  MXGE_MAX_SEND_DESC,    /* num segs */
4783                                  65536,                 /* maxsegsize */
4784                                  0,                     /* flags */
4785                                  NULL, NULL,            /* lock */
4786                                  &sc->parent_dmat);     /* tag */
4787
4788         if (err != 0) {
4789                 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4790                               err);
4791                 goto abort_with_tq;
4792         }
4793
4794         ifp = sc->ifp = if_alloc(IFT_ETHER);
4795         if (ifp == NULL) {
4796                 device_printf(dev, "can not if_alloc()\n");
4797                 err = ENOSPC;
4798                 goto abort_with_parent_dmat;
4799         }
4800         if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4801
4802         snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4803                  device_get_nameunit(dev));
4804         mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4805         snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4806                  "%s:drv", device_get_nameunit(dev));
4807         mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4808                  MTX_NETWORK_LOCK, MTX_DEF);
4809
4810         callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4811
4812         mxge_setup_cfg_space(sc);
4813         
4814         /* Map the board into the kernel */
4815         rid = PCIR_BARS;
4816         sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4817                                          ~0, 1, RF_ACTIVE);
4818         if (sc->mem_res == NULL) {
4819                 device_printf(dev, "could not map memory\n");
4820                 err = ENXIO;
4821                 goto abort_with_lock;
4822         }
4823         sc->sram = rman_get_virtual(sc->mem_res);
4824         sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4825         if (sc->sram_size > rman_get_size(sc->mem_res)) {
4826                 device_printf(dev, "impossible memory region size %ld\n",
4827                               rman_get_size(sc->mem_res));
4828                 err = ENXIO;
4829                 goto abort_with_mem_res;
4830         }
4831
4832         /* make NULL terminated copy of the EEPROM strings section of
4833            lanai SRAM */
4834         bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4835         bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4836                                 rman_get_bushandle(sc->mem_res),
4837                                 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4838                                 sc->eeprom_strings,
4839                                 MXGE_EEPROM_STRINGS_SIZE - 2);
4840         err = mxge_parse_strings(sc);
4841         if (err != 0)
4842                 goto abort_with_mem_res;
4843
4844         /* Enable write combining for efficient use of PCIe bus */
4845         mxge_enable_wc(sc);
4846
4847         /* Allocate the out of band dma memory */
4848         err = mxge_dma_alloc(sc, &sc->cmd_dma,
4849                              sizeof (mxge_cmd_t), 64);
4850         if (err != 0)
4851                 goto abort_with_mem_res;
4852         sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4853         err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4854         if (err != 0)
4855                 goto abort_with_cmd_dma;
4856
4857         err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4858         if (err != 0)
4859                 goto abort_with_zeropad_dma;
4860
4861         /* select & load the firmware */
4862         err = mxge_select_firmware(sc);
4863         if (err != 0)
4864                 goto abort_with_dmabench;
4865         sc->intr_coal_delay = mxge_intr_coal_delay;
4866
4867         mxge_slice_probe(sc);
4868         err = mxge_alloc_slices(sc);
4869         if (err != 0)
4870                 goto abort_with_dmabench;
4871
4872         err = mxge_reset(sc, 0);
4873         if (err != 0)
4874                 goto abort_with_slices;
4875
4876         err = mxge_alloc_rings(sc);
4877         if (err != 0) {
4878                 device_printf(sc->dev, "failed to allocate rings\n");
4879                 goto abort_with_slices;
4880         }
4881
4882         err = mxge_add_irq(sc);
4883         if (err != 0) {
4884                 device_printf(sc->dev, "failed to add irq\n");
4885                 goto abort_with_rings;
4886         }
4887
4888         ifp->if_baudrate = IF_Gbps(10);
4889         ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4890                 IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4891                 IFCAP_RXCSUM_IPV6;
4892 #if defined(INET) || defined(INET6)
4893         ifp->if_capabilities |= IFCAP_LRO;
4894 #endif
4895
4896 #ifdef MXGE_NEW_VLAN_API
4897         ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4898
4899         /* Only FW 1.4.32 and newer can do TSO over vlans */
4900         if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4901             sc->fw_ver_tiny >= 32)
4902                 ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4903 #endif
4904         sc->max_mtu = mxge_max_mtu(sc);
4905         if (sc->max_mtu >= 9000)
4906                 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4907         else
4908                 device_printf(dev, "MTU limited to %d.  Install "
4909                               "latest firmware for 9000 byte jumbo support\n",
4910                               sc->max_mtu - ETHER_HDR_LEN);
4911         ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4912         ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4913         /* check to see if f/w supports TSO for IPv6 */
4914         if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4915                 if (CSUM_TCP_IPV6)
4916                         ifp->if_capabilities |= IFCAP_TSO6;
4917                 sc->max_tso6_hlen = min(cmd.data0,
4918                                         sizeof (sc->ss[0].scratch));
4919         }
4920         ifp->if_capenable = ifp->if_capabilities;
4921         if (sc->lro_cnt == 0)
4922                 ifp->if_capenable &= ~IFCAP_LRO;
4923         ifp->if_init = mxge_init;
4924         ifp->if_softc = sc;
4925         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4926         ifp->if_ioctl = mxge_ioctl;
4927         ifp->if_start = mxge_start;
4928         ifp->if_get_counter = mxge_get_counter;
4929         /* Initialise the ifmedia structure */
4930         ifmedia_init(&sc->media, 0, mxge_media_change,
4931                      mxge_media_status);
4932         mxge_media_init(sc);
4933         mxge_media_probe(sc);
4934         sc->dying = 0;
4935         ether_ifattach(ifp, sc->mac_addr);
4936         /* ether_ifattach sets mtu to ETHERMTU */
4937         if (mxge_initial_mtu != ETHERMTU)
4938                 mxge_change_mtu(sc, mxge_initial_mtu);
4939
4940         mxge_add_sysctls(sc);
4941 #ifdef IFNET_BUF_RING
4942         ifp->if_transmit = mxge_transmit;
4943         ifp->if_qflush = mxge_qflush;
4944 #endif
4945         taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4946                                 device_get_nameunit(sc->dev));
4947         callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4948         return 0;
4949
4950 abort_with_rings:
4951         mxge_free_rings(sc);
4952 abort_with_slices:
4953         mxge_free_slices(sc);
4954 abort_with_dmabench:
4955         mxge_dma_free(&sc->dmabench_dma);
4956 abort_with_zeropad_dma:
4957         mxge_dma_free(&sc->zeropad_dma);
4958 abort_with_cmd_dma:
4959         mxge_dma_free(&sc->cmd_dma);
4960 abort_with_mem_res:
4961         bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4962 abort_with_lock:
4963         pci_disable_busmaster(dev);
4964         mtx_destroy(&sc->cmd_mtx);
4965         mtx_destroy(&sc->driver_mtx);
4966         if_free(ifp);
4967 abort_with_parent_dmat:
4968         bus_dma_tag_destroy(sc->parent_dmat);
4969 abort_with_tq:
4970         if (sc->tq != NULL) {
4971                 taskqueue_drain(sc->tq, &sc->watchdog_task);
4972                 taskqueue_free(sc->tq);
4973                 sc->tq = NULL;
4974         }
4975 abort_with_nothing:
4976         return err;
4977 }
4978
4979 static int
4980 mxge_detach(device_t dev)
4981 {
4982         mxge_softc_t *sc = device_get_softc(dev);
4983
4984         if (mxge_vlans_active(sc)) {
4985                 device_printf(sc->dev,
4986                               "Detach vlans before removing module\n");
4987                 return EBUSY;
4988         }
4989         mtx_lock(&sc->driver_mtx);
4990         sc->dying = 1;
4991         if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4992                 mxge_close(sc, 0);
4993         mtx_unlock(&sc->driver_mtx);
4994         ether_ifdetach(sc->ifp);
4995         if (sc->tq != NULL) {
4996                 taskqueue_drain(sc->tq, &sc->watchdog_task);
4997                 taskqueue_free(sc->tq);
4998                 sc->tq = NULL;
4999         }
5000         callout_drain(&sc->co_hdl);
5001         ifmedia_removeall(&sc->media);
5002         mxge_dummy_rdma(sc, 0);
5003         mxge_rem_sysctls(sc);
5004         mxge_rem_irq(sc);
5005         mxge_free_rings(sc);
5006         mxge_free_slices(sc);
5007         mxge_dma_free(&sc->dmabench_dma);
5008         mxge_dma_free(&sc->zeropad_dma);
5009         mxge_dma_free(&sc->cmd_dma);
5010         bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5011         pci_disable_busmaster(dev);
5012         mtx_destroy(&sc->cmd_mtx);
5013         mtx_destroy(&sc->driver_mtx);
5014         if_free(sc->ifp);
5015         bus_dma_tag_destroy(sc->parent_dmat);
5016         return 0;
5017 }
5018
5019 static int
5020 mxge_shutdown(device_t dev)
5021 {
5022         return 0;
5023 }
5024
5025 /*
5026   This file uses Myri10GE driver indentation.
5027
5028   Local Variables:
5029   c-file-style:"linux"
5030   tab-width:8
5031   End:
5032 */