]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/mxge/if_mxge.c
tcp/lro: Use tcp_lro_flush_all in device drivers to avoid code duplication
[FreeBSD/FreeBSD.git] / sys / dev / mxge / if_mxge.c
1 /******************************************************************************
2
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 #include <sys/zlib.h>
50
51 #include <net/if.h>
52 #include <net/if_var.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57
58 #include <net/bpf.h>
59
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/ip6.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_lro.h>
69 #include <netinet6/ip6_var.h>
70
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 #include <sys/smp.h>
77
78 #include <dev/pci/pcireg.h>
79 #include <dev/pci/pcivar.h>
80 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81
82 #include <vm/vm.h>              /* for pmap_mapdev() */
83 #include <vm/pmap.h>
84
85 #if defined(__i386) || defined(__amd64)
86 #include <machine/specialreg.h>
87 #endif
88
89 #include <dev/mxge/mxge_mcp.h>
90 #include <dev/mxge/mcp_gen_header.h>
91 /*#define MXGE_FAKE_IFP*/
92 #include <dev/mxge/if_mxge_var.h>
93 #ifdef IFNET_BUF_RING
94 #include <sys/buf_ring.h>
95 #endif
96
97 #include "opt_inet.h"
98 #include "opt_inet6.h"
99
100 /* tunable params */
101 static int mxge_nvidia_ecrc_enable = 1;
102 static int mxge_force_firmware = 0;
103 static int mxge_intr_coal_delay = 30;
104 static int mxge_deassert_wait = 1;
105 static int mxge_flow_control = 1;
106 static int mxge_verbose = 0;
107 static int mxge_ticks;
108 static int mxge_max_slices = 1;
109 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110 static int mxge_always_promisc = 0;
111 static int mxge_initial_mtu = ETHERMTU_JUMBO;
112 static int mxge_throttle = 0;
113 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114 static char *mxge_fw_aligned = "mxge_eth_z8e";
115 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117
118 static int mxge_probe(device_t dev);
119 static int mxge_attach(device_t dev);
120 static int mxge_detach(device_t dev);
121 static int mxge_shutdown(device_t dev);
122 static void mxge_intr(void *arg);
123
124 static device_method_t mxge_methods[] =
125 {
126   /* Device interface */
127   DEVMETHOD(device_probe, mxge_probe),
128   DEVMETHOD(device_attach, mxge_attach),
129   DEVMETHOD(device_detach, mxge_detach),
130   DEVMETHOD(device_shutdown, mxge_shutdown),
131
132   DEVMETHOD_END
133 };
134
135 static driver_t mxge_driver =
136 {
137   "mxge",
138   mxge_methods,
139   sizeof(mxge_softc_t),
140 };
141
142 static devclass_t mxge_devclass;
143
144 /* Declare ourselves to be a child of the PCI bus.*/
145 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148
149 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151 static int mxge_close(mxge_softc_t *sc, int down);
152 static int mxge_open(mxge_softc_t *sc);
153 static void mxge_tick(void *arg);
154
155 static int
156 mxge_probe(device_t dev)
157 {
158         int rev;
159
160
161         if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162             ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163              (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164                 rev = pci_get_revid(dev);
165                 switch (rev) {
166                 case MXGE_PCI_REV_Z8E:
167                         device_set_desc(dev, "Myri10G-PCIE-8A");
168                         break;
169                 case MXGE_PCI_REV_Z8ES:
170                         device_set_desc(dev, "Myri10G-PCIE-8B");
171                         break;
172                 default:
173                         device_set_desc(dev, "Myri10G-PCIE-8??");
174                         device_printf(dev, "Unrecognized rev %d NIC\n",
175                                       rev);
176                         break;  
177                 }
178                 return 0;
179         }
180         return ENXIO;
181 }
182
183 static void
184 mxge_enable_wc(mxge_softc_t *sc)
185 {
186 #if defined(__i386) || defined(__amd64)
187         vm_offset_t len;
188         int err;
189
190         sc->wc = 1;
191         len = rman_get_size(sc->mem_res);
192         err = pmap_change_attr((vm_offset_t) sc->sram,
193                                len, PAT_WRITE_COMBINING);
194         if (err != 0) {
195                 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196                               err);
197                 sc->wc = 0;
198         }
199 #endif          
200 }
201
202
203 /* callback to get our DMA address */
204 static void
205 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206                          int error)
207 {
208         if (error == 0) {
209                 *(bus_addr_t *) arg = segs->ds_addr;
210         }
211 }
212
213 static int
214 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215                    bus_size_t alignment)
216 {
217         int err;
218         device_t dev = sc->dev;
219         bus_size_t boundary, maxsegsize;
220
221         if (bytes > 4096 && alignment == 4096) {
222                 boundary = 0;
223                 maxsegsize = bytes;
224         } else {
225                 boundary = 4096;
226                 maxsegsize = 4096;
227         }
228
229         /* allocate DMAable memory tags */
230         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
231                                  alignment,             /* alignment */
232                                  boundary,              /* boundary */
233                                  BUS_SPACE_MAXADDR,     /* low */
234                                  BUS_SPACE_MAXADDR,     /* high */
235                                  NULL, NULL,            /* filter */
236                                  bytes,                 /* maxsize */
237                                  1,                     /* num segs */
238                                  maxsegsize,            /* maxsegsize */
239                                  BUS_DMA_COHERENT,      /* flags */
240                                  NULL, NULL,            /* lock */
241                                  &dma->dmat);           /* tag */
242         if (err != 0) {
243                 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244                 return err;
245         }
246
247         /* allocate DMAable memory & map */
248         err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249                                (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250                                 | BUS_DMA_ZERO),  &dma->map);
251         if (err != 0) {
252                 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253                 goto abort_with_dmat;
254         }
255
256         /* load the memory */
257         err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258                               mxge_dmamap_callback,
259                               (void *)&dma->bus_addr, 0);
260         if (err != 0) {
261                 device_printf(dev, "couldn't load map (err = %d)\n", err);
262                 goto abort_with_mem;
263         }
264         return 0;
265
266 abort_with_mem:
267         bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268 abort_with_dmat:
269         (void)bus_dma_tag_destroy(dma->dmat);
270         return err;
271 }
272
273
274 static void
275 mxge_dma_free(mxge_dma_t *dma)
276 {
277         bus_dmamap_unload(dma->dmat, dma->map);
278         bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279         (void)bus_dma_tag_destroy(dma->dmat);
280 }
281
282 /*
283  * The eeprom strings on the lanaiX have the format
284  * SN=x\0
285  * MAC=x:x:x:x:x:x\0
286  * PC=text\0
287  */
288
289 static int
290 mxge_parse_strings(mxge_softc_t *sc)
291 {
292         char *ptr;
293         int i, found_mac, found_sn2;
294         char *endptr;
295
296         ptr = sc->eeprom_strings;
297         found_mac = 0;
298         found_sn2 = 0;
299         while (*ptr != '\0') {
300                 if (strncmp(ptr, "MAC=", 4) == 0) {
301                         ptr += 4;
302                         for (i = 0;;) {
303                                 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304                                 if (endptr - ptr != 2)
305                                         goto abort;
306                                 ptr = endptr;
307                                 if (++i == 6)
308                                         break;
309                                 if (*ptr++ != ':')
310                                         goto abort;
311                         }
312                         found_mac = 1;
313                 } else if (strncmp(ptr, "PC=", 3) == 0) {
314                         ptr += 3;
315                         strlcpy(sc->product_code_string, ptr,
316                             sizeof(sc->product_code_string));
317                 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318                         ptr += 3;
319                         strlcpy(sc->serial_number_string, ptr,
320                             sizeof(sc->serial_number_string));
321                 } else if (strncmp(ptr, "SN2=", 4) == 0) {
322                         /* SN2 takes precedence over SN */
323                         ptr += 4;
324                         found_sn2 = 1;
325                         strlcpy(sc->serial_number_string, ptr,
326                             sizeof(sc->serial_number_string));
327                 }
328                 while (*ptr++ != '\0') {}
329         }
330
331         if (found_mac)
332                 return 0;
333
334  abort:
335         device_printf(sc->dev, "failed to parse eeprom_strings\n");
336
337         return ENXIO;
338 }
339
340 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341 static void
342 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 {
344         uint32_t val;
345         unsigned long base, off;
346         char *va, *cfgptr;
347         device_t pdev, mcp55;
348         uint16_t vendor_id, device_id, word;
349         uintptr_t bus, slot, func, ivend, idev;
350         uint32_t *ptr32;
351
352
353         if (!mxge_nvidia_ecrc_enable)
354                 return;
355
356         pdev = device_get_parent(device_get_parent(sc->dev));
357         if (pdev == NULL) {
358                 device_printf(sc->dev, "could not find parent?\n");
359                 return;
360         }
361         vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362         device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363
364         if (vendor_id != 0x10de)
365                 return;
366
367         base = 0;
368
369         if (device_id == 0x005d) {
370                 /* ck804, base address is magic */
371                 base = 0xe0000000UL;
372         } else if (device_id >= 0x0374 && device_id <= 0x378) {
373                 /* mcp55, base address stored in chipset */
374                 mcp55 = pci_find_bsf(0, 0, 0);
375                 if (mcp55 &&
376                     0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377                     0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378                         word = pci_read_config(mcp55, 0x90, 2);
379                         base = ((unsigned long)word & 0x7ffeU) << 25;
380                 }
381         }
382         if (!base)
383                 return;
384
385         /* XXXX
386            Test below is commented because it is believed that doing
387            config read/write beyond 0xff will access the config space
388            for the next larger function.  Uncomment this and remove
389            the hacky pmap_mapdev() way of accessing config space when
390            FreeBSD grows support for extended pcie config space access
391         */
392 #if 0   
393         /* See if we can, by some miracle, access the extended
394            config space */
395         val = pci_read_config(pdev, 0x178, 4);
396         if (val != 0xffffffff) {
397                 val |= 0x40;
398                 pci_write_config(pdev, 0x178, val, 4);
399                 return;
400         }
401 #endif
402         /* Rather than using normal pci config space writes, we must
403          * map the Nvidia config space ourselves.  This is because on
404          * opteron/nvidia class machine the 0xe000000 mapping is
405          * handled by the nvidia chipset, that means the internal PCI
406          * device (the on-chip northbridge), or the amd-8131 bridge
407          * and things behind them are not visible by this method.
408          */
409
410         BUS_READ_IVAR(device_get_parent(pdev), pdev,
411                       PCI_IVAR_BUS, &bus);
412         BUS_READ_IVAR(device_get_parent(pdev), pdev,
413                       PCI_IVAR_SLOT, &slot);
414         BUS_READ_IVAR(device_get_parent(pdev), pdev,
415                       PCI_IVAR_FUNCTION, &func);
416         BUS_READ_IVAR(device_get_parent(pdev), pdev,
417                       PCI_IVAR_VENDOR, &ivend);
418         BUS_READ_IVAR(device_get_parent(pdev), pdev,
419                       PCI_IVAR_DEVICE, &idev);
420                                         
421         off =  base
422                 + 0x00100000UL * (unsigned long)bus
423                 + 0x00001000UL * (unsigned long)(func
424                                                  + 8 * slot);
425
426         /* map it into the kernel */
427         va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428         
429
430         if (va == NULL) {
431                 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432                 return;
433         }
434         /* get a pointer to the config space mapped into the kernel */
435         cfgptr = va + (off & PAGE_MASK);
436
437         /* make sure that we can really access it */
438         vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439         device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440         if (! (vendor_id == ivend && device_id == idev)) {
441                 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442                               vendor_id, device_id);
443                 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444                 return;
445         }
446
447         ptr32 = (uint32_t*)(cfgptr + 0x178);
448         val = *ptr32;
449
450         if (val == 0xffffffff) {
451                 device_printf(sc->dev, "extended mapping failed\n");
452                 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453                 return;
454         }
455         *ptr32 = val | 0x40;
456         pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457         if (mxge_verbose)
458                 device_printf(sc->dev,
459                               "Enabled ECRC on upstream Nvidia bridge "
460                               "at %d:%d:%d\n",
461                               (int)bus, (int)slot, (int)func);
462         return;
463 }
464 #else
465 static void
466 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467 {
468         device_printf(sc->dev,
469                       "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470         return;
471 }
472 #endif
473
474
475 static int
476 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 {
478         mxge_cmd_t cmd;
479         bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480         int status;
481         uint32_t len;
482         char *test = " ";
483
484
485         /* Run a small DMA test.
486          * The magic multipliers to the length tell the firmware
487          * to do DMA read, write, or read+write tests.  The
488          * results are returned in cmd.data0.  The upper 16
489          * bits of the return is the number of transfers completed.
490          * The lower 16 bits is the time in 0.5us ticks that the
491          * transfers took to complete.
492          */
493
494         len = sc->tx_boundary;
495
496         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498         cmd.data2 = len * 0x10000;
499         status = mxge_send_cmd(sc, test_type, &cmd);
500         if (status != 0) {
501                 test = "read";
502                 goto abort;
503         }
504         sc->read_dma = ((cmd.data0>>16) * len * 2) /
505                 (cmd.data0 & 0xffff);
506         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508         cmd.data2 = len * 0x1;
509         status = mxge_send_cmd(sc, test_type, &cmd);
510         if (status != 0) {
511                 test = "write";
512                 goto abort;
513         }
514         sc->write_dma = ((cmd.data0>>16) * len * 2) /
515                 (cmd.data0 & 0xffff);
516
517         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519         cmd.data2 = len * 0x10001;
520         status = mxge_send_cmd(sc, test_type, &cmd);
521         if (status != 0) {
522                 test = "read/write";
523                 goto abort;
524         }
525         sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526                 (cmd.data0 & 0xffff);
527
528 abort:
529         if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530                 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531                               test, status);
532
533         return status;
534 }
535
536 /*
537  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538  * when the PCI-E Completion packets are aligned on an 8-byte
539  * boundary.  Some PCI-E chip sets always align Completion packets; on
540  * the ones that do not, the alignment can be enforced by enabling
541  * ECRC generation (if supported).
542  *
543  * When PCI-E Completion packets are not aligned, it is actually more
544  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545  *
546  * If the driver can neither enable ECRC nor verify that it has
547  * already been enabled, then it must use a firmware image which works
548  * around unaligned completion packets (ethp_z8e.dat), and it should
549  * also ensure that it never gives the device a Read-DMA which is
550  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551  * enabled, then the driver should use the aligned (eth_z8e.dat)
552  * firmware image, and set tx_boundary to 4KB.
553  */
554
555 static int
556 mxge_firmware_probe(mxge_softc_t *sc)
557 {
558         device_t dev = sc->dev;
559         int reg, status;
560         uint16_t pectl;
561
562         sc->tx_boundary = 4096;
563         /*
564          * Verify the max read request size was set to 4KB
565          * before trying the test with 4KB.
566          */
567         if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568                 pectl = pci_read_config(dev, reg + 0x8, 2);
569                 if ((pectl & (5 << 12)) != (5 << 12)) {
570                         device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571                                       pectl);
572                         sc->tx_boundary = 2048;
573                 }
574         }
575
576         /*
577          * load the optimized firmware (which assumes aligned PCIe
578          * completions) in order to see if it works on this host.
579          */
580         sc->fw_name = mxge_fw_aligned;
581         status = mxge_load_firmware(sc, 1);
582         if (status != 0) {
583                 return status;
584         }
585
586         /*
587          * Enable ECRC if possible
588          */
589         mxge_enable_nvidia_ecrc(sc);
590
591         /*
592          * Run a DMA test which watches for unaligned completions and
593          * aborts on the first one seen.  Not required on Z8ES or newer.
594          */
595         if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596                 return 0;
597         status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598         if (status == 0)
599                 return 0; /* keep the aligned firmware */
600
601         if (status != E2BIG)
602                 device_printf(dev, "DMA test failed: %d\n", status);
603         if (status == ENOSYS)
604                 device_printf(dev, "Falling back to ethp! "
605                               "Please install up to date fw\n");
606         return status;
607 }
608
609 static int
610 mxge_select_firmware(mxge_softc_t *sc)
611 {
612         int aligned = 0;
613         int force_firmware = mxge_force_firmware;
614
615         if (sc->throttle)
616                 force_firmware = sc->throttle;
617
618         if (force_firmware != 0) {
619                 if (force_firmware == 1)
620                         aligned = 1;
621                 else
622                         aligned = 0;
623                 if (mxge_verbose)
624                         device_printf(sc->dev,
625                                       "Assuming %s completions (forced)\n",
626                                       aligned ? "aligned" : "unaligned");
627                 goto abort;
628         }
629
630         /* if the PCIe link width is 4 or less, we can use the aligned
631            firmware and skip any checks */
632         if (sc->link_width != 0 && sc->link_width <= 4) {
633                 device_printf(sc->dev,
634                               "PCIe x%d Link, expect reduced performance\n",
635                               sc->link_width);
636                 aligned = 1;
637                 goto abort;
638         }
639
640         if (0 == mxge_firmware_probe(sc))
641                 return 0;
642
643 abort:
644         if (aligned) {
645                 sc->fw_name = mxge_fw_aligned;
646                 sc->tx_boundary = 4096;
647         } else {
648                 sc->fw_name = mxge_fw_unaligned;
649                 sc->tx_boundary = 2048;
650         }
651         return (mxge_load_firmware(sc, 0));
652 }
653
654 static int
655 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656 {
657
658
659         if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660                 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661                               be32toh(hdr->mcp_type));
662                 return EIO;
663         }
664
665         /* save firmware version for sysctl */
666         strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667         if (mxge_verbose)
668                 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669
670         sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671                &sc->fw_ver_minor, &sc->fw_ver_tiny);
672
673         if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674               && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675                 device_printf(sc->dev, "Found firmware version %s\n",
676                               sc->fw_version);
677                 device_printf(sc->dev, "Driver needs %d.%d\n",
678                               MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679                 return EINVAL;
680         }
681         return 0;
682
683 }
684
685 static void *
686 z_alloc(void *nil, u_int items, u_int size)
687 {
688         void *ptr;
689
690         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691         return ptr;
692 }
693
694 static void
695 z_free(void *nil, void *ptr)
696 {
697         free(ptr, M_TEMP);
698 }
699
700
701 static int
702 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 {
704         z_stream zs;
705         char *inflate_buffer;
706         const struct firmware *fw;
707         const mcp_gen_header_t *hdr;
708         unsigned hdr_offset;
709         int status;
710         unsigned int i;
711         char dummy;
712         size_t fw_len;
713
714         fw = firmware_get(sc->fw_name);
715         if (fw == NULL) {
716                 device_printf(sc->dev, "Could not find firmware image %s\n",
717                               sc->fw_name);
718                 return ENOENT;
719         }
720
721
722
723         /* setup zlib and decompress f/w */
724         bzero(&zs, sizeof (zs));
725         zs.zalloc = z_alloc;
726         zs.zfree = z_free;
727         status = inflateInit(&zs);
728         if (status != Z_OK) {
729                 status = EIO;
730                 goto abort_with_fw;
731         }
732
733         /* the uncompressed size is stored as the firmware version,
734            which would otherwise go unused */
735         fw_len = (size_t) fw->version;
736         inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737         if (inflate_buffer == NULL)
738                 goto abort_with_zs;
739         zs.avail_in = fw->datasize;
740         zs.next_in = __DECONST(char *, fw->data);
741         zs.avail_out = fw_len;
742         zs.next_out = inflate_buffer;
743         status = inflate(&zs, Z_FINISH);
744         if (status != Z_STREAM_END) {
745                 device_printf(sc->dev, "zlib %d\n", status);
746                 status = EIO;
747                 goto abort_with_buffer;
748         }
749
750         /* check id */
751         hdr_offset = htobe32(*(const uint32_t *)
752                              (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753         if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754                 device_printf(sc->dev, "Bad firmware file");
755                 status = EIO;
756                 goto abort_with_buffer;
757         }
758         hdr = (const void*)(inflate_buffer + hdr_offset);
759
760         status = mxge_validate_firmware(sc, hdr);
761         if (status != 0)
762                 goto abort_with_buffer;
763
764         /* Copy the inflated firmware to NIC SRAM. */
765         for (i = 0; i < fw_len; i += 256) {
766                 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767                               inflate_buffer + i,
768                               min(256U, (unsigned)(fw_len - i)));
769                 wmb();
770                 dummy = *sc->sram;
771                 wmb();
772         }
773
774         *limit = fw_len;
775         status = 0;
776 abort_with_buffer:
777         free(inflate_buffer, M_TEMP);
778 abort_with_zs:
779         inflateEnd(&zs);
780 abort_with_fw:
781         firmware_put(fw, FIRMWARE_UNLOAD);
782         return status;
783 }
784
785 /*
786  * Enable or disable periodic RDMAs from the host to make certain
787  * chipsets resend dropped PCIe messages
788  */
789
790 static void
791 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 {
793         char buf_bytes[72];
794         volatile uint32_t *confirm;
795         volatile char *submit;
796         uint32_t *buf, dma_low, dma_high;
797         int i;
798
799         buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800
801         /* clear confirmation addr */
802         confirm = (volatile uint32_t *)sc->cmd;
803         *confirm = 0;
804         wmb();
805
806         /* send an rdma command to the PCIe engine, and wait for the
807            response in the confirmation address.  The firmware should
808            write a -1 there to indicate it is alive and well
809         */
810
811         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813         buf[0] = htobe32(dma_high);             /* confirm addr MSW */
814         buf[1] = htobe32(dma_low);              /* confirm addr LSW */
815         buf[2] = htobe32(0xffffffff);           /* confirm data */
816         dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817         dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818         buf[3] = htobe32(dma_high);             /* dummy addr MSW */
819         buf[4] = htobe32(dma_low);              /* dummy addr LSW */
820         buf[5] = htobe32(enable);                       /* enable? */
821
822
823         submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824
825         mxge_pio_copy(submit, buf, 64);
826         wmb();
827         DELAY(1000);
828         wmb();
829         i = 0;
830         while (*confirm != 0xffffffff && i < 20) {
831                 DELAY(1000);
832                 i++;
833         }
834         if (*confirm != 0xffffffff) {
835                 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836                               (enable ? "enable" : "disable"), confirm,
837                               *confirm);
838         }
839         return;
840 }
841
842 static int
843 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 {
845         mcp_cmd_t *buf;
846         char buf_bytes[sizeof(*buf) + 8];
847         volatile mcp_cmd_response_t *response = sc->cmd;
848         volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849         uint32_t dma_low, dma_high;
850         int err, sleep_total = 0;
851
852         /* ensure buf is aligned to 8 bytes */
853         buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854
855         buf->data0 = htobe32(data->data0);
856         buf->data1 = htobe32(data->data1);
857         buf->data2 = htobe32(data->data2);
858         buf->cmd = htobe32(cmd);
859         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861
862         buf->response_addr.low = htobe32(dma_low);
863         buf->response_addr.high = htobe32(dma_high);
864         mtx_lock(&sc->cmd_mtx);
865         response->result = 0xffffffff;
866         wmb();
867         mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868
869         /* wait up to 20ms */
870         err = EAGAIN;
871         for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872                 bus_dmamap_sync(sc->cmd_dma.dmat,
873                                 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874                 wmb();
875                 switch (be32toh(response->result)) {
876                 case 0:
877                         data->data0 = be32toh(response->data);
878                         err = 0;
879                         break;
880                 case 0xffffffff:
881                         DELAY(1000);
882                         break;
883                 case MXGEFW_CMD_UNKNOWN:
884                         err = ENOSYS;
885                         break;
886                 case MXGEFW_CMD_ERROR_UNALIGNED:
887                         err = E2BIG;
888                         break;
889                 case MXGEFW_CMD_ERROR_BUSY:
890                         err = EBUSY;
891                         break;
892                 case MXGEFW_CMD_ERROR_I2C_ABSENT:
893                         err = ENXIO;
894                         break;
895                 default:
896                         device_printf(sc->dev,
897                                       "mxge: command %d "
898                                       "failed, result = %d\n",
899                                       cmd, be32toh(response->result));
900                         err = ENXIO;
901                         break;
902                 }
903                 if (err != EAGAIN)
904                         break;
905         }
906         if (err == EAGAIN)
907                 device_printf(sc->dev, "mxge: command %d timed out"
908                               "result = %d\n",
909                               cmd, be32toh(response->result));
910         mtx_unlock(&sc->cmd_mtx);
911         return err;
912 }
913
914 static int
915 mxge_adopt_running_firmware(mxge_softc_t *sc)
916 {
917         struct mcp_gen_header *hdr;
918         const size_t bytes = sizeof (struct mcp_gen_header);
919         size_t hdr_offset;
920         int status;
921
922         /* find running firmware header */
923         hdr_offset = htobe32(*(volatile uint32_t *)
924                              (sc->sram + MCP_HEADER_PTR_OFFSET));
925
926         if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927                 device_printf(sc->dev,
928                               "Running firmware has bad header offset (%d)\n",
929                               (int)hdr_offset);
930                 return EIO;
931         }
932
933         /* copy header of running firmware from SRAM to host memory to
934          * validate firmware */
935         hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936         if (hdr == NULL) {
937                 device_printf(sc->dev, "could not malloc firmware hdr\n");
938                 return ENOMEM;
939         }
940         bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941                                 rman_get_bushandle(sc->mem_res),
942                                 hdr_offset, (char *)hdr, bytes);
943         status = mxge_validate_firmware(sc, hdr);
944         free(hdr, M_DEVBUF);
945
946         /*
947          * check to see if adopted firmware has bug where adopting
948          * it will cause broadcasts to be filtered unless the NIC
949          * is kept in ALLMULTI mode
950          */
951         if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952             sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953                 sc->adopted_rx_filter_bug = 1;
954                 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955                               "working around rx filter bug\n",
956                               sc->fw_ver_major, sc->fw_ver_minor,
957                               sc->fw_ver_tiny);
958         }
959
960         return status;
961 }
962
963
964 static int
965 mxge_load_firmware(mxge_softc_t *sc, int adopt)
966 {
967         volatile uint32_t *confirm;
968         volatile char *submit;
969         char buf_bytes[72];
970         uint32_t *buf, size, dma_low, dma_high;
971         int status, i;
972
973         buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974
975         size = sc->sram_size;
976         status = mxge_load_firmware_helper(sc, &size);
977         if (status) {
978                 if (!adopt)
979                         return status;
980                 /* Try to use the currently running firmware, if
981                    it is new enough */
982                 status = mxge_adopt_running_firmware(sc);
983                 if (status) {
984                         device_printf(sc->dev,
985                                       "failed to adopt running firmware\n");
986                         return status;
987                 }
988                 device_printf(sc->dev,
989                               "Successfully adopted running firmware\n");
990                 if (sc->tx_boundary == 4096) {
991                         device_printf(sc->dev,
992                                 "Using firmware currently running on NIC"
993                                  ".  For optimal\n");
994                         device_printf(sc->dev,
995                                  "performance consider loading optimized "
996                                  "firmware\n");
997                 }
998                 sc->fw_name = mxge_fw_unaligned;
999                 sc->tx_boundary = 2048;
1000                 return 0;
1001         }
1002         /* clear confirmation addr */
1003         confirm = (volatile uint32_t *)sc->cmd;
1004         *confirm = 0;
1005         wmb();
1006         /* send a reload command to the bootstrap MCP, and wait for the
1007            response in the confirmation address.  The firmware should
1008            write a -1 there to indicate it is alive and well
1009         */
1010
1011         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013
1014         buf[0] = htobe32(dma_high);     /* confirm addr MSW */
1015         buf[1] = htobe32(dma_low);      /* confirm addr LSW */
1016         buf[2] = htobe32(0xffffffff);   /* confirm data */
1017
1018         /* FIX: All newest firmware should un-protect the bottom of
1019            the sram before handoff. However, the very first interfaces
1020            do not. Therefore the handoff copy must skip the first 8 bytes
1021         */
1022                                         /* where the code starts*/
1023         buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024         buf[4] = htobe32(size - 8);     /* length of code */
1025         buf[5] = htobe32(8);            /* where to copy to */
1026         buf[6] = htobe32(0);            /* where to jump to */
1027
1028         submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029         mxge_pio_copy(submit, buf, 64);
1030         wmb();
1031         DELAY(1000);
1032         wmb();
1033         i = 0;
1034         while (*confirm != 0xffffffff && i < 20) {
1035                 DELAY(1000*10);
1036                 i++;
1037                 bus_dmamap_sync(sc->cmd_dma.dmat,
1038                                 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039         }
1040         if (*confirm != 0xffffffff) {
1041                 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042                         confirm, *confirm);
1043                 
1044                 return ENXIO;
1045         }
1046         return 0;
1047 }
1048
1049 static int
1050 mxge_update_mac_address(mxge_softc_t *sc)
1051 {
1052         mxge_cmd_t cmd;
1053         uint8_t *addr = sc->mac_addr;
1054         int status;
1055
1056         
1057         cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058                      | (addr[2] << 8) | addr[3]);
1059
1060         cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061
1062         status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063         return status;
1064 }
1065
1066 static int
1067 mxge_change_pause(mxge_softc_t *sc, int pause)
1068 {       
1069         mxge_cmd_t cmd;
1070         int status;
1071
1072         if (pause)
1073                 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074                                        &cmd);
1075         else
1076                 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077                                        &cmd);
1078
1079         if (status) {
1080                 device_printf(sc->dev, "Failed to set flow control mode\n");
1081                 return ENXIO;
1082         }
1083         sc->pause = pause;
1084         return 0;
1085 }
1086
1087 static void
1088 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089 {       
1090         mxge_cmd_t cmd;
1091         int status;
1092
1093         if (mxge_always_promisc)
1094                 promisc = 1;
1095
1096         if (promisc)
1097                 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098                                        &cmd);
1099         else
1100                 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101                                        &cmd);
1102
1103         if (status) {
1104                 device_printf(sc->dev, "Failed to set promisc mode\n");
1105         }
1106 }
1107
1108 static void
1109 mxge_set_multicast_list(mxge_softc_t *sc)
1110 {
1111         mxge_cmd_t cmd;
1112         struct ifmultiaddr *ifma;
1113         struct ifnet *ifp = sc->ifp;
1114         int err;
1115
1116         /* This firmware is known to not support multicast */
1117         if (!sc->fw_multicast_support)
1118                 return;
1119
1120         /* Disable multicast filtering while we play with the lists*/
1121         err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122         if (err != 0) {
1123                 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124                        " error status: %d\n", err);
1125                 return;
1126         }
1127         
1128         if (sc->adopted_rx_filter_bug)
1129                 return;
1130         
1131         if (ifp->if_flags & IFF_ALLMULTI)
1132                 /* request to disable multicast filtering, so quit here */
1133                 return;
1134
1135         /* Flush all the filters */
1136
1137         err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138         if (err != 0) {
1139                 device_printf(sc->dev,
1140                               "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141                               ", error status: %d\n", err);
1142                 return;
1143         }
1144
1145         /* Walk the multicast list, and add each address */
1146
1147         if_maddr_rlock(ifp);
1148         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149                 if (ifma->ifma_addr->sa_family != AF_LINK)
1150                         continue;
1151                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152                       &cmd.data0, 4);
1153                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154                       &cmd.data1, 2);
1155                 cmd.data0 = htonl(cmd.data0);
1156                 cmd.data1 = htonl(cmd.data1);
1157                 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158                 if (err != 0) {
1159                         device_printf(sc->dev, "Failed "
1160                                "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161                                "%d\t", err);
1162                         /* abort, leaving multicast filtering off */
1163                         if_maddr_runlock(ifp);
1164                         return;
1165                 }
1166         }
1167         if_maddr_runlock(ifp);
1168         /* Enable multicast filtering */
1169         err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170         if (err != 0) {
1171                 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172                        ", error status: %d\n", err);
1173         }
1174 }
1175
1176 static int
1177 mxge_max_mtu(mxge_softc_t *sc)
1178 {
1179         mxge_cmd_t cmd;
1180         int status;
1181
1182         if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183                 return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184
1185         /* try to set nbufs to see if it we can
1186            use virtually contiguous jumbos */
1187         cmd.data0 = 0;
1188         status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189                                &cmd);
1190         if (status == 0)
1191                 return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192
1193         /* otherwise, we're limited to MJUMPAGESIZE */
1194         return MJUMPAGESIZE - MXGEFW_PAD;
1195 }
1196
1197 static int
1198 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199 {
1200         struct mxge_slice_state *ss;
1201         mxge_rx_done_t *rx_done;
1202         volatile uint32_t *irq_claim;
1203         mxge_cmd_t cmd;
1204         int slice, status;
1205
1206         /* try to send a reset command to the card to see if it
1207            is alive */
1208         memset(&cmd, 0, sizeof (cmd));
1209         status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210         if (status != 0) {
1211                 device_printf(sc->dev, "failed reset\n");
1212                 return ENXIO;
1213         }
1214
1215         mxge_dummy_rdma(sc, 1);
1216
1217
1218         /* set the intrq size */
1219         cmd.data0 = sc->rx_ring_size;
1220         status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221
1222         /*
1223          * Even though we already know how many slices are supported
1224          * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225          * has magic side effects, and must be called after a reset.
1226          * It must be called prior to calling any RSS related cmds,
1227          * including assigning an interrupt queue for anything but
1228          * slice 0.  It must also be called *after*
1229          * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230          * the firmware to compute offsets.
1231          */
1232         
1233         if (sc->num_slices > 1) {
1234                 /* ask the maximum number of slices it supports */
1235                 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236                                            &cmd);
1237                 if (status != 0) {
1238                         device_printf(sc->dev,
1239                                       "failed to get number of slices\n");
1240                         return status;
1241                 }
1242                 /*
1243                  * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244                  * to setting up the interrupt queue DMA
1245                  */
1246                 cmd.data0 = sc->num_slices;
1247                 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248 #ifdef IFNET_BUF_RING
1249                 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250 #endif
1251                 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252                                            &cmd);
1253                 if (status != 0) {
1254                         device_printf(sc->dev,
1255                                       "failed to set number of slices\n");
1256                         return status;
1257                 }
1258         }
1259
1260
1261         if (interrupts_setup) {
1262                 /* Now exchange information about interrupts  */
1263                 for (slice = 0; slice < sc->num_slices; slice++) {
1264                         rx_done = &sc->ss[slice].rx_done;
1265                         memset(rx_done->entry, 0, sc->rx_ring_size);
1266                         cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267                         cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268                         cmd.data2 = slice;
1269                         status |= mxge_send_cmd(sc,
1270                                                 MXGEFW_CMD_SET_INTRQ_DMA,
1271                                                 &cmd);
1272                 }
1273         }
1274
1275         status |= mxge_send_cmd(sc,
1276                                 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277         
1278
1279         sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280
1281         status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282         irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283
1284
1285         status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286                                 &cmd);
1287         sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288         if (status != 0) {
1289                 device_printf(sc->dev, "failed set interrupt parameters\n");
1290                 return status;
1291         }
1292         
1293
1294         *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295
1296         
1297         /* run a DMA benchmark */
1298         (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299
1300         for (slice = 0; slice < sc->num_slices; slice++) {
1301                 ss = &sc->ss[slice];
1302
1303                 ss->irq_claim = irq_claim + (2 * slice);
1304                 /* reset mcp/driver shared state back to 0 */
1305                 ss->rx_done.idx = 0;
1306                 ss->rx_done.cnt = 0;
1307                 ss->tx.req = 0;
1308                 ss->tx.done = 0;
1309                 ss->tx.pkt_done = 0;
1310                 ss->tx.queue_active = 0;
1311                 ss->tx.activate = 0;
1312                 ss->tx.deactivate = 0;
1313                 ss->tx.wake = 0;
1314                 ss->tx.defrag = 0;
1315                 ss->tx.stall = 0;
1316                 ss->rx_big.cnt = 0;
1317                 ss->rx_small.cnt = 0;
1318                 ss->lc.lro_bad_csum = 0;
1319                 ss->lc.lro_queued = 0;
1320                 ss->lc.lro_flushed = 0;
1321                 if (ss->fw_stats != NULL) {
1322                         bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323                 }
1324         }
1325         sc->rdma_tags_available = 15;
1326         status = mxge_update_mac_address(sc);
1327         mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328         mxge_change_pause(sc, sc->pause);
1329         mxge_set_multicast_list(sc);
1330         if (sc->throttle) {
1331                 cmd.data0 = sc->throttle;
1332                 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333                                   &cmd)) {
1334                         device_printf(sc->dev,
1335                                       "can't enable throttle\n");
1336                 }
1337         }
1338         return status;
1339 }
1340
1341 static int
1342 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343 {
1344         mxge_cmd_t cmd;
1345         mxge_softc_t *sc;
1346         int err;
1347         unsigned int throttle;
1348
1349         sc = arg1;
1350         throttle = sc->throttle;
1351         err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352         if (err != 0) {
1353                 return err;
1354         }
1355
1356         if (throttle == sc->throttle)
1357                 return 0;
1358
1359         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360                 return EINVAL;
1361         
1362         mtx_lock(&sc->driver_mtx);
1363         cmd.data0 = throttle;
1364         err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365         if (err == 0)
1366                 sc->throttle = throttle;
1367         mtx_unlock(&sc->driver_mtx);    
1368         return err;
1369 }
1370
1371 static int
1372 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 {
1374         mxge_softc_t *sc;
1375         unsigned int intr_coal_delay;
1376         int err;
1377
1378         sc = arg1;
1379         intr_coal_delay = sc->intr_coal_delay;
1380         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381         if (err != 0) {
1382                 return err;
1383         }
1384         if (intr_coal_delay == sc->intr_coal_delay)
1385                 return 0;
1386
1387         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388                 return EINVAL;
1389
1390         mtx_lock(&sc->driver_mtx);
1391         *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392         sc->intr_coal_delay = intr_coal_delay;
1393         
1394         mtx_unlock(&sc->driver_mtx);
1395         return err;
1396 }
1397
1398 static int
1399 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 {
1401         mxge_softc_t *sc;
1402         unsigned int enabled;
1403         int err;
1404
1405         sc = arg1;
1406         enabled = sc->pause;
1407         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408         if (err != 0) {
1409                 return err;
1410         }
1411         if (enabled == sc->pause)
1412                 return 0;
1413
1414         mtx_lock(&sc->driver_mtx);
1415         err = mxge_change_pause(sc, enabled);
1416         mtx_unlock(&sc->driver_mtx);
1417         return err;
1418 }
1419
1420 static int
1421 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422 {
1423         int err;
1424
1425         if (arg1 == NULL)
1426                 return EFAULT;
1427         arg2 = be32toh(*(int *)arg1);
1428         arg1 = NULL;
1429         err = sysctl_handle_int(oidp, arg1, arg2, req);
1430
1431         return err;
1432 }
1433
1434 static void
1435 mxge_rem_sysctls(mxge_softc_t *sc)
1436 {
1437         struct mxge_slice_state *ss;
1438         int slice;
1439
1440         if (sc->slice_sysctl_tree == NULL)
1441                 return;
1442
1443         for (slice = 0; slice < sc->num_slices; slice++) {
1444                 ss = &sc->ss[slice];
1445                 if (ss == NULL || ss->sysctl_tree == NULL)
1446                         continue;
1447                 sysctl_ctx_free(&ss->sysctl_ctx);
1448                 ss->sysctl_tree = NULL;
1449         }
1450         sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451         sc->slice_sysctl_tree = NULL;
1452 }
1453
1454 static void
1455 mxge_add_sysctls(mxge_softc_t *sc)
1456 {
1457         struct sysctl_ctx_list *ctx;
1458         struct sysctl_oid_list *children;
1459         mcp_irq_data_t *fw;
1460         struct mxge_slice_state *ss;
1461         int slice;
1462         char slice_num[8];
1463
1464         ctx = device_get_sysctl_ctx(sc->dev);
1465         children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466         fw = sc->ss[0].fw_stats;
1467
1468         /* random information */
1469         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470                        "firmware_version",
1471                        CTLFLAG_RD, sc->fw_version,
1472                        0, "firmware version");
1473         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474                        "serial_number",
1475                        CTLFLAG_RD, sc->serial_number_string,
1476                        0, "serial number");
1477         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478                        "product_code",
1479                        CTLFLAG_RD, sc->product_code_string,
1480                        0, "product_code");
1481         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482                        "pcie_link_width",
1483                        CTLFLAG_RD, &sc->link_width,
1484                        0, "tx_boundary");
1485         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486                        "tx_boundary",
1487                        CTLFLAG_RD, &sc->tx_boundary,
1488                        0, "tx_boundary");
1489         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490                        "write_combine",
1491                        CTLFLAG_RD, &sc->wc,
1492                        0, "write combining PIO?");
1493         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494                        "read_dma_MBs",
1495                        CTLFLAG_RD, &sc->read_dma,
1496                        0, "DMA Read speed in MB/s");
1497         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498                        "write_dma_MBs",
1499                        CTLFLAG_RD, &sc->write_dma,
1500                        0, "DMA Write speed in MB/s");
1501         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502                        "read_write_dma_MBs",
1503                        CTLFLAG_RD, &sc->read_write_dma,
1504                        0, "DMA concurrent Read/Write speed in MB/s");
1505         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506                        "watchdog_resets",
1507                        CTLFLAG_RD, &sc->watchdog_resets,
1508                        0, "Number of times NIC was reset");
1509
1510
1511         /* performance related tunables */
1512         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513                         "intr_coal_delay",
1514                         CTLTYPE_INT|CTLFLAG_RW, sc,
1515                         0, mxge_change_intr_coal,
1516                         "I", "interrupt coalescing delay in usecs");
1517
1518         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519                         "throttle",
1520                         CTLTYPE_INT|CTLFLAG_RW, sc,
1521                         0, mxge_change_throttle,
1522                         "I", "transmit throttling");
1523
1524         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525                         "flow_control_enabled",
1526                         CTLTYPE_INT|CTLFLAG_RW, sc,
1527                         0, mxge_change_flow_control,
1528                         "I", "interrupt coalescing delay in usecs");
1529
1530         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531                        "deassert_wait",
1532                        CTLFLAG_RW, &mxge_deassert_wait,
1533                        0, "Wait for IRQ line to go low in ihandler");
1534
1535         /* stats block from firmware is in network byte order.
1536            Need to swap it */
1537         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538                         "link_up",
1539                         CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540                         0, mxge_handle_be32,
1541                         "I", "link up");
1542         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543                         "rdma_tags_available",
1544                         CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545                         0, mxge_handle_be32,
1546                         "I", "rdma_tags_available");
1547         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548                         "dropped_bad_crc32",
1549                         CTLTYPE_INT|CTLFLAG_RD,
1550                         &fw->dropped_bad_crc32,
1551                         0, mxge_handle_be32,
1552                         "I", "dropped_bad_crc32");
1553         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554                         "dropped_bad_phy",
1555                         CTLTYPE_INT|CTLFLAG_RD,
1556                         &fw->dropped_bad_phy,
1557                         0, mxge_handle_be32,
1558                         "I", "dropped_bad_phy");
1559         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560                         "dropped_link_error_or_filtered",
1561                         CTLTYPE_INT|CTLFLAG_RD,
1562                         &fw->dropped_link_error_or_filtered,
1563                         0, mxge_handle_be32,
1564                         "I", "dropped_link_error_or_filtered");
1565         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566                         "dropped_link_overflow",
1567                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568                         0, mxge_handle_be32,
1569                         "I", "dropped_link_overflow");
1570         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571                         "dropped_multicast_filtered",
1572                         CTLTYPE_INT|CTLFLAG_RD,
1573                         &fw->dropped_multicast_filtered,
1574                         0, mxge_handle_be32,
1575                         "I", "dropped_multicast_filtered");
1576         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577                         "dropped_no_big_buffer",
1578                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579                         0, mxge_handle_be32,
1580                         "I", "dropped_no_big_buffer");
1581         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582                         "dropped_no_small_buffer",
1583                         CTLTYPE_INT|CTLFLAG_RD,
1584                         &fw->dropped_no_small_buffer,
1585                         0, mxge_handle_be32,
1586                         "I", "dropped_no_small_buffer");
1587         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588                         "dropped_overrun",
1589                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590                         0, mxge_handle_be32,
1591                         "I", "dropped_overrun");
1592         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593                         "dropped_pause",
1594                         CTLTYPE_INT|CTLFLAG_RD,
1595                         &fw->dropped_pause,
1596                         0, mxge_handle_be32,
1597                         "I", "dropped_pause");
1598         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599                         "dropped_runt",
1600                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601                         0, mxge_handle_be32,
1602                         "I", "dropped_runt");
1603
1604         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605                         "dropped_unicast_filtered",
1606                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607                         0, mxge_handle_be32,
1608                         "I", "dropped_unicast_filtered");
1609
1610         /* verbose printing? */
1611         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612                        "verbose",
1613                        CTLFLAG_RW, &mxge_verbose,
1614                        0, "verbose printing");
1615
1616         /* add counters exported for debugging from all slices */
1617         sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618         sc->slice_sysctl_tree =
1619                 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620                                 "slice", CTLFLAG_RD, 0, "");
1621
1622         for (slice = 0; slice < sc->num_slices; slice++) {
1623                 ss = &sc->ss[slice];
1624                 sysctl_ctx_init(&ss->sysctl_ctx);
1625                 ctx = &ss->sysctl_ctx;
1626                 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627                 sprintf(slice_num, "%d", slice);
1628                 ss->sysctl_tree =
1629                         SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630                                         CTLFLAG_RD, 0, "");
1631                 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633                                "rx_small_cnt",
1634                                CTLFLAG_RD, &ss->rx_small.cnt,
1635                                0, "rx_small_cnt");
1636                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637                                "rx_big_cnt",
1638                                CTLFLAG_RD, &ss->rx_big.cnt,
1639                                0, "rx_small_cnt");
1640                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1641                                "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642                                0, "number of lro merge queues flushed");
1643
1644                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1645                                "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646                                0, "number of bad csums preventing LRO");
1647
1648                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1649                                "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650                                0, "number of frames appended to lro merge"
1651                                "queues");
1652
1653 #ifndef IFNET_BUF_RING
1654                 /* only transmit from slice 0 for now */
1655                 if (slice > 0)
1656                         continue;
1657 #endif
1658                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659                                "tx_req",
1660                                CTLFLAG_RD, &ss->tx.req,
1661                                0, "tx_req");
1662
1663                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664                                "tx_done",
1665                                CTLFLAG_RD, &ss->tx.done,
1666                                0, "tx_done");
1667                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668                                "tx_pkt_done",
1669                                CTLFLAG_RD, &ss->tx.pkt_done,
1670                                0, "tx_done");
1671                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672                                "tx_stall",
1673                                CTLFLAG_RD, &ss->tx.stall,
1674                                0, "tx_stall");
1675                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676                                "tx_wake",
1677                                CTLFLAG_RD, &ss->tx.wake,
1678                                0, "tx_wake");
1679                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680                                "tx_defrag",
1681                                CTLFLAG_RD, &ss->tx.defrag,
1682                                0, "tx_defrag");
1683                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684                                "tx_queue_active",
1685                                CTLFLAG_RD, &ss->tx.queue_active,
1686                                0, "tx_queue_active");
1687                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688                                "tx_activate",
1689                                CTLFLAG_RD, &ss->tx.activate,
1690                                0, "tx_activate");
1691                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692                                "tx_deactivate",
1693                                CTLFLAG_RD, &ss->tx.deactivate,
1694                                0, "tx_deactivate");
1695         }
1696 }
1697
1698 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699    backwards one at a time and handle ring wraps */
1700
1701 static inline void
1702 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703                             mcp_kreq_ether_send_t *src, int cnt)
1704 {
1705         int idx, starting_slot;
1706         starting_slot = tx->req;
1707         while (cnt > 1) {
1708                 cnt--;
1709                 idx = (starting_slot + cnt) & tx->mask;
1710                 mxge_pio_copy(&tx->lanai[idx],
1711                               &src[cnt], sizeof(*src));
1712                 wmb();
1713         }
1714 }
1715
1716 /*
1717  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718  * at most 32 bytes at a time, so as to avoid involving the software
1719  * pio handler in the nic.   We re-write the first segment's flags
1720  * to mark them valid only after writing the entire chain
1721  */
1722
1723 static inline void
1724 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725                   int cnt)
1726 {
1727         int idx, i;
1728         uint32_t *src_ints;
1729         volatile uint32_t *dst_ints;
1730         mcp_kreq_ether_send_t *srcp;
1731         volatile mcp_kreq_ether_send_t *dstp, *dst;
1732         uint8_t last_flags;
1733         
1734         idx = tx->req & tx->mask;
1735
1736         last_flags = src->flags;
1737         src->flags = 0;
1738         wmb();
1739         dst = dstp = &tx->lanai[idx];
1740         srcp = src;
1741
1742         if ((idx + cnt) < tx->mask) {
1743                 for (i = 0; i < (cnt - 1); i += 2) {
1744                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745                         wmb(); /* force write every 32 bytes */
1746                         srcp += 2;
1747                         dstp += 2;
1748                 }
1749         } else {
1750                 /* submit all but the first request, and ensure
1751                    that it is submitted below */
1752                 mxge_submit_req_backwards(tx, src, cnt);
1753                 i = 0;
1754         }
1755         if (i < cnt) {
1756                 /* submit the first request */
1757                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1758                 wmb(); /* barrier before setting valid flag */
1759         }
1760
1761         /* re-write the last 32-bits with the valid flags */
1762         src->flags = last_flags;
1763         src_ints = (uint32_t *)src;
1764         src_ints+=3;
1765         dst_ints = (volatile uint32_t *)dst;
1766         dst_ints+=3;
1767         *dst_ints =  *src_ints;
1768         tx->req += cnt;
1769         wmb();
1770 }
1771
1772 static int
1773 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774     struct mxge_pkt_info *pi)
1775 {
1776         struct ether_vlan_header *eh;
1777         uint16_t etype;
1778         int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779 #if IFCAP_TSO6 && defined(INET6)
1780         int nxt;
1781 #endif
1782
1783         eh = mtod(m, struct ether_vlan_header *);
1784         if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785                 etype = ntohs(eh->evl_proto);
1786                 pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787         } else {
1788                 etype = ntohs(eh->evl_encap_proto);
1789                 pi->ip_off = ETHER_HDR_LEN;
1790         }
1791
1792         switch (etype) {
1793         case ETHERTYPE_IP:
1794                 /*
1795                  * ensure ip header is in first mbuf, copy it to a
1796                  * scratch buffer if not
1797                  */
1798                 pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799                 pi->ip6 = NULL;
1800                 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801                         m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802                             ss->scratch);
1803                         pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804                 }
1805                 pi->ip_hlen = pi->ip->ip_hl << 2;
1806                 if (!tso)
1807                         return 0;
1808
1809                 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810                     sizeof(struct tcphdr))) {
1811                         m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812                             sizeof(struct tcphdr), ss->scratch);
1813                         pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814                 }
1815                 pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816                 break;
1817 #if IFCAP_TSO6 && defined(INET6)
1818         case ETHERTYPE_IPV6:
1819                 pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820                 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821                         m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822                             ss->scratch);
1823                         pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824                 }
1825                 nxt = 0;
1826                 pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827                 pi->ip_hlen -= pi->ip_off;
1828                 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829                         return EINVAL;
1830
1831                 if (!tso)
1832                         return 0;
1833
1834                 if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835                         return EINVAL;
1836
1837                 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838                     sizeof(struct tcphdr))) {
1839                         m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840                             sizeof(struct tcphdr), ss->scratch);
1841                         pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842                 }
1843                 pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844                 break;
1845 #endif
1846         default:
1847                 return EINVAL;
1848         }
1849         return 0;
1850 }
1851
1852 #if IFCAP_TSO4
1853
1854 static void
1855 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856                int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 {
1858         mxge_tx_ring_t *tx;
1859         mcp_kreq_ether_send_t *req;
1860         bus_dma_segment_t *seg;
1861         uint32_t low, high_swapped;
1862         int len, seglen, cum_len, cum_len_next;
1863         int next_is_first, chop, cnt, rdma_count, small;
1864         uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865         uint8_t flags, flags_next;
1866         static int once;
1867
1868         mss = m->m_pkthdr.tso_segsz;
1869
1870         /* negative cum_len signifies to the
1871          * send loop that we are still in the
1872          * header portion of the TSO packet.
1873          */
1874
1875         cksum_offset = pi->ip_off + pi->ip_hlen;
1876         cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877
1878         /* TSO implies checksum offload on this hardware */
1879         if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880                 /*
1881                  * If packet has full TCP csum, replace it with pseudo hdr
1882                  * sum that the NIC expects, otherwise the NIC will emit
1883                  * packets with bad TCP checksums.
1884                  */
1885                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886                 if (pi->ip6) {
1887 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888                         m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889                         sum = in6_cksum_pseudo(pi->ip6,
1890                             m->m_pkthdr.len - cksum_offset,
1891                             IPPROTO_TCP, 0);
1892 #endif
1893                 } else {
1894 #ifdef INET
1895                         m->m_pkthdr.csum_flags |= CSUM_TCP;
1896                         sum = in_pseudo(pi->ip->ip_src.s_addr,
1897                             pi->ip->ip_dst.s_addr,
1898                             htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899                                     cksum_offset)));
1900 #endif
1901                 }
1902                 m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903                     cksum_offset, sizeof(sum), (caddr_t)&sum);
1904         }
1905         flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906
1907         
1908         /* for TSO, pseudo_hdr_offset holds mss.
1909          * The firmware figures out where to put
1910          * the checksum by parsing the header. */
1911         pseudo_hdr_offset = htobe16(mss);
1912
1913         if (pi->ip6) {
1914                 /*
1915                  * for IPv6 TSO, the "checksum offset" is re-purposed
1916                  * to store the TCP header len
1917                  */
1918                 cksum_offset = (pi->tcp->th_off << 2);
1919         }
1920
1921         tx = &ss->tx;
1922         req = tx->req_list;
1923         seg = tx->seg_list;
1924         cnt = 0;
1925         rdma_count = 0;
1926         /* "rdma_count" is the number of RDMAs belonging to the
1927          * current packet BEFORE the current send request. For
1928          * non-TSO packets, this is equal to "count".
1929          * For TSO packets, rdma_count needs to be reset
1930          * to 0 after a segment cut.
1931          *
1932          * The rdma_count field of the send request is
1933          * the number of RDMAs of the packet starting at
1934          * that request. For TSO send requests with one ore more cuts
1935          * in the middle, this is the number of RDMAs starting
1936          * after the last cut in the request. All previous
1937          * segments before the last cut implicitly have 1 RDMA.
1938          *
1939          * Since the number of RDMAs is not known beforehand,
1940          * it must be filled-in retroactively - after each
1941          * segmentation cut or at the end of the entire packet.
1942          */
1943
1944         while (busdma_seg_cnt) {
1945                 /* Break the busdma segment up into pieces*/
1946                 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947                 high_swapped =  htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948                 len = seg->ds_len;
1949
1950                 while (len) {
1951                         flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952                         seglen = len;
1953                         cum_len_next = cum_len + seglen;
1954                         (req-rdma_count)->rdma_count = rdma_count + 1;
1955                         if (__predict_true(cum_len >= 0)) {
1956                                 /* payload */
1957                                 chop = (cum_len_next > mss);
1958                                 cum_len_next = cum_len_next % mss;
1959                                 next_is_first = (cum_len_next == 0);
1960                                 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961                                 flags_next |= next_is_first *
1962                                         MXGEFW_FLAGS_FIRST;
1963                                 rdma_count |= -(chop | next_is_first);
1964                                 rdma_count += chop & !next_is_first;
1965                         } else if (cum_len_next >= 0) {
1966                                 /* header ends */
1967                                 rdma_count = -1;
1968                                 cum_len_next = 0;
1969                                 seglen = -cum_len;
1970                                 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971                                 flags_next = MXGEFW_FLAGS_TSO_PLD |
1972                                         MXGEFW_FLAGS_FIRST |
1973                                         (small * MXGEFW_FLAGS_SMALL);
1974                             }
1975                         
1976                         req->addr_high = high_swapped;
1977                         req->addr_low = htobe32(low);
1978                         req->pseudo_hdr_offset = pseudo_hdr_offset;
1979                         req->pad = 0;
1980                         req->rdma_count = 1;
1981                         req->length = htobe16(seglen);
1982                         req->cksum_offset = cksum_offset;
1983                         req->flags = flags | ((cum_len & 1) *
1984                                               MXGEFW_FLAGS_ALIGN_ODD);
1985                         low += seglen;
1986                         len -= seglen;
1987                         cum_len = cum_len_next;
1988                         flags = flags_next;
1989                         req++;
1990                         cnt++;
1991                         rdma_count++;
1992                         if (cksum_offset != 0 && !pi->ip6) {
1993                                 if (__predict_false(cksum_offset > seglen))
1994                                         cksum_offset -= seglen;
1995                                 else
1996                                         cksum_offset = 0;
1997                         }
1998                         if (__predict_false(cnt > tx->max_desc))
1999                                 goto drop;
2000                 }
2001                 busdma_seg_cnt--;
2002                 seg++;
2003         }
2004         (req-rdma_count)->rdma_count = rdma_count;
2005
2006         do {
2007                 req--;
2008                 req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009         } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010
2011         tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012         mxge_submit_req(tx, tx->req_list, cnt);
2013 #ifdef IFNET_BUF_RING
2014         if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015                 /* tell the NIC to start polling this slice */
2016                 *tx->send_go = 1;
2017                 tx->queue_active = 1;
2018                 tx->activate++;
2019                 wmb();
2020         }
2021 #endif
2022         return;
2023
2024 drop:
2025         bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026         m_freem(m);
2027         ss->oerrors++;
2028         if (!once) {
2029                 printf("tx->max_desc exceeded via TSO!\n");
2030                 printf("mss = %d, %ld, %d!\n", mss,
2031                        (long)seg - (long)tx->seg_list, tx->max_desc);
2032                 once = 1;
2033         }
2034         return;
2035
2036 }
2037
2038 #endif /* IFCAP_TSO4 */
2039
2040 #ifdef MXGE_NEW_VLAN_API
2041 /*
2042  * We reproduce the software vlan tag insertion from
2043  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044  * vlan tag insertion. We need to advertise this in order to have the
2045  * vlan interface respect our csum offload flags.
2046  */
2047 static struct mbuf *
2048 mxge_vlan_tag_insert(struct mbuf *m)
2049 {
2050         struct ether_vlan_header *evl;
2051
2052         M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053         if (__predict_false(m == NULL))
2054                 return NULL;
2055         if (m->m_len < sizeof(*evl)) {
2056                 m = m_pullup(m, sizeof(*evl));
2057                 if (__predict_false(m == NULL))
2058                         return NULL;
2059         }
2060         /*
2061          * Transform the Ethernet header into an Ethernet header
2062          * with 802.1Q encapsulation.
2063          */
2064         evl = mtod(m, struct ether_vlan_header *);
2065         bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066               (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067         evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068         evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069         m->m_flags &= ~M_VLANTAG;
2070         return m;
2071 }
2072 #endif /* MXGE_NEW_VLAN_API */
2073
2074 static void
2075 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076 {
2077         struct mxge_pkt_info pi = {0,0,0,0};
2078         mxge_softc_t *sc;
2079         mcp_kreq_ether_send_t *req;
2080         bus_dma_segment_t *seg;
2081         struct mbuf *m_tmp;
2082         struct ifnet *ifp;
2083         mxge_tx_ring_t *tx;
2084         int cnt, cum_len, err, i, idx, odd_flag;
2085         uint16_t pseudo_hdr_offset;
2086         uint8_t flags, cksum_offset;
2087
2088
2089         sc = ss->sc;
2090         ifp = sc->ifp;
2091         tx = &ss->tx;
2092
2093 #ifdef MXGE_NEW_VLAN_API
2094         if (m->m_flags & M_VLANTAG) {
2095                 m = mxge_vlan_tag_insert(m);
2096                 if (__predict_false(m == NULL))
2097                         goto drop_without_m;
2098         }
2099 #endif
2100         if (m->m_pkthdr.csum_flags &
2101             (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102                 if (mxge_parse_tx(ss, m, &pi))
2103                         goto drop;
2104         }
2105
2106         /* (try to) map the frame for DMA */
2107         idx = tx->req & tx->mask;
2108         err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109                                       m, tx->seg_list, &cnt,
2110                                       BUS_DMA_NOWAIT);
2111         if (__predict_false(err == EFBIG)) {
2112                 /* Too many segments in the chain.  Try
2113                    to defrag */
2114                 m_tmp = m_defrag(m, M_NOWAIT);
2115                 if (m_tmp == NULL) {
2116                         goto drop;
2117                 }
2118                 ss->tx.defrag++;
2119                 m = m_tmp;
2120                 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121                                               tx->info[idx].map,
2122                                               m, tx->seg_list, &cnt,
2123                                               BUS_DMA_NOWAIT);
2124         }
2125         if (__predict_false(err != 0)) {
2126                 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127                               " packet len = %d\n", err, m->m_pkthdr.len);
2128                 goto drop;
2129         }
2130         bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131                         BUS_DMASYNC_PREWRITE);
2132         tx->info[idx].m = m;
2133
2134 #if IFCAP_TSO4
2135         /* TSO is different enough, we handle it in another routine */
2136         if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137                 mxge_encap_tso(ss, m, cnt, &pi);
2138                 return;
2139         }
2140 #endif
2141
2142         req = tx->req_list;
2143         cksum_offset = 0;
2144         pseudo_hdr_offset = 0;
2145         flags = MXGEFW_FLAGS_NO_TSO;
2146
2147         /* checksum offloading? */
2148         if (m->m_pkthdr.csum_flags &
2149             (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150                 /* ensure ip header is in first mbuf, copy
2151                    it to a scratch buffer if not */
2152                 cksum_offset = pi.ip_off + pi.ip_hlen;
2153                 pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154                 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155                 req->cksum_offset = cksum_offset;
2156                 flags |= MXGEFW_FLAGS_CKSUM;
2157                 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158         } else {
2159                 odd_flag = 0;
2160         }
2161         if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162                 flags |= MXGEFW_FLAGS_SMALL;
2163
2164         /* convert segments into a request list */
2165         cum_len = 0;
2166         seg = tx->seg_list;
2167         req->flags = MXGEFW_FLAGS_FIRST;
2168         for (i = 0; i < cnt; i++) {
2169                 req->addr_low =
2170                         htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171                 req->addr_high =
2172                         htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173                 req->length = htobe16(seg->ds_len);
2174                 req->cksum_offset = cksum_offset;
2175                 if (cksum_offset > seg->ds_len)
2176                         cksum_offset -= seg->ds_len;
2177                 else
2178                         cksum_offset = 0;
2179                 req->pseudo_hdr_offset = pseudo_hdr_offset;
2180                 req->pad = 0; /* complete solid 16-byte block */
2181                 req->rdma_count = 1;
2182                 req->flags |= flags | ((cum_len & 1) * odd_flag);
2183                 cum_len += seg->ds_len;
2184                 seg++;
2185                 req++;
2186                 req->flags = 0;
2187         }
2188         req--;
2189         /* pad runts to 60 bytes */
2190         if (cum_len < 60) {
2191                 req++;
2192                 req->addr_low =
2193                         htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194                 req->addr_high =
2195                         htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196                 req->length = htobe16(60 - cum_len);
2197                 req->cksum_offset = 0;
2198                 req->pseudo_hdr_offset = pseudo_hdr_offset;
2199                 req->pad = 0; /* complete solid 16-byte block */
2200                 req->rdma_count = 1;
2201                 req->flags |= flags | ((cum_len & 1) * odd_flag);
2202                 cnt++;
2203         }
2204
2205         tx->req_list[0].rdma_count = cnt;
2206 #if 0
2207         /* print what the firmware will see */
2208         for (i = 0; i < cnt; i++) {
2209                 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210                     "cso:%d, flags:0x%x, rdma:%d\n",
2211                     i, (int)ntohl(tx->req_list[i].addr_high),
2212                     (int)ntohl(tx->req_list[i].addr_low),
2213                     (int)ntohs(tx->req_list[i].length),
2214                     (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215                     tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216                     tx->req_list[i].rdma_count);
2217         }
2218         printf("--------------\n");
2219 #endif
2220         tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221         mxge_submit_req(tx, tx->req_list, cnt);
2222 #ifdef IFNET_BUF_RING
2223         if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224                 /* tell the NIC to start polling this slice */
2225                 *tx->send_go = 1;
2226                 tx->queue_active = 1;
2227                 tx->activate++;
2228                 wmb();
2229         }
2230 #endif
2231         return;
2232
2233 drop:
2234         m_freem(m);
2235 drop_without_m:
2236         ss->oerrors++;
2237         return;
2238 }
2239
2240 #ifdef IFNET_BUF_RING
2241 static void
2242 mxge_qflush(struct ifnet *ifp)
2243 {
2244         mxge_softc_t *sc = ifp->if_softc;
2245         mxge_tx_ring_t *tx;
2246         struct mbuf *m;
2247         int slice;
2248
2249         for (slice = 0; slice < sc->num_slices; slice++) {
2250                 tx = &sc->ss[slice].tx;
2251                 mtx_lock(&tx->mtx);
2252                 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253                         m_freem(m);
2254                 mtx_unlock(&tx->mtx);
2255         }
2256         if_qflush(ifp);
2257 }
2258
2259 static inline void
2260 mxge_start_locked(struct mxge_slice_state *ss)
2261 {
2262         mxge_softc_t *sc;
2263         struct mbuf *m;
2264         struct ifnet *ifp;
2265         mxge_tx_ring_t *tx;
2266
2267         sc = ss->sc;
2268         ifp = sc->ifp;
2269         tx = &ss->tx;
2270
2271         while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272                 m = drbr_dequeue(ifp, tx->br);
2273                 if (m == NULL) {
2274                         return;
2275                 }
2276                 /* let BPF see it */
2277                 BPF_MTAP(ifp, m);
2278
2279                 /* give it to the nic */
2280                 mxge_encap(ss, m);
2281         }
2282         /* ran out of transmit slots */
2283         if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284             && (!drbr_empty(ifp, tx->br))) {
2285                 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286                 tx->stall++;
2287         }
2288 }
2289
2290 static int
2291 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292 {
2293         mxge_softc_t *sc;
2294         struct ifnet *ifp;
2295         mxge_tx_ring_t *tx;
2296         int err;
2297
2298         sc = ss->sc;
2299         ifp = sc->ifp;
2300         tx = &ss->tx;
2301
2302         if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303             IFF_DRV_RUNNING) {
2304                 err = drbr_enqueue(ifp, tx->br, m);
2305                 return (err);
2306         }
2307
2308         if (!drbr_needs_enqueue(ifp, tx->br) &&
2309             ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310                 /* let BPF see it */
2311                 BPF_MTAP(ifp, m);
2312                 /* give it to the nic */
2313                 mxge_encap(ss, m);
2314         } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315                 return (err);
2316         }
2317         if (!drbr_empty(ifp, tx->br))
2318                 mxge_start_locked(ss);
2319         return (0);
2320 }
2321
2322 static int
2323 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324 {
2325         mxge_softc_t *sc = ifp->if_softc;
2326         struct mxge_slice_state *ss;
2327         mxge_tx_ring_t *tx;
2328         int err = 0;
2329         int slice;
2330
2331         slice = m->m_pkthdr.flowid;
2332         slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333
2334         ss = &sc->ss[slice];
2335         tx = &ss->tx;
2336
2337         if (mtx_trylock(&tx->mtx)) {
2338                 err = mxge_transmit_locked(ss, m);
2339                 mtx_unlock(&tx->mtx);
2340         } else {
2341                 err = drbr_enqueue(ifp, tx->br, m);
2342         }
2343
2344         return (err);
2345 }
2346
2347 #else
2348
2349 static inline void
2350 mxge_start_locked(struct mxge_slice_state *ss)
2351 {
2352         mxge_softc_t *sc;
2353         struct mbuf *m;
2354         struct ifnet *ifp;
2355         mxge_tx_ring_t *tx;
2356
2357         sc = ss->sc;
2358         ifp = sc->ifp;
2359         tx = &ss->tx;
2360         while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361                 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362                 if (m == NULL) {
2363                         return;
2364                 }
2365                 /* let BPF see it */
2366                 BPF_MTAP(ifp, m);
2367
2368                 /* give it to the nic */
2369                 mxge_encap(ss, m);
2370         }
2371         /* ran out of transmit slots */
2372         if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373                 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374                 tx->stall++;
2375         }
2376 }
2377 #endif
2378 static void
2379 mxge_start(struct ifnet *ifp)
2380 {
2381         mxge_softc_t *sc = ifp->if_softc;
2382         struct mxge_slice_state *ss;
2383
2384         /* only use the first slice for now */
2385         ss = &sc->ss[0];
2386         mtx_lock(&ss->tx.mtx);
2387         mxge_start_locked(ss);
2388         mtx_unlock(&ss->tx.mtx);                
2389 }
2390
2391 /*
2392  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393  * at most 32 bytes at a time, so as to avoid involving the software
2394  * pio handler in the nic.   We re-write the first segment's low
2395  * DMA address to mark it valid only after we write the entire chunk
2396  * in a burst
2397  */
2398 static inline void
2399 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400                 mcp_kreq_ether_recv_t *src)
2401 {
2402         uint32_t low;
2403
2404         low = src->addr_low;
2405         src->addr_low = 0xffffffff;
2406         mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407         wmb();
2408         mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409         wmb();
2410         src->addr_low = low;
2411         dst->addr_low = low;
2412         wmb();
2413 }
2414
2415 static int
2416 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417 {
2418         bus_dma_segment_t seg;
2419         struct mbuf *m;
2420         mxge_rx_ring_t *rx = &ss->rx_small;
2421         int cnt, err;
2422
2423         m = m_gethdr(M_NOWAIT, MT_DATA);
2424         if (m == NULL) {
2425                 rx->alloc_fail++;
2426                 err = ENOBUFS;
2427                 goto done;
2428         }
2429         m->m_len = MHLEN;
2430         err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431                                       &seg, &cnt, BUS_DMA_NOWAIT);
2432         if (err != 0) {
2433                 m_free(m);
2434                 goto done;
2435         }
2436         rx->info[idx].m = m;
2437         rx->shadow[idx].addr_low =
2438                 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439         rx->shadow[idx].addr_high =
2440                 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441
2442 done:
2443         if ((idx & 7) == 7)
2444                 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445         return err;
2446 }
2447
2448 static int
2449 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450 {
2451         bus_dma_segment_t seg[3];
2452         struct mbuf *m;
2453         mxge_rx_ring_t *rx = &ss->rx_big;
2454         int cnt, err, i;
2455
2456         m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457         if (m == NULL) {
2458                 rx->alloc_fail++;
2459                 err = ENOBUFS;
2460                 goto done;
2461         }
2462         m->m_len = rx->mlen;
2463         err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464                                       seg, &cnt, BUS_DMA_NOWAIT);
2465         if (err != 0) {
2466                 m_free(m);
2467                 goto done;
2468         }
2469         rx->info[idx].m = m;
2470         rx->shadow[idx].addr_low =
2471                 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472         rx->shadow[idx].addr_high =
2473                 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474
2475 #if MXGE_VIRT_JUMBOS
2476         for (i = 1; i < cnt; i++) {
2477                 rx->shadow[idx + i].addr_low =
2478                         htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479                 rx->shadow[idx + i].addr_high =
2480                         htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481        }
2482 #endif
2483
2484 done:
2485        for (i = 0; i < rx->nbufs; i++) {
2486                 if ((idx & 7) == 7) {
2487                         mxge_submit_8rx(&rx->lanai[idx - 7],
2488                                         &rx->shadow[idx - 7]);
2489                 }
2490                 idx++;
2491         }
2492         return err;
2493 }
2494
2495 #ifdef INET6
2496
2497 static uint16_t
2498 mxge_csum_generic(uint16_t *raw, int len)
2499 {
2500         uint32_t csum;
2501
2502
2503         csum = 0;
2504         while (len > 0) {
2505                 csum += *raw;
2506                 raw++;
2507                 len -= 2;
2508         }
2509         csum = (csum >> 16) + (csum & 0xffff);
2510         csum = (csum >> 16) + (csum & 0xffff);
2511         return (uint16_t)csum;
2512 }
2513
2514 static inline uint16_t
2515 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 {
2517         uint32_t partial;
2518         int nxt, cksum_offset;
2519         struct ip6_hdr *ip6 = p;
2520         uint16_t c;
2521
2522         nxt = ip6->ip6_nxt;
2523         cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524         if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525                 cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526                                            IPPROTO_IPV6, &nxt);
2527                 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528                         return (1);
2529         }
2530
2531         /*
2532          * IPv6 headers do not contain a checksum, and hence
2533          * do not checksum to zero, so they don't "fall out"
2534          * of the partial checksum calculation like IPv4
2535          * headers do.  We need to fix the partial checksum by
2536          * subtracting the checksum of the IPv6 header.
2537          */
2538
2539         partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540                                     ETHER_HDR_LEN);
2541         csum += ~partial;
2542         csum +=  (csum < ~partial);
2543         csum = (csum >> 16) + (csum & 0xFFFF);
2544         csum = (csum >> 16) + (csum & 0xFFFF);
2545         c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546                              csum);
2547         c ^= 0xffff;
2548         return (c);
2549 }
2550 #endif /* INET6 */
2551 /*
2552  *  Myri10GE hardware checksums are not valid if the sender
2553  *  padded the frame with non-zero padding.  This is because
2554  *  the firmware just does a simple 16-bit 1s complement
2555  *  checksum across the entire frame, excluding the first 14
2556  *  bytes.  It is best to simply to check the checksum and
2557  *  tell the stack about it only if the checksum is good
2558  */
2559
2560 static inline uint16_t
2561 mxge_rx_csum(struct mbuf *m, int csum)
2562 {
2563         struct ether_header *eh;
2564 #ifdef INET
2565         struct ip *ip;
2566 #endif
2567 #if defined(INET) || defined(INET6)
2568         int cap = m->m_pkthdr.rcvif->if_capenable;
2569 #endif
2570         uint16_t c, etype;
2571
2572
2573         eh = mtod(m, struct ether_header *);
2574         etype = ntohs(eh->ether_type);
2575         switch (etype) {
2576 #ifdef INET
2577         case ETHERTYPE_IP:
2578                 if ((cap & IFCAP_RXCSUM) == 0)
2579                         return (1);
2580                 ip = (struct ip *)(eh + 1);
2581                 if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582                         return (1);
2583                 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584                               htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585                                     (ip->ip_hl << 2) + ip->ip_p));
2586                 c ^= 0xffff;
2587                 break;
2588 #endif
2589 #ifdef INET6
2590         case ETHERTYPE_IPV6:
2591                 if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592                         return (1);
2593                 c = mxge_rx_csum6((eh + 1), m, csum);
2594                 break;
2595 #endif
2596         default:
2597                 c = 1;
2598         }
2599         return (c);
2600 }
2601
2602 static void
2603 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604 {
2605         struct ether_vlan_header *evl;
2606         struct ether_header *eh;
2607         uint32_t partial;
2608
2609         evl = mtod(m, struct ether_vlan_header *);
2610         eh = mtod(m, struct ether_header *);
2611
2612         /*
2613          * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614          * after what the firmware thought was the end of the ethernet
2615          * header.
2616          */
2617
2618         /* put checksum into host byte order */
2619         *csum = ntohs(*csum);
2620         partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621         (*csum) += ~partial;
2622         (*csum) +=  ((*csum) < ~partial);
2623         (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624         (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625
2626         /* restore checksum to network byte order;
2627            later consumers expect this */
2628         *csum = htons(*csum);
2629
2630         /* save the tag */
2631 #ifdef MXGE_NEW_VLAN_API        
2632         m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633 #else
2634         {
2635                 struct m_tag *mtag;
2636                 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637                                    M_NOWAIT);
2638                 if (mtag == NULL)
2639                         return;
2640                 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641                 m_tag_prepend(m, mtag);
2642         }
2643
2644 #endif
2645         m->m_flags |= M_VLANTAG;
2646
2647         /*
2648          * Remove the 802.1q header by copying the Ethernet
2649          * addresses over it and adjusting the beginning of
2650          * the data in the mbuf.  The encapsulated Ethernet
2651          * type field is already in place.
2652          */
2653         bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654               ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655         m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656 }
2657
2658
2659 static inline void
2660 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661                  uint32_t csum, int lro)
2662 {
2663         mxge_softc_t *sc;
2664         struct ifnet *ifp;
2665         struct mbuf *m;
2666         struct ether_header *eh;
2667         mxge_rx_ring_t *rx;
2668         bus_dmamap_t old_map;
2669         int idx;
2670
2671         sc = ss->sc;
2672         ifp = sc->ifp;
2673         rx = &ss->rx_big;
2674         idx = rx->cnt & rx->mask;
2675         rx->cnt += rx->nbufs;
2676         /* save a pointer to the received mbuf */
2677         m = rx->info[idx].m;
2678         /* try to replace the received mbuf */
2679         if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680                 /* drop the frame -- the old mbuf is re-cycled */
2681                 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2682                 return;
2683         }
2684
2685         /* unmap the received buffer */
2686         old_map = rx->info[idx].map;
2687         bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688         bus_dmamap_unload(rx->dmat, old_map);
2689
2690         /* swap the bus_dmamap_t's */
2691         rx->info[idx].map = rx->extra_map;
2692         rx->extra_map = old_map;
2693
2694         /* mcp implicitly skips 1st 2 bytes so that packet is properly
2695          * aligned */
2696         m->m_data += MXGEFW_PAD;
2697
2698         m->m_pkthdr.rcvif = ifp;
2699         m->m_len = m->m_pkthdr.len = len;
2700         ss->ipackets++;
2701         eh = mtod(m, struct ether_header *);
2702         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703                 mxge_vlan_tag_remove(m, &csum);
2704         }
2705         /* if the checksum is valid, mark it in the mbuf header */
2706         
2707         if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2708             (0 == mxge_rx_csum(m, csum))) {
2709                 /* Tell the stack that the  checksum is good */
2710                 m->m_pkthdr.csum_data = 0xffff;
2711                 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2712                         CSUM_DATA_VALID;
2713
2714 #if defined(INET) || defined (INET6)
2715                 if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2716                         return;
2717 #endif
2718         }
2719         /* flowid only valid if RSS hashing is enabled */
2720         if (sc->num_slices > 1) {
2721                 m->m_pkthdr.flowid = (ss - sc->ss);
2722                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2723         }
2724         /* pass the frame up the stack */
2725         (*ifp->if_input)(ifp, m);
2726 }
2727
2728 static inline void
2729 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730                    uint32_t csum, int lro)
2731 {
2732         mxge_softc_t *sc;
2733         struct ifnet *ifp;
2734         struct ether_header *eh;
2735         struct mbuf *m;
2736         mxge_rx_ring_t *rx;
2737         bus_dmamap_t old_map;
2738         int idx;
2739
2740         sc = ss->sc;
2741         ifp = sc->ifp;
2742         rx = &ss->rx_small;
2743         idx = rx->cnt & rx->mask;
2744         rx->cnt++;
2745         /* save a pointer to the received mbuf */
2746         m = rx->info[idx].m;
2747         /* try to replace the received mbuf */
2748         if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749                 /* drop the frame -- the old mbuf is re-cycled */
2750                 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2751                 return;
2752         }
2753
2754         /* unmap the received buffer */
2755         old_map = rx->info[idx].map;
2756         bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757         bus_dmamap_unload(rx->dmat, old_map);
2758
2759         /* swap the bus_dmamap_t's */
2760         rx->info[idx].map = rx->extra_map;
2761         rx->extra_map = old_map;
2762
2763         /* mcp implicitly skips 1st 2 bytes so that packet is properly
2764          * aligned */
2765         m->m_data += MXGEFW_PAD;
2766
2767         m->m_pkthdr.rcvif = ifp;
2768         m->m_len = m->m_pkthdr.len = len;
2769         ss->ipackets++;
2770         eh = mtod(m, struct ether_header *);
2771         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772                 mxge_vlan_tag_remove(m, &csum);
2773         }
2774         /* if the checksum is valid, mark it in the mbuf header */
2775         if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2776             (0 == mxge_rx_csum(m, csum))) {
2777                 /* Tell the stack that the  checksum is good */
2778                 m->m_pkthdr.csum_data = 0xffff;
2779                 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2780                         CSUM_DATA_VALID;
2781
2782 #if defined(INET) || defined (INET6)
2783                 if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2784                         return;
2785 #endif
2786         }
2787         /* flowid only valid if RSS hashing is enabled */
2788         if (sc->num_slices > 1) {
2789                 m->m_pkthdr.flowid = (ss - sc->ss);
2790                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2791         }
2792         /* pass the frame up the stack */
2793         (*ifp->if_input)(ifp, m);
2794 }
2795
2796 static inline void
2797 mxge_clean_rx_done(struct mxge_slice_state *ss)
2798 {
2799         mxge_rx_done_t *rx_done = &ss->rx_done;
2800         int limit = 0;
2801         uint16_t length;
2802         uint16_t checksum;
2803         int lro;
2804
2805         lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806         while (rx_done->entry[rx_done->idx].length != 0) {
2807                 length = ntohs(rx_done->entry[rx_done->idx].length);
2808                 rx_done->entry[rx_done->idx].length = 0;
2809                 checksum = rx_done->entry[rx_done->idx].checksum;
2810                 if (length <= (MHLEN - MXGEFW_PAD))
2811                         mxge_rx_done_small(ss, length, checksum, lro);
2812                 else
2813                         mxge_rx_done_big(ss, length, checksum, lro);
2814                 rx_done->cnt++;
2815                 rx_done->idx = rx_done->cnt & rx_done->mask;
2816
2817                 /* limit potential for livelock */
2818                 if (__predict_false(++limit > rx_done->mask / 2))
2819                         break;
2820         }
2821 #if defined(INET)  || defined (INET6)
2822         tcp_lro_flush_all(&ss->lc);
2823 #endif
2824 }
2825
2826
2827 static inline void
2828 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2829 {
2830         struct ifnet *ifp;
2831         mxge_tx_ring_t *tx;
2832         struct mbuf *m;
2833         bus_dmamap_t map;
2834         int idx;
2835         int *flags;
2836
2837         tx = &ss->tx;
2838         ifp = ss->sc->ifp;
2839         while (tx->pkt_done != mcp_idx) {
2840                 idx = tx->done & tx->mask;
2841                 tx->done++;
2842                 m = tx->info[idx].m;
2843                 /* mbuf and DMA map only attached to the first
2844                    segment per-mbuf */
2845                 if (m != NULL) {
2846                         ss->obytes += m->m_pkthdr.len;
2847                         if (m->m_flags & M_MCAST)
2848                                 ss->omcasts++;
2849                         ss->opackets++;
2850                         tx->info[idx].m = NULL;
2851                         map = tx->info[idx].map;
2852                         bus_dmamap_unload(tx->dmat, map);
2853                         m_freem(m);
2854                 }
2855                 if (tx->info[idx].flag) {
2856                         tx->info[idx].flag = 0;
2857                         tx->pkt_done++;
2858                 }
2859         }
2860         
2861         /* If we have space, clear IFF_OACTIVE to tell the stack that
2862            its OK to send packets */
2863 #ifdef IFNET_BUF_RING
2864         flags = &ss->if_drv_flags;
2865 #else
2866         flags = &ifp->if_drv_flags;
2867 #endif
2868         mtx_lock(&ss->tx.mtx);
2869         if ((*flags) & IFF_DRV_OACTIVE &&
2870             tx->req - tx->done < (tx->mask + 1)/4) {
2871                 *(flags) &= ~IFF_DRV_OACTIVE;
2872                 ss->tx.wake++;
2873                 mxge_start_locked(ss);
2874         }
2875 #ifdef IFNET_BUF_RING
2876         if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2877                 /* let the NIC stop polling this queue, since there
2878                  * are no more transmits pending */
2879                 if (tx->req == tx->done) {
2880                         *tx->send_stop = 1;
2881                         tx->queue_active = 0;
2882                         tx->deactivate++;
2883                         wmb();
2884                 }
2885         }
2886 #endif
2887         mtx_unlock(&ss->tx.mtx);
2888
2889 }
2890
2891 static struct mxge_media_type mxge_xfp_media_types[] =
2892 {
2893         {IFM_10G_CX4,   0x7f,           "10GBASE-CX4 (module)"},
2894         {IFM_10G_SR,    (1 << 7),       "10GBASE-SR"},
2895         {IFM_10G_LR,    (1 << 6),       "10GBASE-LR"},
2896         {0,             (1 << 5),       "10GBASE-ER"},
2897         {IFM_10G_LRM,   (1 << 4),       "10GBASE-LRM"},
2898         {0,             (1 << 3),       "10GBASE-SW"},
2899         {0,             (1 << 2),       "10GBASE-LW"},
2900         {0,             (1 << 1),       "10GBASE-EW"},
2901         {0,             (1 << 0),       "Reserved"}
2902 };
2903 static struct mxge_media_type mxge_sfp_media_types[] =
2904 {
2905         {IFM_10G_TWINAX,      0,        "10GBASE-Twinax"},
2906         {0,             (1 << 7),       "Reserved"},
2907         {IFM_10G_LRM,   (1 << 6),       "10GBASE-LRM"},
2908         {IFM_10G_LR,    (1 << 5),       "10GBASE-LR"},
2909         {IFM_10G_SR,    (1 << 4),       "10GBASE-SR"},
2910         {IFM_10G_TWINAX,(1 << 0),       "10GBASE-Twinax"}
2911 };
2912
2913 static void
2914 mxge_media_set(mxge_softc_t *sc, int media_type)
2915 {
2916
2917         
2918         ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2919                     0, NULL);
2920         ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2921         sc->current_media = media_type;
2922         sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2923 }
2924
2925 static void
2926 mxge_media_init(mxge_softc_t *sc)
2927 {
2928         char *ptr;
2929         int i;
2930
2931         ifmedia_removeall(&sc->media);
2932         mxge_media_set(sc, IFM_AUTO);
2933
2934         /*
2935          * parse the product code to deterimine the interface type
2936          * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2937          * after the 3rd dash in the driver's cached copy of the
2938          * EEPROM's product code string.
2939          */
2940         ptr = sc->product_code_string;
2941         if (ptr == NULL) {
2942                 device_printf(sc->dev, "Missing product code\n");
2943                 return;
2944         }
2945
2946         for (i = 0; i < 3; i++, ptr++) {
2947                 ptr = strchr(ptr, '-');
2948                 if (ptr == NULL) {
2949                         device_printf(sc->dev,
2950                                       "only %d dashes in PC?!?\n", i);
2951                         return;
2952                 }
2953         }
2954         if (*ptr == 'C' || *(ptr +1) == 'C') {
2955                 /* -C is CX4 */
2956                 sc->connector = MXGE_CX4;
2957                 mxge_media_set(sc, IFM_10G_CX4);
2958         } else if (*ptr == 'Q') {
2959                 /* -Q is Quad Ribbon Fiber */
2960                 sc->connector = MXGE_QRF;
2961                 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2962                 /* FreeBSD has no media type for Quad ribbon fiber */
2963         } else if (*ptr == 'R') {
2964                 /* -R is XFP */
2965                 sc->connector = MXGE_XFP;
2966         } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2967                 /* -S or -2S is SFP+ */
2968                 sc->connector = MXGE_SFP;
2969         } else {
2970                 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2971         }
2972 }
2973
2974 /*
2975  * Determine the media type for a NIC.  Some XFPs will identify
2976  * themselves only when their link is up, so this is initiated via a
2977  * link up interrupt.  However, this can potentially take up to
2978  * several milliseconds, so it is run via the watchdog routine, rather
2979  * than in the interrupt handler itself.
2980  */
2981 static void
2982 mxge_media_probe(mxge_softc_t *sc)
2983 {
2984         mxge_cmd_t cmd;
2985         char *cage_type;
2986
2987         struct mxge_media_type *mxge_media_types = NULL;
2988         int i, err, ms, mxge_media_type_entries;
2989         uint32_t byte;
2990
2991         sc->need_media_probe = 0;
2992
2993         if (sc->connector == MXGE_XFP) {
2994                 /* -R is XFP */
2995                 mxge_media_types = mxge_xfp_media_types;
2996                 mxge_media_type_entries =
2997                         sizeof (mxge_xfp_media_types) /
2998                         sizeof (mxge_xfp_media_types[0]);
2999                 byte = MXGE_XFP_COMPLIANCE_BYTE;
3000                 cage_type = "XFP";
3001         } else  if (sc->connector == MXGE_SFP) {
3002                 /* -S or -2S is SFP+ */
3003                 mxge_media_types = mxge_sfp_media_types;
3004                 mxge_media_type_entries =
3005                         sizeof (mxge_sfp_media_types) /
3006                         sizeof (mxge_sfp_media_types[0]);
3007                 cage_type = "SFP+";
3008                 byte = 3;
3009         } else {
3010                 /* nothing to do; media type cannot change */
3011                 return;
3012         }
3013
3014         /*
3015          * At this point we know the NIC has an XFP cage, so now we
3016          * try to determine what is in the cage by using the
3017          * firmware's XFP I2C commands to read the XFP 10GbE compilance
3018          * register.  We read just one byte, which may take over
3019          * a millisecond
3020          */
3021
3022         cmd.data0 = 0;   /* just fetch 1 byte, not all 256 */
3023         cmd.data1 = byte;
3024         err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3025         if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3026                 device_printf(sc->dev, "failed to read XFP\n");
3027         }
3028         if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3029                 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3030         }
3031         if (err != MXGEFW_CMD_OK) {
3032                 return;
3033         }
3034
3035         /* now we wait for the data to be cached */
3036         cmd.data0 = byte;
3037         err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3038         for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3039                 DELAY(1000);
3040                 cmd.data0 = byte;
3041                 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3042         }
3043         if (err != MXGEFW_CMD_OK) {
3044                 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3045                               cage_type, err, ms);
3046                 return;
3047         }
3048                 
3049         if (cmd.data0 == mxge_media_types[0].bitmask) {
3050                 if (mxge_verbose)
3051                         device_printf(sc->dev, "%s:%s\n", cage_type,
3052                                       mxge_media_types[0].name);
3053                 if (sc->current_media != mxge_media_types[0].flag) {
3054                         mxge_media_init(sc);
3055                         mxge_media_set(sc, mxge_media_types[0].flag);
3056                 }
3057                 return;
3058         }
3059         for (i = 1; i < mxge_media_type_entries; i++) {
3060                 if (cmd.data0 & mxge_media_types[i].bitmask) {
3061                         if (mxge_verbose)
3062                                 device_printf(sc->dev, "%s:%s\n",
3063                                               cage_type,
3064                                               mxge_media_types[i].name);
3065
3066                         if (sc->current_media != mxge_media_types[i].flag) {
3067                                 mxge_media_init(sc);
3068                                 mxge_media_set(sc, mxge_media_types[i].flag);
3069                         }
3070                         return;
3071                 }
3072         }
3073         if (mxge_verbose)
3074                 device_printf(sc->dev, "%s media 0x%x unknown\n",
3075                               cage_type, cmd.data0);
3076
3077         return;
3078 }
3079
3080 static void
3081 mxge_intr(void *arg)
3082 {
3083         struct mxge_slice_state *ss = arg;
3084         mxge_softc_t *sc = ss->sc;
3085         mcp_irq_data_t *stats = ss->fw_stats;
3086         mxge_tx_ring_t *tx = &ss->tx;
3087         mxge_rx_done_t *rx_done = &ss->rx_done;
3088         uint32_t send_done_count;
3089         uint8_t valid;
3090
3091
3092 #ifndef IFNET_BUF_RING
3093         /* an interrupt on a non-zero slice is implicitly valid
3094            since MSI-X irqs are not shared */
3095         if (ss != sc->ss) {
3096                 mxge_clean_rx_done(ss);
3097                 *ss->irq_claim = be32toh(3);
3098                 return;
3099         }
3100 #endif
3101
3102         /* make sure the DMA has finished */
3103         if (!stats->valid) {
3104                 return;
3105         }
3106         valid = stats->valid;
3107
3108         if (sc->legacy_irq) {
3109                 /* lower legacy IRQ  */
3110                 *sc->irq_deassert = 0;
3111                 if (!mxge_deassert_wait)
3112                         /* don't wait for conf. that irq is low */
3113                         stats->valid = 0;
3114         } else {
3115                 stats->valid = 0;
3116         }
3117
3118         /* loop while waiting for legacy irq deassertion */
3119         do {
3120                 /* check for transmit completes and receives */
3121                 send_done_count = be32toh(stats->send_done_count);
3122                 while ((send_done_count != tx->pkt_done) ||
3123                        (rx_done->entry[rx_done->idx].length != 0)) {
3124                         if (send_done_count != tx->pkt_done)
3125                                 mxge_tx_done(ss, (int)send_done_count);
3126                         mxge_clean_rx_done(ss);
3127                         send_done_count = be32toh(stats->send_done_count);
3128                 }
3129                 if (sc->legacy_irq && mxge_deassert_wait)
3130                         wmb();
3131         } while (*((volatile uint8_t *) &stats->valid));
3132
3133         /* fw link & error stats meaningful only on the first slice */
3134         if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3135                 if (sc->link_state != stats->link_up) {
3136                         sc->link_state = stats->link_up;
3137                         if (sc->link_state) {
3138                                 if_link_state_change(sc->ifp, LINK_STATE_UP);
3139                                 if (mxge_verbose)
3140                                         device_printf(sc->dev, "link up\n");
3141                         } else {
3142                                 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3143                                 if (mxge_verbose)
3144                                         device_printf(sc->dev, "link down\n");
3145                         }
3146                         sc->need_media_probe = 1;
3147                 }
3148                 if (sc->rdma_tags_available !=
3149                     be32toh(stats->rdma_tags_available)) {
3150                         sc->rdma_tags_available =
3151                                 be32toh(stats->rdma_tags_available);
3152                         device_printf(sc->dev, "RDMA timed out! %d tags "
3153                                       "left\n", sc->rdma_tags_available);
3154                 }
3155
3156                 if (stats->link_down) {
3157                         sc->down_cnt += stats->link_down;
3158                         sc->link_state = 0;
3159                         if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3160                 }
3161         }
3162
3163         /* check to see if we have rx token to pass back */
3164         if (valid & 0x1)
3165             *ss->irq_claim = be32toh(3);
3166         *(ss->irq_claim + 1) = be32toh(3);
3167 }
3168
3169 static void
3170 mxge_init(void *arg)
3171 {
3172         mxge_softc_t *sc = arg;
3173         struct ifnet *ifp = sc->ifp;
3174
3175
3176         mtx_lock(&sc->driver_mtx);
3177         if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3178                 (void) mxge_open(sc);
3179         mtx_unlock(&sc->driver_mtx);
3180 }
3181
3182
3183
3184 static void
3185 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3186 {
3187         int i;
3188
3189 #if defined(INET) || defined(INET6)
3190         tcp_lro_free(&ss->lc);
3191 #endif
3192         for (i = 0; i <= ss->rx_big.mask; i++) {
3193                 if (ss->rx_big.info[i].m == NULL)
3194                         continue;
3195                 bus_dmamap_unload(ss->rx_big.dmat,
3196                                   ss->rx_big.info[i].map);
3197                 m_freem(ss->rx_big.info[i].m);
3198                 ss->rx_big.info[i].m = NULL;
3199         }
3200
3201         for (i = 0; i <= ss->rx_small.mask; i++) {
3202                 if (ss->rx_small.info[i].m == NULL)
3203                         continue;
3204                 bus_dmamap_unload(ss->rx_small.dmat,
3205                                   ss->rx_small.info[i].map);
3206                 m_freem(ss->rx_small.info[i].m);
3207                 ss->rx_small.info[i].m = NULL;
3208         }
3209
3210         /* transmit ring used only on the first slice */
3211         if (ss->tx.info == NULL)
3212                 return;
3213
3214         for (i = 0; i <= ss->tx.mask; i++) {
3215                 ss->tx.info[i].flag = 0;
3216                 if (ss->tx.info[i].m == NULL)
3217                         continue;
3218                 bus_dmamap_unload(ss->tx.dmat,
3219                                   ss->tx.info[i].map);
3220                 m_freem(ss->tx.info[i].m);
3221                 ss->tx.info[i].m = NULL;
3222         }
3223 }
3224
3225 static void
3226 mxge_free_mbufs(mxge_softc_t *sc)
3227 {
3228         int slice;
3229
3230         for (slice = 0; slice < sc->num_slices; slice++)
3231                 mxge_free_slice_mbufs(&sc->ss[slice]);
3232 }
3233
3234 static void
3235 mxge_free_slice_rings(struct mxge_slice_state *ss)
3236 {
3237         int i;
3238
3239
3240         if (ss->rx_done.entry != NULL)
3241                 mxge_dma_free(&ss->rx_done.dma);
3242         ss->rx_done.entry = NULL;
3243
3244         if (ss->tx.req_bytes != NULL)
3245                 free(ss->tx.req_bytes, M_DEVBUF);
3246         ss->tx.req_bytes = NULL;
3247
3248         if (ss->tx.seg_list != NULL)
3249                 free(ss->tx.seg_list, M_DEVBUF);
3250         ss->tx.seg_list = NULL;
3251
3252         if (ss->rx_small.shadow != NULL)
3253                 free(ss->rx_small.shadow, M_DEVBUF);
3254         ss->rx_small.shadow = NULL;
3255
3256         if (ss->rx_big.shadow != NULL)
3257                 free(ss->rx_big.shadow, M_DEVBUF);
3258         ss->rx_big.shadow = NULL;
3259
3260         if (ss->tx.info != NULL) {
3261                 if (ss->tx.dmat != NULL) {
3262                         for (i = 0; i <= ss->tx.mask; i++) {
3263                                 bus_dmamap_destroy(ss->tx.dmat,
3264                                                    ss->tx.info[i].map);
3265                         }
3266                         bus_dma_tag_destroy(ss->tx.dmat);
3267                 }
3268                 free(ss->tx.info, M_DEVBUF);
3269         }
3270         ss->tx.info = NULL;
3271
3272         if (ss->rx_small.info != NULL) {
3273                 if (ss->rx_small.dmat != NULL) {
3274                         for (i = 0; i <= ss->rx_small.mask; i++) {
3275                                 bus_dmamap_destroy(ss->rx_small.dmat,
3276                                                    ss->rx_small.info[i].map);
3277                         }
3278                         bus_dmamap_destroy(ss->rx_small.dmat,
3279                                            ss->rx_small.extra_map);
3280                         bus_dma_tag_destroy(ss->rx_small.dmat);
3281                 }
3282                 free(ss->rx_small.info, M_DEVBUF);
3283         }
3284         ss->rx_small.info = NULL;
3285
3286         if (ss->rx_big.info != NULL) {
3287                 if (ss->rx_big.dmat != NULL) {
3288                         for (i = 0; i <= ss->rx_big.mask; i++) {
3289                                 bus_dmamap_destroy(ss->rx_big.dmat,
3290                                                    ss->rx_big.info[i].map);
3291                         }
3292                         bus_dmamap_destroy(ss->rx_big.dmat,
3293                                            ss->rx_big.extra_map);
3294                         bus_dma_tag_destroy(ss->rx_big.dmat);
3295                 }
3296                 free(ss->rx_big.info, M_DEVBUF);
3297         }
3298         ss->rx_big.info = NULL;
3299 }
3300
3301 static void
3302 mxge_free_rings(mxge_softc_t *sc)
3303 {
3304         int slice;
3305
3306         for (slice = 0; slice < sc->num_slices; slice++)
3307                 mxge_free_slice_rings(&sc->ss[slice]);
3308 }
3309
3310 static int
3311 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3312                        int tx_ring_entries)
3313 {
3314         mxge_softc_t *sc = ss->sc;
3315         size_t bytes;
3316         int err, i;
3317
3318         /* allocate per-slice receive resources */
3319
3320         ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3321         ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3322
3323         /* allocate the rx shadow rings */
3324         bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3325         ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3326
3327         bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3328         ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3329
3330         /* allocate the rx host info rings */
3331         bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3332         ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333
3334         bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3335         ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3336
3337         /* allocate the rx busdma resources */
3338         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3339                                  1,                     /* alignment */
3340                                  4096,                  /* boundary */
3341                                  BUS_SPACE_MAXADDR,     /* low */
3342                                  BUS_SPACE_MAXADDR,     /* high */
3343                                  NULL, NULL,            /* filter */
3344                                  MHLEN,                 /* maxsize */
3345                                  1,                     /* num segs */
3346                                  MHLEN,                 /* maxsegsize */
3347                                  BUS_DMA_ALLOCNOW,      /* flags */
3348                                  NULL, NULL,            /* lock */
3349                                  &ss->rx_small.dmat);   /* tag */
3350         if (err != 0) {
3351                 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3352                               err);
3353                 return err;
3354         }
3355
3356         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3357                                  1,                     /* alignment */
3358 #if MXGE_VIRT_JUMBOS
3359                                  4096,                  /* boundary */
3360 #else
3361                                  0,                     /* boundary */
3362 #endif
3363                                  BUS_SPACE_MAXADDR,     /* low */
3364                                  BUS_SPACE_MAXADDR,     /* high */
3365                                  NULL, NULL,            /* filter */
3366                                  3*4096,                /* maxsize */
3367 #if MXGE_VIRT_JUMBOS
3368                                  3,                     /* num segs */
3369                                  4096,                  /* maxsegsize*/
3370 #else
3371                                  1,                     /* num segs */
3372                                  MJUM9BYTES,            /* maxsegsize*/
3373 #endif
3374                                  BUS_DMA_ALLOCNOW,      /* flags */
3375                                  NULL, NULL,            /* lock */
3376                                  &ss->rx_big.dmat);     /* tag */
3377         if (err != 0) {
3378                 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3379                               err);
3380                 return err;
3381         }
3382         for (i = 0; i <= ss->rx_small.mask; i++) {
3383                 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3384                                         &ss->rx_small.info[i].map);
3385                 if (err != 0) {
3386                         device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3387                                       err);
3388                         return err;
3389                 }
3390         }
3391         err = bus_dmamap_create(ss->rx_small.dmat, 0,
3392                                 &ss->rx_small.extra_map);
3393         if (err != 0) {
3394                 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3395                               err);
3396                 return err;
3397         }
3398
3399         for (i = 0; i <= ss->rx_big.mask; i++) {
3400                 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3401                                         &ss->rx_big.info[i].map);
3402                 if (err != 0) {
3403                         device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3404                                       err);
3405                         return err;
3406                 }
3407         }
3408         err = bus_dmamap_create(ss->rx_big.dmat, 0,
3409                                 &ss->rx_big.extra_map);
3410         if (err != 0) {
3411                 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3412                               err);
3413                 return err;
3414         }
3415
3416         /* now allocate TX resources */
3417
3418 #ifndef IFNET_BUF_RING
3419         /* only use a single TX ring for now */
3420         if (ss != ss->sc->ss)
3421                 return 0;
3422 #endif
3423
3424         ss->tx.mask = tx_ring_entries - 1;
3425         ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3426
3427         
3428         /* allocate the tx request copy block */
3429         bytes = 8 +
3430                 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3431         ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3432         /* ensure req_list entries are aligned to 8 bytes */
3433         ss->tx.req_list = (mcp_kreq_ether_send_t *)
3434                 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3435
3436         /* allocate the tx busdma segment list */
3437         bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3438         ss->tx.seg_list = (bus_dma_segment_t *)
3439                 malloc(bytes, M_DEVBUF, M_WAITOK);
3440
3441         /* allocate the tx host info ring */
3442         bytes = tx_ring_entries * sizeof (*ss->tx.info);
3443         ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3444         
3445         /* allocate the tx busdma resources */
3446         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3447                                  1,                     /* alignment */
3448                                  sc->tx_boundary,       /* boundary */
3449                                  BUS_SPACE_MAXADDR,     /* low */
3450                                  BUS_SPACE_MAXADDR,     /* high */
3451                                  NULL, NULL,            /* filter */
3452                                  65536 + 256,           /* maxsize */
3453                                  ss->tx.max_desc - 2,   /* num segs */
3454                                  sc->tx_boundary,       /* maxsegsz */
3455                                  BUS_DMA_ALLOCNOW,      /* flags */
3456                                  NULL, NULL,            /* lock */
3457                                  &ss->tx.dmat);         /* tag */
3458         
3459         if (err != 0) {
3460                 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3461                               err);
3462                 return err;
3463         }
3464
3465         /* now use these tags to setup dmamaps for each slot
3466            in the ring */
3467         for (i = 0; i <= ss->tx.mask; i++) {
3468                 err = bus_dmamap_create(ss->tx.dmat, 0,
3469                                         &ss->tx.info[i].map);
3470                 if (err != 0) {
3471                         device_printf(sc->dev, "Err %d  tx dmamap\n",
3472                                       err);
3473                         return err;
3474                 }
3475         }
3476         return 0;
3477
3478 }
3479
3480 static int
3481 mxge_alloc_rings(mxge_softc_t *sc)
3482 {
3483         mxge_cmd_t cmd;
3484         int tx_ring_size;
3485         int tx_ring_entries, rx_ring_entries;
3486         int err, slice;
3487         
3488         /* get ring sizes */
3489         err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3490         tx_ring_size = cmd.data0;
3491         if (err != 0) {
3492                 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3493                 goto abort;
3494         }
3495
3496         tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3497         rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3498         IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3499         sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3500         IFQ_SET_READY(&sc->ifp->if_snd);
3501
3502         for (slice = 0; slice < sc->num_slices; slice++) {
3503                 err = mxge_alloc_slice_rings(&sc->ss[slice],
3504                                              rx_ring_entries,
3505                                              tx_ring_entries);
3506                 if (err != 0)
3507                         goto abort;
3508         }
3509         return 0;
3510
3511 abort:
3512         mxge_free_rings(sc);
3513         return err;
3514
3515 }
3516
3517
3518 static void
3519 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3520 {
3521         int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3522
3523         if (bufsize < MCLBYTES) {
3524                 /* easy, everything fits in a single buffer */
3525                 *big_buf_size = MCLBYTES;
3526                 *cl_size = MCLBYTES;
3527                 *nbufs = 1;
3528                 return;
3529         }
3530
3531         if (bufsize < MJUMPAGESIZE) {
3532                 /* still easy, everything still fits in a single buffer */
3533                 *big_buf_size = MJUMPAGESIZE;
3534                 *cl_size = MJUMPAGESIZE;
3535                 *nbufs = 1;
3536                 return;
3537         }
3538 #if MXGE_VIRT_JUMBOS
3539         /* now we need to use virtually contiguous buffers */
3540         *cl_size = MJUM9BYTES;
3541         *big_buf_size = 4096;
3542         *nbufs = mtu / 4096 + 1;
3543         /* needs to be a power of two, so round up */
3544         if (*nbufs == 3)
3545                 *nbufs = 4;
3546 #else
3547         *cl_size = MJUM9BYTES;
3548         *big_buf_size = MJUM9BYTES;
3549         *nbufs = 1;
3550 #endif
3551 }
3552
3553 static int
3554 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3555 {
3556         mxge_softc_t *sc;
3557         mxge_cmd_t cmd;
3558         bus_dmamap_t map;
3559         int err, i, slice;
3560
3561
3562         sc = ss->sc;
3563         slice = ss - sc->ss;
3564
3565 #if defined(INET) || defined(INET6)
3566         (void)tcp_lro_init(&ss->lc);
3567 #endif
3568         ss->lc.ifp = sc->ifp;
3569         
3570         /* get the lanai pointers to the send and receive rings */
3571
3572         err = 0;
3573 #ifndef IFNET_BUF_RING
3574         /* We currently only send from the first slice */
3575         if (slice == 0) {
3576 #endif
3577                 cmd.data0 = slice;
3578                 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3579                 ss->tx.lanai =
3580                         (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3581                 ss->tx.send_go = (volatile uint32_t *)
3582                         (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3583                 ss->tx.send_stop = (volatile uint32_t *)
3584                 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3585 #ifndef IFNET_BUF_RING
3586         }
3587 #endif
3588         cmd.data0 = slice;
3589         err |= mxge_send_cmd(sc,
3590                              MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3591         ss->rx_small.lanai =
3592                 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3593         cmd.data0 = slice;
3594         err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3595         ss->rx_big.lanai =
3596                 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3597
3598         if (err != 0) {
3599                 device_printf(sc->dev,
3600                               "failed to get ring sizes or locations\n");
3601                 return EIO;
3602         }
3603
3604         /* stock receive rings */
3605         for (i = 0; i <= ss->rx_small.mask; i++) {
3606                 map = ss->rx_small.info[i].map;
3607                 err = mxge_get_buf_small(ss, map, i);
3608                 if (err) {
3609                         device_printf(sc->dev, "alloced %d/%d smalls\n",
3610                                       i, ss->rx_small.mask + 1);
3611                         return ENOMEM;
3612                 }
3613         }
3614         for (i = 0; i <= ss->rx_big.mask; i++) {
3615                 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3616                 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3617         }
3618         ss->rx_big.nbufs = nbufs;
3619         ss->rx_big.cl_size = cl_size;
3620         ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3621                 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3622         for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3623                 map = ss->rx_big.info[i].map;
3624                 err = mxge_get_buf_big(ss, map, i);
3625                 if (err) {
3626                         device_printf(sc->dev, "alloced %d/%d bigs\n",
3627                                       i, ss->rx_big.mask + 1);
3628                         return ENOMEM;
3629                 }
3630         }
3631         return 0;
3632 }
3633
3634 static int
3635 mxge_open(mxge_softc_t *sc)
3636 {
3637         mxge_cmd_t cmd;
3638         int err, big_bytes, nbufs, slice, cl_size, i;
3639         bus_addr_t bus;
3640         volatile uint8_t *itable;
3641         struct mxge_slice_state *ss;
3642
3643         /* Copy the MAC address in case it was overridden */
3644         bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3645
3646         err = mxge_reset(sc, 1);
3647         if (err != 0) {
3648                 device_printf(sc->dev, "failed to reset\n");
3649                 return EIO;
3650         }
3651
3652         if (sc->num_slices > 1) {
3653                 /* setup the indirection table */
3654                 cmd.data0 = sc->num_slices;
3655                 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3656                                     &cmd);
3657
3658                 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3659                                      &cmd);
3660                 if (err != 0) {
3661                         device_printf(sc->dev,
3662                                       "failed to setup rss tables\n");
3663                         return err;
3664                 }
3665
3666                 /* just enable an identity mapping */
3667                 itable = sc->sram + cmd.data0;
3668                 for (i = 0; i < sc->num_slices; i++)
3669                         itable[i] = (uint8_t)i;
3670
3671                 cmd.data0 = 1;
3672                 cmd.data1 = mxge_rss_hash_type;
3673                 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3674                 if (err != 0) {
3675                         device_printf(sc->dev, "failed to enable slices\n");
3676                         return err;
3677                 }
3678         }
3679
3680
3681         mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3682
3683         cmd.data0 = nbufs;
3684         err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3685                             &cmd);
3686         /* error is only meaningful if we're trying to set
3687            MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3688         if (err && nbufs > 1) {
3689                 device_printf(sc->dev,
3690                               "Failed to set alway-use-n to %d\n",
3691                               nbufs);
3692                 return EIO;
3693         }
3694         /* Give the firmware the mtu and the big and small buffer
3695            sizes.  The firmware wants the big buf size to be a power
3696            of two. Luckily, FreeBSD's clusters are powers of two */
3697         cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3698         err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3699         cmd.data0 = MHLEN - MXGEFW_PAD;
3700         err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3701                              &cmd);
3702         cmd.data0 = big_bytes;
3703         err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3704
3705         if (err != 0) {
3706                 device_printf(sc->dev, "failed to setup params\n");
3707                 goto abort;
3708         }
3709
3710         /* Now give him the pointer to the stats block */
3711         for (slice = 0;
3712 #ifdef IFNET_BUF_RING
3713              slice < sc->num_slices;
3714 #else
3715              slice < 1;
3716 #endif
3717              slice++) {
3718                 ss = &sc->ss[slice];
3719                 cmd.data0 =
3720                         MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3721                 cmd.data1 =
3722                         MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3723                 cmd.data2 = sizeof(struct mcp_irq_data);
3724                 cmd.data2 |= (slice << 16);
3725                 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3726         }
3727
3728         if (err != 0) {
3729                 bus = sc->ss->fw_stats_dma.bus_addr;
3730                 bus += offsetof(struct mcp_irq_data, send_done_count);
3731                 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3732                 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3733                 err = mxge_send_cmd(sc,
3734                                     MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3735                                     &cmd);
3736                 /* Firmware cannot support multicast without STATS_DMA_V2 */
3737                 sc->fw_multicast_support = 0;
3738         } else {
3739                 sc->fw_multicast_support = 1;
3740         }
3741
3742         if (err != 0) {
3743                 device_printf(sc->dev, "failed to setup params\n");
3744                 goto abort;
3745         }
3746
3747         for (slice = 0; slice < sc->num_slices; slice++) {
3748                 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3749                 if (err != 0) {
3750                         device_printf(sc->dev, "couldn't open slice %d\n",
3751                                       slice);
3752                         goto abort;
3753                 }
3754         }
3755
3756         /* Finally, start the firmware running */
3757         err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3758         if (err) {
3759                 device_printf(sc->dev, "Couldn't bring up link\n");
3760                 goto abort;
3761         }
3762 #ifdef IFNET_BUF_RING
3763         for (slice = 0; slice < sc->num_slices; slice++) {
3764                 ss = &sc->ss[slice];
3765                 ss->if_drv_flags |= IFF_DRV_RUNNING;
3766                 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3767         }
3768 #endif
3769         sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3770         sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3771
3772         return 0;
3773
3774
3775 abort:
3776         mxge_free_mbufs(sc);
3777
3778         return err;
3779 }
3780
3781 static int
3782 mxge_close(mxge_softc_t *sc, int down)
3783 {
3784         mxge_cmd_t cmd;
3785         int err, old_down_cnt;
3786 #ifdef IFNET_BUF_RING
3787         struct mxge_slice_state *ss;    
3788         int slice;
3789 #endif
3790
3791 #ifdef IFNET_BUF_RING
3792         for (slice = 0; slice < sc->num_slices; slice++) {
3793                 ss = &sc->ss[slice];
3794                 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3795         }
3796 #endif
3797         sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3798         if (!down) {
3799                 old_down_cnt = sc->down_cnt;
3800                 wmb();
3801                 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3802                 if (err) {
3803                         device_printf(sc->dev,
3804                                       "Couldn't bring down link\n");
3805                 }
3806                 if (old_down_cnt == sc->down_cnt) {
3807                         /* wait for down irq */
3808                         DELAY(10 * sc->intr_coal_delay);
3809                 }
3810                 wmb();
3811                 if (old_down_cnt == sc->down_cnt) {
3812                         device_printf(sc->dev, "never got down irq\n");
3813                 }
3814         }
3815         mxge_free_mbufs(sc);
3816
3817         return 0;
3818 }
3819
3820 static void
3821 mxge_setup_cfg_space(mxge_softc_t *sc)
3822 {
3823         device_t dev = sc->dev;
3824         int reg;
3825         uint16_t lnk, pectl;
3826
3827         /* find the PCIe link width and set max read request to 4KB*/
3828         if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3829                 lnk = pci_read_config(dev, reg + 0x12, 2);
3830                 sc->link_width = (lnk >> 4) & 0x3f;
3831
3832                 if (sc->pectl == 0) {
3833                         pectl = pci_read_config(dev, reg + 0x8, 2);
3834                         pectl = (pectl & ~0x7000) | (5 << 12);
3835                         pci_write_config(dev, reg + 0x8, pectl, 2);
3836                         sc->pectl = pectl;
3837                 } else {
3838                         /* restore saved pectl after watchdog reset */
3839                         pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3840                 }
3841         }
3842
3843         /* Enable DMA and Memory space access */
3844         pci_enable_busmaster(dev);
3845 }
3846
3847 static uint32_t
3848 mxge_read_reboot(mxge_softc_t *sc)
3849 {
3850         device_t dev = sc->dev;
3851         uint32_t vs;
3852
3853         /* find the vendor specific offset */
3854         if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3855                 device_printf(sc->dev,
3856                               "could not find vendor specific offset\n");
3857                 return (uint32_t)-1;
3858         }
3859         /* enable read32 mode */
3860         pci_write_config(dev, vs + 0x10, 0x3, 1);
3861         /* tell NIC which register to read */
3862         pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3863         return (pci_read_config(dev, vs + 0x14, 4));
3864 }
3865
3866 static void
3867 mxge_watchdog_reset(mxge_softc_t *sc)
3868 {
3869         struct pci_devinfo *dinfo;
3870         struct mxge_slice_state *ss;
3871         int err, running, s, num_tx_slices = 1;
3872         uint32_t reboot;
3873         uint16_t cmd;
3874
3875         err = ENXIO;
3876
3877         device_printf(sc->dev, "Watchdog reset!\n");
3878
3879         /*
3880          * check to see if the NIC rebooted.  If it did, then all of
3881          * PCI config space has been reset, and things like the
3882          * busmaster bit will be zero.  If this is the case, then we
3883          * must restore PCI config space before the NIC can be used
3884          * again
3885          */
3886         cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3887         if (cmd == 0xffff) {
3888                 /*
3889                  * maybe the watchdog caught the NIC rebooting; wait
3890                  * up to 100ms for it to finish.  If it does not come
3891                  * back, then give up
3892                  */
3893                 DELAY(1000*100);
3894                 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3895                 if (cmd == 0xffff) {
3896                         device_printf(sc->dev, "NIC disappeared!\n");
3897                 }
3898         }
3899         if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3900                 /* print the reboot status */
3901                 reboot = mxge_read_reboot(sc);
3902                 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3903                               reboot);
3904                 running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3905                 if (running) {
3906
3907                         /*
3908                          * quiesce NIC so that TX routines will not try to
3909                          * xmit after restoration of BAR
3910                          */
3911
3912                         /* Mark the link as down */
3913                         if (sc->link_state) {
3914                                 sc->link_state = 0;
3915                                 if_link_state_change(sc->ifp,
3916                                                      LINK_STATE_DOWN);
3917                         }
3918 #ifdef IFNET_BUF_RING
3919                         num_tx_slices = sc->num_slices;
3920 #endif
3921                         /* grab all TX locks to ensure no tx  */
3922                         for (s = 0; s < num_tx_slices; s++) {
3923                                 ss = &sc->ss[s];
3924                                 mtx_lock(&ss->tx.mtx);
3925                         }
3926                         mxge_close(sc, 1);
3927                 }
3928                 /* restore PCI configuration space */
3929                 dinfo = device_get_ivars(sc->dev);
3930                 pci_cfg_restore(sc->dev, dinfo);
3931
3932                 /* and redo any changes we made to our config space */
3933                 mxge_setup_cfg_space(sc);
3934
3935                 /* reload f/w */
3936                 err = mxge_load_firmware(sc, 0);
3937                 if (err) {
3938                         device_printf(sc->dev,
3939                                       "Unable to re-load f/w\n");
3940                 }
3941                 if (running) {
3942                         if (!err)
3943                                 err = mxge_open(sc);
3944                         /* release all TX locks */
3945                         for (s = 0; s < num_tx_slices; s++) {
3946                                 ss = &sc->ss[s];
3947 #ifdef IFNET_BUF_RING
3948                                 mxge_start_locked(ss);
3949 #endif
3950                                 mtx_unlock(&ss->tx.mtx);
3951                         }
3952                 }
3953                 sc->watchdog_resets++;
3954         } else {
3955                 device_printf(sc->dev,
3956                               "NIC did not reboot, not resetting\n");
3957                 err = 0;
3958         }
3959         if (err) {
3960                 device_printf(sc->dev, "watchdog reset failed\n");
3961         } else {
3962                 if (sc->dying == 2)
3963                         sc->dying = 0;
3964                 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3965         }
3966 }
3967
3968 static void
3969 mxge_watchdog_task(void *arg, int pending)
3970 {
3971         mxge_softc_t *sc = arg;
3972
3973
3974         mtx_lock(&sc->driver_mtx);
3975         mxge_watchdog_reset(sc);
3976         mtx_unlock(&sc->driver_mtx);
3977 }
3978
3979 static void
3980 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3981 {
3982         tx = &sc->ss[slice].tx;
3983         device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3984         device_printf(sc->dev,
3985                       "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3986                       tx->req, tx->done, tx->queue_active);
3987         device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3988                               tx->activate, tx->deactivate);
3989         device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3990                       tx->pkt_done,
3991                       be32toh(sc->ss->fw_stats->send_done_count));
3992 }
3993
3994 static int
3995 mxge_watchdog(mxge_softc_t *sc)
3996 {
3997         mxge_tx_ring_t *tx;
3998         uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3999         int i, err = 0;
4000
4001         /* see if we have outstanding transmits, which
4002            have been pending for more than mxge_ticks */
4003         for (i = 0;
4004 #ifdef IFNET_BUF_RING
4005              (i < sc->num_slices) && (err == 0);
4006 #else
4007              (i < 1) && (err == 0);
4008 #endif
4009              i++) {
4010                 tx = &sc->ss[i].tx;             
4011                 if (tx->req != tx->done &&
4012                     tx->watchdog_req != tx->watchdog_done &&
4013                     tx->done == tx->watchdog_done) {
4014                         /* check for pause blocking before resetting */
4015                         if (tx->watchdog_rx_pause == rx_pause) {
4016                                 mxge_warn_stuck(sc, tx, i);
4017                                 taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4018                                 return (ENXIO);
4019                         }
4020                         else
4021                                 device_printf(sc->dev, "Flow control blocking "
4022                                               "xmits, check link partner\n");
4023                 }
4024
4025                 tx->watchdog_req = tx->req;
4026                 tx->watchdog_done = tx->done;
4027                 tx->watchdog_rx_pause = rx_pause;
4028         }
4029
4030         if (sc->need_media_probe)
4031                 mxge_media_probe(sc);
4032         return (err);
4033 }
4034
4035 static uint64_t
4036 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4037 {
4038         struct mxge_softc *sc;
4039         uint64_t rv;
4040
4041         sc = if_getsoftc(ifp);
4042         rv = 0;
4043
4044         switch (cnt) {
4045         case IFCOUNTER_IPACKETS:
4046                 for (int s = 0; s < sc->num_slices; s++)
4047                         rv += sc->ss[s].ipackets;
4048                 return (rv);
4049         case IFCOUNTER_OPACKETS:
4050                 for (int s = 0; s < sc->num_slices; s++)
4051                         rv += sc->ss[s].opackets;
4052                 return (rv);
4053         case IFCOUNTER_OERRORS:
4054                 for (int s = 0; s < sc->num_slices; s++)
4055                         rv += sc->ss[s].oerrors;
4056                 return (rv);
4057 #ifdef IFNET_BUF_RING
4058         case IFCOUNTER_OBYTES:
4059                 for (int s = 0; s < sc->num_slices; s++)
4060                         rv += sc->ss[s].obytes;
4061                 return (rv);
4062         case IFCOUNTER_OMCASTS:
4063                 for (int s = 0; s < sc->num_slices; s++)
4064                         rv += sc->ss[s].omcasts;
4065                 return (rv);
4066         case IFCOUNTER_OQDROPS:
4067                 for (int s = 0; s < sc->num_slices; s++)
4068                         rv += sc->ss[s].tx.br->br_drops;
4069                 return (rv);
4070 #endif
4071         default:
4072                 return (if_get_counter_default(ifp, cnt));
4073         }
4074 }
4075
4076 static void
4077 mxge_tick(void *arg)
4078 {
4079         mxge_softc_t *sc = arg;
4080         u_long pkts = 0;
4081         int err = 0;
4082         int running, ticks;
4083         uint16_t cmd;
4084
4085         ticks = mxge_ticks;
4086         running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4087         if (running) {
4088                 if (!sc->watchdog_countdown) {
4089                         err = mxge_watchdog(sc);
4090                         sc->watchdog_countdown = 4;
4091                 }
4092                 sc->watchdog_countdown--;
4093         }
4094         if (pkts == 0) {
4095                 /* ensure NIC did not suffer h/w fault while idle */
4096                 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);                
4097                 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4098                         sc->dying = 2;
4099                         taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4100                         err = ENXIO;
4101                 }
4102                 /* look less often if NIC is idle */
4103                 ticks *= 4;
4104         }
4105
4106         if (err == 0)
4107                 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4108
4109 }
4110
4111 static int
4112 mxge_media_change(struct ifnet *ifp)
4113 {
4114         return EINVAL;
4115 }
4116
4117 static int
4118 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4119 {
4120         struct ifnet *ifp = sc->ifp;
4121         int real_mtu, old_mtu;
4122         int err = 0;
4123
4124
4125         real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4126         if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4127                 return EINVAL;
4128         mtx_lock(&sc->driver_mtx);
4129         old_mtu = ifp->if_mtu;
4130         ifp->if_mtu = mtu;
4131         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4132                 mxge_close(sc, 0);
4133                 err = mxge_open(sc);
4134                 if (err != 0) {
4135                         ifp->if_mtu = old_mtu;
4136                         mxge_close(sc, 0);
4137                         (void) mxge_open(sc);
4138                 }
4139         }
4140         mtx_unlock(&sc->driver_mtx);
4141         return err;
4142 }       
4143
4144 static void
4145 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4146 {
4147         mxge_softc_t *sc = ifp->if_softc;
4148         
4149
4150         if (sc == NULL)
4151                 return;
4152         ifmr->ifm_status = IFM_AVALID;
4153         ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4154         ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4155         ifmr->ifm_active |= sc->current_media;
4156 }
4157
4158 static int
4159 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4160 {
4161         mxge_softc_t *sc = ifp->if_softc;
4162         struct ifreq *ifr = (struct ifreq *)data;
4163         int err, mask;
4164
4165         err = 0;
4166         switch (command) {
4167         case SIOCSIFADDR:
4168         case SIOCGIFADDR:
4169                 err = ether_ioctl(ifp, command, data);
4170                 break;
4171
4172         case SIOCSIFMTU:
4173                 err = mxge_change_mtu(sc, ifr->ifr_mtu);
4174                 break;
4175
4176         case SIOCSIFFLAGS:
4177                 mtx_lock(&sc->driver_mtx);
4178                 if (sc->dying) {
4179                         mtx_unlock(&sc->driver_mtx);
4180                         return EINVAL;
4181                 }
4182                 if (ifp->if_flags & IFF_UP) {
4183                         if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4184                                 err = mxge_open(sc);
4185                         } else {
4186                                 /* take care of promis can allmulti
4187                                    flag chages */
4188                                 mxge_change_promisc(sc,
4189                                                     ifp->if_flags & IFF_PROMISC);
4190                                 mxge_set_multicast_list(sc);
4191                         }
4192                 } else {
4193                         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4194                                 mxge_close(sc, 0);
4195                         }
4196                 }
4197                 mtx_unlock(&sc->driver_mtx);
4198                 break;
4199
4200         case SIOCADDMULTI:
4201         case SIOCDELMULTI:
4202                 mtx_lock(&sc->driver_mtx);
4203                 mxge_set_multicast_list(sc);
4204                 mtx_unlock(&sc->driver_mtx);
4205                 break;
4206
4207         case SIOCSIFCAP:
4208                 mtx_lock(&sc->driver_mtx);
4209                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4210                 if (mask & IFCAP_TXCSUM) {
4211                         if (IFCAP_TXCSUM & ifp->if_capenable) {
4212                                 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4213                                 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4214                         } else {
4215                                 ifp->if_capenable |= IFCAP_TXCSUM;
4216                                 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4217                         }
4218                 } else if (mask & IFCAP_RXCSUM) {
4219                         if (IFCAP_RXCSUM & ifp->if_capenable) {
4220                                 ifp->if_capenable &= ~IFCAP_RXCSUM;
4221                         } else {
4222                                 ifp->if_capenable |= IFCAP_RXCSUM;
4223                         }
4224                 }
4225                 if (mask & IFCAP_TSO4) {
4226                         if (IFCAP_TSO4 & ifp->if_capenable) {
4227                                 ifp->if_capenable &= ~IFCAP_TSO4;
4228                         } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4229                                 ifp->if_capenable |= IFCAP_TSO4;
4230                                 ifp->if_hwassist |= CSUM_TSO;
4231                         } else {
4232                                 printf("mxge requires tx checksum offload"
4233                                        " be enabled to use TSO\n");
4234                                 err = EINVAL;
4235                         }
4236                 }
4237 #if IFCAP_TSO6
4238                 if (mask & IFCAP_TXCSUM_IPV6) {
4239                         if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4240                                 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4241                                                        | IFCAP_TSO6);
4242                                 ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4243                                                       | CSUM_UDP);
4244                         } else {
4245                                 ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4246                                 ifp->if_hwassist |= (CSUM_TCP_IPV6
4247                                                      | CSUM_UDP_IPV6);
4248                         }
4249                 } else if (mask & IFCAP_RXCSUM_IPV6) {
4250                         if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4251                                 ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4252                         } else {
4253                                 ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4254                         }
4255                 }
4256                 if (mask & IFCAP_TSO6) {
4257                         if (IFCAP_TSO6 & ifp->if_capenable) {
4258                                 ifp->if_capenable &= ~IFCAP_TSO6;
4259                         } else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4260                                 ifp->if_capenable |= IFCAP_TSO6;
4261                                 ifp->if_hwassist |= CSUM_TSO;
4262                         } else {
4263                                 printf("mxge requires tx checksum offload"
4264                                        " be enabled to use TSO\n");
4265                                 err = EINVAL;
4266                         }
4267                 }
4268 #endif /*IFCAP_TSO6 */
4269
4270                 if (mask & IFCAP_LRO)
4271                         ifp->if_capenable ^= IFCAP_LRO;
4272                 if (mask & IFCAP_VLAN_HWTAGGING)
4273                         ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4274                 if (mask & IFCAP_VLAN_HWTSO)
4275                         ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4276
4277                 if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4278                     !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4279                         ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4280
4281                 mtx_unlock(&sc->driver_mtx);
4282                 VLAN_CAPABILITIES(ifp);
4283
4284                 break;
4285
4286         case SIOCGIFMEDIA:
4287                 mtx_lock(&sc->driver_mtx);
4288                 mxge_media_probe(sc);
4289                 mtx_unlock(&sc->driver_mtx);
4290                 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4291                                     &sc->media, command);
4292                 break;
4293
4294         default:
4295                 err = ENOTTY;
4296         }
4297         return err;
4298 }
4299
4300 static void
4301 mxge_fetch_tunables(mxge_softc_t *sc)
4302 {
4303
4304         TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4305         TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4306                           &mxge_flow_control);
4307         TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4308                           &mxge_intr_coal_delay);       
4309         TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4310                           &mxge_nvidia_ecrc_enable);    
4311         TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4312                           &mxge_force_firmware);        
4313         TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4314                           &mxge_deassert_wait); 
4315         TUNABLE_INT_FETCH("hw.mxge.verbose",
4316                           &mxge_verbose);       
4317         TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4318         TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4319         TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4320         TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4321         TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4322         TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4323
4324         if (bootverbose)
4325                 mxge_verbose = 1;
4326         if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4327                 mxge_intr_coal_delay = 30;
4328         if (mxge_ticks == 0)
4329                 mxge_ticks = hz / 2;
4330         sc->pause = mxge_flow_control;
4331         if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4332             || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4333                 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4334         }
4335         if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4336             mxge_initial_mtu < ETHER_MIN_LEN)
4337                 mxge_initial_mtu = ETHERMTU_JUMBO;
4338
4339         if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4340                 mxge_throttle = MXGE_MAX_THROTTLE;
4341         if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4342                 mxge_throttle = MXGE_MIN_THROTTLE;
4343         sc->throttle = mxge_throttle;
4344 }
4345
4346
4347 static void
4348 mxge_free_slices(mxge_softc_t *sc)
4349 {
4350         struct mxge_slice_state *ss;
4351         int i;
4352
4353
4354         if (sc->ss == NULL)
4355                 return;
4356
4357         for (i = 0; i < sc->num_slices; i++) {
4358                 ss = &sc->ss[i];
4359                 if (ss->fw_stats != NULL) {
4360                         mxge_dma_free(&ss->fw_stats_dma);
4361                         ss->fw_stats = NULL;
4362 #ifdef IFNET_BUF_RING
4363                         if (ss->tx.br != NULL) {
4364                                 drbr_free(ss->tx.br, M_DEVBUF);
4365                                 ss->tx.br = NULL;
4366                         }
4367 #endif
4368                         mtx_destroy(&ss->tx.mtx);
4369                 }
4370                 if (ss->rx_done.entry != NULL) {
4371                         mxge_dma_free(&ss->rx_done.dma);
4372                         ss->rx_done.entry = NULL;
4373                 }
4374         }
4375         free(sc->ss, M_DEVBUF);
4376         sc->ss = NULL;
4377 }
4378
4379 static int
4380 mxge_alloc_slices(mxge_softc_t *sc)
4381 {
4382         mxge_cmd_t cmd;
4383         struct mxge_slice_state *ss;
4384         size_t bytes;
4385         int err, i, max_intr_slots;
4386
4387         err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4388         if (err != 0) {
4389                 device_printf(sc->dev, "Cannot determine rx ring size\n");
4390                 return err;
4391         }
4392         sc->rx_ring_size = cmd.data0;
4393         max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4394         
4395         bytes = sizeof (*sc->ss) * sc->num_slices;
4396         sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4397         if (sc->ss == NULL)
4398                 return (ENOMEM);
4399         for (i = 0; i < sc->num_slices; i++) {
4400                 ss = &sc->ss[i];
4401
4402                 ss->sc = sc;
4403
4404                 /* allocate per-slice rx interrupt queues */
4405                 
4406                 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4407                 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4408                 if (err != 0)
4409                         goto abort;
4410                 ss->rx_done.entry = ss->rx_done.dma.addr;
4411                 bzero(ss->rx_done.entry, bytes);
4412
4413                 /*
4414                  * allocate the per-slice firmware stats; stats
4415                  * (including tx) are used used only on the first
4416                  * slice for now
4417                  */
4418 #ifndef IFNET_BUF_RING
4419                 if (i > 0)
4420                         continue;
4421 #endif
4422
4423                 bytes = sizeof (*ss->fw_stats);
4424                 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4425                                      sizeof (*ss->fw_stats), 64);
4426                 if (err != 0)
4427                         goto abort;
4428                 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4429                 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4430                          "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4431                 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4432 #ifdef IFNET_BUF_RING
4433                 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4434                                            &ss->tx.mtx);
4435 #endif
4436         }
4437
4438         return (0);
4439
4440 abort:
4441         mxge_free_slices(sc);
4442         return (ENOMEM);
4443 }
4444
4445 static void
4446 mxge_slice_probe(mxge_softc_t *sc)
4447 {
4448         mxge_cmd_t cmd;
4449         char *old_fw;
4450         int msix_cnt, status, max_intr_slots;
4451
4452         sc->num_slices = 1;
4453         /*
4454          *  don't enable multiple slices if they are not enabled,
4455          *  or if this is not an SMP system
4456          */
4457         
4458         if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4459                 return;
4460
4461         /* see how many MSI-X interrupts are available */
4462         msix_cnt = pci_msix_count(sc->dev);
4463         if (msix_cnt < 2)
4464                 return;
4465
4466         /* now load the slice aware firmware see what it supports */
4467         old_fw = sc->fw_name;
4468         if (old_fw == mxge_fw_aligned)
4469                 sc->fw_name = mxge_fw_rss_aligned;
4470         else
4471                 sc->fw_name = mxge_fw_rss_unaligned;
4472         status = mxge_load_firmware(sc, 0);
4473         if (status != 0) {
4474                 device_printf(sc->dev, "Falling back to a single slice\n");
4475                 return;
4476         }
4477         
4478         /* try to send a reset command to the card to see if it
4479            is alive */
4480         memset(&cmd, 0, sizeof (cmd));
4481         status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4482         if (status != 0) {
4483                 device_printf(sc->dev, "failed reset\n");
4484                 goto abort_with_fw;
4485         }
4486
4487         /* get rx ring size */
4488         status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4489         if (status != 0) {
4490                 device_printf(sc->dev, "Cannot determine rx ring size\n");
4491                 goto abort_with_fw;
4492         }
4493         max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4494
4495         /* tell it the size of the interrupt queues */
4496         cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4497         status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4498         if (status != 0) {
4499                 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4500                 goto abort_with_fw;
4501         }
4502
4503         /* ask the maximum number of slices it supports */
4504         status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4505         if (status != 0) {
4506                 device_printf(sc->dev,
4507                               "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4508                 goto abort_with_fw;
4509         }
4510         sc->num_slices = cmd.data0;
4511         if (sc->num_slices > msix_cnt)
4512                 sc->num_slices = msix_cnt;
4513
4514         if (mxge_max_slices == -1) {
4515                 /* cap to number of CPUs in system */
4516                 if (sc->num_slices > mp_ncpus)
4517                         sc->num_slices = mp_ncpus;
4518         } else {
4519                 if (sc->num_slices > mxge_max_slices)
4520                         sc->num_slices = mxge_max_slices;
4521         }
4522         /* make sure it is a power of two */
4523         while (sc->num_slices & (sc->num_slices - 1))
4524                 sc->num_slices--;
4525
4526         if (mxge_verbose)
4527                 device_printf(sc->dev, "using %d slices\n",
4528                               sc->num_slices);
4529         
4530         return;
4531
4532 abort_with_fw:
4533         sc->fw_name = old_fw;
4534         (void) mxge_load_firmware(sc, 0);
4535 }
4536
4537 static int
4538 mxge_add_msix_irqs(mxge_softc_t *sc)
4539 {
4540         size_t bytes;
4541         int count, err, i, rid;
4542
4543         rid = PCIR_BAR(2);
4544         sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4545                                                     &rid, RF_ACTIVE);
4546
4547         if (sc->msix_table_res == NULL) {
4548                 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4549                 return ENXIO;
4550         }
4551
4552         count = sc->num_slices;
4553         err = pci_alloc_msix(sc->dev, &count);
4554         if (err != 0) {
4555                 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4556                               "err = %d \n", sc->num_slices, err);
4557                 goto abort_with_msix_table;
4558         }
4559         if (count < sc->num_slices) {
4560                 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4561                               count, sc->num_slices);
4562                 device_printf(sc->dev,
4563                               "Try setting hw.mxge.max_slices to %d\n",
4564                               count);
4565                 err = ENOSPC;
4566                 goto abort_with_msix;
4567         }
4568         bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4569         sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4570         if (sc->msix_irq_res == NULL) {
4571                 err = ENOMEM;
4572                 goto abort_with_msix;
4573         }
4574
4575         for (i = 0; i < sc->num_slices; i++) {
4576                 rid = i + 1;
4577                 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4578                                                           SYS_RES_IRQ,
4579                                                           &rid, RF_ACTIVE);
4580                 if (sc->msix_irq_res[i] == NULL) {
4581                         device_printf(sc->dev, "couldn't allocate IRQ res"
4582                                       " for message %d\n", i);
4583                         err = ENXIO;
4584                         goto abort_with_res;
4585                 }
4586         }
4587
4588         bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4589         sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4590
4591         for (i = 0; i < sc->num_slices; i++) {
4592                 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4593                                      INTR_TYPE_NET | INTR_MPSAFE,
4594 #if __FreeBSD_version > 700030
4595                                      NULL,
4596 #endif
4597                                      mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4598                 if (err != 0) {
4599                         device_printf(sc->dev, "couldn't setup intr for "
4600                                       "message %d\n", i);
4601                         goto abort_with_intr;
4602                 }
4603                 bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4604                                   sc->msix_ih[i], "s%d", i);
4605         }
4606
4607         if (mxge_verbose) {
4608                 device_printf(sc->dev, "using %d msix IRQs:",
4609                               sc->num_slices);
4610                 for (i = 0; i < sc->num_slices; i++)
4611                         printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4612                 printf("\n");
4613         }
4614         return (0);
4615
4616 abort_with_intr:
4617         for (i = 0; i < sc->num_slices; i++) {
4618                 if (sc->msix_ih[i] != NULL) {
4619                         bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4620                                           sc->msix_ih[i]);
4621                         sc->msix_ih[i] = NULL;
4622                 }
4623         }
4624         free(sc->msix_ih, M_DEVBUF);
4625
4626
4627 abort_with_res:
4628         for (i = 0; i < sc->num_slices; i++) {
4629                 rid = i + 1;
4630                 if (sc->msix_irq_res[i] != NULL)
4631                         bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4632                                              sc->msix_irq_res[i]);
4633                 sc->msix_irq_res[i] = NULL;
4634         }
4635         free(sc->msix_irq_res, M_DEVBUF);
4636
4637
4638 abort_with_msix:
4639         pci_release_msi(sc->dev);
4640
4641 abort_with_msix_table:
4642         bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4643                              sc->msix_table_res);
4644
4645         return err;
4646 }
4647
4648 static int
4649 mxge_add_single_irq(mxge_softc_t *sc)
4650 {
4651         int count, err, rid;
4652
4653         count = pci_msi_count(sc->dev);
4654         if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4655                 rid = 1;
4656         } else {
4657                 rid = 0;
4658                 sc->legacy_irq = 1;
4659         }
4660         sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4661                                              RF_SHAREABLE | RF_ACTIVE);
4662         if (sc->irq_res == NULL) {
4663                 device_printf(sc->dev, "could not alloc interrupt\n");
4664                 return ENXIO;
4665         }
4666         if (mxge_verbose)
4667                 device_printf(sc->dev, "using %s irq %jd\n",
4668                               sc->legacy_irq ? "INTx" : "MSI",
4669                               rman_get_start(sc->irq_res));
4670         err = bus_setup_intr(sc->dev, sc->irq_res,
4671                              INTR_TYPE_NET | INTR_MPSAFE,
4672 #if __FreeBSD_version > 700030
4673                              NULL,
4674 #endif
4675                              mxge_intr, &sc->ss[0], &sc->ih);
4676         if (err != 0) {
4677                 bus_release_resource(sc->dev, SYS_RES_IRQ,
4678                                      sc->legacy_irq ? 0 : 1, sc->irq_res);
4679                 if (!sc->legacy_irq)
4680                         pci_release_msi(sc->dev);
4681         }
4682         return err;
4683 }
4684
4685 static void
4686 mxge_rem_msix_irqs(mxge_softc_t *sc)
4687 {
4688         int i, rid;
4689
4690         for (i = 0; i < sc->num_slices; i++) {
4691                 if (sc->msix_ih[i] != NULL) {
4692                         bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4693                                           sc->msix_ih[i]);
4694                         sc->msix_ih[i] = NULL;
4695                 }
4696         }
4697         free(sc->msix_ih, M_DEVBUF);
4698
4699         for (i = 0; i < sc->num_slices; i++) {
4700                 rid = i + 1;
4701                 if (sc->msix_irq_res[i] != NULL)
4702                         bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4703                                              sc->msix_irq_res[i]);
4704                 sc->msix_irq_res[i] = NULL;
4705         }
4706         free(sc->msix_irq_res, M_DEVBUF);
4707
4708         bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4709                              sc->msix_table_res);
4710
4711         pci_release_msi(sc->dev);
4712         return;
4713 }
4714
4715 static void
4716 mxge_rem_single_irq(mxge_softc_t *sc)
4717 {
4718         bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4719         bus_release_resource(sc->dev, SYS_RES_IRQ,
4720                              sc->legacy_irq ? 0 : 1, sc->irq_res);
4721         if (!sc->legacy_irq)
4722                 pci_release_msi(sc->dev);
4723 }
4724
4725 static void
4726 mxge_rem_irq(mxge_softc_t *sc)
4727 {
4728         if (sc->num_slices > 1)
4729                 mxge_rem_msix_irqs(sc);
4730         else
4731                 mxge_rem_single_irq(sc);
4732 }
4733
4734 static int
4735 mxge_add_irq(mxge_softc_t *sc)
4736 {
4737         int err;
4738
4739         if (sc->num_slices > 1)
4740                 err = mxge_add_msix_irqs(sc);
4741         else
4742                 err = mxge_add_single_irq(sc);
4743         
4744         if (0 && err == 0 && sc->num_slices > 1) {
4745                 mxge_rem_msix_irqs(sc);
4746                 err = mxge_add_msix_irqs(sc);
4747         }
4748         return err;
4749 }
4750
4751
4752 static int
4753 mxge_attach(device_t dev)
4754 {
4755         mxge_cmd_t cmd;
4756         mxge_softc_t *sc = device_get_softc(dev);
4757         struct ifnet *ifp;
4758         int err, rid;
4759
4760         sc->dev = dev;
4761         mxge_fetch_tunables(sc);
4762
4763         TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4764         sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4765                                   taskqueue_thread_enqueue, &sc->tq);
4766         if (sc->tq == NULL) {
4767                 err = ENOMEM;
4768                 goto abort_with_nothing;
4769         }
4770
4771         err = bus_dma_tag_create(bus_get_dma_tag(dev),  /* parent */
4772                                  1,                     /* alignment */
4773                                  0,                     /* boundary */
4774                                  BUS_SPACE_MAXADDR,     /* low */
4775                                  BUS_SPACE_MAXADDR,     /* high */
4776                                  NULL, NULL,            /* filter */
4777                                  65536 + 256,           /* maxsize */
4778                                  MXGE_MAX_SEND_DESC,    /* num segs */
4779                                  65536,                 /* maxsegsize */
4780                                  0,                     /* flags */
4781                                  NULL, NULL,            /* lock */
4782                                  &sc->parent_dmat);     /* tag */
4783
4784         if (err != 0) {
4785                 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4786                               err);
4787                 goto abort_with_tq;
4788         }
4789
4790         ifp = sc->ifp = if_alloc(IFT_ETHER);
4791         if (ifp == NULL) {
4792                 device_printf(dev, "can not if_alloc()\n");
4793                 err = ENOSPC;
4794                 goto abort_with_parent_dmat;
4795         }
4796         if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4797
4798         snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4799                  device_get_nameunit(dev));
4800         mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4801         snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4802                  "%s:drv", device_get_nameunit(dev));
4803         mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4804                  MTX_NETWORK_LOCK, MTX_DEF);
4805
4806         callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4807
4808         mxge_setup_cfg_space(sc);
4809         
4810         /* Map the board into the kernel */
4811         rid = PCIR_BARS;
4812         sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4813                                              RF_ACTIVE);
4814         if (sc->mem_res == NULL) {
4815                 device_printf(dev, "could not map memory\n");
4816                 err = ENXIO;
4817                 goto abort_with_lock;
4818         }
4819         sc->sram = rman_get_virtual(sc->mem_res);
4820         sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4821         if (sc->sram_size > rman_get_size(sc->mem_res)) {
4822                 device_printf(dev, "impossible memory region size %jd\n",
4823                               rman_get_size(sc->mem_res));
4824                 err = ENXIO;
4825                 goto abort_with_mem_res;
4826         }
4827
4828         /* make NULL terminated copy of the EEPROM strings section of
4829            lanai SRAM */
4830         bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4831         bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4832                                 rman_get_bushandle(sc->mem_res),
4833                                 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4834                                 sc->eeprom_strings,
4835                                 MXGE_EEPROM_STRINGS_SIZE - 2);
4836         err = mxge_parse_strings(sc);
4837         if (err != 0)
4838                 goto abort_with_mem_res;
4839
4840         /* Enable write combining for efficient use of PCIe bus */
4841         mxge_enable_wc(sc);
4842
4843         /* Allocate the out of band dma memory */
4844         err = mxge_dma_alloc(sc, &sc->cmd_dma,
4845                              sizeof (mxge_cmd_t), 64);
4846         if (err != 0)
4847                 goto abort_with_mem_res;
4848         sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4849         err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4850         if (err != 0)
4851                 goto abort_with_cmd_dma;
4852
4853         err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4854         if (err != 0)
4855                 goto abort_with_zeropad_dma;
4856
4857         /* select & load the firmware */
4858         err = mxge_select_firmware(sc);
4859         if (err != 0)
4860                 goto abort_with_dmabench;
4861         sc->intr_coal_delay = mxge_intr_coal_delay;
4862
4863         mxge_slice_probe(sc);
4864         err = mxge_alloc_slices(sc);
4865         if (err != 0)
4866                 goto abort_with_dmabench;
4867
4868         err = mxge_reset(sc, 0);
4869         if (err != 0)
4870                 goto abort_with_slices;
4871
4872         err = mxge_alloc_rings(sc);
4873         if (err != 0) {
4874                 device_printf(sc->dev, "failed to allocate rings\n");
4875                 goto abort_with_slices;
4876         }
4877
4878         err = mxge_add_irq(sc);
4879         if (err != 0) {
4880                 device_printf(sc->dev, "failed to add irq\n");
4881                 goto abort_with_rings;
4882         }
4883
4884         ifp->if_baudrate = IF_Gbps(10);
4885         ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4886                 IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4887                 IFCAP_RXCSUM_IPV6;
4888 #if defined(INET) || defined(INET6)
4889         ifp->if_capabilities |= IFCAP_LRO;
4890 #endif
4891
4892 #ifdef MXGE_NEW_VLAN_API
4893         ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4894
4895         /* Only FW 1.4.32 and newer can do TSO over vlans */
4896         if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4897             sc->fw_ver_tiny >= 32)
4898                 ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4899 #endif
4900         sc->max_mtu = mxge_max_mtu(sc);
4901         if (sc->max_mtu >= 9000)
4902                 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4903         else
4904                 device_printf(dev, "MTU limited to %d.  Install "
4905                               "latest firmware for 9000 byte jumbo support\n",
4906                               sc->max_mtu - ETHER_HDR_LEN);
4907         ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4908         ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4909         /* check to see if f/w supports TSO for IPv6 */
4910         if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4911                 if (CSUM_TCP_IPV6)
4912                         ifp->if_capabilities |= IFCAP_TSO6;
4913                 sc->max_tso6_hlen = min(cmd.data0,
4914                                         sizeof (sc->ss[0].scratch));
4915         }
4916         ifp->if_capenable = ifp->if_capabilities;
4917         if (sc->lro_cnt == 0)
4918                 ifp->if_capenable &= ~IFCAP_LRO;
4919         ifp->if_init = mxge_init;
4920         ifp->if_softc = sc;
4921         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4922         ifp->if_ioctl = mxge_ioctl;
4923         ifp->if_start = mxge_start;
4924         ifp->if_get_counter = mxge_get_counter;
4925         /* Initialise the ifmedia structure */
4926         ifmedia_init(&sc->media, 0, mxge_media_change,
4927                      mxge_media_status);
4928         mxge_media_init(sc);
4929         mxge_media_probe(sc);
4930         sc->dying = 0;
4931         ether_ifattach(ifp, sc->mac_addr);
4932         /* ether_ifattach sets mtu to ETHERMTU */
4933         if (mxge_initial_mtu != ETHERMTU)
4934                 mxge_change_mtu(sc, mxge_initial_mtu);
4935
4936         mxge_add_sysctls(sc);
4937 #ifdef IFNET_BUF_RING
4938         ifp->if_transmit = mxge_transmit;
4939         ifp->if_qflush = mxge_qflush;
4940 #endif
4941         taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4942                                 device_get_nameunit(sc->dev));
4943         callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4944         return 0;
4945
4946 abort_with_rings:
4947         mxge_free_rings(sc);
4948 abort_with_slices:
4949         mxge_free_slices(sc);
4950 abort_with_dmabench:
4951         mxge_dma_free(&sc->dmabench_dma);
4952 abort_with_zeropad_dma:
4953         mxge_dma_free(&sc->zeropad_dma);
4954 abort_with_cmd_dma:
4955         mxge_dma_free(&sc->cmd_dma);
4956 abort_with_mem_res:
4957         bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4958 abort_with_lock:
4959         pci_disable_busmaster(dev);
4960         mtx_destroy(&sc->cmd_mtx);
4961         mtx_destroy(&sc->driver_mtx);
4962         if_free(ifp);
4963 abort_with_parent_dmat:
4964         bus_dma_tag_destroy(sc->parent_dmat);
4965 abort_with_tq:
4966         if (sc->tq != NULL) {
4967                 taskqueue_drain(sc->tq, &sc->watchdog_task);
4968                 taskqueue_free(sc->tq);
4969                 sc->tq = NULL;
4970         }
4971 abort_with_nothing:
4972         return err;
4973 }
4974
4975 static int
4976 mxge_detach(device_t dev)
4977 {
4978         mxge_softc_t *sc = device_get_softc(dev);
4979
4980         if (mxge_vlans_active(sc)) {
4981                 device_printf(sc->dev,
4982                               "Detach vlans before removing module\n");
4983                 return EBUSY;
4984         }
4985         mtx_lock(&sc->driver_mtx);
4986         sc->dying = 1;
4987         if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4988                 mxge_close(sc, 0);
4989         mtx_unlock(&sc->driver_mtx);
4990         ether_ifdetach(sc->ifp);
4991         if (sc->tq != NULL) {
4992                 taskqueue_drain(sc->tq, &sc->watchdog_task);
4993                 taskqueue_free(sc->tq);
4994                 sc->tq = NULL;
4995         }
4996         callout_drain(&sc->co_hdl);
4997         ifmedia_removeall(&sc->media);
4998         mxge_dummy_rdma(sc, 0);
4999         mxge_rem_sysctls(sc);
5000         mxge_rem_irq(sc);
5001         mxge_free_rings(sc);
5002         mxge_free_slices(sc);
5003         mxge_dma_free(&sc->dmabench_dma);
5004         mxge_dma_free(&sc->zeropad_dma);
5005         mxge_dma_free(&sc->cmd_dma);
5006         bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5007         pci_disable_busmaster(dev);
5008         mtx_destroy(&sc->cmd_mtx);
5009         mtx_destroy(&sc->driver_mtx);
5010         if_free(sc->ifp);
5011         bus_dma_tag_destroy(sc->parent_dmat);
5012         return 0;
5013 }
5014
5015 static int
5016 mxge_shutdown(device_t dev)
5017 {
5018         return 0;
5019 }
5020
5021 /*
5022   This file uses Myri10GE driver indentation.
5023
5024   Local Variables:
5025   c-file-style:"linux"
5026   tab-width:8
5027   End:
5028 */