1 /**************************************************************************
3 Copyright (c) 2007-2008, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
39 #include <sys/module.h>
40 #include <sys/pciio.h>
42 #include <machine/bus.h>
43 #include <machine/resource.h>
44 #include <sys/bus_dma.h>
45 #include <sys/kthread.h>
47 #include <sys/ioccom.h>
49 #include <sys/linker.h>
50 #include <sys/firmware.h>
51 #include <sys/socket.h>
52 #include <sys/sockio.h>
54 #include <sys/sched.h>
56 #include <sys/sysctl.h>
57 #include <sys/queue.h>
58 #include <sys/taskqueue.h>
59 #include <sys/unistd.h>
60 #include <sys/syslog.h>
63 #include <net/ethernet.h>
65 #include <net/if_arp.h>
66 #include <net/if_dl.h>
67 #include <net/if_media.h>
68 #include <net/if_types.h>
70 #include <netinet/in_systm.h>
71 #include <netinet/in.h>
72 #include <netinet/if_ether.h>
73 #include <netinet/ip.h>
74 #include <netinet/ip6.h>
75 #include <netinet/sctp_crc32.h>
76 #include <netinet/sctp.h>
77 #include <netinet/tcp.h>
78 #include <netinet/udp.h>
81 #include <dev/pci/pcireg.h>
82 #include <dev/pci/pcivar.h>
83 #include <dev/pci/pci_private.h>
89 #include <cxgb_include.h>
92 #include <dev/cxgb/cxgb_include.h>
93 #include <dev/cxgb/sys/mvec.h>
97 extern struct sysctl_oid_list sysctl__hw_cxgb_children;
98 static int cxgb_pcpu_tx_coalesce = 0;
99 TUNABLE_INT("hw.cxgb.tx_coalesce", &cxgb_pcpu_tx_coalesce);
100 SYSCTL_UINT(_hw_cxgb, OID_AUTO, tx_coalesce, CTLFLAG_RDTUN, &cxgb_pcpu_tx_coalesce, 0,
101 "coalesce small packets into a single work request");
103 static int sleep_ticks = 1;
104 TUNABLE_INT("hw.cxgb.sleep_ticks", &sleep_ticks);
105 SYSCTL_UINT(_hw_cxgb, OID_AUTO, sleep_ticks, CTLFLAG_RDTUN, &sleep_ticks, 0,
106 "ticks to sleep between checking pcpu queues");
108 int cxgb_txq_buf_ring_size = TX_ETH_Q_SIZE;
109 TUNABLE_INT("hw.cxgb.txq_mr_size", &cxgb_txq_buf_ring_size);
110 SYSCTL_UINT(_hw_cxgb, OID_AUTO, txq_mr_size, CTLFLAG_RDTUN, &cxgb_txq_buf_ring_size, 0,
111 "size of per-queue mbuf ring");
114 static inline int32_t cxgb_pcpu_calc_cookie(struct ifnet *ifp, struct mbuf *immpkt);
115 static void cxgb_pcpu_start_proc(void *arg);
116 #ifdef IFNET_MULTIQUEUE
117 static int cxgb_pcpu_cookie_to_qidx(struct port_info *, uint32_t cookie);
119 static int cxgb_tx(struct sge_qset *qs, uint32_t txmax);
122 cxgb_pcpu_enqueue_packet_(struct sge_qset *qs, struct mbuf *m)
127 #ifndef IFNET_MULTIQUEUE
128 panic("not expecting enqueue without multiqueue");
130 KASSERT(m != NULL, ("null mbuf"));
131 KASSERT(m->m_type == MT_DATA, ("bad mbuf type %d", m->m_type));
132 if (qs->qs_flags & QS_EXITING) {
136 txq = &qs->txq[TXQ_ETH];
137 err = buf_ring_enqueue(&txq->txq_mr, m);
142 if ((qs->txq[TXQ_ETH].flags & TXQ_TRANSMITTING) == 0)
149 cxgb_pcpu_enqueue_packet(struct ifnet *ifp, struct mbuf *m)
151 struct port_info *pi = ifp->if_softc;
154 #ifdef IFNET_MULTIQUEUE
157 calc_cookie = m->m_pkthdr.rss_hash;
158 qidx = cxgb_pcpu_cookie_to_qidx(pi, calc_cookie);
162 qs = &pi->adapter->sge.qs[qidx];
163 err = cxgb_pcpu_enqueue_packet_(qs, m);
168 cxgb_dequeue_packet(struct sge_txq *txq, struct mbuf **m_vec)
172 int count, size, coalesced;
174 #ifndef IFNET_MULTIQUEUE
175 struct port_info *pi = txq->port;
177 if (txq->immpkt != NULL)
178 panic("immediate packet set");
179 mtx_assert(&txq->lock, MA_OWNED);
181 IFQ_DRV_DEQUEUE(&pi->ifp->if_snd, m);
189 coalesced = count = size = 0;
190 qs = txq_to_qset(txq, TXQ_ETH);
191 if (qs->qs_flags & QS_EXITING)
194 if (txq->immpkt != NULL) {
195 DPRINTF("immediate packet\n");
196 m_vec[0] = txq->immpkt;
200 sc = qs->port->adapter;
202 m = buf_ring_dequeue(&txq->txq_mr);
207 KASSERT(m->m_type == MT_DATA,
208 ("m=%p is bad mbuf type %d from ring cons=%d prod=%d", m,
209 m->m_type, txq->txq_mr.br_cons, txq->txq_mr.br_prod));
211 if (m->m_pkthdr.tso_segsz > 0 || m->m_pkthdr.len > TX_WR_SIZE_MAX ||
212 m->m_next != NULL || (cxgb_pcpu_tx_coalesce == 0)) {
216 size = m->m_pkthdr.len;
217 for (m = buf_ring_peek(&txq->txq_mr); m != NULL;
218 m = buf_ring_peek(&txq->txq_mr)) {
220 if (m->m_pkthdr.tso_segsz > 0 ||
221 size + m->m_pkthdr.len > TX_WR_SIZE_MAX || m->m_next != NULL)
224 buf_ring_dequeue(&txq->txq_mr);
225 size += m->m_pkthdr.len;
228 if (count == TX_WR_COUNT_MAX)
233 txq->txq_coalesced += coalesced;
239 cxgb_pcpu_get_cookie(struct ifnet *ifp, struct in6_addr *lip, uint16_t lport, struct in6_addr *rip, uint16_t rport, int ipv6)
248 * Can definitely bypass bcopy XXX
252 bcopy(rip, &buf[0], 4);
253 bcopy(lip, &buf[4], 4);
254 bcopy(&rport, &buf[8], 2);
255 bcopy(&lport, &buf[10], 2);
258 bcopy(rip, &buf[0], 16);
259 bcopy(lip, &buf[16], 16);
260 bcopy(&rport, &buf[32], 2);
261 bcopy(&lport, &buf[34], 2);
265 base = update_crc32(base, buf, count);
266 base = sctp_csum_finalize(base);
269 * Indirection table is 128 bits
270 * -> cookie indexes into indirection table which maps connection to queue
271 * -> RSS map maps queue to CPU
273 cookie = (base & (RSS_TABLE_SIZE-1));
280 cxgb_pcpu_calc_cookie(struct ifnet *ifp, struct mbuf *immpkt)
282 struct in6_addr lip, rip;
283 uint16_t lport, rport;
284 struct ether_header *eh;
291 uint8_t *next, proto;
306 eh = mtod(immpkt, struct ether_header *);
307 etype = ntohs(eh->ether_type);
311 ip = (struct ip *)(eh + 1);
312 next = (uint8_t *)(ip + 1);
313 bcopy(&ip->ip_src, &lip, 4);
314 bcopy(&ip->ip_dst, &rip, 4);
318 ip6 = (struct ip6_hdr *)(eh + 1);
319 next = (uint8_t *)(ip6 + 1);
320 bcopy(&ip6->ip6_src, &lip, sizeof(struct in6_addr));
321 bcopy(&ip6->ip6_dst, &rip, sizeof(struct in6_addr));
322 if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
325 hbh = (struct ip6_hbh *)(ip6 + 1);
326 proto = hbh->ip6h_nxt;
328 proto = ip6->ip6_nxt;
333 * Default to queue zero
340 th = (struct tcphdr *)next;
341 lport = th->th_sport;
342 rport = th->th_dport;
345 uh = (struct udphdr *)next;
346 lport = uh->uh_sport;
347 rport = uh->uh_dport;
350 sh = (struct sctphdr *)next;
351 lport = sh->src_port;
352 rport = sh->dest_port;
361 cookie = cxgb_pcpu_get_cookie(ifp, &lip, lport, &rip, rport, (etype == ETHERTYPE_IPV6));
367 cxgb_pcpu_free(struct sge_qset *qs)
370 struct sge_txq *txq = &qs->txq[TXQ_ETH];
372 mtx_lock(&txq->lock);
373 while ((m = mbufq_dequeue(&txq->sendq)) != NULL)
375 while ((m = buf_ring_dequeue(&txq->txq_mr)) != NULL)
378 t3_free_tx_desc_all(txq);
379 mtx_unlock(&txq->lock);
383 cxgb_pcpu_reclaim_tx(struct sge_txq *txq)
386 struct sge_qset *qs = txq_to_qset(txq, TXQ_ETH);
389 KASSERT(qs->qs_cpuid == curcpu, ("cpu qset mismatch cpuid=%d curcpu=%d",
390 qs->qs_cpuid, curcpu));
392 mtx_assert(&txq->lock, MA_OWNED);
394 reclaimable = desc_reclaimable(txq);
395 if (reclaimable == 0)
398 t3_free_tx_desc(txq, reclaimable);
400 txq->cleaned += reclaimable;
401 txq->in_use -= reclaimable;
402 if (isset(&qs->txq_stopped, TXQ_ETH)) {
403 qs->port->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
404 clrbit(&qs->txq_stopped, TXQ_ETH);
407 return (reclaimable);
411 cxgb_pcpu_start_(struct sge_qset *qs, struct mbuf *immpkt, int tx_flush)
413 int i, err, initerr, flush, reclaimed, stopped;
414 struct port_info *pi;
420 initerr = err = i = reclaimed = 0;
422 txq = &qs->txq[TXQ_ETH];
424 mtx_assert(&txq->lock, MA_OWNED);
427 if (!pi->link_config.link_ok)
429 else if (qs->qs_flags & QS_EXITING)
431 else if ((pi->ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
433 else if ((pi->ifp->if_flags & IFF_UP) == 0)
437 if (!buf_ring_empty(&txq->txq_mr))
438 initerr = cxgb_pcpu_enqueue_packet_(qs, immpkt);
440 txq->immpkt = immpkt;
444 if (initerr && initerr != ENOBUFS) {
446 log(LOG_WARNING, "cxgb link down\n");
452 if ((tx_flush && (desc_reclaimable(txq) > 0)) ||
453 (desc_reclaimable(txq) > (TX_ETH_Q_SIZE>>1))) {
457 device_printf(qs->port->adapter->dev,
458 "cpuid=%d curcpu=%d reclaimable=%d txq=%p txq->cidx=%d txq->pidx=%d ",
459 qs->qs_cpuid, curcpu, desc_reclaimable(txq),
460 txq, txq->cidx, txq->pidx);
462 reclaimed = cxgb_pcpu_reclaim_tx(txq);
464 printf("reclaimed=%d\n", reclaimed);
467 stopped = isset(&qs->txq_stopped, TXQ_ETH);
468 flush = (((!buf_ring_empty(&txq->txq_mr) || (!IFQ_DRV_IS_EMPTY(&pi->ifp->if_snd))) && !stopped) || txq->immpkt);
469 max_desc = tx_flush ? TX_ETH_Q_SIZE : TX_START_MAX_DESC;
472 DPRINTF("stopped=%d flush=%d max_desc=%d\n",
473 stopped, flush, max_desc);
475 err = flush ? cxgb_tx(qs, max_desc) : ENOSPC;
478 if ((tx_flush && flush && err == 0) &&
479 (!buf_ring_empty(&txq->txq_mr) ||
480 !IFQ_DRV_IS_EMPTY(&pi->ifp->if_snd))) {
481 struct thread *td = curthread;
485 sched_prio(td, PRI_MIN_TIMESHARE);
490 device_printf(qs->port->adapter->dev,
491 "exceeded max enqueue tries\n");
496 err = (initerr != 0) ? initerr : err;
502 cxgb_pcpu_start(struct ifnet *ifp, struct mbuf *immpkt)
505 int err, qidx, locked, resid;
506 struct port_info *pi;
508 struct sge_txq *txq = NULL /* gcc is dumb */;
514 qidx = resid = err = cookie = locked = 0;
516 #ifdef IFNET_MULTIQUEUE
517 if (immpkt && (immpkt->m_pkthdr.rss_hash != 0)) {
518 cookie = immpkt->m_pkthdr.rss_hash;
519 qidx = cxgb_pcpu_cookie_to_qidx(pi, cookie);
520 DPRINTF("hash=0x%x qidx=%d cpu=%d\n", immpkt->m_pkthdr.rss_hash, qidx, curcpu);
521 qs = &pi->adapter->sge.qs[qidx];
524 qs = &pi->adapter->sge.qs[pi->first_qset];
526 txq = &qs->txq[TXQ_ETH];
528 if (((sc->tunq_coalesce == 0) ||
529 (buf_ring_count(&txq->txq_mr) >= TX_WR_COUNT_MAX) ||
530 (cxgb_pcpu_tx_coalesce == 0)) && mtx_trylock(&txq->lock)) {
532 printf("doing immediate transmit\n");
534 txq->flags |= TXQ_TRANSMITTING;
535 err = cxgb_pcpu_start_(qs, immpkt, FALSE);
536 txq->flags &= ~TXQ_TRANSMITTING;
537 resid = (buf_ring_count(&txq->txq_mr) > 64) || (desc_reclaimable(txq) > 64);
538 mtx_unlock(&txq->lock);
541 printf("deferred coalesce=%jx ring_count=%d mtx_owned=%d\n",
542 sc->tunq_coalesce, buf_ring_count(&txq->txq_mr), mtx_owned(&txq->lock));
543 err = cxgb_pcpu_enqueue_packet_(qs, immpkt);
546 if (resid && (txq->flags & TXQ_TRANSMITTING) == 0)
549 return ((err == ENOSPC) ? 0 : err);
553 cxgb_start(struct ifnet *ifp)
555 struct port_info *p = ifp->if_softc;
557 if (!p->link_config.link_ok)
560 if (IFQ_DRV_IS_EMPTY(&ifp->if_snd))
563 cxgb_pcpu_start(ifp, NULL);
567 cxgb_pcpu_start_proc(void *arg)
569 struct sge_qset *qs = arg;
571 struct sge_txq *txq = &qs->txq[TXQ_ETH];
572 int idleticks, err = 0;
574 struct adapter *sc = qs->port->adapter;
578 sleep_ticks = max(hz/1000, 1);
579 qs->qs_flags |= QS_RUNNING;
581 sched_bind(td, qs->qs_cpuid);
584 DELAY(qs->qs_cpuid*100000);
586 printf("bound to %d running on %d\n", qs->qs_cpuid, curcpu);
589 if (qs->qs_flags & QS_EXITING)
592 if ((qs->port->ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
594 if (!buf_ring_empty(&txq->txq_mr) ||
595 !mbufq_empty(&txq->sendq))
599 idleticks = sleep_ticks;
600 if (mtx_trylock(&txq->lock)) {
601 txq->flags |= TXQ_TRANSMITTING;
602 err = cxgb_pcpu_start_(qs, NULL, TRUE);
603 txq->flags &= ~TXQ_TRANSMITTING;
604 mtx_unlock(&txq->lock);
608 if (mtx_trylock(&qs->rspq.lock)) {
609 process_responses(sc, qs, -1);
611 refill_fl_service(sc, &qs->fl[0]);
612 refill_fl_service(sc, &qs->fl[1]);
613 t3_write_reg(sc, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
614 V_NEWTIMER(qs->rspq.next_holdoff) | V_NEWINDEX(qs->rspq.cidx));
616 mtx_unlock(&qs->rspq.lock);
619 if ((!buf_ring_empty(&txq->txq_mr)) && err == 0) {
621 printf("head=%p cons=%d prod=%d\n",
622 txq->sendq.head, txq->txq_mr.br_cons,
623 txq->txq_mr.br_prod);
627 tsleep(qs, 1, "cxgbidle", idleticks);
631 device_printf(qs->port->adapter->dev, "exiting thread for cpu%d\n", qs->qs_cpuid);
635 t3_free_qset(qs->port->adapter, qs);
637 qs->qs_flags &= ~QS_RUNNING;
641 #ifdef IFNET_MULTIQUEUE
643 cxgb_pcpu_cookie_to_qidx(struct port_info *pi, uint32_t cookie)
649 * Will probably need to be changed for 4-port XXX
651 tmp = pi->tx_chan ? cookie : cookie & ((RSS_TABLE_SIZE>>1)-1);
652 DPRINTF(" tmp=%d ", tmp);
653 qidx = (tmp & (pi->nqsets -1)) + pi->first_qset;
660 cxgb_pcpu_startup_threads(struct adapter *sc)
666 for (i = 0; i < (sc)->params.nports; ++i) {
667 struct port_info *pi = adap2pinfo(sc, i);
669 #ifdef IFNET_MULTIQUEUE
674 for (j = 0; j < nqsets; ++j) {
677 qs = &sc->sge.qs[pi->first_qset + j];
679 qs->qs_cpuid = ((pi->first_qset + j) % mp_ncpus);
680 device_printf(sc->dev, "starting thread for %d\n",
683 kthread_create(cxgb_pcpu_start_proc, qs, &p,
684 RFNOWAIT, 0, "cxgbsp");
691 cxgb_pcpu_shutdown_threads(struct adapter *sc)
696 for (i = 0; i < sc->params.nports; i++) {
697 struct port_info *pi = &sc->port[i];
698 int first = pi->first_qset;
700 #ifdef IFNET_MULTIQUEUE
705 for (j = 0; j < nqsets; j++) {
706 struct sge_qset *qs = &sc->sge.qs[first + j];
708 qs->qs_flags |= QS_EXITING;
710 tsleep(&sc, PRI_MIN_TIMESHARE, "cxgb unload 0", hz>>2);
711 while (qs->qs_flags & QS_RUNNING) {
712 qs->qs_flags |= QS_EXITING;
713 device_printf(sc->dev, "qset thread %d still running - sleeping\n", first + j);
714 tsleep(&sc, PRI_MIN_TIMESHARE, "cxgb unload 1", 2*hz);
721 check_pkt_coalesce(struct sge_qset *qs)
726 txq = &qs->txq[TXQ_ETH];
727 sc = qs->port->adapter;
729 if (sc->tunq_fill[qs->idx] && (txq->in_use < (txq->size - (txq->size>>2))))
730 sc->tunq_fill[qs->idx] = 0;
731 else if (!sc->tunq_fill[qs->idx] && (txq->in_use > (txq->size - (txq->size>>2))))
732 sc->tunq_fill[qs->idx] = 1;
736 cxgb_tx(struct sge_qset *qs, uint32_t txmax)
739 struct ifnet *ifp = qs->port->ifp;
740 int i, err, in_use_init, count;
741 struct mbuf *m_vec[TX_WR_COUNT_MAX];
743 txq = &qs->txq[TXQ_ETH];
745 in_use_init = txq->in_use;
748 for (i = 0; i < TX_WR_COUNT_MAX; i++)
751 mtx_assert(&txq->lock, MA_OWNED);
752 while ((txq->in_use - in_use_init < txmax) &&
753 (txq->size > txq->in_use + TX_MAX_DESC)) {
754 check_pkt_coalesce(qs);
755 count = cxgb_dequeue_packet(txq, m_vec);
760 ETHER_BPF_MTAP(ifp, m_vec[0]);
762 if ((err = t3_encap(qs, m_vec, count)) != 0)
764 txq->txq_enqueued += count;
768 if (__predict_false(err)) {
770 ifp->if_drv_flags |= IFF_DRV_OACTIVE;
771 IFQ_LOCK(&ifp->if_snd);
772 IFQ_DRV_PREPEND(&ifp->if_snd, m_vec[0]);
773 IFQ_UNLOCK(&ifp->if_snd);
776 else if ((err == 0) && (txq->size <= txq->in_use + TX_MAX_DESC) &&
777 (ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
778 setbit(&qs->txq_stopped, TXQ_ETH);
779 ifp->if_drv_flags |= IFF_DRV_OACTIVE;
784 if ((err == 0) && (txq->size <= txq->in_use + TX_MAX_DESC)) {
787 setbit(&qs->txq_stopped, TXQ_ETH);
788 ifp->if_drv_flags |= IFF_DRV_OACTIVE;
796 for (i = 0; i < count; i++)