sys/dev/gve/gve_tx.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 2023 Google LLC
   5  *
   6  * Redistribution and use in source and binary forms, with or without modification,
   7  * are permitted provided that the following conditions are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright notice, this
  10  *    list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright notice,
  13  *    this list of conditions and the following disclaimer in the documentation
  14  *    and/or other materials provided with the distribution.
  15  *
  16  * 3. Neither the name of the copyright holder nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software without
  18  *    specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  22  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  23  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
  24  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  25  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
  27  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  */
  31 #include "gve.h"
  32 #include "gve_adminq.h"
  33
  34 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182
  35
  36 static int
  37 gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx)
  38 {
  39         struct gve_queue_page_list *qpl = tx->com.qpl;
  40         struct gve_tx_fifo *fifo = &tx->fifo;
  41
  42         fifo->size = qpl->num_pages * PAGE_SIZE;
  43         fifo->base = qpl->kva;
  44         atomic_store_int(&fifo->available, fifo->size);
  45         fifo->head = 0;
  46
  47         return (0);
  48 }
  49
  50 static void
  51 gve_tx_free_ring(struct gve_priv *priv, int i)
  52 {
  53         struct gve_tx_ring *tx = &priv->tx[i];
  54         struct gve_ring_com *com = &tx->com;
  55
  56         /* Safe to call even if never alloced */
  57         gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
  58
  59         if (tx->br != NULL) {
  60                 buf_ring_free(tx->br, M_DEVBUF);
  61                 tx->br = NULL;
  62         }
  63
  64         if (mtx_initialized(&tx->ring_mtx))
  65                 mtx_destroy(&tx->ring_mtx);
  66
  67         if (tx->info != NULL) {
  68                 free(tx->info, M_GVE);
  69                 tx->info = NULL;
  70         }
  71
  72         if (tx->desc_ring != NULL) {
  73                 gve_dma_free_coherent(&tx->desc_ring_mem);
  74                 tx->desc_ring = NULL;
  75         }
  76
  77         if (com->q_resources != NULL) {
  78                 gve_dma_free_coherent(&com->q_resources_mem);
  79                 com->q_resources = NULL;
  80         }
  81 }
  82
  83 static int
  84 gve_tx_alloc_ring(struct gve_priv *priv, int i)
  85 {
  86         struct gve_tx_ring *tx = &priv->tx[i];
  87         struct gve_ring_com *com = &tx->com;
  88         char mtx_name[16];
  89         int err;
  90
  91         com->priv = priv;
  92         com->id = i;
  93
  94         com->qpl = &priv->qpls[i];
  95         if (com->qpl == NULL) {
  96                 device_printf(priv->dev, "No QPL left for tx ring %d\n", i);
  97                 return (ENOMEM);
  98         }
  99
 100         err = gve_tx_fifo_init(priv, tx);
 101         if (err != 0)
 102                 goto abort;
 103
 104         tx->info = malloc(sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt,
 105             M_GVE, M_WAITOK | M_ZERO);
 106
 107         sprintf(mtx_name, "gvetx%d", i);
 108         mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF);
 109
 110         tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF,
 111             M_WAITOK, &tx->ring_mtx);
 112
 113         gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS);
 114
 115         err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources),
 116             PAGE_SIZE, &com->q_resources_mem);
 117         if (err != 0) {
 118                 device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i);
 119                 goto abort;
 120         }
 121         com->q_resources = com->q_resources_mem.cpu_addr;
 122
 123         err = gve_dma_alloc_coherent(priv,
 124             sizeof(union gve_tx_desc) * priv->tx_desc_cnt,
 125             CACHE_LINE_SIZE, &tx->desc_ring_mem);
 126         if (err != 0) {
 127                 device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i);
 128                 goto abort;
 129         }
 130         tx->desc_ring = tx->desc_ring_mem.cpu_addr;
 131
 132         return (0);
 133
 134 abort:
 135         gve_tx_free_ring(priv, i);
 136         return (err);
 137 }
 138
 139 int
 140 gve_alloc_tx_rings(struct gve_priv *priv)
 141 {
 142         int err = 0;
 143         int i;
 144
 145         priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues,
 146             M_GVE, M_WAITOK | M_ZERO);
 147
 148         for (i = 0; i < priv->tx_cfg.num_queues; i++) {
 149                 err = gve_tx_alloc_ring(priv, i);
 150                 if (err != 0)
 151                         goto free_rings;
 152
 153         }
 154
 155         return (0);
 156
 157 free_rings:
 158         while (i--)
 159                 gve_tx_free_ring(priv, i);
 160         free(priv->tx, M_GVE);
 161         return (err);
 162 }
 163
 164 void
 165 gve_free_tx_rings(struct gve_priv *priv)
 166 {
 167         int i;
 168
 169         for (i = 0; i < priv->tx_cfg.num_queues; i++)
 170                 gve_tx_free_ring(priv, i);
 171
 172         free(priv->tx, M_GVE);
 173 }
 174
 175 static void
 176 gve_tx_clear_desc_ring(struct gve_tx_ring *tx)
 177 {
 178         struct gve_ring_com *com = &tx->com;
 179         int i;
 180
 181         for (i = 0; i < com->priv->tx_desc_cnt; i++) {
 182                 tx->desc_ring[i] = (union gve_tx_desc){};
 183                 tx->info[i] = (struct gve_tx_buffer_state){};
 184         }
 185
 186         bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
 187             BUS_DMASYNC_PREWRITE);
 188 }
 189
 190 static void
 191 gve_clear_tx_ring(struct gve_priv *priv, int i)
 192 {
 193         struct gve_tx_ring *tx = &priv->tx[i];
 194         struct gve_tx_fifo *fifo = &tx->fifo;
 195
 196         tx->req = 0;
 197         tx->done = 0;
 198         tx->mask = priv->tx_desc_cnt - 1;
 199
 200         atomic_store_int(&fifo->available, fifo->size);
 201         fifo->head = 0;
 202
 203         gve_tx_clear_desc_ring(tx);
 204 }
 205
 206 static void
 207 gve_start_tx_ring(struct gve_priv *priv, int i)
 208 {
 209         struct gve_tx_ring *tx = &priv->tx[i];
 210         struct gve_ring_com *com = &tx->com;
 211
 212         NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx);
 213         com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK,
 214             taskqueue_thread_enqueue, &com->cleanup_tq);
 215         taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d",
 216             device_get_nameunit(priv->dev), i);
 217
 218         TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx);
 219         tx->xmit_tq = taskqueue_create_fast("gve tx xmit",
 220             M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq);
 221         taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit",
 222             device_get_nameunit(priv->dev), i);
 223 }
 224
 225 int
 226 gve_create_tx_rings(struct gve_priv *priv)
 227 {
 228         struct gve_ring_com *com;
 229         struct gve_tx_ring *tx;
 230         int err;
 231         int i;
 232
 233         if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK))
 234                 return (0);
 235
 236         for (i = 0; i < priv->tx_cfg.num_queues; i++)
 237                 gve_clear_tx_ring(priv, i);
 238
 239         err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues);
 240         if (err != 0)
 241                 return (err);
 242
 243         bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map,
 244             BUS_DMASYNC_POSTREAD);
 245
 246         for (i = 0; i < priv->tx_cfg.num_queues; i++) {
 247                 tx = &priv->tx[i];
 248                 com = &tx->com;
 249
 250                 com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index);
 251
 252                 bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map,
 253                     BUS_DMASYNC_POSTREAD);
 254                 com->db_offset = 4 * be32toh(com->q_resources->db_index);
 255                 com->counter_idx = be32toh(com->q_resources->counter_index);
 256
 257                 gve_start_tx_ring(priv, i);
 258         }
 259
 260         gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
 261         return (0);
 262 }
 263
 264 static void
 265 gve_stop_tx_ring(struct gve_priv *priv, int i)
 266 {
 267         struct gve_tx_ring *tx = &priv->tx[i];
 268         struct gve_ring_com *com = &tx->com;
 269
 270         if (com->cleanup_tq != NULL) {
 271                 taskqueue_quiesce(com->cleanup_tq);
 272                 taskqueue_free(com->cleanup_tq);
 273                 com->cleanup_tq = NULL;
 274         }
 275
 276         if (tx->xmit_tq != NULL) {
 277                 taskqueue_quiesce(tx->xmit_tq);
 278                 taskqueue_free(tx->xmit_tq);
 279                 tx->xmit_tq = NULL;
 280         }
 281 }
 282
 283 int
 284 gve_destroy_tx_rings(struct gve_priv *priv)
 285 {
 286         int err;
 287         int i;
 288
 289         for (i = 0; i < priv->tx_cfg.num_queues; i++)
 290                 gve_stop_tx_ring(priv, i);
 291
 292         if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) {
 293                 err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues);
 294                 if (err != 0)
 295                         return (err);
 296                 gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK);
 297         }
 298
 299         return (0);
 300 }
 301
 302 int
 303 gve_tx_intr(void *arg)
 304 {
 305         struct gve_tx_ring *tx = arg;
 306         struct gve_priv *priv = tx->com.priv;
 307         struct gve_ring_com *com = &tx->com;
 308
 309         if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
 310                 return (FILTER_STRAY);
 311
 312         gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK);
 313         taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task);
 314         return (FILTER_HANDLED);
 315 }
 316
 317 static uint32_t
 318 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx)
 319 {
 320         bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map,
 321             BUS_DMASYNC_POSTREAD);
 322         uint32_t counter = priv->counters[tx->com.counter_idx];
 323         return (be32toh(counter));
 324 }
 325
 326 static void
 327 gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes)
 328 {
 329         atomic_add_int(&fifo->available, bytes);
 330 }
 331
 332 void
 333 gve_tx_cleanup_tq(void *arg, int pending)
 334 {
 335         struct gve_tx_ring *tx = arg;
 336         struct gve_priv *priv = tx->com.priv;
 337         uint32_t nic_done = gve_tx_load_event_counter(priv, tx);
 338         uint32_t todo = nic_done - tx->done;
 339         size_t space_freed = 0;
 340         int i, j;
 341
 342         if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
 343                 return;
 344
 345         for (j = 0; j < todo; j++) {
 346                 uint32_t idx = tx->done & tx->mask;
 347                 struct gve_tx_buffer_state *info = &tx->info[idx];
 348                 struct mbuf *mbuf = info->mbuf;
 349
 350                 tx->done++;
 351                 if (mbuf == NULL)
 352                         continue;
 353
 354                 info->mbuf = NULL;
 355                 counter_enter();
 356                 counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len);
 357                 counter_u64_add_protected(tx->stats.tpackets, 1);
 358                 counter_exit();
 359                 m_freem(mbuf);
 360
 361                 for (i = 0; i < GVE_TX_MAX_DESCS; i++) {
 362                         space_freed += info->iov[i].iov_len + info->iov[i].iov_padding;
 363                         info->iov[i].iov_len = 0;
 364                         info->iov[i].iov_padding = 0;
 365                 }
 366         }
 367
 368         gve_tx_free_fifo(&tx->fifo, space_freed);
 369
 370         gve_db_bar_write_4(priv, tx->com.irq_db_offset,
 371             GVE_IRQ_ACK | GVE_IRQ_EVENT);
 372
 373         /*
 374          * Completions born before this barrier MAY NOT cause the NIC to send an
 375          * interrupt but they will still be handled by the enqueue below.
 376          * Completions born after the barrier WILL trigger an interrupt.
 377          */
 378         mb();
 379
 380         nic_done = gve_tx_load_event_counter(priv, tx);
 381         todo = nic_done - tx->done;
 382         if (todo != 0) {
 383                 gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK);
 384                 taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task);
 385         }
 386 }
 387
 388 static void
 389 gve_dma_sync_for_device(struct gve_queue_page_list *qpl,
 390                         uint64_t iov_offset, uint64_t iov_len)
 391 {
 392         uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE;
 393         uint64_t first_page = iov_offset / PAGE_SIZE;
 394         struct gve_dma_handle *dma;
 395         uint64_t page;
 396
 397         for (page = first_page; page <= last_page; page++) {
 398                 dma = &(qpl->dmas[page]);
 399                 bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE);
 400         }
 401 }
 402
 403 static void
 404 gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf)
 405 {
 406         mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
 407         mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4;
 408         mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid);
 409         mtd_desc->reserved0 = 0;
 410         mtd_desc->reserved1 = 0;
 411 }
 412
 413 static void
 414 gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso,
 415     uint16_t l4_hdr_offset, uint32_t desc_cnt,
 416     uint16_t first_seg_len, uint64_t addr, bool has_csum_flag,
 417     int csum_offset, uint16_t pkt_len)
 418 {
 419         if (is_tso) {
 420                 pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM;
 421                 pkt_desc->l4_csum_offset = csum_offset >> 1;
 422                 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
 423         } else if (has_csum_flag) {
 424                 pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM;
 425                 pkt_desc->l4_csum_offset = csum_offset >> 1;
 426                 pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1;
 427         } else {
 428                 pkt_desc->type_flags = GVE_TXD_STD;
 429                 pkt_desc->l4_csum_offset = 0;
 430                 pkt_desc->l4_hdr_offset = 0;
 431         }
 432         pkt_desc->desc_cnt = desc_cnt;
 433         pkt_desc->len = htobe16(pkt_len);
 434         pkt_desc->seg_len = htobe16(first_seg_len);
 435         pkt_desc->seg_addr = htobe64(addr);
 436 }
 437
 438 static void
 439 gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc,
 440     bool is_tso, uint16_t len, uint64_t addr,
 441     bool is_ipv6, uint8_t l3_off, uint16_t tso_mss)
 442 {
 443         seg_desc->type_flags = GVE_TXD_SEG;
 444         if (is_tso) {
 445                 if (is_ipv6)
 446                         seg_desc->type_flags |= GVE_TXSF_IPV6;
 447                 seg_desc->l3_offset = l3_off >> 1;
 448                 seg_desc->mss = htobe16(tso_mss);
 449         }
 450         seg_desc->seg_len = htobe16(len);
 451         seg_desc->seg_addr = htobe64(addr);
 452 }
 453
 454 static inline uint32_t
 455 gve_tx_avail(struct gve_tx_ring *tx)
 456 {
 457         return (tx->mask + 1 - (tx->req - tx->done));
 458 }
 459
 460 static bool
 461 gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes)
 462 {
 463         return (atomic_load_int(&fifo->available) >= bytes);
 464 }
 465
 466 static inline bool
 467 gve_can_tx(struct gve_tx_ring *tx, int bytes_required)
 468 {
 469         return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) &&
 470             gve_tx_fifo_can_alloc(&tx->fifo, bytes_required));
 471 }
 472
 473 static int
 474 gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes)
 475 {
 476         return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head;
 477 }
 478
 479 static inline int
 480 gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len,
 481     uint16_t pkt_len)
 482 {
 483         int pad_bytes, align_hdr_pad;
 484         int bytes;
 485
 486         pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
 487         /* We need to take into account the header alignment padding. */
 488         align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len;
 489         bytes = align_hdr_pad + pad_bytes + pkt_len;
 490
 491         return (bytes);
 492 }
 493
 494 static int
 495 gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes,
 496     struct gve_tx_iovec iov[2])
 497 {
 498         size_t overflow, padding;
 499         uint32_t aligned_head;
 500         int nfrags = 0;
 501
 502         if (bytes == 0)
 503                 return (0);
 504
 505         /*
 506          * This check happens before we know how much padding is needed to
 507          * align to a cacheline boundary for the payload, but that is fine,
 508          * because the FIFO head always start aligned, and the FIFO's boundaries
 509          * are aligned, so if there is space for the data, there is space for
 510          * the padding to the next alignment.
 511          */
 512         KASSERT(gve_tx_fifo_can_alloc(fifo, bytes),
 513             ("Allocating gve tx fifo when there is no room"));
 514
 515         nfrags++;
 516
 517         iov[0].iov_offset = fifo->head;
 518         iov[0].iov_len = bytes;
 519         fifo->head += bytes;
 520
 521         if (fifo->head > fifo->size) {
 522                 /*
 523                  * If the allocation did not fit in the tail fragment of the
 524                  * FIFO, also use the head fragment.
 525                  */
 526                 nfrags++;
 527                 overflow = fifo->head - fifo->size;
 528                 iov[0].iov_len -= overflow;
 529                 iov[1].iov_offset = 0;  /* Start of fifo*/
 530                 iov[1].iov_len = overflow;
 531
 532                 fifo->head = overflow;
 533         }
 534
 535         /* Re-align to a cacheline boundary */
 536         aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE);
 537         padding = aligned_head - fifo->head;
 538         iov[nfrags - 1].iov_padding = padding;
 539         atomic_add_int(&fifo->available, -(bytes + padding));
 540         fifo->head = aligned_head;
 541
 542         if (fifo->head == fifo->size)
 543                 fifo->head = 0;
 544
 545         return (nfrags);
 546 }
 547
 548 /* Only error this returns is ENOBUFS when the tx fifo is short of space */
 549 static int
 550 gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf)
 551 {
 552         bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false;
 553         int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset;
 554         uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len;
 555         int pad_bytes, hdr_nfrags, payload_nfrags;
 556         struct gve_tx_pkt_desc *pkt_desc;
 557         struct gve_tx_seg_desc *seg_desc;
 558         struct gve_tx_mtd_desc *mtd_desc;
 559         struct gve_tx_buffer_state *info;
 560         uint32_t idx = tx->req & tx->mask;
 561         struct ether_header *eh;
 562         struct mbuf *mbuf_next;
 563         int payload_iov = 2;
 564         int bytes_required;
 565         struct ip6_hdr *ip6;
 566         struct tcphdr *th;
 567         uint32_t next_idx;
 568         uint8_t l3_off;
 569         struct ip *ip;
 570         int i;
 571
 572         info = &tx->info[idx];
 573         csum_flags = mbuf->m_pkthdr.csum_flags;
 574         pkt_len = mbuf->m_pkthdr.len;
 575         is_tso = csum_flags & CSUM_TSO;
 576         has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP |
 577             CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO);
 578         mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0;
 579         tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0;
 580
 581         eh = mtod(mbuf, struct ether_header *);
 582         KASSERT(eh->ether_type != ETHERTYPE_VLAN,
 583             ("VLAN-tagged packets not supported"));
 584
 585         is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6;
 586         l3_off = ETHER_HDR_LEN;
 587         mbuf_next = m_getptr(mbuf, l3_off, &offset);
 588
 589         if (is_ipv6) {
 590                 ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset));
 591                 l4_off = l3_off + sizeof(struct ip6_hdr);
 592                 is_tcp = (ip6->ip6_nxt == IPPROTO_TCP);
 593                 is_udp = (ip6->ip6_nxt == IPPROTO_UDP);
 594                 mbuf_next = m_getptr(mbuf, l4_off, &offset);
 595         } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 596                 ip = (struct ip *)(mtodo(mbuf_next, offset));
 597                 l4_off = l3_off + (ip->ip_hl << 2);
 598                 is_tcp = (ip->ip_p == IPPROTO_TCP);
 599                 is_udp = (ip->ip_p == IPPROTO_UDP);
 600                 mbuf_next = m_getptr(mbuf, l4_off, &offset);
 601         }
 602
 603         l4_data_off = 0;
 604         if (is_tcp) {
 605                 th = (struct tcphdr *)(mtodo(mbuf_next, offset));
 606                 l4_data_off = l4_off + (th->th_off << 2);
 607         } else if (is_udp)
 608                 l4_data_off = l4_off + sizeof(struct udphdr);
 609
 610         if (has_csum_flag) {
 611                 if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0)
 612                         csum_offset = offsetof(struct tcphdr, th_sum);
 613                 else
 614                         csum_offset = offsetof(struct udphdr, uh_sum);
 615         }
 616
 617         /*
 618          * If this packet is neither a TCP nor a UDP packet, the first segment,
 619          * the one represented by the packet descriptor, will carry the
 620          * spec-stipulated minimum of 182B.
 621          */
 622         if (l4_data_off != 0)
 623                 first_seg_len = l4_data_off;
 624         else
 625                 first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES);
 626
 627         bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len);
 628         if (__predict_false(!gve_can_tx(tx, bytes_required))) {
 629                 counter_enter();
 630                 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1);
 631                 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
 632                 counter_exit();
 633                 return (ENOBUFS);
 634         }
 635
 636         /* So that the cleanup taskqueue can free the mbuf eventually. */
 637         info->mbuf = mbuf;
 638
 639         /*
 640          * We don't want to split the header, so if necessary, pad to the end
 641          * of the fifo and then put the header at the beginning of the fifo.
 642          */
 643         pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len);
 644         hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes,
 645             &info->iov[0]);
 646         KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0"));
 647         payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len,
 648             &info->iov[payload_iov]);
 649
 650         pkt_desc = &tx->desc_ring[idx].pkt;
 651         gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off,
 652             1 + mtd_desc_nr + payload_nfrags, first_seg_len,
 653             info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset,
 654             pkt_len);
 655
 656         m_copydata(mbuf, 0, first_seg_len,
 657             (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset);
 658         gve_dma_sync_for_device(tx->com.qpl,
 659             info->iov[hdr_nfrags - 1].iov_offset,
 660             info->iov[hdr_nfrags - 1].iov_len);
 661         copy_offset = first_seg_len;
 662
 663         if (mtd_desc_nr == 1) {
 664                 next_idx = (tx->req + 1) & tx->mask;
 665                 mtd_desc = &tx->desc_ring[next_idx].mtd;
 666                 gve_tx_fill_mtd_desc(mtd_desc, mbuf);
 667         }
 668
 669         for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
 670                 next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
 671                 seg_desc = &tx->desc_ring[next_idx].seg;
 672
 673                 gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len,
 674                     info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss);
 675
 676                 m_copydata(mbuf, copy_offset, info->iov[i].iov_len,
 677                     (char *)tx->fifo.base + info->iov[i].iov_offset);
 678                 gve_dma_sync_for_device(tx->com.qpl,
 679                     info->iov[i].iov_offset, info->iov[i].iov_len);
 680                 copy_offset += info->iov[i].iov_len;
 681         }
 682
 683         tx->req += (1 + mtd_desc_nr + payload_nfrags);
 684         if (is_tso) {
 685                 counter_enter();
 686                 counter_u64_add_protected(tx->stats.tso_packet_cnt, 1);
 687                 counter_exit();
 688         }
 689         return (0);
 690 }
 691
 692 static void
 693 gve_xmit_br(struct gve_tx_ring *tx)
 694 {
 695         struct gve_priv *priv = tx->com.priv;
 696         struct ifnet *ifp = priv->ifp;
 697         struct mbuf *mbuf;
 698
 699         while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 &&
 700             (mbuf = drbr_peek(ifp, tx->br)) != NULL) {
 701
 702                 if (__predict_false(gve_xmit(tx, mbuf) != 0)) {
 703                         drbr_putback(ifp, tx->br, mbuf);
 704                         taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
 705                         break;
 706                 }
 707
 708                 drbr_advance(ifp, tx->br);
 709                 BPF_MTAP(ifp, mbuf);
 710
 711                 bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map,
 712                     BUS_DMASYNC_PREWRITE);
 713                 gve_db_bar_write_4(priv, tx->com.db_offset, tx->req);
 714         }
 715 }
 716
 717 void
 718 gve_xmit_tq(void *arg, int pending)
 719 {
 720         struct gve_tx_ring *tx = (struct gve_tx_ring *)arg;
 721
 722         GVE_RING_LOCK(tx);
 723         gve_xmit_br(tx);
 724         GVE_RING_UNLOCK(tx);
 725 }
 726
 727 static bool
 728 is_vlan_tagged_pkt(struct mbuf *mbuf)
 729 {
 730         struct ether_header *eh;
 731
 732         eh = mtod(mbuf, struct ether_header *);
 733         return (ntohs(eh->ether_type) == ETHERTYPE_VLAN);
 734 }
 735
 736 int
 737 gve_xmit_ifp(if_t ifp, struct mbuf *mbuf)
 738 {
 739         struct gve_priv *priv = if_getsoftc(ifp);
 740         struct gve_tx_ring *tx;
 741         bool is_br_empty;
 742         int err;
 743         uint32_t i;
 744
 745         if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0))
 746                 return (ENODEV);
 747
 748         if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE)
 749                 i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues;
 750         else
 751                 i = curcpu % priv->tx_cfg.num_queues;
 752         tx = &priv->tx[i];
 753
 754         if (__predict_false(is_vlan_tagged_pkt(mbuf))) {
 755                 counter_enter();
 756                 counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1);
 757                 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
 758                 counter_exit();
 759                 m_freem(mbuf);
 760                 return (ENODEV);
 761         }
 762
 763         is_br_empty = drbr_empty(ifp, tx->br);
 764         err = drbr_enqueue(ifp, tx->br, mbuf);
 765         if (__predict_false(err != 0)) {
 766                 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
 767                 counter_enter();
 768                 counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1);
 769                 counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1);
 770                 counter_exit();
 771                 return (err);
 772         }
 773
 774         /*
 775          * If the mbuf we just enqueued is the only one on the ring, then
 776          * transmit it right away in the interests of low latency.
 777          */
 778         if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) {
 779                 gve_xmit_br(tx);
 780                 GVE_RING_UNLOCK(tx);
 781         } else {
 782                 taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task);
 783         }
 784
 785         return (0);
 786 }
 787
 788 void
 789 gve_qflush(if_t ifp)
 790 {
 791         struct gve_priv *priv = if_getsoftc(ifp);
 792         struct gve_tx_ring *tx;
 793         int i;
 794
 795         for (i = 0; i < priv->tx_cfg.num_queues; ++i) {
 796                 tx = &priv->tx[i];
 797                 if (drbr_empty(ifp, tx->br) == 0) {
 798                         GVE_RING_LOCK(tx);
 799                         drbr_flush(ifp, tx->br);
 800                         GVE_RING_UNLOCK(tx);
 801                 }
 802         }
 803
 804         if_qflush(ifp);
 805 }