2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2021 Microsoft Corp.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 #include <sys/types.h>
37 #include <sys/socket.h>
38 #include <sys/sysctl.h>
39 #include <sys/taskqueue.h>
40 #include <sys/counter.h>
42 #include <net/ethernet.h>
44 #include <net/if_media.h>
45 #include <netinet/tcp_lro.h>
48 #include "hw_channel.h"
51 /* Microsoft Azure Network Adapter (MANA)'s definitions
53 * Structures labeled with "HW DATA" are exchanged with the hardware. All of
54 * them are naturally aligned and hence don't need __packed.
56 /* MANA protocol version */
57 #define MANA_MAJOR_VERSION 0
58 #define MANA_MINOR_VERSION 1
59 #define MANA_MICRO_VERSION 1
61 #define DRV_MODULE_NAME "mana"
63 #ifndef DRV_MODULE_VERSION
64 #define DRV_MODULE_VERSION \
65 __XSTRING(MANA_MAJOR_VERSION) "." \
66 __XSTRING(MANA_MINOR_VERSION) "." \
67 __XSTRING(MANA_MICRO_VERSION)
69 #define DEVICE_NAME "Microsoft Azure Network Adapter (MANA)"
70 #define DEVICE_DESC "MANA adapter"
73 * Supported PCI vendor and devices IDs
75 #ifndef PCI_VENDOR_ID_MICROSOFT
76 #define PCI_VENDOR_ID_MICROSOFT 0x1414
79 #define PCI_DEV_ID_MANA_VF 0x00ba
81 typedef struct _mana_vendor_id_t {
86 typedef uint64_t mana_handle_t;
87 #define INVALID_MANA_HANDLE ((mana_handle_t)-1)
90 TRI_STATE_UNKNOWN = -1,
95 /* Number of entries for hardware indirection table must be in power of 2 */
96 #define MANA_INDIRECT_TABLE_SIZE 64
97 #define MANA_INDIRECT_TABLE_MASK (MANA_INDIRECT_TABLE_SIZE - 1)
99 /* The Toeplitz hash key's length in bytes: should be multiple of 8 */
100 #define MANA_HASH_KEY_SIZE 40
102 #define COMP_ENTRY_SIZE 64
104 #define MIN_FRAME_SIZE 146
105 #define ADAPTER_MTU_SIZE 1500
106 #define DEFAULT_FRAME_SIZE (ADAPTER_MTU_SIZE + 14)
107 #define MAX_FRAME_SIZE 4096
109 #define RX_BUFFERS_PER_QUEUE 512
111 #define MAX_SEND_BUFFERS_PER_QUEUE 256
113 #define EQ_SIZE (8 * PAGE_SIZE)
114 #define LOG2_EQ_THROTTLE 3
116 #define MAX_PORTS_IN_MANA_DEV 8
118 struct mana_send_buf_info {
120 bus_dmamap_t dma_map;
122 /* Required to store the result of mana_gd_post_work_request.
123 * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the
124 * work queue when the WQE is consumed.
126 struct gdma_posted_wqe_info wqe_inf;
130 counter_u64_t packets; /* rx, tx */
131 counter_u64_t bytes; /* rx, tx */
132 counter_u64_t stop; /* tx */
133 counter_u64_t wakeup; /* tx */
134 counter_u64_t collapse; /* tx */
135 counter_u64_t collapse_err; /* tx */
136 counter_u64_t dma_mapping_err; /* rx, tx */
137 counter_u64_t mbuf_alloc_fail; /* rx */
138 counter_u64_t alt_chg; /* tx */
139 counter_u64_t alt_reset; /* tx */
140 counter_u64_t cqe_err; /* tx */
141 counter_u64_t cqe_unknown_type; /* tx */
145 struct gdma_queue *gdma_sq;
148 uint32_t gdma_txq_id;
150 uint32_t reserved1 :10;
151 uint32_t vsq_frame :14;
152 uint32_t reserved2 :8;
159 /* Store index to the array of tx_qp in port structure */
161 /* The alternative txq idx when this txq is under heavy load */
164 /* The mbufs are sent to the HW and we are waiting for the CQEs. */
165 struct mana_send_buf_info *tx_buf_info;
166 uint16_t next_to_use;
167 uint16_t next_to_complete;
169 atomic_t pending_sends;
171 struct buf_ring *txq_br;
173 char txq_mtx_name[16];
178 struct task enqueue_task;
179 struct taskqueue *enqueue_tq;
181 struct mana_stats stats;
186 * Max WQE size is 512B. The first 8B is for GDMA Out of Band (OOB),
187 * next is the Client OOB can be either 8B or 24B. Thus, the max
188 * space for SGL entries in a singel WQE is 512 - 8 - 8 = 496B. Since each
189 * SGL is 16B in size, the max number of SGLs in a WQE is 496/16 = 31.
190 * Save one for emergency use, set the MAX_MBUF_FRAGS allowed to 30.
192 #define MAX_MBUF_FRAGS 30
193 #define MANA_TSO_MAXSEG_SZ PAGE_SIZE
194 #define MANA_TSO_MAX_SZ IP_MAXPACKET
196 /* mbuf data and frags dma mappings */
197 struct mana_mbuf_head {
198 bus_addr_t dma_handle[MAX_MBUF_FRAGS + 1];
200 uint32_t size[MAX_MBUF_FRAGS + 1];
203 #define MANA_HEADROOM sizeof(struct mana_mbuf_head)
205 enum mana_tx_pkt_format {
206 MANA_SHORT_PKT_FMT = 0,
207 MANA_LONG_PKT_FMT = 1,
210 struct mana_tx_short_oob {
212 uint32_t is_outer_ipv4 :1;
213 uint32_t is_outer_ipv6 :1;
214 uint32_t comp_iphdr_csum :1;
215 uint32_t comp_tcp_csum :1;
216 uint32_t comp_udp_csum :1;
217 uint32_t supress_txcqe_gen :1;
218 uint32_t vcq_num :24;
220 uint32_t trans_off :10; /* Transport header offset */
221 uint32_t vsq_frame :14;
222 uint32_t short_vp_offset :8;
225 struct mana_tx_long_oob {
226 uint32_t is_encap :1;
227 uint32_t inner_is_ipv6 :1;
228 uint32_t inner_tcp_opt :1;
229 uint32_t inject_vlan_pri_tag :1;
230 uint32_t reserved1 :12;
231 uint32_t pcp :3; /* 802.1Q */
232 uint32_t dei :1; /* 802.1Q */
233 uint32_t vlan_id :12; /* 802.1Q */
235 uint32_t inner_frame_offset :10;
236 uint32_t inner_ip_rel_offset :6;
237 uint32_t long_vp_offset :12;
238 uint32_t reserved2 :4;
245 struct mana_tx_short_oob s_oob;
246 struct mana_tx_long_oob l_oob;
257 CQE_RX_COALESCED_4 = 2,
258 CQE_RX_OBJECT_FENCE = 3,
259 CQE_RX_TRUNCATED = 4,
263 CQE_TX_MTU_DROP = 34,
264 CQE_TX_INVALID_OOB = 35,
265 CQE_TX_INVALID_ETH_TYPE = 36,
266 CQE_TX_HDR_PROCESSING_ERROR = 37,
267 CQE_TX_VF_DISABLED = 38,
268 CQE_TX_VPORT_IDX_OUT_OF_RANGE = 39,
269 CQE_TX_VPORT_DISABLED = 40,
270 CQE_TX_VLAN_TAGGING_VIOLATION = 41,
273 #define MANA_CQE_COMPLETION 1
275 struct mana_cqe_header {
276 uint32_t cqe_type :6;
277 uint32_t client_type :2;
278 uint32_t vendor_err :24;
281 /* NDIS HASH Types */
282 #define NDIS_HASH_IPV4 BIT(0)
283 #define NDIS_HASH_TCP_IPV4 BIT(1)
284 #define NDIS_HASH_UDP_IPV4 BIT(2)
285 #define NDIS_HASH_IPV6 BIT(3)
286 #define NDIS_HASH_TCP_IPV6 BIT(4)
287 #define NDIS_HASH_UDP_IPV6 BIT(5)
288 #define NDIS_HASH_IPV6_EX BIT(6)
289 #define NDIS_HASH_TCP_IPV6_EX BIT(7)
290 #define NDIS_HASH_UDP_IPV6_EX BIT(8)
292 #define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX)
293 #define MANA_HASH_L4 \
294 (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 | \
295 NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
297 #define NDIS_HASH_IPV4_L3_MASK (NDIS_HASH_IPV4)
298 #define NDIS_HASH_IPV4_L4_MASK (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4)
299 #define NDIS_HASH_IPV6_L3_MASK (NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX)
300 #define NDIS_HASH_IPV6_L4_MASK \
301 (NDIS_HASH_TCP_IPV6 | NDIS_HASH_UDP_IPV6 | \
302 NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
303 #define NDIS_HASH_IPV4_MASK \
304 (NDIS_HASH_IPV4_L3_MASK | NDIS_HASH_IPV4_L4_MASK)
305 #define NDIS_HASH_IPV6_MASK \
306 (NDIS_HASH_IPV6_L3_MASK | NDIS_HASH_IPV6_L4_MASK)
309 struct mana_rxcomp_perpkt_info {
310 uint32_t pkt_len :16;
311 uint32_t reserved1 :16;
316 #define MANA_RXCOMP_OOB_NUM_PPI 4
318 /* Receive completion OOB */
319 struct mana_rxcomp_oob {
320 struct mana_cqe_header cqe_hdr;
322 uint32_t rx_vlan_id :12;
323 uint32_t rx_vlantag_present :1;
324 uint32_t rx_outer_iphdr_csum_succeed :1;
325 uint32_t rx_outer_iphdr_csum_fail :1;
326 uint32_t reserved1 :1;
327 uint32_t rx_hashtype :9;
328 uint32_t rx_iphdr_csum_succeed :1;
329 uint32_t rx_iphdr_csum_fail :1;
330 uint32_t rx_tcp_csum_succeed :1;
331 uint32_t rx_tcp_csum_fail :1;
332 uint32_t rx_udp_csum_succeed :1;
333 uint32_t rx_udp_csum_fail :1;
334 uint32_t reserved2 :1;
336 struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI];
338 uint32_t rx_wqe_offset;
341 struct mana_tx_comp_oob {
342 struct mana_cqe_header cqe_hdr;
344 uint32_t tx_data_offset;
346 uint32_t tx_sgl_offset :5;
347 uint32_t tx_wqe_offset :27;
349 uint32_t reserved[12];
354 #define CQE_POLLING_BUFFER 512
357 struct gdma_queue *gdma_cq;
359 /* Cache the CQ id (used to verify if each CQE comes to the right CQ. */
362 /* Type of the CQ: TX or RX */
363 enum mana_cq_type type;
365 /* Pointer to the mana_rxq that is pushing RX CQEs to the queue.
366 * Only and must be non-NULL if type is MANA_CQ_TYPE_RX.
368 struct mana_rxq *rxq;
370 /* Pointer to the mana_txq that is pushing TX CQEs to the queue.
371 * Only and must be non-NULL if type is MANA_CQ_TYPE_TX.
373 struct mana_txq *txq;
375 /* Taskqueue and related structs */
376 struct task cleanup_task;
377 struct taskqueue *cleanup_tq;
381 /* Budget for one cleanup task */
385 /* Buffer which the CQ handler can copy the CQE's into. */
386 struct gdma_comp gdma_comp_buf[CQE_POLLING_BUFFER];
389 struct mana_recv_buf_oob {
390 /* A valid GDMA work request representing the data buffer. */
391 struct gdma_wqe_request wqe_req;
394 bus_dmamap_t dma_map;
396 /* SGL of the buffer going to be sent as part of the work request. */
398 struct gdma_sge sgl[MAX_RX_WQE_SGL_ENTRIES];
400 /* Required to store the result of mana_gd_post_work_request.
401 * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the
402 * work queue when the WQE is consumed.
404 struct gdma_posted_wqe_info wqe_inf;
408 struct gdma_queue *gdma_rq;
409 /* Cache the gdma receive queue id */
412 /* Index of RQ in the vPort, not gdma receive queue id */
419 struct completion fence_event;
421 struct mana_cq rx_cq;
426 /* Total number of receive buffers to be allocated */
433 struct mana_stats stats;
435 /* MUST BE THE LAST MEMBER:
436 * Each receive buffer has an associated mana_recv_buf_oob.
438 struct mana_recv_buf_oob rx_oobs[];
444 struct mana_cq tx_cq;
446 mana_handle_t tx_object;
449 struct mana_port_stats {
450 counter_u64_t rx_packets;
451 counter_u64_t tx_packets;
453 counter_u64_t rx_bytes;
454 counter_u64_t tx_bytes;
456 counter_u64_t rx_drops;
457 counter_u64_t tx_drops;
459 counter_u64_t stop_queue;
460 counter_u64_t wake_queue;
463 struct mana_context {
464 struct gdma_dev *gdma_dev;
470 if_t ports[MAX_PORTS_IN_MANA_DEV];
473 struct mana_port_context {
474 struct mana_context *ac;
476 struct ifmedia media;
480 /* DMA tag used for queue bufs of the entire port */
481 bus_dma_tag_t rx_buf_tag;
482 bus_dma_tag_t tx_buf_tag;
484 uint8_t mac_addr[ETHER_ADDR_LEN];
486 enum TRI_STATE rss_state;
488 mana_handle_t default_rxobj;
489 bool tx_shortform_allowed;
490 uint16_t tx_vp_offset;
492 struct mana_tx_qp *tx_qp;
494 /* Indirection Table for RX & TX. The values are queue indexes */
495 uint32_t indir_table[MANA_INDIRECT_TABLE_SIZE];
497 /* Indirection table containing RxObject Handles */
498 mana_handle_t rxobj_table[MANA_INDIRECT_TABLE_SIZE];
500 /* Hash key used by the NIC */
501 uint8_t hashkey[MANA_HASH_KEY_SIZE];
503 /* This points to an array of num_queues of RQ pointers. */
504 struct mana_rxq **rxqs;
506 /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
507 unsigned int max_queues;
508 unsigned int num_queues;
510 mana_handle_t port_handle;
519 bool port_st_save; /* Saved port state */
523 bool bind_cleanup_thread_cpu;
524 int last_tx_cq_bind_cpu;
525 int last_rx_cq_bind_cpu;
527 struct mana_port_stats port_stats;
529 struct sysctl_oid_list *port_list;
530 struct sysctl_ctx_list que_sysctl_ctx;
533 #define MANA_APC_LOCK_INIT(apc) \
534 sx_init(&(apc)->apc_lock, "MANA port lock")
535 #define MANA_APC_LOCK_DESTROY(apc) sx_destroy(&(apc)->apc_lock)
536 #define MANA_APC_LOCK_LOCK(apc) sx_xlock(&(apc)->apc_lock)
537 #define MANA_APC_LOCK_UNLOCK(apc) sx_unlock(&(apc)->apc_lock)
539 int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx,
540 bool update_hash, bool update_tab);
542 int mana_alloc_queues(if_t ndev);
543 int mana_attach(if_t ndev);
544 int mana_detach(if_t ndev);
546 int mana_probe(struct gdma_dev *gd);
547 void mana_remove(struct gdma_dev *gd);
549 struct mana_obj_spec {
550 uint32_t queue_index;
551 uint64_t gdma_region;
553 uint32_t attached_eq;
554 uint32_t modr_ctx_id;
557 enum mana_command_code {
558 MANA_QUERY_DEV_CONFIG = 0x20001,
559 MANA_QUERY_GF_STAT = 0x20002,
560 MANA_CONFIG_VPORT_TX = 0x20003,
561 MANA_CREATE_WQ_OBJ = 0x20004,
562 MANA_DESTROY_WQ_OBJ = 0x20005,
563 MANA_FENCE_RQ = 0x20006,
564 MANA_CONFIG_VPORT_RX = 0x20007,
565 MANA_QUERY_VPORT_CONFIG = 0x20008,
568 /* Query Device Configuration */
569 struct mana_query_device_cfg_req {
570 struct gdma_req_hdr hdr;
572 /* Driver Capability flags */
573 uint64_t drv_cap_flags1;
574 uint64_t drv_cap_flags2;
575 uint64_t drv_cap_flags3;
576 uint64_t drv_cap_flags4;
578 uint32_t proto_major_ver;
579 uint32_t proto_minor_ver;
580 uint32_t proto_micro_ver;
585 struct mana_query_device_cfg_resp {
586 struct gdma_resp_hdr hdr;
588 uint64_t pf_cap_flags1;
589 uint64_t pf_cap_flags2;
590 uint64_t pf_cap_flags3;
591 uint64_t pf_cap_flags4;
593 uint16_t max_num_vports;
595 uint32_t max_num_eqs;
598 /* Query vPort Configuration */
599 struct mana_query_vport_cfg_req {
600 struct gdma_req_hdr hdr;
601 uint32_t vport_index;
604 struct mana_query_vport_cfg_resp {
605 struct gdma_resp_hdr hdr;
608 uint32_t num_indirection_ent;
611 uint8_t reserved2[2];
615 /* Configure vPort */
616 struct mana_config_vport_req {
617 struct gdma_req_hdr hdr;
620 uint32_t doorbell_pageid;
623 struct mana_config_vport_resp {
624 struct gdma_resp_hdr hdr;
625 uint16_t tx_vport_offset;
626 uint8_t short_form_allowed;
630 /* Create WQ Object */
631 struct mana_create_wqobj_req {
632 struct gdma_req_hdr hdr;
636 uint64_t wq_gdma_region;
637 uint64_t cq_gdma_region;
640 uint32_t cq_moderation_ctx_id;
641 uint32_t cq_parent_qid;
644 struct mana_create_wqobj_resp {
645 struct gdma_resp_hdr hdr;
648 mana_handle_t wq_obj;
651 /* Destroy WQ Object */
652 struct mana_destroy_wqobj_req {
653 struct gdma_req_hdr hdr;
656 mana_handle_t wq_obj_handle;
659 struct mana_destroy_wqobj_resp {
660 struct gdma_resp_hdr hdr;
664 struct mana_fence_rq_req {
665 struct gdma_req_hdr hdr;
666 mana_handle_t wq_obj_handle;
669 struct mana_fence_rq_resp {
670 struct gdma_resp_hdr hdr;
673 /* Configure vPort Rx Steering */
674 struct mana_cfg_rx_steer_req {
675 struct gdma_req_hdr hdr;
677 uint16_t num_indir_entries;
678 uint16_t indir_tab_offset;
681 uint8_t update_default_rxobj;
682 uint8_t update_hashkey;
683 uint8_t update_indir_tab;
685 mana_handle_t default_rxobj;
686 uint8_t hashkey[MANA_HASH_KEY_SIZE];
689 struct mana_cfg_rx_steer_resp {
690 struct gdma_resp_hdr hdr;
693 #define MANA_MAX_NUM_QUEUES 16
695 #define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1)
697 struct mana_tx_package {
698 struct gdma_wqe_request wqe_req;
699 struct gdma_sge sgl_array[MAX_MBUF_FRAGS];
701 struct mana_tx_oob tx_oob;
703 struct gdma_posted_wqe_info wqe_info;
706 int mana_restart(struct mana_port_context *apc);
708 int mana_create_wq_obj(struct mana_port_context *apc,
710 uint32_t wq_type, struct mana_obj_spec *wq_spec,
711 struct mana_obj_spec *cq_spec,
712 mana_handle_t *wq_obj);
714 void mana_destroy_wq_obj(struct mana_port_context *apc, uint32_t wq_type,
715 mana_handle_t wq_obj);
717 int mana_cfg_vport(struct mana_port_context *apc, uint32_t protection_dom_id,
718 uint32_t doorbell_pg_id);
720 void mana_uncfg_vport(struct mana_port_context *apc);