2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
81 #include <dev/nvme/nvme.h>
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR 4
97 #define NVME_IOSLOTS 8
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN (1 << 14)
102 #define NVME_QUEUES 16
103 #define NVME_MAX_QENTRIES 2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN 0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS 0xffff
117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
121 /* Convert a zero-based value into a one-based value */
122 #define ONE_BASED(zero) ((zero) + 1)
123 /* Convert a one-based value into a zero-based value */
124 #define ZERO_BASED(one) ((one) - 1)
126 /* Encode number of SQ's and CQ's for Set/Get Features */
127 #define NVME_FEATURE_NUM_QUEUES(sc) \
128 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
129 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
131 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
133 enum nvme_controller_register_offsets {
134 NVME_CR_CAP_LOW = 0x00,
135 NVME_CR_CAP_HI = 0x04,
137 NVME_CR_INTMS = 0x0c,
138 NVME_CR_INTMC = 0x10,
143 NVME_CR_ASQ_LOW = 0x28,
144 NVME_CR_ASQ_HI = 0x2c,
145 NVME_CR_ACQ_LOW = 0x30,
146 NVME_CR_ACQ_HI = 0x34,
149 enum nvme_cmd_cdw11 {
150 NVME_CMD_CDW11_PC = 0x0001,
151 NVME_CMD_CDW11_IEN = 0x0002,
152 NVME_CMD_CDW11_IV = 0xFFFF0000,
160 #define NVME_CQ_INTEN 0x01
161 #define NVME_CQ_INTCOAL 0x02
163 struct nvme_completion_queue {
164 struct nvme_completion *qbase;
167 uint16_t tail; /* nvme progress */
168 uint16_t head; /* guest progress */
173 struct nvme_submission_queue {
174 struct nvme_command *qbase;
177 uint16_t head; /* nvme progress */
178 uint16_t tail; /* guest progress */
179 uint16_t cqid; /* completion queue id */
183 enum nvme_storage_type {
184 NVME_STOR_BLOCKIF = 0,
188 struct pci_nvme_blockstore {
189 enum nvme_storage_type type;
193 uint32_t sectsz_bits;
195 uint32_t deallocate:1;
199 * Calculate the number of additional page descriptors for guest IO requests
200 * based on the advertised Max Data Transfer (MDTS) and given the number of
201 * default iovec's in a struct blockif_req.
203 #define MDTS_PAD_SIZE \
204 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
255 PCI_NVME_AE_TYPE_ERROR = 0,
256 PCI_NVME_AE_TYPE_SMART,
257 PCI_NVME_AE_TYPE_NOTICE,
258 PCI_NVME_AE_TYPE_IO_CMD = 6,
259 PCI_NVME_AE_TYPE_VENDOR = 7,
260 PCI_NVME_AE_TYPE_MAX /* Must be last */
261 } pci_nvme_async_type;
263 /* Asynchronous Event Requests */
264 struct pci_nvme_aer {
265 STAILQ_ENTRY(pci_nvme_aer) link;
266 uint16_t cid; /* Command ID of the submitted AER */
270 PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
271 PCI_NVME_AE_INFO_FW_ACTIVATION,
272 PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
273 PCI_NVME_AE_INFO_ANA_CHANGE,
274 PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
275 PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
276 PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
277 PCI_NVME_AE_INFO_MAX,
278 } pci_nvme_async_info;
280 /* Asynchronous Event Notifications */
281 struct pci_nvme_aen {
282 pci_nvme_async_type atype;
287 struct pci_nvme_softc {
288 struct pci_devinst *nsc_pi;
292 struct nvme_registers regs;
294 struct nvme_namespace_data nsdata;
295 struct nvme_controller_data ctrldata;
296 struct nvme_error_information_entry err_log;
297 struct nvme_health_information_page health_log;
298 struct nvme_firmware_page fw_log;
299 struct nvme_ns_list ns_log;
301 struct pci_nvme_blockstore nvstore;
303 uint16_t max_qentries; /* max entries per queue */
304 uint32_t max_queues; /* max number of IO SQ's or CQ's */
305 uint32_t num_cqueues;
306 uint32_t num_squeues;
307 bool num_q_is_set; /* Has host set Number of Queues */
309 struct pci_nvme_ioreq *ioreqs;
310 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
311 uint32_t pending_ios;
316 * Memory mapped Submission and Completion queues
317 * Each array includes both Admin and IO queues
319 struct nvme_completion_queue *compl_queues;
320 struct nvme_submission_queue *submit_queues;
322 struct nvme_feature_obj feat[NVME_FID_MAX];
324 enum nvme_dsm_type dataset_management;
326 /* Accounting for SMART data */
327 __uint128_t read_data_units;
328 __uint128_t write_data_units;
329 __uint128_t read_commands;
330 __uint128_t write_commands;
331 uint32_t read_dunits_remainder;
332 uint32_t write_dunits_remainder;
334 STAILQ_HEAD(, pci_nvme_aer) aer_list;
335 pthread_mutex_t aer_mtx;
337 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
339 pthread_mutex_t aen_mtx;
340 pthread_cond_t aen_cond;
344 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
345 struct nvme_completion_queue *cq,
350 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
351 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
352 static void pci_nvme_io_done(struct blockif_req *, int);
354 /* Controller Configuration utils */
355 #define NVME_CC_GET_EN(cc) \
356 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
357 #define NVME_CC_GET_CSS(cc) \
358 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
359 #define NVME_CC_GET_SHN(cc) \
360 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
361 #define NVME_CC_GET_IOSQES(cc) \
362 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
363 #define NVME_CC_GET_IOCQES(cc) \
364 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
366 #define NVME_CC_WRITE_MASK \
367 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
368 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
369 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
371 #define NVME_CC_NEN_WRITE_MASK \
372 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
373 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
374 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
376 /* Controller Status utils */
377 #define NVME_CSTS_GET_RDY(sts) \
378 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
380 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
382 /* Completion Queue status word utils */
383 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
384 #define NVME_STATUS_MASK \
385 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
386 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
388 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
389 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
391 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
392 struct nvme_feature_obj *,
393 struct nvme_command *,
394 struct nvme_completion *);
395 static void nvme_feature_num_queues(struct pci_nvme_softc *,
396 struct nvme_feature_obj *,
397 struct nvme_command *,
398 struct nvme_completion *);
399 static void nvme_feature_iv_config(struct pci_nvme_softc *,
400 struct nvme_feature_obj *,
401 struct nvme_command *,
402 struct nvme_completion *);
404 static void *aen_thr(void *arg);
407 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
411 len = strnlen(src, dst_size);
412 memset(dst, pad, dst_size);
413 memcpy(dst, src, len);
417 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
420 *status &= ~NVME_STATUS_MASK;
421 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
422 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
426 pci_nvme_status_genc(uint16_t *status, uint16_t code)
429 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
433 * Initialize the requested number or IO Submission and Completion Queues.
434 * Admin queues are allocated implicitly.
437 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
442 * Allocate and initialize the Submission Queues
444 if (nsq > NVME_QUEUES) {
445 WPRINTF("%s: clamping number of SQ from %u to %u",
446 __func__, nsq, NVME_QUEUES);
450 sc->num_squeues = nsq;
452 sc->submit_queues = calloc(sc->num_squeues + 1,
453 sizeof(struct nvme_submission_queue));
454 if (sc->submit_queues == NULL) {
455 WPRINTF("%s: SQ allocation failed", __func__);
458 struct nvme_submission_queue *sq = sc->submit_queues;
460 for (i = 0; i < sc->num_squeues; i++)
461 pthread_mutex_init(&sq[i].mtx, NULL);
465 * Allocate and initialize the Completion Queues
467 if (ncq > NVME_QUEUES) {
468 WPRINTF("%s: clamping number of CQ from %u to %u",
469 __func__, ncq, NVME_QUEUES);
473 sc->num_cqueues = ncq;
475 sc->compl_queues = calloc(sc->num_cqueues + 1,
476 sizeof(struct nvme_completion_queue));
477 if (sc->compl_queues == NULL) {
478 WPRINTF("%s: CQ allocation failed", __func__);
481 struct nvme_completion_queue *cq = sc->compl_queues;
483 for (i = 0; i < sc->num_cqueues; i++)
484 pthread_mutex_init(&cq[i].mtx, NULL);
489 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
491 struct nvme_controller_data *cd = &sc->ctrldata;
496 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
497 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
499 /* Num of submission commands that we can handle at a time (2^rab) */
509 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
511 cd->ver = NVME_REV(1,4);
513 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
517 /* Advertise 1, Read-only firmware slot */
518 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
519 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
520 cd->lpa = 0; /* TODO: support some simple things like SMART */
521 cd->elpe = 0; /* max error log page entries */
522 cd->npss = 1; /* number of power states support */
524 /* Warning Composite Temperature Threshold */
527 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
528 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
529 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
530 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
531 cd->nn = 1; /* number of namespaces */
534 switch (sc->dataset_management) {
535 case NVME_DATASET_MANAGEMENT_AUTO:
536 if (sc->nvstore.deallocate)
537 cd->oncs |= NVME_ONCS_DSM;
539 case NVME_DATASET_MANAGEMENT_ENABLE:
540 cd->oncs |= NVME_ONCS_DSM;
548 cd->power_state[0].mp = 10;
552 * Calculate the CRC-16 of the given buffer
553 * See copyright attribution at top of file
556 crc16(uint16_t crc, const void *buffer, unsigned int len)
558 const unsigned char *cp = buffer;
559 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
560 static uint16_t const crc16_table[256] = {
561 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
562 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
563 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
564 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
565 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
566 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
567 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
568 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
569 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
570 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
571 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
572 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
573 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
574 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
575 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
576 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
577 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
578 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
579 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
580 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
581 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
582 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
583 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
584 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
585 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
586 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
587 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
588 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
589 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
590 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
591 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
592 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
596 crc = (((crc >> 8) & 0xffU) ^
597 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
602 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
603 struct nvme_namespace_data *nd)
606 /* Get capacity and block size information from backing store */
607 nd->nsze = nvstore->size / nvstore->sectsz;
613 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
614 struct nvme_namespace_data *nd, uint32_t nsid,
615 struct pci_nvme_blockstore *nvstore)
618 pci_nvme_init_nsdata_size(nvstore, nd);
620 if (nvstore->type == NVME_STOR_BLOCKIF)
621 nvstore->deallocate = blockif_candelete(nvstore->ctx);
623 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
626 /* Create an EUI-64 if user did not provide one */
627 if (nvstore->eui64 == 0) {
629 uint64_t eui64 = nvstore->eui64;
631 asprintf(&data, "%s%u%u%u", get_config_value("name"),
632 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
633 sc->nsc_pi->pi_func);
636 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
639 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
641 be64enc(nd->eui64, nvstore->eui64);
643 /* LBA data-sz = 2^lbads */
644 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
648 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
651 memset(&sc->err_log, 0, sizeof(sc->err_log));
652 memset(&sc->health_log, 0, sizeof(sc->health_log));
653 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
654 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
656 /* Set read/write remainder to round up according to spec */
657 sc->read_dunits_remainder = 999;
658 sc->write_dunits_remainder = 999;
660 /* Set nominal Health values checked by implementations */
661 sc->health_log.temperature = 310;
662 sc->health_log.available_spare = 100;
663 sc->health_log.available_spare_threshold = 10;
667 pci_nvme_init_features(struct pci_nvme_softc *sc)
670 sc->feat[0].set = nvme_feature_invalid_cb;
671 sc->feat[0].get = nvme_feature_invalid_cb;
673 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
674 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
675 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
676 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
677 nvme_feature_iv_config;
678 /* Enable all AENs by default */
679 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f;
680 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
681 nvme_feature_invalid_cb;
682 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
683 nvme_feature_invalid_cb;
687 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
690 STAILQ_INIT(&sc->aer_list);
695 pci_nvme_aer_init(struct pci_nvme_softc *sc)
698 pthread_mutex_init(&sc->aer_mtx, NULL);
699 pci_nvme_aer_reset(sc);
703 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
705 struct pci_nvme_aer *aer = NULL;
707 pthread_mutex_lock(&sc->aer_mtx);
708 while (!STAILQ_EMPTY(&sc->aer_list)) {
709 aer = STAILQ_FIRST(&sc->aer_list);
710 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
713 pthread_mutex_unlock(&sc->aer_mtx);
715 pci_nvme_aer_reset(sc);
719 pci_nvme_aer_available(struct pci_nvme_softc *sc)
722 return (sc->aer_count != 0);
726 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
728 struct nvme_controller_data *cd = &sc->ctrldata;
730 /* AERL is a zero based value while aer_count is one's based */
731 return (sc->aer_count == (cd->aerl + 1));
735 * Add an Async Event Request
737 * Stores an AER to be returned later if the Controller needs to notify the
739 * Note that while the NVMe spec doesn't require Controllers to return AER's
740 * in order, this implementation does preserve the order.
743 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
745 struct pci_nvme_aer *aer = NULL;
747 if (pci_nvme_aer_limit_reached(sc))
750 aer = calloc(1, sizeof(struct pci_nvme_aer));
754 /* Save the Command ID for use in the completion message */
757 pthread_mutex_lock(&sc->aer_mtx);
759 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
760 pthread_mutex_unlock(&sc->aer_mtx);
766 * Get an Async Event Request structure
768 * Returns a pointer to an AER previously submitted by the host or NULL if
769 * no AER's exist. Caller is responsible for freeing the returned struct.
771 static struct pci_nvme_aer *
772 pci_nvme_aer_get(struct pci_nvme_softc *sc)
774 struct pci_nvme_aer *aer = NULL;
776 pthread_mutex_lock(&sc->aer_mtx);
777 aer = STAILQ_FIRST(&sc->aer_list);
779 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
782 pthread_mutex_unlock(&sc->aer_mtx);
788 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
792 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
794 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
795 sc->aen[atype].atype = atype;
800 pci_nvme_aen_init(struct pci_nvme_softc *sc)
804 pci_nvme_aen_reset(sc);
806 pthread_mutex_init(&sc->aen_mtx, NULL);
807 pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
808 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
809 sc->nsc_pi->pi_func);
810 pthread_set_name_np(sc->aen_tid, nstr);
814 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
817 pci_nvme_aen_reset(sc);
820 /* Notify the AEN thread of pending work */
822 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
825 pthread_cond_signal(&sc->aen_cond);
829 * Post an Asynchronous Event Notification
832 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
835 struct pci_nvme_aen *aen;
837 if (atype >= PCI_NVME_AE_TYPE_MAX) {
841 pthread_mutex_lock(&sc->aen_mtx);
842 aen = &sc->aen[atype];
844 /* Has the controller already posted an event of this type? */
846 pthread_mutex_unlock(&sc->aen_mtx);
850 aen->event_data = event_data;
852 pthread_mutex_unlock(&sc->aen_mtx);
854 pci_nvme_aen_notify(sc);
860 pci_nvme_aen_process(struct pci_nvme_softc *sc)
862 struct pci_nvme_aer *aer;
863 struct pci_nvme_aen *aen;
864 pci_nvme_async_type atype;
869 assert(pthread_mutex_isowned_np(&sc->aen_mtx));
870 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
871 aen = &sc->aen[atype];
872 /* Previous iterations may have depleted the available AER's */
873 if (!pci_nvme_aer_available(sc)) {
874 DPRINTF("%s: no AER", __func__);
879 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
883 status = NVME_SC_SUCCESS;
885 /* Is the event masked? */
887 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
889 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
891 case PCI_NVME_AE_TYPE_ERROR:
892 lid = NVME_LOG_ERROR;
894 case PCI_NVME_AE_TYPE_SMART:
896 if ((mask & aen->event_data) == 0)
898 lid = NVME_LOG_HEALTH_INFORMATION;
900 case PCI_NVME_AE_TYPE_NOTICE:
901 if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
902 EPRINTLN("%s unknown AEN notice type %u",
903 __func__, aen->event_data);
904 status = NVME_SC_INTERNAL_DEVICE_ERROR;
908 if (((1 << aen->event_data) & mask) == 0)
910 switch (aen->event_data) {
911 case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
912 lid = NVME_LOG_CHANGED_NAMESPACE;
914 case PCI_NVME_AE_INFO_FW_ACTIVATION:
915 lid = NVME_LOG_FIRMWARE_SLOT;
917 case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
918 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
920 case PCI_NVME_AE_INFO_ANA_CHANGE:
921 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
923 case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
924 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
926 case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
927 lid = NVME_LOG_LBA_STATUS_INFORMATION;
929 case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
930 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
938 EPRINTLN("%s unknown AEN type %u", __func__, atype);
939 status = NVME_SC_INTERNAL_DEVICE_ERROR;
943 aer = pci_nvme_aer_get(sc);
946 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
947 pci_nvme_cq_update(sc, &sc->compl_queues[0],
948 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
956 pci_generate_msix(sc->nsc_pi, 0);
963 struct pci_nvme_softc *sc;
967 pthread_mutex_lock(&sc->aen_mtx);
969 pci_nvme_aen_process(sc);
970 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
972 pthread_mutex_unlock(&sc->aen_mtx);
979 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
983 DPRINTF("%s", __func__);
985 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
986 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
987 (60 << NVME_CAP_LO_REG_TO_SHIFT);
989 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
991 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */
996 assert(sc->submit_queues != NULL);
998 for (i = 0; i < sc->num_squeues + 1; i++) {
999 sc->submit_queues[i].qbase = NULL;
1000 sc->submit_queues[i].size = 0;
1001 sc->submit_queues[i].cqid = 0;
1002 sc->submit_queues[i].tail = 0;
1003 sc->submit_queues[i].head = 0;
1006 assert(sc->compl_queues != NULL);
1008 for (i = 0; i < sc->num_cqueues + 1; i++) {
1009 sc->compl_queues[i].qbase = NULL;
1010 sc->compl_queues[i].size = 0;
1011 sc->compl_queues[i].tail = 0;
1012 sc->compl_queues[i].head = 0;
1015 sc->num_q_is_set = false;
1017 pci_nvme_aer_destroy(sc);
1018 pci_nvme_aen_destroy(sc);
1022 pci_nvme_reset(struct pci_nvme_softc *sc)
1024 pthread_mutex_lock(&sc->mtx);
1025 pci_nvme_reset_locked(sc);
1026 pthread_mutex_unlock(&sc->mtx);
1030 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1032 uint16_t acqs, asqs;
1034 DPRINTF("%s", __func__);
1036 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1037 sc->submit_queues[0].size = asqs;
1038 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1039 sizeof(struct nvme_command) * asqs);
1041 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1042 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1044 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1045 NVME_AQA_REG_ACQS_MASK) + 1;
1046 sc->compl_queues[0].size = acqs;
1047 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1048 sizeof(struct nvme_completion) * acqs);
1049 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1051 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1052 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1056 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1057 size_t len, enum nvme_copy_dir dir)
1062 if (len > (8 * 1024)) {
1066 /* Copy from the start of prp1 to the end of the physical page */
1067 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1068 bytes = MIN(bytes, len);
1070 p = vm_map_gpa(ctx, prp1, bytes);
1075 if (dir == NVME_COPY_TO_PRP)
1076 memcpy(p, b, bytes);
1078 memcpy(b, p, bytes);
1087 len = MIN(len, PAGE_SIZE);
1089 p = vm_map_gpa(ctx, prp2, len);
1094 if (dir == NVME_COPY_TO_PRP)
1103 * Write a Completion Queue Entry update
1105 * Write the completion and update the doorbell value
1108 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1109 struct nvme_completion_queue *cq,
1115 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1116 struct nvme_completion *cqe;
1118 assert(cq->qbase != NULL);
1120 pthread_mutex_lock(&cq->mtx);
1122 cqe = &cq->qbase[cq->tail];
1124 /* Flip the phase bit */
1125 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1128 cqe->sqhd = sq->head;
1131 cqe->status = status;
1134 if (cq->tail >= cq->size) {
1138 pthread_mutex_unlock(&cq->mtx);
1142 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1143 struct nvme_completion* compl)
1145 uint16_t qid = command->cdw10 & 0xffff;
1147 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1148 if (qid == 0 || qid > sc->num_squeues ||
1149 (sc->submit_queues[qid].qbase == NULL)) {
1150 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1151 __func__, qid, sc->num_squeues);
1152 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1153 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1157 sc->submit_queues[qid].qbase = NULL;
1158 sc->submit_queues[qid].cqid = 0;
1159 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1164 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1165 struct nvme_completion* compl)
1167 if (command->cdw11 & NVME_CMD_CDW11_PC) {
1168 uint16_t qid = command->cdw10 & 0xffff;
1169 struct nvme_submission_queue *nsq;
1171 if ((qid == 0) || (qid > sc->num_squeues) ||
1172 (sc->submit_queues[qid].qbase != NULL)) {
1173 WPRINTF("%s queue index %u > num_squeues %u",
1174 __func__, qid, sc->num_squeues);
1175 pci_nvme_status_tc(&compl->status,
1176 NVME_SCT_COMMAND_SPECIFIC,
1177 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1181 nsq = &sc->submit_queues[qid];
1182 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1183 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1184 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1186 * Queues must specify at least two entries
1187 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1188 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1190 pci_nvme_status_tc(&compl->status,
1191 NVME_SCT_COMMAND_SPECIFIC,
1192 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1195 nsq->head = nsq->tail = 0;
1197 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1198 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1199 pci_nvme_status_tc(&compl->status,
1200 NVME_SCT_COMMAND_SPECIFIC,
1201 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1205 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1206 pci_nvme_status_tc(&compl->status,
1207 NVME_SCT_COMMAND_SPECIFIC,
1208 NVME_SC_COMPLETION_QUEUE_INVALID);
1212 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1214 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1215 sizeof(struct nvme_command) * (size_t)nsq->size);
1217 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1218 qid, nsq->size, nsq->qbase, nsq->cqid);
1220 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1222 DPRINTF("%s completed creating IOSQ qid %u",
1226 * Guest sent non-cont submission queue request.
1227 * This setting is unsupported by this emulation.
1229 WPRINTF("%s unsupported non-contig (list-based) "
1230 "create i/o submission queue", __func__);
1232 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1238 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1239 struct nvme_completion* compl)
1241 uint16_t qid = command->cdw10 & 0xffff;
1244 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1245 if (qid == 0 || qid > sc->num_cqueues ||
1246 (sc->compl_queues[qid].qbase == NULL)) {
1247 WPRINTF("%s queue index %u / num_cqueues %u",
1248 __func__, qid, sc->num_cqueues);
1249 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1250 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1254 /* Deleting an Active CQ is an error */
1255 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1256 if (sc->submit_queues[sqid].cqid == qid) {
1257 pci_nvme_status_tc(&compl->status,
1258 NVME_SCT_COMMAND_SPECIFIC,
1259 NVME_SC_INVALID_QUEUE_DELETION);
1263 sc->compl_queues[qid].qbase = NULL;
1264 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1269 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1270 struct nvme_completion* compl)
1272 struct nvme_completion_queue *ncq;
1273 uint16_t qid = command->cdw10 & 0xffff;
1275 /* Only support Physically Contiguous queues */
1276 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1277 WPRINTF("%s unsupported non-contig (list-based) "
1278 "create i/o completion queue",
1281 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1285 if ((qid == 0) || (qid > sc->num_cqueues) ||
1286 (sc->compl_queues[qid].qbase != NULL)) {
1287 WPRINTF("%s queue index %u > num_cqueues %u",
1288 __func__, qid, sc->num_cqueues);
1289 pci_nvme_status_tc(&compl->status,
1290 NVME_SCT_COMMAND_SPECIFIC,
1291 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1295 ncq = &sc->compl_queues[qid];
1296 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1297 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1298 if (ncq->intr_vec > (sc->max_queues + 1)) {
1299 pci_nvme_status_tc(&compl->status,
1300 NVME_SCT_COMMAND_SPECIFIC,
1301 NVME_SC_INVALID_INTERRUPT_VECTOR);
1305 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1306 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1308 * Queues must specify at least two entries
1309 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1310 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1312 pci_nvme_status_tc(&compl->status,
1313 NVME_SCT_COMMAND_SPECIFIC,
1314 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1317 ncq->head = ncq->tail = 0;
1318 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1320 sizeof(struct nvme_command) * (size_t)ncq->size);
1322 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1329 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1330 struct nvme_completion* compl)
1334 uint8_t logpage = command->cdw10 & 0xFF;
1336 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1338 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1341 * Command specifies the number of dwords to return in fields NUMDU
1342 * and NUMDL. This is a zero-based value.
1344 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1345 logsize *= sizeof(uint32_t);
1346 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1349 case NVME_LOG_ERROR:
1350 if (logoff >= sizeof(sc->err_log)) {
1351 pci_nvme_status_genc(&compl->status,
1352 NVME_SC_INVALID_FIELD);
1356 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1357 command->prp2, (uint8_t *)&sc->err_log + logoff,
1358 MIN(logsize - logoff, sizeof(sc->err_log)),
1361 case NVME_LOG_HEALTH_INFORMATION:
1362 if (logoff >= sizeof(sc->health_log)) {
1363 pci_nvme_status_genc(&compl->status,
1364 NVME_SC_INVALID_FIELD);
1368 pthread_mutex_lock(&sc->mtx);
1369 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1370 sizeof(sc->health_log.data_units_read));
1371 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1372 sizeof(sc->health_log.data_units_written));
1373 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1374 sizeof(sc->health_log.host_read_commands));
1375 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1376 sizeof(sc->health_log.host_write_commands));
1377 pthread_mutex_unlock(&sc->mtx);
1379 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1380 command->prp2, (uint8_t *)&sc->health_log + logoff,
1381 MIN(logsize - logoff, sizeof(sc->health_log)),
1384 case NVME_LOG_FIRMWARE_SLOT:
1385 if (logoff >= sizeof(sc->fw_log)) {
1386 pci_nvme_status_genc(&compl->status,
1387 NVME_SC_INVALID_FIELD);
1391 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1392 command->prp2, (uint8_t *)&sc->fw_log + logoff,
1393 MIN(logsize - logoff, sizeof(sc->fw_log)),
1396 case NVME_LOG_CHANGED_NAMESPACE:
1397 if (logoff >= sizeof(sc->ns_log)) {
1398 pci_nvme_status_genc(&compl->status,
1399 NVME_SC_INVALID_FIELD);
1403 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1404 command->prp2, (uint8_t *)&sc->ns_log + logoff,
1405 MIN(logsize - logoff, sizeof(sc->ns_log)),
1407 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1410 DPRINTF("%s get log page %x command not supported",
1413 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1414 NVME_SC_INVALID_LOG_PAGE);
1421 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1422 struct nvme_completion* compl)
1427 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1428 command->cdw10 & 0xFF, command->nsid);
1430 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1432 switch (command->cdw10 & 0xFF) {
1433 case 0x00: /* return Identify Namespace data structure */
1434 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1435 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1438 case 0x01: /* return Identify Controller data structure */
1439 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1440 command->prp2, (uint8_t *)&sc->ctrldata,
1441 sizeof(sc->ctrldata),
1444 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1445 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1446 sizeof(uint32_t) * 1024);
1447 /* All unused entries shall be zero */
1448 bzero(dest, sizeof(uint32_t) * 1024);
1449 ((uint32_t *)dest)[0] = 1;
1451 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1452 if (command->nsid != 1) {
1453 pci_nvme_status_genc(&status,
1454 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1457 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1458 sizeof(uint32_t) * 1024);
1459 /* All bytes after the descriptor shall be zero */
1460 bzero(dest, sizeof(uint32_t) * 1024);
1462 /* Return NIDT=1 (i.e. EUI64) descriptor */
1463 ((uint8_t *)dest)[0] = 1;
1464 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1465 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1468 DPRINTF("%s unsupported identify command requested 0x%x",
1469 __func__, command->cdw10 & 0xFF);
1470 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1474 compl->status = status;
1479 nvme_fid_to_name(uint8_t fid)
1484 case NVME_FEAT_ARBITRATION:
1485 name = "Arbitration";
1487 case NVME_FEAT_POWER_MANAGEMENT:
1488 name = "Power Management";
1490 case NVME_FEAT_LBA_RANGE_TYPE:
1491 name = "LBA Range Type";
1493 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1494 name = "Temperature Threshold";
1496 case NVME_FEAT_ERROR_RECOVERY:
1497 name = "Error Recovery";
1499 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1500 name = "Volatile Write Cache";
1502 case NVME_FEAT_NUMBER_OF_QUEUES:
1503 name = "Number of Queues";
1505 case NVME_FEAT_INTERRUPT_COALESCING:
1506 name = "Interrupt Coalescing";
1508 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1509 name = "Interrupt Vector Configuration";
1511 case NVME_FEAT_WRITE_ATOMICITY:
1512 name = "Write Atomicity Normal";
1514 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1515 name = "Asynchronous Event Configuration";
1517 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1518 name = "Autonomous Power State Transition";
1520 case NVME_FEAT_HOST_MEMORY_BUFFER:
1521 name = "Host Memory Buffer";
1523 case NVME_FEAT_TIMESTAMP:
1526 case NVME_FEAT_KEEP_ALIVE_TIMER:
1527 name = "Keep Alive Timer";
1529 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1530 name = "Host Controlled Thermal Management";
1532 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1533 name = "Non-Operation Power State Config";
1535 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1536 name = "Read Recovery Level Config";
1538 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1539 name = "Predictable Latency Mode Config";
1541 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1542 name = "Predictable Latency Mode Window";
1544 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1545 name = "LBA Status Information Report Interval";
1547 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1548 name = "Host Behavior Support";
1550 case NVME_FEAT_SANITIZE_CONFIG:
1551 name = "Sanitize Config";
1553 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1554 name = "Endurance Group Event Configuration";
1556 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1557 name = "Software Progress Marker";
1559 case NVME_FEAT_HOST_IDENTIFIER:
1560 name = "Host Identifier";
1562 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1563 name = "Reservation Notification Mask";
1565 case NVME_FEAT_RESERVATION_PERSISTENCE:
1566 name = "Reservation Persistence";
1568 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1569 name = "Namespace Write Protection Config";
1580 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1581 struct nvme_feature_obj *feat,
1582 struct nvme_command *command,
1583 struct nvme_completion *compl)
1586 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1590 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1591 struct nvme_feature_obj *feat,
1592 struct nvme_command *command,
1593 struct nvme_completion *compl)
1596 uint32_t cdw11 = command->cdw11;
1600 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1602 iv = cdw11 & 0xffff;
1603 cd = cdw11 & (1 << 16);
1605 if (iv > (sc->max_queues + 1)) {
1609 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1610 if ((iv == 0) && !cd)
1613 /* Requested Interrupt Vector must be used by a CQ */
1614 for (i = 0; i < sc->num_cqueues + 1; i++) {
1615 if (sc->compl_queues[i].intr_vec == iv) {
1616 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1623 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1624 struct nvme_feature_obj *feat,
1625 struct nvme_command *command,
1626 struct nvme_completion *compl)
1628 uint16_t nqr; /* Number of Queues Requested */
1630 if (sc->num_q_is_set) {
1631 WPRINTF("%s: Number of Queues already set", __func__);
1632 pci_nvme_status_genc(&compl->status,
1633 NVME_SC_COMMAND_SEQUENCE_ERROR);
1637 nqr = command->cdw11 & 0xFFFF;
1638 if (nqr == 0xffff) {
1639 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1640 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1644 sc->num_squeues = ONE_BASED(nqr);
1645 if (sc->num_squeues > sc->max_queues) {
1646 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1648 sc->num_squeues = sc->max_queues;
1651 nqr = (command->cdw11 >> 16) & 0xFFFF;
1652 if (nqr == 0xffff) {
1653 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1654 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1658 sc->num_cqueues = ONE_BASED(nqr);
1659 if (sc->num_cqueues > sc->max_queues) {
1660 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1662 sc->num_cqueues = sc->max_queues;
1665 /* Patch the command value which will be saved on callback's return */
1666 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1667 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1669 sc->num_q_is_set = true;
1673 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1674 struct nvme_completion *compl)
1676 struct nvme_feature_obj *feat;
1677 uint32_t nsid = command->nsid;
1678 uint8_t fid = command->cdw10 & 0xFF;
1680 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1682 if (fid >= NVME_FID_MAX) {
1683 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1684 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1687 feat = &sc->feat[fid];
1689 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1690 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1694 if (!feat->namespace_specific &&
1695 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1696 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1697 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1702 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1705 feat->set(sc, feat, command, compl);
1707 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1708 if (compl->status == NVME_SC_SUCCESS) {
1709 feat->cdw11 = command->cdw11;
1710 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1711 (command->cdw11 != 0))
1712 pci_nvme_aen_notify(sc);
1719 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1720 struct nvme_completion* compl)
1722 struct nvme_feature_obj *feat;
1723 uint8_t fid = command->cdw10 & 0xFF;
1725 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1727 if (fid >= NVME_FID_MAX) {
1728 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1729 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1734 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1736 feat = &sc->feat[fid];
1738 feat->get(sc, feat, command, compl);
1741 if (compl->status == NVME_SC_SUCCESS) {
1742 compl->cdw0 = feat->cdw11;
1749 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1750 struct nvme_completion* compl)
1752 uint8_t ses, lbaf, pi;
1754 /* Only supports Secure Erase Setting - User Data Erase */
1755 ses = (command->cdw10 >> 9) & 0x7;
1757 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1761 /* Only supports a single LBA Format */
1762 lbaf = command->cdw10 & 0xf;
1764 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1765 NVME_SC_INVALID_FORMAT);
1769 /* Doesn't support Protection Infomation */
1770 pi = (command->cdw10 >> 5) & 0x7;
1772 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1776 if (sc->nvstore.type == NVME_STOR_RAM) {
1777 if (sc->nvstore.ctx)
1778 free(sc->nvstore.ctx);
1779 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1780 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1782 struct pci_nvme_ioreq *req;
1785 req = pci_nvme_get_ioreq(sc);
1787 pci_nvme_status_genc(&compl->status,
1788 NVME_SC_INTERNAL_DEVICE_ERROR);
1789 WPRINTF("%s: unable to allocate IO req", __func__);
1792 req->nvme_sq = &sc->submit_queues[0];
1794 req->opc = command->opc;
1795 req->cid = command->cid;
1796 req->nsid = command->nsid;
1798 req->io_req.br_offset = 0;
1799 req->io_req.br_resid = sc->nvstore.size;
1800 req->io_req.br_callback = pci_nvme_io_done;
1802 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1804 pci_nvme_status_genc(&compl->status,
1805 NVME_SC_INTERNAL_DEVICE_ERROR);
1806 pci_nvme_release_ioreq(sc, req);
1808 compl->status = NVME_NO_STATUS;
1815 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1816 struct nvme_completion* compl)
1818 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1819 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1821 /* TODO: search for the command ID and abort it */
1824 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1829 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1830 struct nvme_command* command, struct nvme_completion* compl)
1832 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1833 sc->aer_count, sc->ctrldata.aerl, command->cid);
1835 /* Don't exceed the Async Event Request Limit (AERL). */
1836 if (pci_nvme_aer_limit_reached(sc)) {
1837 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1838 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1842 if (pci_nvme_aer_add(sc, command->cid)) {
1843 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1844 NVME_SC_INTERNAL_DEVICE_ERROR);
1849 * Raise events when they happen based on the Set Features cmd.
1850 * These events happen async, so only set completion successful if
1851 * there is an event reflective of the request to get event.
1853 compl->status = NVME_NO_STATUS;
1854 pci_nvme_aen_notify(sc);
1860 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1862 struct nvme_completion compl;
1863 struct nvme_command *cmd;
1864 struct nvme_submission_queue *sq;
1865 struct nvme_completion_queue *cq;
1868 DPRINTF("%s index %u", __func__, (uint32_t)value);
1870 sq = &sc->submit_queues[0];
1871 cq = &sc->compl_queues[0];
1873 pthread_mutex_lock(&sq->mtx);
1876 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1878 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1879 cmd = &(sq->qbase)[sqhead];
1884 case NVME_OPC_DELETE_IO_SQ:
1885 DPRINTF("%s command DELETE_IO_SQ", __func__);
1886 nvme_opc_delete_io_sq(sc, cmd, &compl);
1888 case NVME_OPC_CREATE_IO_SQ:
1889 DPRINTF("%s command CREATE_IO_SQ", __func__);
1890 nvme_opc_create_io_sq(sc, cmd, &compl);
1892 case NVME_OPC_DELETE_IO_CQ:
1893 DPRINTF("%s command DELETE_IO_CQ", __func__);
1894 nvme_opc_delete_io_cq(sc, cmd, &compl);
1896 case NVME_OPC_CREATE_IO_CQ:
1897 DPRINTF("%s command CREATE_IO_CQ", __func__);
1898 nvme_opc_create_io_cq(sc, cmd, &compl);
1900 case NVME_OPC_GET_LOG_PAGE:
1901 DPRINTF("%s command GET_LOG_PAGE", __func__);
1902 nvme_opc_get_log_page(sc, cmd, &compl);
1904 case NVME_OPC_IDENTIFY:
1905 DPRINTF("%s command IDENTIFY", __func__);
1906 nvme_opc_identify(sc, cmd, &compl);
1908 case NVME_OPC_ABORT:
1909 DPRINTF("%s command ABORT", __func__);
1910 nvme_opc_abort(sc, cmd, &compl);
1912 case NVME_OPC_SET_FEATURES:
1913 DPRINTF("%s command SET_FEATURES", __func__);
1914 nvme_opc_set_features(sc, cmd, &compl);
1916 case NVME_OPC_GET_FEATURES:
1917 DPRINTF("%s command GET_FEATURES", __func__);
1918 nvme_opc_get_features(sc, cmd, &compl);
1920 case NVME_OPC_FIRMWARE_ACTIVATE:
1921 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1922 pci_nvme_status_tc(&compl.status,
1923 NVME_SCT_COMMAND_SPECIFIC,
1924 NVME_SC_INVALID_FIRMWARE_SLOT);
1926 case NVME_OPC_ASYNC_EVENT_REQUEST:
1927 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1928 nvme_opc_async_event_req(sc, cmd, &compl);
1930 case NVME_OPC_FORMAT_NVM:
1931 DPRINTF("%s command FORMAT_NVM", __func__);
1932 if ((sc->ctrldata.oacs &
1933 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1934 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1937 nvme_opc_format_nvm(sc, cmd, &compl);
1940 DPRINTF("0x%x command is not implemented",
1942 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1944 sqhead = (sqhead + 1) % sq->size;
1946 if (NVME_COMPLETION_VALID(compl)) {
1947 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1955 DPRINTF("setting sqhead %u", sqhead);
1958 if (cq->head != cq->tail)
1959 pci_generate_msix(sc->nsc_pi, 0);
1961 pthread_mutex_unlock(&sq->mtx);
1965 * Update the Write and Read statistics reported in SMART data
1967 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1968 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1969 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1972 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1973 size_t bytes, uint16_t status)
1976 pthread_mutex_lock(&sc->mtx);
1978 case NVME_OPC_WRITE:
1979 sc->write_commands++;
1980 if (status != NVME_SC_SUCCESS)
1982 sc->write_dunits_remainder += (bytes / 512);
1983 while (sc->write_dunits_remainder >= 1000) {
1984 sc->write_data_units++;
1985 sc->write_dunits_remainder -= 1000;
1989 sc->read_commands++;
1990 if (status != NVME_SC_SUCCESS)
1992 sc->read_dunits_remainder += (bytes / 512);
1993 while (sc->read_dunits_remainder >= 1000) {
1994 sc->read_data_units++;
1995 sc->read_dunits_remainder -= 1000;
1999 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2002 pthread_mutex_unlock(&sc->mtx);
2006 * Check if the combination of Starting LBA (slba) and Number of Logical
2007 * Blocks (nlb) exceeds the range of the underlying storage.
2009 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2010 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2014 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2017 size_t offset, bytes;
2019 /* Overflow check of multiplying Starting LBA by the sector size */
2020 if (slba >> (64 - nvstore->sectsz_bits))
2023 offset = slba << nvstore->sectsz_bits;
2024 bytes = nlb << nvstore->sectsz_bits;
2026 /* Overflow check of Number of Logical Blocks */
2027 if ((nvstore->size - offset) < bytes)
2034 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2035 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2042 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2046 /* concatenate contig block-iovs to minimize number of iovs */
2047 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2048 iovidx = req->io_req.br_iovcnt - 1;
2050 req->io_req.br_iov[iovidx].iov_base =
2051 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2052 req->prev_gpaddr, size);
2054 req->prev_size += size;
2055 req->io_req.br_resid += size;
2057 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2059 iovidx = req->io_req.br_iovcnt;
2061 req->io_req.br_offset = lba;
2062 req->io_req.br_resid = 0;
2063 req->io_req.br_param = req;
2066 req->io_req.br_iov[iovidx].iov_base =
2067 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2070 req->io_req.br_iov[iovidx].iov_len = size;
2072 req->prev_gpaddr = gpaddr;
2073 req->prev_size = size;
2074 req->io_req.br_resid += size;
2076 req->io_req.br_iovcnt++;
2083 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2084 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2085 uint32_t cdw0, uint16_t status)
2087 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2089 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2090 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2091 NVME_STATUS_GET_SC(status));
2093 pci_nvme_cq_update(sc, cq,
2099 if (cq->head != cq->tail) {
2100 if (cq->intr_en & NVME_CQ_INTEN) {
2101 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2103 DPRINTF("%s: CQ%u interrupt disabled",
2104 __func__, sq->cqid);
2110 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2113 req->nvme_sq = NULL;
2116 pthread_mutex_lock(&sc->mtx);
2118 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2121 /* when no more IO pending, can set to ready if device reset/enabled */
2122 if (sc->pending_ios == 0 &&
2123 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2124 sc->regs.csts |= NVME_CSTS_RDY;
2126 pthread_mutex_unlock(&sc->mtx);
2128 sem_post(&sc->iosemlock);
2131 static struct pci_nvme_ioreq *
2132 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2134 struct pci_nvme_ioreq *req = NULL;
2136 sem_wait(&sc->iosemlock);
2137 pthread_mutex_lock(&sc->mtx);
2139 req = STAILQ_FIRST(&sc->ioreqs_free);
2140 assert(req != NULL);
2141 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2147 pthread_mutex_unlock(&sc->mtx);
2149 req->io_req.br_iovcnt = 0;
2150 req->io_req.br_offset = 0;
2151 req->io_req.br_resid = 0;
2152 req->io_req.br_param = req;
2153 req->prev_gpaddr = 0;
2160 pci_nvme_io_done(struct blockif_req *br, int err)
2162 struct pci_nvme_ioreq *req = br->br_param;
2163 struct nvme_submission_queue *sq = req->nvme_sq;
2164 uint16_t code, status;
2166 DPRINTF("%s error %d %s", __func__, err, strerror(err));
2168 /* TODO return correct error */
2169 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2170 pci_nvme_status_genc(&status, code);
2172 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2173 pci_nvme_stats_write_read_update(req->sc, req->opc,
2174 req->bytes, status);
2175 pci_nvme_release_ioreq(req->sc, req);
2179 * Implements the Flush command. The specification states:
2180 * If a volatile write cache is not present, Flush commands complete
2181 * successfully and have no effect
2182 * in the description of the Volatile Write Cache (VWC) field of the Identify
2183 * Controller data. Therefore, set status to Success if the command is
2184 * not supported (i.e. RAM or as indicated by the blockif).
2187 nvme_opc_flush(struct pci_nvme_softc *sc,
2188 struct nvme_command *cmd,
2189 struct pci_nvme_blockstore *nvstore,
2190 struct pci_nvme_ioreq *req,
2193 bool pending = false;
2195 if (nvstore->type == NVME_STOR_RAM) {
2196 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2200 req->io_req.br_callback = pci_nvme_io_done;
2202 err = blockif_flush(nvstore->ctx, &req->io_req);
2208 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2211 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2219 nvme_write_read_ram(struct pci_nvme_softc *sc,
2220 struct pci_nvme_blockstore *nvstore,
2221 uint64_t prp1, uint64_t prp2,
2222 size_t offset, uint64_t bytes,
2225 uint8_t *buf = nvstore->ctx;
2226 enum nvme_copy_dir dir;
2230 dir = NVME_COPY_TO_PRP;
2232 dir = NVME_COPY_FROM_PRP;
2234 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2235 buf + offset, bytes, dir))
2236 pci_nvme_status_genc(&status,
2237 NVME_SC_DATA_TRANSFER_ERROR);
2239 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2245 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2246 struct pci_nvme_blockstore *nvstore,
2247 struct pci_nvme_ioreq *req,
2248 uint64_t prp1, uint64_t prp2,
2249 size_t offset, uint64_t bytes,
2254 uint16_t status = NVME_NO_STATUS;
2256 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2257 if (pci_nvme_append_iov_req(sc, req, prp1,
2258 size, is_write, offset)) {
2259 pci_nvme_status_genc(&status,
2260 NVME_SC_DATA_TRANSFER_ERROR);
2269 } else if (bytes <= PAGE_SIZE) {
2271 if (pci_nvme_append_iov_req(sc, req, prp2,
2272 size, is_write, offset)) {
2273 pci_nvme_status_genc(&status,
2274 NVME_SC_DATA_TRANSFER_ERROR);
2278 void *vmctx = sc->nsc_pi->pi_vmctx;
2279 uint64_t *prp_list = &prp2;
2280 uint64_t *last = prp_list;
2282 /* PRP2 is pointer to a physical region page list */
2284 /* Last entry in list points to the next list */
2285 if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2286 uint64_t prp = *prp_list;
2288 prp_list = paddr_guest2host(vmctx, prp,
2289 PAGE_SIZE - (prp % PAGE_SIZE));
2290 last = prp_list + (NVME_PRP2_ITEMS - 1);
2293 size = MIN(bytes, PAGE_SIZE);
2295 if (pci_nvme_append_iov_req(sc, req, *prp_list,
2296 size, is_write, offset)) {
2297 pci_nvme_status_genc(&status,
2298 NVME_SC_DATA_TRANSFER_ERROR);
2308 req->io_req.br_callback = pci_nvme_io_done;
2310 err = blockif_write(nvstore->ctx, &req->io_req);
2312 err = blockif_read(nvstore->ctx, &req->io_req);
2315 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2321 nvme_opc_write_read(struct pci_nvme_softc *sc,
2322 struct nvme_command *cmd,
2323 struct pci_nvme_blockstore *nvstore,
2324 struct pci_nvme_ioreq *req,
2327 uint64_t lba, nblocks, bytes;
2329 bool is_write = cmd->opc == NVME_OPC_WRITE;
2330 bool pending = false;
2332 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2333 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2335 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2336 WPRINTF("%s command would exceed LBA range", __func__);
2337 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2341 bytes = nblocks << nvstore->sectsz_bits;
2342 if (bytes > NVME_MAX_DATA_SIZE) {
2343 WPRINTF("%s command would exceed MDTS", __func__);
2344 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2348 offset = lba << nvstore->sectsz_bits;
2351 req->io_req.br_offset = lba;
2353 /* PRP bits 1:0 must be zero */
2354 cmd->prp1 &= ~0x3UL;
2355 cmd->prp2 &= ~0x3UL;
2357 if (nvstore->type == NVME_STOR_RAM) {
2358 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2359 cmd->prp2, offset, bytes, is_write);
2361 *status = nvme_write_read_blockif(sc, nvstore, req,
2362 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2364 if (*status == NVME_NO_STATUS)
2369 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2375 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2377 struct pci_nvme_ioreq *req = br->br_param;
2378 struct pci_nvme_softc *sc = req->sc;
2383 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2384 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2385 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2387 struct iovec *iov = req->io_req.br_iov;
2390 iov += req->prev_gpaddr;
2392 /* The iov_* values already include the sector size */
2393 req->io_req.br_offset = (off_t)iov->iov_base;
2394 req->io_req.br_resid = iov->iov_len;
2395 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2396 pci_nvme_status_genc(&status,
2397 NVME_SC_INTERNAL_DEVICE_ERROR);
2403 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2404 req->cid, 0, status);
2405 pci_nvme_release_ioreq(sc, req);
2410 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2411 struct nvme_command *cmd,
2412 struct pci_nvme_blockstore *nvstore,
2413 struct pci_nvme_ioreq *req,
2416 struct nvme_dsm_range *range;
2417 uint32_t nr, r, non_zero, dr;
2419 bool pending = false;
2421 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2422 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2426 nr = cmd->cdw10 & 0xff;
2428 /* copy locally because a range entry could straddle PRPs */
2429 range = calloc(1, NVME_MAX_DSM_TRIM);
2430 if (range == NULL) {
2431 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2434 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2435 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2437 /* Check for invalid ranges and the number of non-zero lengths */
2439 for (r = 0; r <= nr; r++) {
2440 if (pci_nvme_out_of_range(nvstore,
2441 range[r].starting_lba, range[r].length)) {
2442 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2445 if (range[r].length != 0)
2449 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2450 size_t offset, bytes;
2451 int sectsz_bits = sc->nvstore.sectsz_bits;
2454 * DSM calls are advisory only, and compliant controllers
2455 * may choose to take no actions (i.e. return Success).
2457 if (!nvstore->deallocate) {
2458 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2462 /* If all ranges have a zero length, return Success */
2463 if (non_zero == 0) {
2464 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2469 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2473 offset = range[0].starting_lba << sectsz_bits;
2474 bytes = range[0].length << sectsz_bits;
2477 * If the request is for more than a single range, store
2478 * the ranges in the br_iov. Optimize for the common case
2479 * of a single range.
2481 * Note that NVMe Number of Ranges is a zero based value
2483 req->io_req.br_iovcnt = 0;
2484 req->io_req.br_offset = offset;
2485 req->io_req.br_resid = bytes;
2488 req->io_req.br_callback = pci_nvme_io_done;
2490 struct iovec *iov = req->io_req.br_iov;
2492 for (r = 0, dr = 0; r <= nr; r++) {
2493 offset = range[r].starting_lba << sectsz_bits;
2494 bytes = range[r].length << sectsz_bits;
2498 if ((nvstore->size - offset) < bytes) {
2499 pci_nvme_status_genc(status,
2500 NVME_SC_LBA_OUT_OF_RANGE);
2503 iov[dr].iov_base = (void *)offset;
2504 iov[dr].iov_len = bytes;
2507 req->io_req.br_callback = pci_nvme_dealloc_sm;
2510 * Use prev_gpaddr to track the current entry and
2511 * prev_size to track the number of entries
2513 req->prev_gpaddr = 0;
2514 req->prev_size = dr;
2517 err = blockif_delete(nvstore->ctx, &req->io_req);
2519 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2529 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2531 struct nvme_submission_queue *sq;
2535 /* handle all submissions up to sq->tail index */
2536 sq = &sc->submit_queues[idx];
2538 pthread_mutex_lock(&sq->mtx);
2541 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2542 idx, sqhead, sq->tail, sq->qbase);
2544 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2545 struct nvme_command *cmd;
2546 struct pci_nvme_ioreq *req;
2554 cmd = &sq->qbase[sqhead];
2555 sqhead = (sqhead + 1) % sq->size;
2557 nsid = le32toh(cmd->nsid);
2558 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2559 pci_nvme_status_genc(&status,
2560 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2562 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2566 req = pci_nvme_get_ioreq(sc);
2568 pci_nvme_status_genc(&status,
2569 NVME_SC_INTERNAL_DEVICE_ERROR);
2570 WPRINTF("%s: unable to allocate IO req", __func__);
2575 req->opc = cmd->opc;
2576 req->cid = cmd->cid;
2577 req->nsid = cmd->nsid;
2580 case NVME_OPC_FLUSH:
2581 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2584 case NVME_OPC_WRITE:
2586 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2589 case NVME_OPC_WRITE_ZEROES:
2590 /* TODO: write zeroes
2591 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2592 __func__, lba, cmd->cdw12 & 0xFFFF); */
2593 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2595 case NVME_OPC_DATASET_MANAGEMENT:
2596 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2600 WPRINTF("%s unhandled io command 0x%x",
2601 __func__, cmd->opc);
2602 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2606 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2609 pci_nvme_release_ioreq(sc, req);
2615 pthread_mutex_unlock(&sq->mtx);
2619 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2620 uint64_t idx, int is_sq, uint64_t value)
2622 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2623 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2626 if (idx > sc->num_squeues) {
2627 WPRINTF("%s queue index %lu overflow from "
2629 __func__, idx, sc->num_squeues);
2633 atomic_store_short(&sc->submit_queues[idx].tail,
2637 pci_nvme_handle_admin_cmd(sc, value);
2639 /* submission queue; handle new entries in SQ */
2640 if (idx > sc->num_squeues) {
2641 WPRINTF("%s SQ index %lu overflow from "
2643 __func__, idx, sc->num_squeues);
2646 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2649 if (idx > sc->num_cqueues) {
2650 WPRINTF("%s queue index %lu overflow from "
2652 __func__, idx, sc->num_cqueues);
2656 atomic_store_short(&sc->compl_queues[idx].head,
2662 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2664 const char *s = iswrite ? "WRITE" : "READ";
2667 case NVME_CR_CAP_LOW:
2668 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2670 case NVME_CR_CAP_HI:
2671 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2674 DPRINTF("%s %s NVME_CR_VS", func, s);
2677 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2680 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2683 DPRINTF("%s %s NVME_CR_CC", func, s);
2686 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2689 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2692 DPRINTF("%s %s NVME_CR_AQA", func, s);
2694 case NVME_CR_ASQ_LOW:
2695 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2697 case NVME_CR_ASQ_HI:
2698 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2700 case NVME_CR_ACQ_LOW:
2701 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2703 case NVME_CR_ACQ_HI:
2704 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2707 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2713 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2714 uint64_t offset, int size, uint64_t value)
2718 if (offset >= NVME_DOORBELL_OFFSET) {
2719 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2720 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2721 int is_sq = (belloffset % 8) < 4;
2723 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2724 WPRINTF("guest attempted an overflow write offset "
2725 "0x%lx, val 0x%lx in %s",
2726 offset, value, __func__);
2730 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2734 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2735 offset, size, value);
2738 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2739 "val 0x%lx) to bar0 in %s",
2740 size, offset, value, __func__);
2741 /* TODO: shutdown device */
2745 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2747 pthread_mutex_lock(&sc->mtx);
2750 case NVME_CR_CAP_LOW:
2751 case NVME_CR_CAP_HI:
2758 /* MSI-X, so ignore */
2761 /* MSI-X, so ignore */
2764 ccreg = (uint32_t)value;
2766 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2769 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2770 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2771 NVME_CC_GET_IOCQES(ccreg));
2773 if (NVME_CC_GET_SHN(ccreg)) {
2774 /* perform shutdown - flush out data to backend */
2775 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2776 NVME_CSTS_REG_SHST_SHIFT);
2777 sc->regs.csts |= NVME_SHST_COMPLETE <<
2778 NVME_CSTS_REG_SHST_SHIFT;
2780 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2781 if (NVME_CC_GET_EN(ccreg) == 0)
2782 /* transition 1-> causes controller reset */
2783 pci_nvme_reset_locked(sc);
2785 pci_nvme_init_controller(ctx, sc);
2788 /* Insert the iocqes, iosqes and en bits from the write */
2789 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2790 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2791 if (NVME_CC_GET_EN(ccreg) == 0) {
2792 /* Insert the ams, mps and css bit fields */
2793 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2794 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2795 sc->regs.csts &= ~NVME_CSTS_RDY;
2796 } else if (sc->pending_ios == 0) {
2797 sc->regs.csts |= NVME_CSTS_RDY;
2803 /* ignore writes; don't support subsystem reset */
2806 sc->regs.aqa = (uint32_t)value;
2808 case NVME_CR_ASQ_LOW:
2809 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2810 (0xFFFFF000 & value);
2812 case NVME_CR_ASQ_HI:
2813 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2816 case NVME_CR_ACQ_LOW:
2817 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2818 (0xFFFFF000 & value);
2820 case NVME_CR_ACQ_HI:
2821 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2825 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2826 __func__, offset, value, size);
2828 pthread_mutex_unlock(&sc->mtx);
2832 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2833 int baridx, uint64_t offset, int size, uint64_t value)
2835 struct pci_nvme_softc* sc = pi->pi_arg;
2837 if (baridx == pci_msix_table_bar(pi) ||
2838 baridx == pci_msix_pba_bar(pi)) {
2839 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2840 " value 0x%lx", baridx, offset, size, value);
2842 pci_emul_msix_twrite(pi, offset, size, value);
2848 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2852 DPRINTF("%s unknown baridx %d, val 0x%lx",
2853 __func__, baridx, value);
2857 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2858 uint64_t offset, int size)
2862 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2864 if (offset < NVME_DOORBELL_OFFSET) {
2865 void *p = &(sc->regs);
2866 pthread_mutex_lock(&sc->mtx);
2867 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2868 pthread_mutex_unlock(&sc->mtx);
2871 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2882 value &= 0xFFFFFFFF;
2886 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2887 offset, size, (uint32_t)value);
2895 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2896 uint64_t offset, int size)
2898 struct pci_nvme_softc* sc = pi->pi_arg;
2900 if (baridx == pci_msix_table_bar(pi) ||
2901 baridx == pci_msix_pba_bar(pi)) {
2902 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2903 baridx, offset, size);
2905 return pci_emul_msix_tread(pi, offset, size);
2910 return pci_nvme_read_bar_0(sc, offset, size);
2913 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2920 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2922 char bident[sizeof("XX:X:X")];
2926 sc->max_queues = NVME_QUEUES;
2927 sc->max_qentries = NVME_MAX_QENTRIES;
2928 sc->ioslots = NVME_IOSLOTS;
2929 sc->num_squeues = sc->max_queues;
2930 sc->num_cqueues = sc->max_queues;
2931 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2933 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2934 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2936 value = get_config_value_node(nvl, "maxq");
2938 sc->max_queues = atoi(value);
2939 value = get_config_value_node(nvl, "qsz");
2940 if (value != NULL) {
2941 sc->max_qentries = atoi(value);
2942 if (sc->max_qentries <= 0) {
2943 EPRINTLN("nvme: Invalid qsz option %d",
2948 value = get_config_value_node(nvl, "ioslots");
2949 if (value != NULL) {
2950 sc->ioslots = atoi(value);
2951 if (sc->ioslots <= 0) {
2952 EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2956 value = get_config_value_node(nvl, "sectsz");
2958 sectsz = atoi(value);
2959 value = get_config_value_node(nvl, "ser");
2960 if (value != NULL) {
2962 * This field indicates the Product Serial Number in
2963 * 7-bit ASCII, unused bytes should be space characters.
2966 cpywithpad((char *)sc->ctrldata.sn,
2967 sizeof(sc->ctrldata.sn), value, ' ');
2969 value = get_config_value_node(nvl, "eui64");
2971 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2972 value = get_config_value_node(nvl, "dsm");
2973 if (value != NULL) {
2974 if (strcmp(value, "auto") == 0)
2975 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2976 else if (strcmp(value, "enable") == 0)
2977 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2978 else if (strcmp(value, "disable") == 0)
2979 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2982 value = get_config_value_node(nvl, "ram");
2983 if (value != NULL) {
2984 uint64_t sz = strtoull(value, NULL, 10);
2986 sc->nvstore.type = NVME_STOR_RAM;
2987 sc->nvstore.size = sz * 1024 * 1024;
2988 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2989 sc->nvstore.sectsz = 4096;
2990 sc->nvstore.sectsz_bits = 12;
2991 if (sc->nvstore.ctx == NULL) {
2992 EPRINTLN("nvme: Unable to allocate RAM");
2996 snprintf(bident, sizeof(bident), "%d:%d",
2997 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2998 sc->nvstore.ctx = blockif_open(nvl, bident);
2999 if (sc->nvstore.ctx == NULL) {
3000 EPRINTLN("nvme: Could not open backing file: %s",
3004 sc->nvstore.type = NVME_STOR_BLOCKIF;
3005 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3008 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3009 sc->nvstore.sectsz = sectsz;
3010 else if (sc->nvstore.type != NVME_STOR_RAM)
3011 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3012 for (sc->nvstore.sectsz_bits = 9;
3013 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3014 sc->nvstore.sectsz_bits++);
3016 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3017 sc->max_queues = NVME_QUEUES;
3023 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3025 struct pci_nvme_softc *sc;
3026 struct pci_nvme_blockstore *nvstore;
3027 struct nvme_namespace_data *nd;
3030 nvstore = &sc->nvstore;
3033 nvstore->size = new_size;
3034 pci_nvme_init_nsdata_size(nvstore, nd);
3036 /* Add changed NSID to list */
3037 sc->ns_log.ns[0] = 1;
3038 sc->ns_log.ns[1] = 0;
3040 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3041 PCI_NVME_AE_INFO_NS_ATTR_CHANGED);
3045 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3047 struct pci_nvme_softc *sc;
3048 uint32_t pci_membar_sz;
3053 sc = calloc(1, sizeof(struct pci_nvme_softc));
3057 error = pci_nvme_parse_config(sc, nvl);
3063 STAILQ_INIT(&sc->ioreqs_free);
3064 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3065 for (int i = 0; i < sc->ioslots; i++) {
3066 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3069 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3070 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3071 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3072 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3073 pci_set_cfgdata8(pi, PCIR_PROGIF,
3074 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3077 * Allocate size of NVMe registers + doorbell space for all queues.
3079 * The specification requires a minimum memory I/O window size of 16K.
3080 * The Windows driver will refuse to start a device with a smaller
3083 pci_membar_sz = sizeof(struct nvme_registers) +
3084 2 * sizeof(uint32_t) * (sc->max_queues + 1);
3085 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3087 DPRINTF("nvme membar size: %u", pci_membar_sz);
3089 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3091 WPRINTF("%s pci alloc mem bar failed", __func__);
3095 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3097 WPRINTF("%s pci add msixcap failed", __func__);
3101 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3103 WPRINTF("%s pci add Express capability failed", __func__);
3107 pthread_mutex_init(&sc->mtx, NULL);
3108 sem_init(&sc->iosemlock, 0, sc->ioslots);
3109 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3111 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3113 * Controller data depends on Namespace data so initialize Namespace
3116 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3117 pci_nvme_init_ctrldata(sc);
3118 pci_nvme_init_logpages(sc);
3119 pci_nvme_init_features(sc);
3121 pci_nvme_aer_init(sc);
3122 pci_nvme_aen_init(sc);
3126 pci_lintr_request(pi);
3133 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3140 if (strncmp(opts, "ram=", 4) == 0) {
3141 cp = strchr(opts, ',');
3143 set_config_value_node(nvl, "ram", opts + 4);
3146 ram = strndup(opts + 4, cp - opts - 4);
3147 set_config_value_node(nvl, "ram", ram);
3149 return (pci_parse_legacy_config(nvl, cp + 1));
3151 return (blockif_legacy_config(nvl, opts));
3154 struct pci_devemu pci_de_nvme = {
3156 .pe_init = pci_nvme_init,
3157 .pe_legacy_config = pci_nvme_legacy_config,
3158 .pe_barwrite = pci_nvme_write,
3159 .pe_barread = pci_nvme_read
3161 PCI_EMUL_SET(pci_de_nvme);