2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN 0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS 0xffff
115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero) ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one) ((one) - 1)
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
131 enum nvme_controller_register_offsets {
132 NVME_CR_CAP_LOW = 0x00,
133 NVME_CR_CAP_HI = 0x04,
135 NVME_CR_INTMS = 0x0c,
136 NVME_CR_INTMC = 0x10,
141 NVME_CR_ASQ_LOW = 0x28,
142 NVME_CR_ASQ_HI = 0x2c,
143 NVME_CR_ACQ_LOW = 0x30,
144 NVME_CR_ACQ_HI = 0x34,
147 enum nvme_cmd_cdw11 {
148 NVME_CMD_CDW11_PC = 0x0001,
149 NVME_CMD_CDW11_IEN = 0x0002,
150 NVME_CMD_CDW11_IV = 0xFFFF0000,
158 #define NVME_CQ_INTEN 0x01
159 #define NVME_CQ_INTCOAL 0x02
161 struct nvme_completion_queue {
162 struct nvme_completion *qbase;
165 uint16_t tail; /* nvme progress */
166 uint16_t head; /* guest progress */
171 struct nvme_submission_queue {
172 struct nvme_command *qbase;
175 uint16_t head; /* nvme progress */
176 uint16_t tail; /* guest progress */
177 uint16_t cqid; /* completion queue id */
181 enum nvme_storage_type {
182 NVME_STOR_BLOCKIF = 0,
186 struct pci_nvme_blockstore {
187 enum nvme_storage_type type;
191 uint32_t sectsz_bits;
193 uint32_t deallocate:1;
197 * Calculate the number of additional page descriptors for guest IO requests
198 * based on the advertised Max Data Transfer (MDTS) and given the number of
199 * default iovec's in a struct blockif_req.
201 * Note the + 1 allows for the initial descriptor to not be page aligned.
203 #define MDTS_PAD_SIZE \
204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
254 struct pci_nvme_aer {
255 STAILQ_ENTRY(pci_nvme_aer) link;
256 uint16_t cid; /* Command ID of the submitted AER */
259 struct pci_nvme_softc {
260 struct pci_devinst *nsc_pi;
264 struct nvme_registers regs;
266 struct nvme_namespace_data nsdata;
267 struct nvme_controller_data ctrldata;
268 struct nvme_error_information_entry err_log;
269 struct nvme_health_information_page health_log;
270 struct nvme_firmware_page fw_log;
272 struct pci_nvme_blockstore nvstore;
274 uint16_t max_qentries; /* max entries per queue */
275 uint32_t max_queues; /* max number of IO SQ's or CQ's */
276 uint32_t num_cqueues;
277 uint32_t num_squeues;
278 bool num_q_is_set; /* Has host set Number of Queues */
280 struct pci_nvme_ioreq *ioreqs;
281 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282 uint32_t pending_ios;
287 * Memory mapped Submission and Completion queues
288 * Each array includes both Admin and IO queues
290 struct nvme_completion_queue *compl_queues;
291 struct nvme_submission_queue *submit_queues;
293 struct nvme_feature_obj feat[NVME_FID_MAX];
295 enum nvme_dsm_type dataset_management;
297 /* Accounting for SMART data */
298 __uint128_t read_data_units;
299 __uint128_t write_data_units;
300 __uint128_t read_commands;
301 __uint128_t write_commands;
302 uint32_t read_dunits_remainder;
303 uint32_t write_dunits_remainder;
305 STAILQ_HEAD(, pci_nvme_aer) aer_list;
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
326 #define NVME_CC_WRITE_MASK \
327 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
331 #define NVME_CC_NEN_WRITE_MASK \
332 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
340 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
348 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352 struct nvme_feature_obj *,
353 struct nvme_command *,
354 struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356 struct nvme_feature_obj *,
357 struct nvme_command *,
358 struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360 struct nvme_feature_obj *,
361 struct nvme_command *,
362 struct nvme_completion *);
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
369 len = strnlen(src, dst_size);
370 memset(dst, pad, dst_size);
371 memcpy(dst, src, len);
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
378 *status &= ~NVME_STATUS_MASK;
379 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
387 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
391 * Initialize the requested number or IO Submission and Completion Queues.
392 * Admin queues are allocated implicitly.
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
400 * Allocate and initialize the Submission Queues
402 if (nsq > NVME_QUEUES) {
403 WPRINTF("%s: clamping number of SQ from %u to %u",
404 __func__, nsq, NVME_QUEUES);
408 sc->num_squeues = nsq;
410 sc->submit_queues = calloc(sc->num_squeues + 1,
411 sizeof(struct nvme_submission_queue));
412 if (sc->submit_queues == NULL) {
413 WPRINTF("%s: SQ allocation failed", __func__);
416 struct nvme_submission_queue *sq = sc->submit_queues;
418 for (i = 0; i < sc->num_squeues; i++)
419 pthread_mutex_init(&sq[i].mtx, NULL);
423 * Allocate and initialize the Completion Queues
425 if (ncq > NVME_QUEUES) {
426 WPRINTF("%s: clamping number of CQ from %u to %u",
427 __func__, ncq, NVME_QUEUES);
431 sc->num_cqueues = ncq;
433 sc->compl_queues = calloc(sc->num_cqueues + 1,
434 sizeof(struct nvme_completion_queue));
435 if (sc->compl_queues == NULL) {
436 WPRINTF("%s: CQ allocation failed", __func__);
439 struct nvme_completion_queue *cq = sc->compl_queues;
441 for (i = 0; i < sc->num_cqueues; i++)
442 pthread_mutex_init(&cq[i].mtx, NULL);
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
449 struct nvme_controller_data *cd = &sc->ctrldata;
454 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
457 /* Num of submission commands that we can handle at a time (2^rab) */
467 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
469 cd->ver = 0x00010300;
471 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
475 /* Advertise 1, Read-only firmware slot */
476 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478 cd->lpa = 0; /* TODO: support some simple things like SMART */
479 cd->elpe = 0; /* max error log page entries */
480 cd->npss = 1; /* number of power states support */
482 /* Warning Composite Temperature Threshold */
485 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489 cd->nn = 1; /* number of namespaces */
492 switch (sc->dataset_management) {
493 case NVME_DATASET_MANAGEMENT_AUTO:
494 if (sc->nvstore.deallocate)
495 cd->oncs |= NVME_ONCS_DSM;
497 case NVME_DATASET_MANAGEMENT_ENABLE:
498 cd->oncs |= NVME_ONCS_DSM;
506 cd->power_state[0].mp = 10;
510 * Calculate the CRC-16 of the given buffer
511 * See copyright attribution at top of file
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
516 const unsigned char *cp = buffer;
517 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518 static uint16_t const crc16_table[256] = {
519 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
554 crc = (((crc >> 8) & 0xffU) ^
555 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561 struct nvme_namespace_data *nd, uint32_t nsid,
562 struct pci_nvme_blockstore *nvstore)
565 /* Get capacity and block size information from backing store */
566 nd->nsze = nvstore->size / nvstore->sectsz;
570 if (nvstore->type == NVME_STOR_BLOCKIF)
571 nvstore->deallocate = blockif_candelete(nvstore->ctx);
573 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
576 /* Create an EUI-64 if user did not provide one */
577 if (nvstore->eui64 == 0) {
579 uint64_t eui64 = nvstore->eui64;
581 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
585 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
588 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
590 be64enc(nd->eui64, nvstore->eui64);
592 /* LBA data-sz = 2^lbads */
593 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
600 memset(&sc->err_log, 0, sizeof(sc->err_log));
601 memset(&sc->health_log, 0, sizeof(sc->health_log));
602 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
604 /* Set read/write remainder to round up according to spec */
605 sc->read_dunits_remainder = 999;
606 sc->write_dunits_remainder = 999;
608 /* Set nominal Health values checked by implementations */
609 sc->health_log.temperature = 310;
610 sc->health_log.available_spare = 100;
611 sc->health_log.available_spare_threshold = 10;
615 pci_nvme_init_features(struct pci_nvme_softc *sc)
618 sc->feat[0].set = nvme_feature_invalid_cb;
619 sc->feat[0].get = nvme_feature_invalid_cb;
621 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
622 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
623 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
624 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
625 nvme_feature_iv_config;
626 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
627 nvme_feature_invalid_cb;
628 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
629 nvme_feature_invalid_cb;
633 pci_nvme_aer_init(struct pci_nvme_softc *sc)
636 STAILQ_INIT(&sc->aer_list);
641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
643 struct pci_nvme_aer *aer = NULL;
645 while (!STAILQ_EMPTY(&sc->aer_list)) {
646 aer = STAILQ_FIRST(&sc->aer_list);
647 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
651 pci_nvme_aer_init(sc);
655 pci_nvme_aer_available(struct pci_nvme_softc *sc)
658 return (!STAILQ_EMPTY(&sc->aer_list));
662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
664 struct nvme_controller_data *cd = &sc->ctrldata;
666 /* AERL is a zero based value while aer_count is one's based */
667 return (sc->aer_count == (cd->aerl + 1));
671 * Add an Async Event Request
673 * Stores an AER to be returned later if the Controller needs to notify the
675 * Note that while the NVMe spec doesn't require Controllers to return AER's
676 * in order, this implementation does preserve the order.
679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
681 struct pci_nvme_aer *aer = NULL;
683 if (pci_nvme_aer_limit_reached(sc))
686 aer = calloc(1, sizeof(struct pci_nvme_aer));
692 /* Save the Command ID for use in the completion message */
694 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
700 * Get an Async Event Request structure
702 * Returns a pointer to an AER previously submitted by the host or NULL if
703 * no AER's exist. Caller is responsible for freeing the returned struct.
705 static struct pci_nvme_aer *
706 pci_nvme_aer_get(struct pci_nvme_softc *sc)
708 struct pci_nvme_aer *aer = NULL;
710 aer = STAILQ_FIRST(&sc->aer_list);
712 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
720 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
724 DPRINTF("%s", __func__);
726 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
727 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
728 (60 << NVME_CAP_LO_REG_TO_SHIFT);
730 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
732 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
737 assert(sc->submit_queues != NULL);
739 for (i = 0; i < sc->num_squeues + 1; i++) {
740 sc->submit_queues[i].qbase = NULL;
741 sc->submit_queues[i].size = 0;
742 sc->submit_queues[i].cqid = 0;
743 sc->submit_queues[i].tail = 0;
744 sc->submit_queues[i].head = 0;
747 assert(sc->compl_queues != NULL);
749 for (i = 0; i < sc->num_cqueues + 1; i++) {
750 sc->compl_queues[i].qbase = NULL;
751 sc->compl_queues[i].size = 0;
752 sc->compl_queues[i].tail = 0;
753 sc->compl_queues[i].head = 0;
756 sc->num_q_is_set = false;
758 pci_nvme_aer_destroy(sc);
762 pci_nvme_reset(struct pci_nvme_softc *sc)
764 pthread_mutex_lock(&sc->mtx);
765 pci_nvme_reset_locked(sc);
766 pthread_mutex_unlock(&sc->mtx);
770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
774 DPRINTF("%s", __func__);
776 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
777 sc->submit_queues[0].size = asqs;
778 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
779 sizeof(struct nvme_command) * asqs);
781 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
782 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
784 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
785 NVME_AQA_REG_ACQS_MASK) + 1;
786 sc->compl_queues[0].size = acqs;
787 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
788 sizeof(struct nvme_completion) * acqs);
789 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
791 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
792 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
797 size_t len, enum nvme_copy_dir dir)
802 if (len > (8 * 1024)) {
806 /* Copy from the start of prp1 to the end of the physical page */
807 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
808 bytes = MIN(bytes, len);
810 p = vm_map_gpa(ctx, prp1, bytes);
815 if (dir == NVME_COPY_TO_PRP)
827 len = MIN(len, PAGE_SIZE);
829 p = vm_map_gpa(ctx, prp2, len);
834 if (dir == NVME_COPY_TO_PRP)
843 * Write a Completion Queue Entry update
845 * Write the completion and update the doorbell value
848 pci_nvme_cq_update(struct pci_nvme_softc *sc,
849 struct nvme_completion_queue *cq,
855 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
856 struct nvme_completion *cqe;
858 assert(cq->qbase != NULL);
860 pthread_mutex_lock(&cq->mtx);
862 cqe = &cq->qbase[cq->tail];
864 /* Flip the phase bit */
865 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
868 cqe->sqhd = sq->head;
871 cqe->status = status;
874 if (cq->tail >= cq->size) {
878 pthread_mutex_unlock(&cq->mtx);
882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
883 struct nvme_completion* compl)
885 uint16_t qid = command->cdw10 & 0xffff;
887 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
888 if (qid == 0 || qid > sc->num_squeues ||
889 (sc->submit_queues[qid].qbase == NULL)) {
890 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
891 __func__, qid, sc->num_squeues);
892 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
893 NVME_SC_INVALID_QUEUE_IDENTIFIER);
897 sc->submit_queues[qid].qbase = NULL;
898 sc->submit_queues[qid].cqid = 0;
899 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
905 struct nvme_completion* compl)
907 if (command->cdw11 & NVME_CMD_CDW11_PC) {
908 uint16_t qid = command->cdw10 & 0xffff;
909 struct nvme_submission_queue *nsq;
911 if ((qid == 0) || (qid > sc->num_squeues) ||
912 (sc->submit_queues[qid].qbase != NULL)) {
913 WPRINTF("%s queue index %u > num_squeues %u",
914 __func__, qid, sc->num_squeues);
915 pci_nvme_status_tc(&compl->status,
916 NVME_SCT_COMMAND_SPECIFIC,
917 NVME_SC_INVALID_QUEUE_IDENTIFIER);
921 nsq = &sc->submit_queues[qid];
922 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
923 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
924 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
926 * Queues must specify at least two entries
927 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
928 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
930 pci_nvme_status_tc(&compl->status,
931 NVME_SCT_COMMAND_SPECIFIC,
932 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
936 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
937 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
938 pci_nvme_status_tc(&compl->status,
939 NVME_SCT_COMMAND_SPECIFIC,
940 NVME_SC_INVALID_QUEUE_IDENTIFIER);
944 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
945 pci_nvme_status_tc(&compl->status,
946 NVME_SCT_COMMAND_SPECIFIC,
947 NVME_SC_COMPLETION_QUEUE_INVALID);
951 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
953 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
954 sizeof(struct nvme_command) * (size_t)nsq->size);
956 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
957 qid, nsq->size, nsq->qbase, nsq->cqid);
959 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
961 DPRINTF("%s completed creating IOSQ qid %u",
965 * Guest sent non-cont submission queue request.
966 * This setting is unsupported by this emulation.
968 WPRINTF("%s unsupported non-contig (list-based) "
969 "create i/o submission queue", __func__);
971 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
977 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
978 struct nvme_completion* compl)
980 uint16_t qid = command->cdw10 & 0xffff;
983 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
984 if (qid == 0 || qid > sc->num_cqueues ||
985 (sc->compl_queues[qid].qbase == NULL)) {
986 WPRINTF("%s queue index %u / num_cqueues %u",
987 __func__, qid, sc->num_cqueues);
988 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
989 NVME_SC_INVALID_QUEUE_IDENTIFIER);
993 /* Deleting an Active CQ is an error */
994 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
995 if (sc->submit_queues[sqid].cqid == qid) {
996 pci_nvme_status_tc(&compl->status,
997 NVME_SCT_COMMAND_SPECIFIC,
998 NVME_SC_INVALID_QUEUE_DELETION);
1002 sc->compl_queues[qid].qbase = NULL;
1003 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1008 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1009 struct nvme_completion* compl)
1011 struct nvme_completion_queue *ncq;
1012 uint16_t qid = command->cdw10 & 0xffff;
1014 /* Only support Physically Contiguous queues */
1015 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1016 WPRINTF("%s unsupported non-contig (list-based) "
1017 "create i/o completion queue",
1020 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1024 if ((qid == 0) || (qid > sc->num_cqueues) ||
1025 (sc->compl_queues[qid].qbase != NULL)) {
1026 WPRINTF("%s queue index %u > num_cqueues %u",
1027 __func__, qid, sc->num_cqueues);
1028 pci_nvme_status_tc(&compl->status,
1029 NVME_SCT_COMMAND_SPECIFIC,
1030 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1034 ncq = &sc->compl_queues[qid];
1035 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1036 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1037 if (ncq->intr_vec > (sc->max_queues + 1)) {
1038 pci_nvme_status_tc(&compl->status,
1039 NVME_SCT_COMMAND_SPECIFIC,
1040 NVME_SC_INVALID_INTERRUPT_VECTOR);
1044 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1045 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1047 * Queues must specify at least two entries
1048 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1049 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1051 pci_nvme_status_tc(&compl->status,
1052 NVME_SCT_COMMAND_SPECIFIC,
1053 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1056 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1058 sizeof(struct nvme_command) * (size_t)ncq->size);
1060 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1067 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1068 struct nvme_completion* compl)
1071 uint8_t logpage = command->cdw10 & 0xFF;
1073 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1075 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1078 * Command specifies the number of dwords to return in fields NUMDU
1079 * and NUMDL. This is a zero-based value.
1081 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1082 logsize *= sizeof(uint32_t);
1085 case NVME_LOG_ERROR:
1086 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1087 command->prp2, (uint8_t *)&sc->err_log,
1088 MIN(logsize, sizeof(sc->err_log)),
1091 case NVME_LOG_HEALTH_INFORMATION:
1092 pthread_mutex_lock(&sc->mtx);
1093 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1094 sizeof(sc->health_log.data_units_read));
1095 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1096 sizeof(sc->health_log.data_units_written));
1097 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1098 sizeof(sc->health_log.host_read_commands));
1099 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1100 sizeof(sc->health_log.host_write_commands));
1101 pthread_mutex_unlock(&sc->mtx);
1103 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1104 command->prp2, (uint8_t *)&sc->health_log,
1105 MIN(logsize, sizeof(sc->health_log)),
1108 case NVME_LOG_FIRMWARE_SLOT:
1109 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1110 command->prp2, (uint8_t *)&sc->fw_log,
1111 MIN(logsize, sizeof(sc->fw_log)),
1115 DPRINTF("%s get log page %x command not supported",
1118 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1119 NVME_SC_INVALID_LOG_PAGE);
1126 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1127 struct nvme_completion* compl)
1132 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1133 command->cdw10 & 0xFF, command->nsid);
1135 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1137 switch (command->cdw10 & 0xFF) {
1138 case 0x00: /* return Identify Namespace data structure */
1139 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1143 case 0x01: /* return Identify Controller data structure */
1144 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1145 command->prp2, (uint8_t *)&sc->ctrldata,
1146 sizeof(sc->ctrldata),
1149 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1150 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1151 sizeof(uint32_t) * 1024);
1152 /* All unused entries shall be zero */
1153 bzero(dest, sizeof(uint32_t) * 1024);
1154 ((uint32_t *)dest)[0] = 1;
1156 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1157 if (command->nsid != 1) {
1158 pci_nvme_status_genc(&status,
1159 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1162 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1163 sizeof(uint32_t) * 1024);
1164 /* All bytes after the descriptor shall be zero */
1165 bzero(dest, sizeof(uint32_t) * 1024);
1167 /* Return NIDT=1 (i.e. EUI64) descriptor */
1168 ((uint8_t *)dest)[0] = 1;
1169 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1170 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1173 DPRINTF("%s unsupported identify command requested 0x%x",
1174 __func__, command->cdw10 & 0xFF);
1175 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1179 compl->status = status;
1184 nvme_fid_to_name(uint8_t fid)
1189 case NVME_FEAT_ARBITRATION:
1190 name = "Arbitration";
1192 case NVME_FEAT_POWER_MANAGEMENT:
1193 name = "Power Management";
1195 case NVME_FEAT_LBA_RANGE_TYPE:
1196 name = "LBA Range Type";
1198 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1199 name = "Temperature Threshold";
1201 case NVME_FEAT_ERROR_RECOVERY:
1202 name = "Error Recovery";
1204 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1205 name = "Volatile Write Cache";
1207 case NVME_FEAT_NUMBER_OF_QUEUES:
1208 name = "Number of Queues";
1210 case NVME_FEAT_INTERRUPT_COALESCING:
1211 name = "Interrupt Coalescing";
1213 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1214 name = "Interrupt Vector Configuration";
1216 case NVME_FEAT_WRITE_ATOMICITY:
1217 name = "Write Atomicity Normal";
1219 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1220 name = "Asynchronous Event Configuration";
1222 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1223 name = "Autonomous Power State Transition";
1225 case NVME_FEAT_HOST_MEMORY_BUFFER:
1226 name = "Host Memory Buffer";
1228 case NVME_FEAT_TIMESTAMP:
1231 case NVME_FEAT_KEEP_ALIVE_TIMER:
1232 name = "Keep Alive Timer";
1234 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1235 name = "Host Controlled Thermal Management";
1237 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1238 name = "Non-Operation Power State Config";
1240 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1241 name = "Read Recovery Level Config";
1243 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1244 name = "Predictable Latency Mode Config";
1246 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1247 name = "Predictable Latency Mode Window";
1249 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1250 name = "LBA Status Information Report Interval";
1252 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1253 name = "Host Behavior Support";
1255 case NVME_FEAT_SANITIZE_CONFIG:
1256 name = "Sanitize Config";
1258 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1259 name = "Endurance Group Event Configuration";
1261 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1262 name = "Software Progress Marker";
1264 case NVME_FEAT_HOST_IDENTIFIER:
1265 name = "Host Identifier";
1267 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1268 name = "Reservation Notification Mask";
1270 case NVME_FEAT_RESERVATION_PERSISTENCE:
1271 name = "Reservation Persistence";
1273 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1274 name = "Namespace Write Protection Config";
1285 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1286 struct nvme_feature_obj *feat,
1287 struct nvme_command *command,
1288 struct nvme_completion *compl)
1291 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1295 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1296 struct nvme_feature_obj *feat,
1297 struct nvme_command *command,
1298 struct nvme_completion *compl)
1301 uint32_t cdw11 = command->cdw11;
1305 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1307 iv = cdw11 & 0xffff;
1308 cd = cdw11 & (1 << 16);
1310 if (iv > (sc->max_queues + 1)) {
1314 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1315 if ((iv == 0) && !cd)
1318 /* Requested Interrupt Vector must be used by a CQ */
1319 for (i = 0; i < sc->num_cqueues + 1; i++) {
1320 if (sc->compl_queues[i].intr_vec == iv) {
1321 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1328 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1329 struct nvme_feature_obj *feat,
1330 struct nvme_command *command,
1331 struct nvme_completion *compl)
1333 uint16_t nqr; /* Number of Queues Requested */
1335 if (sc->num_q_is_set) {
1336 WPRINTF("%s: Number of Queues already set", __func__);
1337 pci_nvme_status_genc(&compl->status,
1338 NVME_SC_COMMAND_SEQUENCE_ERROR);
1342 nqr = command->cdw11 & 0xFFFF;
1343 if (nqr == 0xffff) {
1344 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1345 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1349 sc->num_squeues = ONE_BASED(nqr);
1350 if (sc->num_squeues > sc->max_queues) {
1351 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1353 sc->num_squeues = sc->max_queues;
1356 nqr = (command->cdw11 >> 16) & 0xFFFF;
1357 if (nqr == 0xffff) {
1358 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1359 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1363 sc->num_cqueues = ONE_BASED(nqr);
1364 if (sc->num_cqueues > sc->max_queues) {
1365 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1367 sc->num_cqueues = sc->max_queues;
1370 /* Patch the command value which will be saved on callback's return */
1371 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1372 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1374 sc->num_q_is_set = true;
1378 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1379 struct nvme_completion *compl)
1381 struct nvme_feature_obj *feat;
1382 uint32_t nsid = command->nsid;
1383 uint8_t fid = command->cdw10 & 0xFF;
1385 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1387 if (fid >= NVME_FID_MAX) {
1388 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1389 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1392 feat = &sc->feat[fid];
1394 if (!feat->namespace_specific &&
1395 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1396 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1397 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1402 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1405 feat->set(sc, feat, command, compl);
1407 if (compl->status == NVME_SC_SUCCESS)
1408 feat->cdw11 = command->cdw11;
1414 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1415 struct nvme_completion* compl)
1417 struct nvme_feature_obj *feat;
1418 uint8_t fid = command->cdw10 & 0xFF;
1420 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1422 if (fid >= NVME_FID_MAX) {
1423 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1424 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1429 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1431 feat = &sc->feat[fid];
1433 feat->get(sc, feat, command, compl);
1436 if (compl->status == NVME_SC_SUCCESS) {
1437 compl->cdw0 = feat->cdw11;
1444 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1445 struct nvme_completion* compl)
1447 uint8_t ses, lbaf, pi;
1449 /* Only supports Secure Erase Setting - User Data Erase */
1450 ses = (command->cdw10 >> 9) & 0x7;
1452 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1456 /* Only supports a single LBA Format */
1457 lbaf = command->cdw10 & 0xf;
1459 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1460 NVME_SC_INVALID_FORMAT);
1464 /* Doesn't support Protection Infomation */
1465 pi = (command->cdw10 >> 5) & 0x7;
1467 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1471 if (sc->nvstore.type == NVME_STOR_RAM) {
1472 if (sc->nvstore.ctx)
1473 free(sc->nvstore.ctx);
1474 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1475 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1477 struct pci_nvme_ioreq *req;
1480 req = pci_nvme_get_ioreq(sc);
1482 pci_nvme_status_genc(&compl->status,
1483 NVME_SC_INTERNAL_DEVICE_ERROR);
1484 WPRINTF("%s: unable to allocate IO req", __func__);
1487 req->nvme_sq = &sc->submit_queues[0];
1489 req->opc = command->opc;
1490 req->cid = command->cid;
1491 req->nsid = command->nsid;
1493 req->io_req.br_offset = 0;
1494 req->io_req.br_resid = sc->nvstore.size;
1495 req->io_req.br_callback = pci_nvme_io_done;
1497 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1499 pci_nvme_status_genc(&compl->status,
1500 NVME_SC_INTERNAL_DEVICE_ERROR);
1501 pci_nvme_release_ioreq(sc, req);
1509 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1510 struct nvme_completion* compl)
1512 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1513 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1515 /* TODO: search for the command ID and abort it */
1518 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1523 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1524 struct nvme_command* command, struct nvme_completion* compl)
1526 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1528 /* Don't exceed the Async Event Request Limit (AERL). */
1529 if (pci_nvme_aer_limit_reached(sc)) {
1530 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1531 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1535 if (pci_nvme_aer_add(sc, command->cid)) {
1536 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1537 NVME_SC_INTERNAL_DEVICE_ERROR);
1542 * Raise events when they happen based on the Set Features cmd.
1543 * These events happen async, so only set completion successful if
1544 * there is an event reflective of the request to get event.
1546 compl->status = NVME_NO_STATUS;
1552 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1554 struct nvme_completion compl;
1555 struct nvme_command *cmd;
1556 struct nvme_submission_queue *sq;
1557 struct nvme_completion_queue *cq;
1560 DPRINTF("%s index %u", __func__, (uint32_t)value);
1562 sq = &sc->submit_queues[0];
1563 cq = &sc->compl_queues[0];
1565 pthread_mutex_lock(&sq->mtx);
1568 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1570 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1571 cmd = &(sq->qbase)[sqhead];
1576 case NVME_OPC_DELETE_IO_SQ:
1577 DPRINTF("%s command DELETE_IO_SQ", __func__);
1578 nvme_opc_delete_io_sq(sc, cmd, &compl);
1580 case NVME_OPC_CREATE_IO_SQ:
1581 DPRINTF("%s command CREATE_IO_SQ", __func__);
1582 nvme_opc_create_io_sq(sc, cmd, &compl);
1584 case NVME_OPC_DELETE_IO_CQ:
1585 DPRINTF("%s command DELETE_IO_CQ", __func__);
1586 nvme_opc_delete_io_cq(sc, cmd, &compl);
1588 case NVME_OPC_CREATE_IO_CQ:
1589 DPRINTF("%s command CREATE_IO_CQ", __func__);
1590 nvme_opc_create_io_cq(sc, cmd, &compl);
1592 case NVME_OPC_GET_LOG_PAGE:
1593 DPRINTF("%s command GET_LOG_PAGE", __func__);
1594 nvme_opc_get_log_page(sc, cmd, &compl);
1596 case NVME_OPC_IDENTIFY:
1597 DPRINTF("%s command IDENTIFY", __func__);
1598 nvme_opc_identify(sc, cmd, &compl);
1600 case NVME_OPC_ABORT:
1601 DPRINTF("%s command ABORT", __func__);
1602 nvme_opc_abort(sc, cmd, &compl);
1604 case NVME_OPC_SET_FEATURES:
1605 DPRINTF("%s command SET_FEATURES", __func__);
1606 nvme_opc_set_features(sc, cmd, &compl);
1608 case NVME_OPC_GET_FEATURES:
1609 DPRINTF("%s command GET_FEATURES", __func__);
1610 nvme_opc_get_features(sc, cmd, &compl);
1612 case NVME_OPC_FIRMWARE_ACTIVATE:
1613 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1614 pci_nvme_status_tc(&compl.status,
1615 NVME_SCT_COMMAND_SPECIFIC,
1616 NVME_SC_INVALID_FIRMWARE_SLOT);
1618 case NVME_OPC_ASYNC_EVENT_REQUEST:
1619 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1620 nvme_opc_async_event_req(sc, cmd, &compl);
1622 case NVME_OPC_FORMAT_NVM:
1623 DPRINTF("%s command FORMAT_NVM", __func__);
1624 if ((sc->ctrldata.oacs &
1625 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1626 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1628 compl.status = NVME_NO_STATUS;
1629 nvme_opc_format_nvm(sc, cmd, &compl);
1632 DPRINTF("0x%x command is not implemented",
1634 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1636 sqhead = (sqhead + 1) % sq->size;
1638 if (NVME_COMPLETION_VALID(compl)) {
1639 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1647 DPRINTF("setting sqhead %u", sqhead);
1650 if (cq->head != cq->tail)
1651 pci_generate_msix(sc->nsc_pi, 0);
1653 pthread_mutex_unlock(&sq->mtx);
1657 * Update the Write and Read statistics reported in SMART data
1659 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1660 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1661 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1664 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1665 size_t bytes, uint16_t status)
1668 pthread_mutex_lock(&sc->mtx);
1670 case NVME_OPC_WRITE:
1671 sc->write_commands++;
1672 if (status != NVME_SC_SUCCESS)
1674 sc->write_dunits_remainder += (bytes / 512);
1675 while (sc->write_dunits_remainder >= 1000) {
1676 sc->write_data_units++;
1677 sc->write_dunits_remainder -= 1000;
1681 sc->read_commands++;
1682 if (status != NVME_SC_SUCCESS)
1684 sc->read_dunits_remainder += (bytes / 512);
1685 while (sc->read_dunits_remainder >= 1000) {
1686 sc->read_data_units++;
1687 sc->read_dunits_remainder -= 1000;
1691 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1694 pthread_mutex_unlock(&sc->mtx);
1698 * Check if the combination of Starting LBA (slba) and Number of Logical
1699 * Blocks (nlb) exceeds the range of the underlying storage.
1701 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1702 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1706 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1709 size_t offset, bytes;
1711 /* Overflow check of multiplying Starting LBA by the sector size */
1712 if (slba >> (64 - nvstore->sectsz_bits))
1715 offset = slba << nvstore->sectsz_bits;
1716 bytes = nlb << nvstore->sectsz_bits;
1718 /* Overflow check of Number of Logical Blocks */
1719 if ((nvstore->size - offset) < bytes)
1726 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1727 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1734 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1738 /* concatenate contig block-iovs to minimize number of iovs */
1739 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1740 iovidx = req->io_req.br_iovcnt - 1;
1742 req->io_req.br_iov[iovidx].iov_base =
1743 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1744 req->prev_gpaddr, size);
1746 req->prev_size += size;
1747 req->io_req.br_resid += size;
1749 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1751 iovidx = req->io_req.br_iovcnt;
1753 req->io_req.br_offset = lba;
1754 req->io_req.br_resid = 0;
1755 req->io_req.br_param = req;
1758 req->io_req.br_iov[iovidx].iov_base =
1759 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1762 req->io_req.br_iov[iovidx].iov_len = size;
1764 req->prev_gpaddr = gpaddr;
1765 req->prev_size = size;
1766 req->io_req.br_resid += size;
1768 req->io_req.br_iovcnt++;
1775 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1776 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1777 uint32_t cdw0, uint16_t status)
1779 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1781 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1782 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1783 NVME_STATUS_GET_SC(status));
1785 pci_nvme_cq_update(sc, cq,
1791 if (cq->head != cq->tail) {
1792 if (cq->intr_en & NVME_CQ_INTEN) {
1793 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1795 DPRINTF("%s: CQ%u interrupt disabled",
1796 __func__, sq->cqid);
1802 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1805 req->nvme_sq = NULL;
1808 pthread_mutex_lock(&sc->mtx);
1810 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1813 /* when no more IO pending, can set to ready if device reset/enabled */
1814 if (sc->pending_ios == 0 &&
1815 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1816 sc->regs.csts |= NVME_CSTS_RDY;
1818 pthread_mutex_unlock(&sc->mtx);
1820 sem_post(&sc->iosemlock);
1823 static struct pci_nvme_ioreq *
1824 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1826 struct pci_nvme_ioreq *req = NULL;;
1828 sem_wait(&sc->iosemlock);
1829 pthread_mutex_lock(&sc->mtx);
1831 req = STAILQ_FIRST(&sc->ioreqs_free);
1832 assert(req != NULL);
1833 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1839 pthread_mutex_unlock(&sc->mtx);
1841 req->io_req.br_iovcnt = 0;
1842 req->io_req.br_offset = 0;
1843 req->io_req.br_resid = 0;
1844 req->io_req.br_param = req;
1845 req->prev_gpaddr = 0;
1852 pci_nvme_io_done(struct blockif_req *br, int err)
1854 struct pci_nvme_ioreq *req = br->br_param;
1855 struct nvme_submission_queue *sq = req->nvme_sq;
1856 uint16_t code, status;
1858 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1860 /* TODO return correct error */
1861 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1862 pci_nvme_status_genc(&status, code);
1864 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1865 pci_nvme_stats_write_read_update(req->sc, req->opc,
1866 req->bytes, status);
1867 pci_nvme_release_ioreq(req->sc, req);
1871 * Implements the Flush command. The specification states:
1872 * If a volatile write cache is not present, Flush commands complete
1873 * successfully and have no effect
1874 * in the description of the Volatile Write Cache (VWC) field of the Identify
1875 * Controller data. Therefore, set status to Success if the command is
1876 * not supported (i.e. RAM or as indicated by the blockif).
1879 nvme_opc_flush(struct pci_nvme_softc *sc,
1880 struct nvme_command *cmd,
1881 struct pci_nvme_blockstore *nvstore,
1882 struct pci_nvme_ioreq *req,
1885 bool pending = false;
1887 if (nvstore->type == NVME_STOR_RAM) {
1888 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1892 req->io_req.br_callback = pci_nvme_io_done;
1894 err = blockif_flush(nvstore->ctx, &req->io_req);
1900 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1903 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1911 nvme_write_read_ram(struct pci_nvme_softc *sc,
1912 struct pci_nvme_blockstore *nvstore,
1913 uint64_t prp1, uint64_t prp2,
1914 size_t offset, uint64_t bytes,
1917 uint8_t *buf = nvstore->ctx;
1918 enum nvme_copy_dir dir;
1922 dir = NVME_COPY_TO_PRP;
1924 dir = NVME_COPY_FROM_PRP;
1926 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1927 buf + offset, bytes, dir))
1928 pci_nvme_status_genc(&status,
1929 NVME_SC_DATA_TRANSFER_ERROR);
1931 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1937 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1938 struct pci_nvme_blockstore *nvstore,
1939 struct pci_nvme_ioreq *req,
1940 uint64_t prp1, uint64_t prp2,
1941 size_t offset, uint64_t bytes,
1946 uint16_t status = NVME_NO_STATUS;
1948 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1949 if (pci_nvme_append_iov_req(sc, req, prp1,
1950 size, is_write, offset)) {
1951 pci_nvme_status_genc(&status,
1952 NVME_SC_DATA_TRANSFER_ERROR);
1961 } else if (bytes <= PAGE_SIZE) {
1963 if (pci_nvme_append_iov_req(sc, req, prp2,
1964 size, is_write, offset)) {
1965 pci_nvme_status_genc(&status,
1966 NVME_SC_DATA_TRANSFER_ERROR);
1970 void *vmctx = sc->nsc_pi->pi_vmctx;
1971 uint64_t *prp_list = &prp2;
1972 uint64_t *last = prp_list;
1974 /* PRP2 is pointer to a physical region page list */
1976 /* Last entry in list points to the next list */
1977 if (prp_list == last) {
1978 uint64_t prp = *prp_list;
1980 prp_list = paddr_guest2host(vmctx, prp,
1981 PAGE_SIZE - (prp % PAGE_SIZE));
1982 last = prp_list + (NVME_PRP2_ITEMS - 1);
1985 size = MIN(bytes, PAGE_SIZE);
1987 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1988 size, is_write, offset)) {
1989 pci_nvme_status_genc(&status,
1990 NVME_SC_DATA_TRANSFER_ERROR);
2000 req->io_req.br_callback = pci_nvme_io_done;
2002 err = blockif_write(nvstore->ctx, &req->io_req);
2004 err = blockif_read(nvstore->ctx, &req->io_req);
2007 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2013 nvme_opc_write_read(struct pci_nvme_softc *sc,
2014 struct nvme_command *cmd,
2015 struct pci_nvme_blockstore *nvstore,
2016 struct pci_nvme_ioreq *req,
2019 uint64_t lba, nblocks, bytes;
2021 bool is_write = cmd->opc == NVME_OPC_WRITE;
2022 bool pending = false;
2024 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2025 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2026 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2027 WPRINTF("%s command would exceed LBA range", __func__);
2028 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2032 bytes = nblocks << nvstore->sectsz_bits;
2033 if (bytes > NVME_MAX_DATA_SIZE) {
2034 WPRINTF("%s command would exceed MDTS", __func__);
2035 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2039 offset = lba << nvstore->sectsz_bits;
2042 req->io_req.br_offset = lba;
2044 /* PRP bits 1:0 must be zero */
2045 cmd->prp1 &= ~0x3UL;
2046 cmd->prp2 &= ~0x3UL;
2048 if (nvstore->type == NVME_STOR_RAM) {
2049 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2050 cmd->prp2, offset, bytes, is_write);
2052 *status = nvme_write_read_blockif(sc, nvstore, req,
2053 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2055 if (*status == NVME_NO_STATUS)
2060 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2066 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2068 struct pci_nvme_ioreq *req = br->br_param;
2069 struct pci_nvme_softc *sc = req->sc;
2074 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2075 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2076 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2078 struct iovec *iov = req->io_req.br_iov;
2081 iov += req->prev_gpaddr;
2083 /* The iov_* values already include the sector size */
2084 req->io_req.br_offset = (off_t)iov->iov_base;
2085 req->io_req.br_resid = iov->iov_len;
2086 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2087 pci_nvme_status_genc(&status,
2088 NVME_SC_INTERNAL_DEVICE_ERROR);
2094 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2095 req->cid, 0, status);
2096 pci_nvme_release_ioreq(sc, req);
2101 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2102 struct nvme_command *cmd,
2103 struct pci_nvme_blockstore *nvstore,
2104 struct pci_nvme_ioreq *req,
2107 struct nvme_dsm_range *range;
2108 uint32_t nr, r, non_zero, dr;
2110 bool pending = false;
2112 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2113 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2117 nr = cmd->cdw10 & 0xff;
2119 /* copy locally because a range entry could straddle PRPs */
2120 range = calloc(1, NVME_MAX_DSM_TRIM);
2121 if (range == NULL) {
2122 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2125 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2126 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2128 /* Check for invalid ranges and the number of non-zero lengths */
2130 for (r = 0; r <= nr; r++) {
2131 if (pci_nvme_out_of_range(nvstore,
2132 range[r].starting_lba, range[r].length)) {
2133 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2136 if (range[r].length != 0)
2140 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2141 size_t offset, bytes;
2142 int sectsz_bits = sc->nvstore.sectsz_bits;
2145 * DSM calls are advisory only, and compliant controllers
2146 * may choose to take no actions (i.e. return Success).
2148 if (!nvstore->deallocate) {
2149 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2153 /* If all ranges have a zero length, return Success */
2154 if (non_zero == 0) {
2155 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2160 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2164 offset = range[0].starting_lba << sectsz_bits;
2165 bytes = range[0].length << sectsz_bits;
2168 * If the request is for more than a single range, store
2169 * the ranges in the br_iov. Optimize for the common case
2170 * of a single range.
2172 * Note that NVMe Number of Ranges is a zero based value
2174 req->io_req.br_iovcnt = 0;
2175 req->io_req.br_offset = offset;
2176 req->io_req.br_resid = bytes;
2179 req->io_req.br_callback = pci_nvme_io_done;
2181 struct iovec *iov = req->io_req.br_iov;
2183 for (r = 0, dr = 0; r <= nr; r++) {
2184 offset = range[r].starting_lba << sectsz_bits;
2185 bytes = range[r].length << sectsz_bits;
2189 if ((nvstore->size - offset) < bytes) {
2190 pci_nvme_status_genc(status,
2191 NVME_SC_LBA_OUT_OF_RANGE);
2194 iov[dr].iov_base = (void *)offset;
2195 iov[dr].iov_len = bytes;
2198 req->io_req.br_callback = pci_nvme_dealloc_sm;
2201 * Use prev_gpaddr to track the current entry and
2202 * prev_size to track the number of entries
2204 req->prev_gpaddr = 0;
2205 req->prev_size = dr;
2208 err = blockif_delete(nvstore->ctx, &req->io_req);
2210 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2220 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2222 struct nvme_submission_queue *sq;
2226 /* handle all submissions up to sq->tail index */
2227 sq = &sc->submit_queues[idx];
2229 pthread_mutex_lock(&sq->mtx);
2232 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2233 idx, sqhead, sq->tail, sq->qbase);
2235 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2236 struct nvme_command *cmd;
2237 struct pci_nvme_ioreq *req;
2245 cmd = &sq->qbase[sqhead];
2246 sqhead = (sqhead + 1) % sq->size;
2248 nsid = le32toh(cmd->nsid);
2249 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2250 pci_nvme_status_genc(&status,
2251 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2253 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2257 req = pci_nvme_get_ioreq(sc);
2259 pci_nvme_status_genc(&status,
2260 NVME_SC_INTERNAL_DEVICE_ERROR);
2261 WPRINTF("%s: unable to allocate IO req", __func__);
2266 req->opc = cmd->opc;
2267 req->cid = cmd->cid;
2268 req->nsid = cmd->nsid;
2271 case NVME_OPC_FLUSH:
2272 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2275 case NVME_OPC_WRITE:
2277 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2280 case NVME_OPC_WRITE_ZEROES:
2281 /* TODO: write zeroes
2282 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2283 __func__, lba, cmd->cdw12 & 0xFFFF); */
2284 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2286 case NVME_OPC_DATASET_MANAGEMENT:
2287 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2291 WPRINTF("%s unhandled io command 0x%x",
2292 __func__, cmd->opc);
2293 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2297 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2300 pci_nvme_release_ioreq(sc, req);
2306 pthread_mutex_unlock(&sq->mtx);
2310 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2311 uint64_t idx, int is_sq, uint64_t value)
2313 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2314 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2317 if (idx > sc->num_squeues) {
2318 WPRINTF("%s queue index %lu overflow from "
2320 __func__, idx, sc->num_squeues);
2324 atomic_store_short(&sc->submit_queues[idx].tail,
2328 pci_nvme_handle_admin_cmd(sc, value);
2330 /* submission queue; handle new entries in SQ */
2331 if (idx > sc->num_squeues) {
2332 WPRINTF("%s SQ index %lu overflow from "
2334 __func__, idx, sc->num_squeues);
2337 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2340 if (idx > sc->num_cqueues) {
2341 WPRINTF("%s queue index %lu overflow from "
2343 __func__, idx, sc->num_cqueues);
2347 atomic_store_short(&sc->compl_queues[idx].head,
2353 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2355 const char *s = iswrite ? "WRITE" : "READ";
2358 case NVME_CR_CAP_LOW:
2359 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2361 case NVME_CR_CAP_HI:
2362 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2365 DPRINTF("%s %s NVME_CR_VS", func, s);
2368 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2371 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2374 DPRINTF("%s %s NVME_CR_CC", func, s);
2377 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2380 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2383 DPRINTF("%s %s NVME_CR_AQA", func, s);
2385 case NVME_CR_ASQ_LOW:
2386 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2388 case NVME_CR_ASQ_HI:
2389 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2391 case NVME_CR_ACQ_LOW:
2392 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2394 case NVME_CR_ACQ_HI:
2395 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2398 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2404 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2405 uint64_t offset, int size, uint64_t value)
2409 if (offset >= NVME_DOORBELL_OFFSET) {
2410 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2411 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2412 int is_sq = (belloffset % 8) < 4;
2414 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2415 WPRINTF("guest attempted an overflow write offset "
2416 "0x%lx, val 0x%lx in %s",
2417 offset, value, __func__);
2421 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2425 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2426 offset, size, value);
2429 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2430 "val 0x%lx) to bar0 in %s",
2431 size, offset, value, __func__);
2432 /* TODO: shutdown device */
2436 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2438 pthread_mutex_lock(&sc->mtx);
2441 case NVME_CR_CAP_LOW:
2442 case NVME_CR_CAP_HI:
2449 /* MSI-X, so ignore */
2452 /* MSI-X, so ignore */
2455 ccreg = (uint32_t)value;
2457 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2460 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2461 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2462 NVME_CC_GET_IOCQES(ccreg));
2464 if (NVME_CC_GET_SHN(ccreg)) {
2465 /* perform shutdown - flush out data to backend */
2466 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2467 NVME_CSTS_REG_SHST_SHIFT);
2468 sc->regs.csts |= NVME_SHST_COMPLETE <<
2469 NVME_CSTS_REG_SHST_SHIFT;
2471 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2472 if (NVME_CC_GET_EN(ccreg) == 0)
2473 /* transition 1-> causes controller reset */
2474 pci_nvme_reset_locked(sc);
2476 pci_nvme_init_controller(ctx, sc);
2479 /* Insert the iocqes, iosqes and en bits from the write */
2480 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2481 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2482 if (NVME_CC_GET_EN(ccreg) == 0) {
2483 /* Insert the ams, mps and css bit fields */
2484 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2485 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2486 sc->regs.csts &= ~NVME_CSTS_RDY;
2487 } else if (sc->pending_ios == 0) {
2488 sc->regs.csts |= NVME_CSTS_RDY;
2494 /* ignore writes; don't support subsystem reset */
2497 sc->regs.aqa = (uint32_t)value;
2499 case NVME_CR_ASQ_LOW:
2500 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2501 (0xFFFFF000 & value);
2503 case NVME_CR_ASQ_HI:
2504 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2507 case NVME_CR_ACQ_LOW:
2508 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2509 (0xFFFFF000 & value);
2511 case NVME_CR_ACQ_HI:
2512 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2516 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2517 __func__, offset, value, size);
2519 pthread_mutex_unlock(&sc->mtx);
2523 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2524 int baridx, uint64_t offset, int size, uint64_t value)
2526 struct pci_nvme_softc* sc = pi->pi_arg;
2528 if (baridx == pci_msix_table_bar(pi) ||
2529 baridx == pci_msix_pba_bar(pi)) {
2530 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2531 " value 0x%lx", baridx, offset, size, value);
2533 pci_emul_msix_twrite(pi, offset, size, value);
2539 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2543 DPRINTF("%s unknown baridx %d, val 0x%lx",
2544 __func__, baridx, value);
2548 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2549 uint64_t offset, int size)
2553 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2555 if (offset < NVME_DOORBELL_OFFSET) {
2556 void *p = &(sc->regs);
2557 pthread_mutex_lock(&sc->mtx);
2558 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2559 pthread_mutex_unlock(&sc->mtx);
2562 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2573 value &= 0xFFFFFFFF;
2577 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2578 offset, size, (uint32_t)value);
2586 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2587 uint64_t offset, int size)
2589 struct pci_nvme_softc* sc = pi->pi_arg;
2591 if (baridx == pci_msix_table_bar(pi) ||
2592 baridx == pci_msix_pba_bar(pi)) {
2593 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2594 baridx, offset, size);
2596 return pci_emul_msix_tread(pi, offset, size);
2601 return pci_nvme_read_bar_0(sc, offset, size);
2604 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2612 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2614 char bident[sizeof("XX:X:X")];
2615 char *uopt, *xopts, *config;
2619 sc->max_queues = NVME_QUEUES;
2620 sc->max_qentries = NVME_MAX_QENTRIES;
2621 sc->ioslots = NVME_IOSLOTS;
2622 sc->num_squeues = sc->max_queues;
2623 sc->num_cqueues = sc->max_queues;
2624 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2627 uopt = strdup(opts);
2629 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2630 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2631 for (xopts = strtok(uopt, ",");
2633 xopts = strtok(NULL, ",")) {
2635 if ((config = strchr(xopts, '=')) != NULL)
2638 if (!strcmp("maxq", xopts)) {
2639 sc->max_queues = atoi(config);
2640 } else if (!strcmp("qsz", xopts)) {
2641 sc->max_qentries = atoi(config);
2642 } else if (!strcmp("ioslots", xopts)) {
2643 sc->ioslots = atoi(config);
2644 } else if (!strcmp("sectsz", xopts)) {
2645 sectsz = atoi(config);
2646 } else if (!strcmp("ser", xopts)) {
2648 * This field indicates the Product Serial Number in
2649 * 7-bit ASCII, unused bytes should be space characters.
2652 cpywithpad((char *)sc->ctrldata.sn,
2653 sizeof(sc->ctrldata.sn), config, ' ');
2654 } else if (!strcmp("ram", xopts)) {
2655 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2657 sc->nvstore.type = NVME_STOR_RAM;
2658 sc->nvstore.size = sz * 1024 * 1024;
2659 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2660 sc->nvstore.sectsz = 4096;
2661 sc->nvstore.sectsz_bits = 12;
2662 if (sc->nvstore.ctx == NULL) {
2663 perror("Unable to allocate RAM");
2667 } else if (!strcmp("eui64", xopts)) {
2668 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2669 } else if (!strcmp("dsm", xopts)) {
2670 if (!strcmp("auto", config))
2671 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2672 else if (!strcmp("enable", config))
2673 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2674 else if (!strcmp("disable", config))
2675 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2676 } else if (optidx == 0) {
2677 snprintf(bident, sizeof(bident), "%d:%d",
2678 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2679 sc->nvstore.ctx = blockif_open(xopts, bident);
2680 if (sc->nvstore.ctx == NULL) {
2681 perror("Could not open backing file");
2685 sc->nvstore.type = NVME_STOR_BLOCKIF;
2686 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2688 EPRINTLN("Invalid option %s", xopts);
2697 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2698 EPRINTLN("backing store not specified");
2701 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2702 sc->nvstore.sectsz = sectsz;
2703 else if (sc->nvstore.type != NVME_STOR_RAM)
2704 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2705 for (sc->nvstore.sectsz_bits = 9;
2706 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2707 sc->nvstore.sectsz_bits++);
2709 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2710 sc->max_queues = NVME_QUEUES;
2712 if (sc->max_qentries <= 0) {
2713 EPRINTLN("Invalid qsz option");
2716 if (sc->ioslots <= 0) {
2717 EPRINTLN("Invalid ioslots option");
2725 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2727 struct pci_nvme_softc *sc;
2728 uint32_t pci_membar_sz;
2733 sc = calloc(1, sizeof(struct pci_nvme_softc));
2737 error = pci_nvme_parse_opts(sc, opts);
2743 STAILQ_INIT(&sc->ioreqs_free);
2744 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2745 for (int i = 0; i < sc->ioslots; i++) {
2746 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2749 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2750 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2751 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2752 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2753 pci_set_cfgdata8(pi, PCIR_PROGIF,
2754 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2757 * Allocate size of NVMe registers + doorbell space for all queues.
2759 * The specification requires a minimum memory I/O window size of 16K.
2760 * The Windows driver will refuse to start a device with a smaller
2763 pci_membar_sz = sizeof(struct nvme_registers) +
2764 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2765 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2767 DPRINTF("nvme membar size: %u", pci_membar_sz);
2769 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2771 WPRINTF("%s pci alloc mem bar failed", __func__);
2775 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2777 WPRINTF("%s pci add msixcap failed", __func__);
2781 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2783 WPRINTF("%s pci add Express capability failed", __func__);
2787 pthread_mutex_init(&sc->mtx, NULL);
2788 sem_init(&sc->iosemlock, 0, sc->ioslots);
2790 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2792 * Controller data depends on Namespace data so initialize Namespace
2795 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2796 pci_nvme_init_ctrldata(sc);
2797 pci_nvme_init_logpages(sc);
2798 pci_nvme_init_features(sc);
2800 pci_nvme_aer_init(sc);
2804 pci_lintr_request(pi);
2811 struct pci_devemu pci_de_nvme = {
2813 .pe_init = pci_nvme_init,
2814 .pe_barwrite = pci_nvme_write,
2815 .pe_barread = pci_nvme_read
2817 PCI_EMUL_SET(pci_de_nvme);