2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN 0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS 0xffff
115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero) ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one) ((one) - 1)
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
131 enum nvme_controller_register_offsets {
132 NVME_CR_CAP_LOW = 0x00,
133 NVME_CR_CAP_HI = 0x04,
135 NVME_CR_INTMS = 0x0c,
136 NVME_CR_INTMC = 0x10,
141 NVME_CR_ASQ_LOW = 0x28,
142 NVME_CR_ASQ_HI = 0x2c,
143 NVME_CR_ACQ_LOW = 0x30,
144 NVME_CR_ACQ_HI = 0x34,
147 enum nvme_cmd_cdw11 {
148 NVME_CMD_CDW11_PC = 0x0001,
149 NVME_CMD_CDW11_IEN = 0x0002,
150 NVME_CMD_CDW11_IV = 0xFFFF0000,
158 #define NVME_CQ_INTEN 0x01
159 #define NVME_CQ_INTCOAL 0x02
161 struct nvme_completion_queue {
162 struct nvme_completion *qbase;
165 uint16_t tail; /* nvme progress */
166 uint16_t head; /* guest progress */
171 struct nvme_submission_queue {
172 struct nvme_command *qbase;
175 uint16_t head; /* nvme progress */
176 uint16_t tail; /* guest progress */
177 uint16_t cqid; /* completion queue id */
181 enum nvme_storage_type {
182 NVME_STOR_BLOCKIF = 0,
186 struct pci_nvme_blockstore {
187 enum nvme_storage_type type;
191 uint32_t sectsz_bits;
193 uint32_t deallocate:1;
197 * Calculate the number of additional page descriptors for guest IO requests
198 * based on the advertised Max Data Transfer (MDTS) and given the number of
199 * default iovec's in a struct blockif_req.
201 * Note the + 1 allows for the initial descriptor to not be page aligned.
203 #define MDTS_PAD_SIZE \
204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
254 struct pci_nvme_aer {
255 STAILQ_ENTRY(pci_nvme_aer) link;
256 uint16_t cid; /* Command ID of the submitted AER */
259 struct pci_nvme_softc {
260 struct pci_devinst *nsc_pi;
264 struct nvme_registers regs;
266 struct nvme_namespace_data nsdata;
267 struct nvme_controller_data ctrldata;
268 struct nvme_error_information_entry err_log;
269 struct nvme_health_information_page health_log;
270 struct nvme_firmware_page fw_log;
272 struct pci_nvme_blockstore nvstore;
274 uint16_t max_qentries; /* max entries per queue */
275 uint32_t max_queues; /* max number of IO SQ's or CQ's */
276 uint32_t num_cqueues;
277 uint32_t num_squeues;
278 bool num_q_is_set; /* Has host set Number of Queues */
280 struct pci_nvme_ioreq *ioreqs;
281 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282 uint32_t pending_ios;
287 * Memory mapped Submission and Completion queues
288 * Each array includes both Admin and IO queues
290 struct nvme_completion_queue *compl_queues;
291 struct nvme_submission_queue *submit_queues;
293 struct nvme_feature_obj feat[NVME_FID_MAX];
295 enum nvme_dsm_type dataset_management;
297 /* Accounting for SMART data */
298 __uint128_t read_data_units;
299 __uint128_t write_data_units;
300 __uint128_t read_commands;
301 __uint128_t write_commands;
302 uint32_t read_dunits_remainder;
303 uint32_t write_dunits_remainder;
305 STAILQ_HEAD(, pci_nvme_aer) aer_list;
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
326 #define NVME_CC_WRITE_MASK \
327 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
331 #define NVME_CC_NEN_WRITE_MASK \
332 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
340 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
348 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352 struct nvme_feature_obj *,
353 struct nvme_command *,
354 struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356 struct nvme_feature_obj *,
357 struct nvme_command *,
358 struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360 struct nvme_feature_obj *,
361 struct nvme_command *,
362 struct nvme_completion *);
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
369 len = strnlen(src, dst_size);
370 memset(dst, pad, dst_size);
371 memcpy(dst, src, len);
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
378 *status &= ~NVME_STATUS_MASK;
379 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
387 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
391 * Initialize the requested number or IO Submission and Completion Queues.
392 * Admin queues are allocated implicitly.
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
400 * Allocate and initialize the Submission Queues
402 if (nsq > NVME_QUEUES) {
403 WPRINTF("%s: clamping number of SQ from %u to %u",
404 __func__, nsq, NVME_QUEUES);
408 sc->num_squeues = nsq;
410 sc->submit_queues = calloc(sc->num_squeues + 1,
411 sizeof(struct nvme_submission_queue));
412 if (sc->submit_queues == NULL) {
413 WPRINTF("%s: SQ allocation failed", __func__);
416 struct nvme_submission_queue *sq = sc->submit_queues;
418 for (i = 0; i < sc->num_squeues; i++)
419 pthread_mutex_init(&sq[i].mtx, NULL);
423 * Allocate and initialize the Completion Queues
425 if (ncq > NVME_QUEUES) {
426 WPRINTF("%s: clamping number of CQ from %u to %u",
427 __func__, ncq, NVME_QUEUES);
431 sc->num_cqueues = ncq;
433 sc->compl_queues = calloc(sc->num_cqueues + 1,
434 sizeof(struct nvme_completion_queue));
435 if (sc->compl_queues == NULL) {
436 WPRINTF("%s: CQ allocation failed", __func__);
439 struct nvme_completion_queue *cq = sc->compl_queues;
441 for (i = 0; i < sc->num_cqueues; i++)
442 pthread_mutex_init(&cq[i].mtx, NULL);
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
449 struct nvme_controller_data *cd = &sc->ctrldata;
454 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
457 /* Num of submission commands that we can handle at a time (2^rab) */
467 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
469 cd->ver = 0x00010300;
471 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
475 /* Advertise 1, Read-only firmware slot */
476 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478 cd->lpa = 0; /* TODO: support some simple things like SMART */
479 cd->elpe = 0; /* max error log page entries */
480 cd->npss = 1; /* number of power states support */
482 /* Warning Composite Temperature Threshold */
485 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489 cd->nn = 1; /* number of namespaces */
492 switch (sc->dataset_management) {
493 case NVME_DATASET_MANAGEMENT_AUTO:
494 if (sc->nvstore.deallocate)
495 cd->oncs |= NVME_ONCS_DSM;
497 case NVME_DATASET_MANAGEMENT_ENABLE:
498 cd->oncs |= NVME_ONCS_DSM;
506 cd->power_state[0].mp = 10;
510 * Calculate the CRC-16 of the given buffer
511 * See copyright attribution at top of file
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
516 const unsigned char *cp = buffer;
517 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518 static uint16_t const crc16_table[256] = {
519 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
554 crc = (((crc >> 8) & 0xffU) ^
555 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561 struct nvme_namespace_data *nd, uint32_t nsid,
562 struct pci_nvme_blockstore *nvstore)
565 /* Get capacity and block size information from backing store */
566 nd->nsze = nvstore->size / nvstore->sectsz;
570 if (nvstore->type == NVME_STOR_BLOCKIF)
571 nvstore->deallocate = blockif_candelete(nvstore->ctx);
573 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
576 /* Create an EUI-64 if user did not provide one */
577 if (nvstore->eui64 == 0) {
579 uint64_t eui64 = nvstore->eui64;
581 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
585 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
588 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
590 be64enc(nd->eui64, nvstore->eui64);
592 /* LBA data-sz = 2^lbads */
593 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
600 memset(&sc->err_log, 0, sizeof(sc->err_log));
601 memset(&sc->health_log, 0, sizeof(sc->health_log));
602 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
604 /* Set read/write remainder to round up according to spec */
605 sc->read_dunits_remainder = 999;
606 sc->write_dunits_remainder = 999;
608 /* Set nominal Health values checked by implementations */
609 sc->health_log.temperature = 310;
610 sc->health_log.available_spare = 100;
611 sc->health_log.available_spare_threshold = 10;
615 pci_nvme_init_features(struct pci_nvme_softc *sc)
618 sc->feat[0].set = nvme_feature_invalid_cb;
619 sc->feat[0].get = nvme_feature_invalid_cb;
621 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
622 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
623 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
624 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
625 nvme_feature_iv_config;
626 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
627 nvme_feature_invalid_cb;
628 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
629 nvme_feature_invalid_cb;
633 pci_nvme_aer_init(struct pci_nvme_softc *sc)
636 STAILQ_INIT(&sc->aer_list);
641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
643 struct pci_nvme_aer *aer = NULL;
645 while (!STAILQ_EMPTY(&sc->aer_list)) {
646 aer = STAILQ_FIRST(&sc->aer_list);
647 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
651 pci_nvme_aer_init(sc);
655 pci_nvme_aer_available(struct pci_nvme_softc *sc)
658 return (!STAILQ_EMPTY(&sc->aer_list));
662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
664 struct nvme_controller_data *cd = &sc->ctrldata;
666 /* AERL is a zero based value while aer_count is one's based */
667 return (sc->aer_count == (cd->aerl + 1));
671 * Add an Async Event Request
673 * Stores an AER to be returned later if the Controller needs to notify the
675 * Note that while the NVMe spec doesn't require Controllers to return AER's
676 * in order, this implementation does preserve the order.
679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
681 struct pci_nvme_aer *aer = NULL;
683 if (pci_nvme_aer_limit_reached(sc))
686 aer = calloc(1, sizeof(struct pci_nvme_aer));
692 /* Save the Command ID for use in the completion message */
694 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
700 * Get an Async Event Request structure
702 * Returns a pointer to an AER previously submitted by the host or NULL if
703 * no AER's exist. Caller is responsible for freeing the returned struct.
705 static struct pci_nvme_aer *
706 pci_nvme_aer_get(struct pci_nvme_softc *sc)
708 struct pci_nvme_aer *aer = NULL;
710 aer = STAILQ_FIRST(&sc->aer_list);
712 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
720 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
724 DPRINTF("%s", __func__);
726 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
727 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
728 (60 << NVME_CAP_LO_REG_TO_SHIFT);
730 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
732 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
737 assert(sc->submit_queues != NULL);
739 for (i = 0; i < sc->num_squeues + 1; i++) {
740 sc->submit_queues[i].qbase = NULL;
741 sc->submit_queues[i].size = 0;
742 sc->submit_queues[i].cqid = 0;
743 sc->submit_queues[i].tail = 0;
744 sc->submit_queues[i].head = 0;
747 assert(sc->compl_queues != NULL);
749 for (i = 0; i < sc->num_cqueues + 1; i++) {
750 sc->compl_queues[i].qbase = NULL;
751 sc->compl_queues[i].size = 0;
752 sc->compl_queues[i].tail = 0;
753 sc->compl_queues[i].head = 0;
756 sc->num_q_is_set = false;
758 pci_nvme_aer_destroy(sc);
762 pci_nvme_reset(struct pci_nvme_softc *sc)
764 pthread_mutex_lock(&sc->mtx);
765 pci_nvme_reset_locked(sc);
766 pthread_mutex_unlock(&sc->mtx);
770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
774 DPRINTF("%s", __func__);
776 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
777 sc->submit_queues[0].size = asqs;
778 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
779 sizeof(struct nvme_command) * asqs);
781 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
782 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
784 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
785 NVME_AQA_REG_ACQS_MASK) + 1;
786 sc->compl_queues[0].size = acqs;
787 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
788 sizeof(struct nvme_completion) * acqs);
789 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
791 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
792 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
797 size_t len, enum nvme_copy_dir dir)
802 if (len > (8 * 1024)) {
806 /* Copy from the start of prp1 to the end of the physical page */
807 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
808 bytes = MIN(bytes, len);
810 p = vm_map_gpa(ctx, prp1, bytes);
815 if (dir == NVME_COPY_TO_PRP)
827 len = MIN(len, PAGE_SIZE);
829 p = vm_map_gpa(ctx, prp2, len);
834 if (dir == NVME_COPY_TO_PRP)
843 * Write a Completion Queue Entry update
845 * Write the completion and update the doorbell value
848 pci_nvme_cq_update(struct pci_nvme_softc *sc,
849 struct nvme_completion_queue *cq,
855 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
856 struct nvme_completion *cqe;
858 assert(cq->qbase != NULL);
860 pthread_mutex_lock(&cq->mtx);
862 cqe = &cq->qbase[cq->tail];
864 /* Flip the phase bit */
865 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
868 cqe->sqhd = sq->head;
871 cqe->status = status;
874 if (cq->tail >= cq->size) {
878 pthread_mutex_unlock(&cq->mtx);
882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
883 struct nvme_completion* compl)
885 uint16_t qid = command->cdw10 & 0xffff;
887 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
888 if (qid == 0 || qid > sc->num_squeues ||
889 (sc->submit_queues[qid].qbase == NULL)) {
890 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
891 __func__, qid, sc->num_squeues);
892 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
893 NVME_SC_INVALID_QUEUE_IDENTIFIER);
897 sc->submit_queues[qid].qbase = NULL;
898 sc->submit_queues[qid].cqid = 0;
899 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
905 struct nvme_completion* compl)
907 if (command->cdw11 & NVME_CMD_CDW11_PC) {
908 uint16_t qid = command->cdw10 & 0xffff;
909 struct nvme_submission_queue *nsq;
911 if ((qid == 0) || (qid > sc->num_squeues) ||
912 (sc->submit_queues[qid].qbase != NULL)) {
913 WPRINTF("%s queue index %u > num_squeues %u",
914 __func__, qid, sc->num_squeues);
915 pci_nvme_status_tc(&compl->status,
916 NVME_SCT_COMMAND_SPECIFIC,
917 NVME_SC_INVALID_QUEUE_IDENTIFIER);
921 nsq = &sc->submit_queues[qid];
922 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
923 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
924 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
926 * Queues must specify at least two entries
927 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
928 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
930 pci_nvme_status_tc(&compl->status,
931 NVME_SCT_COMMAND_SPECIFIC,
932 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
935 nsq->head = nsq->tail = 0;
937 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
938 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
939 pci_nvme_status_tc(&compl->status,
940 NVME_SCT_COMMAND_SPECIFIC,
941 NVME_SC_INVALID_QUEUE_IDENTIFIER);
945 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
946 pci_nvme_status_tc(&compl->status,
947 NVME_SCT_COMMAND_SPECIFIC,
948 NVME_SC_COMPLETION_QUEUE_INVALID);
952 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
954 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
955 sizeof(struct nvme_command) * (size_t)nsq->size);
957 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
958 qid, nsq->size, nsq->qbase, nsq->cqid);
960 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
962 DPRINTF("%s completed creating IOSQ qid %u",
966 * Guest sent non-cont submission queue request.
967 * This setting is unsupported by this emulation.
969 WPRINTF("%s unsupported non-contig (list-based) "
970 "create i/o submission queue", __func__);
972 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
978 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
979 struct nvme_completion* compl)
981 uint16_t qid = command->cdw10 & 0xffff;
984 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
985 if (qid == 0 || qid > sc->num_cqueues ||
986 (sc->compl_queues[qid].qbase == NULL)) {
987 WPRINTF("%s queue index %u / num_cqueues %u",
988 __func__, qid, sc->num_cqueues);
989 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
990 NVME_SC_INVALID_QUEUE_IDENTIFIER);
994 /* Deleting an Active CQ is an error */
995 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
996 if (sc->submit_queues[sqid].cqid == qid) {
997 pci_nvme_status_tc(&compl->status,
998 NVME_SCT_COMMAND_SPECIFIC,
999 NVME_SC_INVALID_QUEUE_DELETION);
1003 sc->compl_queues[qid].qbase = NULL;
1004 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1009 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1010 struct nvme_completion* compl)
1012 struct nvme_completion_queue *ncq;
1013 uint16_t qid = command->cdw10 & 0xffff;
1015 /* Only support Physically Contiguous queues */
1016 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1017 WPRINTF("%s unsupported non-contig (list-based) "
1018 "create i/o completion queue",
1021 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1025 if ((qid == 0) || (qid > sc->num_cqueues) ||
1026 (sc->compl_queues[qid].qbase != NULL)) {
1027 WPRINTF("%s queue index %u > num_cqueues %u",
1028 __func__, qid, sc->num_cqueues);
1029 pci_nvme_status_tc(&compl->status,
1030 NVME_SCT_COMMAND_SPECIFIC,
1031 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1035 ncq = &sc->compl_queues[qid];
1036 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1037 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1038 if (ncq->intr_vec > (sc->max_queues + 1)) {
1039 pci_nvme_status_tc(&compl->status,
1040 NVME_SCT_COMMAND_SPECIFIC,
1041 NVME_SC_INVALID_INTERRUPT_VECTOR);
1045 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1046 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1048 * Queues must specify at least two entries
1049 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1050 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1052 pci_nvme_status_tc(&compl->status,
1053 NVME_SCT_COMMAND_SPECIFIC,
1054 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1057 ncq->head = ncq->tail = 0;
1058 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1060 sizeof(struct nvme_command) * (size_t)ncq->size);
1062 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1069 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1070 struct nvme_completion* compl)
1073 uint8_t logpage = command->cdw10 & 0xFF;
1075 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1077 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1080 * Command specifies the number of dwords to return in fields NUMDU
1081 * and NUMDL. This is a zero-based value.
1083 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1084 logsize *= sizeof(uint32_t);
1087 case NVME_LOG_ERROR:
1088 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1089 command->prp2, (uint8_t *)&sc->err_log,
1090 MIN(logsize, sizeof(sc->err_log)),
1093 case NVME_LOG_HEALTH_INFORMATION:
1094 pthread_mutex_lock(&sc->mtx);
1095 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1096 sizeof(sc->health_log.data_units_read));
1097 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1098 sizeof(sc->health_log.data_units_written));
1099 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1100 sizeof(sc->health_log.host_read_commands));
1101 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1102 sizeof(sc->health_log.host_write_commands));
1103 pthread_mutex_unlock(&sc->mtx);
1105 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1106 command->prp2, (uint8_t *)&sc->health_log,
1107 MIN(logsize, sizeof(sc->health_log)),
1110 case NVME_LOG_FIRMWARE_SLOT:
1111 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1112 command->prp2, (uint8_t *)&sc->fw_log,
1113 MIN(logsize, sizeof(sc->fw_log)),
1117 DPRINTF("%s get log page %x command not supported",
1120 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1121 NVME_SC_INVALID_LOG_PAGE);
1128 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1129 struct nvme_completion* compl)
1134 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1135 command->cdw10 & 0xFF, command->nsid);
1137 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1139 switch (command->cdw10 & 0xFF) {
1140 case 0x00: /* return Identify Namespace data structure */
1141 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1142 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1145 case 0x01: /* return Identify Controller data structure */
1146 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1147 command->prp2, (uint8_t *)&sc->ctrldata,
1148 sizeof(sc->ctrldata),
1151 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1152 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1153 sizeof(uint32_t) * 1024);
1154 /* All unused entries shall be zero */
1155 bzero(dest, sizeof(uint32_t) * 1024);
1156 ((uint32_t *)dest)[0] = 1;
1158 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1159 if (command->nsid != 1) {
1160 pci_nvme_status_genc(&status,
1161 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1164 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1165 sizeof(uint32_t) * 1024);
1166 /* All bytes after the descriptor shall be zero */
1167 bzero(dest, sizeof(uint32_t) * 1024);
1169 /* Return NIDT=1 (i.e. EUI64) descriptor */
1170 ((uint8_t *)dest)[0] = 1;
1171 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1172 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1175 DPRINTF("%s unsupported identify command requested 0x%x",
1176 __func__, command->cdw10 & 0xFF);
1177 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1181 compl->status = status;
1186 nvme_fid_to_name(uint8_t fid)
1191 case NVME_FEAT_ARBITRATION:
1192 name = "Arbitration";
1194 case NVME_FEAT_POWER_MANAGEMENT:
1195 name = "Power Management";
1197 case NVME_FEAT_LBA_RANGE_TYPE:
1198 name = "LBA Range Type";
1200 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1201 name = "Temperature Threshold";
1203 case NVME_FEAT_ERROR_RECOVERY:
1204 name = "Error Recovery";
1206 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1207 name = "Volatile Write Cache";
1209 case NVME_FEAT_NUMBER_OF_QUEUES:
1210 name = "Number of Queues";
1212 case NVME_FEAT_INTERRUPT_COALESCING:
1213 name = "Interrupt Coalescing";
1215 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1216 name = "Interrupt Vector Configuration";
1218 case NVME_FEAT_WRITE_ATOMICITY:
1219 name = "Write Atomicity Normal";
1221 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1222 name = "Asynchronous Event Configuration";
1224 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1225 name = "Autonomous Power State Transition";
1227 case NVME_FEAT_HOST_MEMORY_BUFFER:
1228 name = "Host Memory Buffer";
1230 case NVME_FEAT_TIMESTAMP:
1233 case NVME_FEAT_KEEP_ALIVE_TIMER:
1234 name = "Keep Alive Timer";
1236 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1237 name = "Host Controlled Thermal Management";
1239 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1240 name = "Non-Operation Power State Config";
1242 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1243 name = "Read Recovery Level Config";
1245 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1246 name = "Predictable Latency Mode Config";
1248 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1249 name = "Predictable Latency Mode Window";
1251 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1252 name = "LBA Status Information Report Interval";
1254 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1255 name = "Host Behavior Support";
1257 case NVME_FEAT_SANITIZE_CONFIG:
1258 name = "Sanitize Config";
1260 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1261 name = "Endurance Group Event Configuration";
1263 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1264 name = "Software Progress Marker";
1266 case NVME_FEAT_HOST_IDENTIFIER:
1267 name = "Host Identifier";
1269 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1270 name = "Reservation Notification Mask";
1272 case NVME_FEAT_RESERVATION_PERSISTENCE:
1273 name = "Reservation Persistence";
1275 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1276 name = "Namespace Write Protection Config";
1287 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1288 struct nvme_feature_obj *feat,
1289 struct nvme_command *command,
1290 struct nvme_completion *compl)
1293 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1297 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1298 struct nvme_feature_obj *feat,
1299 struct nvme_command *command,
1300 struct nvme_completion *compl)
1303 uint32_t cdw11 = command->cdw11;
1307 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1309 iv = cdw11 & 0xffff;
1310 cd = cdw11 & (1 << 16);
1312 if (iv > (sc->max_queues + 1)) {
1316 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1317 if ((iv == 0) && !cd)
1320 /* Requested Interrupt Vector must be used by a CQ */
1321 for (i = 0; i < sc->num_cqueues + 1; i++) {
1322 if (sc->compl_queues[i].intr_vec == iv) {
1323 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1330 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1331 struct nvme_feature_obj *feat,
1332 struct nvme_command *command,
1333 struct nvme_completion *compl)
1335 uint16_t nqr; /* Number of Queues Requested */
1337 if (sc->num_q_is_set) {
1338 WPRINTF("%s: Number of Queues already set", __func__);
1339 pci_nvme_status_genc(&compl->status,
1340 NVME_SC_COMMAND_SEQUENCE_ERROR);
1344 nqr = command->cdw11 & 0xFFFF;
1345 if (nqr == 0xffff) {
1346 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1347 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1351 sc->num_squeues = ONE_BASED(nqr);
1352 if (sc->num_squeues > sc->max_queues) {
1353 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1355 sc->num_squeues = sc->max_queues;
1358 nqr = (command->cdw11 >> 16) & 0xFFFF;
1359 if (nqr == 0xffff) {
1360 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1361 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1365 sc->num_cqueues = ONE_BASED(nqr);
1366 if (sc->num_cqueues > sc->max_queues) {
1367 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1369 sc->num_cqueues = sc->max_queues;
1372 /* Patch the command value which will be saved on callback's return */
1373 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1374 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1376 sc->num_q_is_set = true;
1380 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1381 struct nvme_completion *compl)
1383 struct nvme_feature_obj *feat;
1384 uint32_t nsid = command->nsid;
1385 uint8_t fid = command->cdw10 & 0xFF;
1387 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1389 if (fid >= NVME_FID_MAX) {
1390 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1391 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1394 feat = &sc->feat[fid];
1396 if (!feat->namespace_specific &&
1397 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1398 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1399 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1404 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1407 feat->set(sc, feat, command, compl);
1409 if (compl->status == NVME_SC_SUCCESS)
1410 feat->cdw11 = command->cdw11;
1416 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1417 struct nvme_completion* compl)
1419 struct nvme_feature_obj *feat;
1420 uint8_t fid = command->cdw10 & 0xFF;
1422 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1424 if (fid >= NVME_FID_MAX) {
1425 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1426 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1431 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1433 feat = &sc->feat[fid];
1435 feat->get(sc, feat, command, compl);
1438 if (compl->status == NVME_SC_SUCCESS) {
1439 compl->cdw0 = feat->cdw11;
1446 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1447 struct nvme_completion* compl)
1449 uint8_t ses, lbaf, pi;
1451 /* Only supports Secure Erase Setting - User Data Erase */
1452 ses = (command->cdw10 >> 9) & 0x7;
1454 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1458 /* Only supports a single LBA Format */
1459 lbaf = command->cdw10 & 0xf;
1461 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1462 NVME_SC_INVALID_FORMAT);
1466 /* Doesn't support Protection Infomation */
1467 pi = (command->cdw10 >> 5) & 0x7;
1469 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1473 if (sc->nvstore.type == NVME_STOR_RAM) {
1474 if (sc->nvstore.ctx)
1475 free(sc->nvstore.ctx);
1476 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1477 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1479 struct pci_nvme_ioreq *req;
1482 req = pci_nvme_get_ioreq(sc);
1484 pci_nvme_status_genc(&compl->status,
1485 NVME_SC_INTERNAL_DEVICE_ERROR);
1486 WPRINTF("%s: unable to allocate IO req", __func__);
1489 req->nvme_sq = &sc->submit_queues[0];
1491 req->opc = command->opc;
1492 req->cid = command->cid;
1493 req->nsid = command->nsid;
1495 req->io_req.br_offset = 0;
1496 req->io_req.br_resid = sc->nvstore.size;
1497 req->io_req.br_callback = pci_nvme_io_done;
1499 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1501 pci_nvme_status_genc(&compl->status,
1502 NVME_SC_INTERNAL_DEVICE_ERROR);
1503 pci_nvme_release_ioreq(sc, req);
1511 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1512 struct nvme_completion* compl)
1514 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1515 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1517 /* TODO: search for the command ID and abort it */
1520 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1525 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1526 struct nvme_command* command, struct nvme_completion* compl)
1528 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1530 /* Don't exceed the Async Event Request Limit (AERL). */
1531 if (pci_nvme_aer_limit_reached(sc)) {
1532 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1533 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1537 if (pci_nvme_aer_add(sc, command->cid)) {
1538 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1539 NVME_SC_INTERNAL_DEVICE_ERROR);
1544 * Raise events when they happen based on the Set Features cmd.
1545 * These events happen async, so only set completion successful if
1546 * there is an event reflective of the request to get event.
1548 compl->status = NVME_NO_STATUS;
1554 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1556 struct nvme_completion compl;
1557 struct nvme_command *cmd;
1558 struct nvme_submission_queue *sq;
1559 struct nvme_completion_queue *cq;
1562 DPRINTF("%s index %u", __func__, (uint32_t)value);
1564 sq = &sc->submit_queues[0];
1565 cq = &sc->compl_queues[0];
1567 pthread_mutex_lock(&sq->mtx);
1570 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1572 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1573 cmd = &(sq->qbase)[sqhead];
1578 case NVME_OPC_DELETE_IO_SQ:
1579 DPRINTF("%s command DELETE_IO_SQ", __func__);
1580 nvme_opc_delete_io_sq(sc, cmd, &compl);
1582 case NVME_OPC_CREATE_IO_SQ:
1583 DPRINTF("%s command CREATE_IO_SQ", __func__);
1584 nvme_opc_create_io_sq(sc, cmd, &compl);
1586 case NVME_OPC_DELETE_IO_CQ:
1587 DPRINTF("%s command DELETE_IO_CQ", __func__);
1588 nvme_opc_delete_io_cq(sc, cmd, &compl);
1590 case NVME_OPC_CREATE_IO_CQ:
1591 DPRINTF("%s command CREATE_IO_CQ", __func__);
1592 nvme_opc_create_io_cq(sc, cmd, &compl);
1594 case NVME_OPC_GET_LOG_PAGE:
1595 DPRINTF("%s command GET_LOG_PAGE", __func__);
1596 nvme_opc_get_log_page(sc, cmd, &compl);
1598 case NVME_OPC_IDENTIFY:
1599 DPRINTF("%s command IDENTIFY", __func__);
1600 nvme_opc_identify(sc, cmd, &compl);
1602 case NVME_OPC_ABORT:
1603 DPRINTF("%s command ABORT", __func__);
1604 nvme_opc_abort(sc, cmd, &compl);
1606 case NVME_OPC_SET_FEATURES:
1607 DPRINTF("%s command SET_FEATURES", __func__);
1608 nvme_opc_set_features(sc, cmd, &compl);
1610 case NVME_OPC_GET_FEATURES:
1611 DPRINTF("%s command GET_FEATURES", __func__);
1612 nvme_opc_get_features(sc, cmd, &compl);
1614 case NVME_OPC_FIRMWARE_ACTIVATE:
1615 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1616 pci_nvme_status_tc(&compl.status,
1617 NVME_SCT_COMMAND_SPECIFIC,
1618 NVME_SC_INVALID_FIRMWARE_SLOT);
1620 case NVME_OPC_ASYNC_EVENT_REQUEST:
1621 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1622 nvme_opc_async_event_req(sc, cmd, &compl);
1624 case NVME_OPC_FORMAT_NVM:
1625 DPRINTF("%s command FORMAT_NVM", __func__);
1626 if ((sc->ctrldata.oacs &
1627 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1628 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1630 compl.status = NVME_NO_STATUS;
1631 nvme_opc_format_nvm(sc, cmd, &compl);
1634 DPRINTF("0x%x command is not implemented",
1636 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1638 sqhead = (sqhead + 1) % sq->size;
1640 if (NVME_COMPLETION_VALID(compl)) {
1641 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1649 DPRINTF("setting sqhead %u", sqhead);
1652 if (cq->head != cq->tail)
1653 pci_generate_msix(sc->nsc_pi, 0);
1655 pthread_mutex_unlock(&sq->mtx);
1659 * Update the Write and Read statistics reported in SMART data
1661 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1662 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1663 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1666 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1667 size_t bytes, uint16_t status)
1670 pthread_mutex_lock(&sc->mtx);
1672 case NVME_OPC_WRITE:
1673 sc->write_commands++;
1674 if (status != NVME_SC_SUCCESS)
1676 sc->write_dunits_remainder += (bytes / 512);
1677 while (sc->write_dunits_remainder >= 1000) {
1678 sc->write_data_units++;
1679 sc->write_dunits_remainder -= 1000;
1683 sc->read_commands++;
1684 if (status != NVME_SC_SUCCESS)
1686 sc->read_dunits_remainder += (bytes / 512);
1687 while (sc->read_dunits_remainder >= 1000) {
1688 sc->read_data_units++;
1689 sc->read_dunits_remainder -= 1000;
1693 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1696 pthread_mutex_unlock(&sc->mtx);
1700 * Check if the combination of Starting LBA (slba) and Number of Logical
1701 * Blocks (nlb) exceeds the range of the underlying storage.
1703 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1704 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1708 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1711 size_t offset, bytes;
1713 /* Overflow check of multiplying Starting LBA by the sector size */
1714 if (slba >> (64 - nvstore->sectsz_bits))
1717 offset = slba << nvstore->sectsz_bits;
1718 bytes = nlb << nvstore->sectsz_bits;
1720 /* Overflow check of Number of Logical Blocks */
1721 if ((nvstore->size - offset) < bytes)
1728 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1729 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1736 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1740 /* concatenate contig block-iovs to minimize number of iovs */
1741 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1742 iovidx = req->io_req.br_iovcnt - 1;
1744 req->io_req.br_iov[iovidx].iov_base =
1745 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1746 req->prev_gpaddr, size);
1748 req->prev_size += size;
1749 req->io_req.br_resid += size;
1751 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1753 iovidx = req->io_req.br_iovcnt;
1755 req->io_req.br_offset = lba;
1756 req->io_req.br_resid = 0;
1757 req->io_req.br_param = req;
1760 req->io_req.br_iov[iovidx].iov_base =
1761 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1764 req->io_req.br_iov[iovidx].iov_len = size;
1766 req->prev_gpaddr = gpaddr;
1767 req->prev_size = size;
1768 req->io_req.br_resid += size;
1770 req->io_req.br_iovcnt++;
1777 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1778 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1779 uint32_t cdw0, uint16_t status)
1781 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1783 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1784 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1785 NVME_STATUS_GET_SC(status));
1787 pci_nvme_cq_update(sc, cq,
1793 if (cq->head != cq->tail) {
1794 if (cq->intr_en & NVME_CQ_INTEN) {
1795 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1797 DPRINTF("%s: CQ%u interrupt disabled",
1798 __func__, sq->cqid);
1804 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1807 req->nvme_sq = NULL;
1810 pthread_mutex_lock(&sc->mtx);
1812 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1815 /* when no more IO pending, can set to ready if device reset/enabled */
1816 if (sc->pending_ios == 0 &&
1817 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1818 sc->regs.csts |= NVME_CSTS_RDY;
1820 pthread_mutex_unlock(&sc->mtx);
1822 sem_post(&sc->iosemlock);
1825 static struct pci_nvme_ioreq *
1826 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1828 struct pci_nvme_ioreq *req = NULL;;
1830 sem_wait(&sc->iosemlock);
1831 pthread_mutex_lock(&sc->mtx);
1833 req = STAILQ_FIRST(&sc->ioreqs_free);
1834 assert(req != NULL);
1835 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1841 pthread_mutex_unlock(&sc->mtx);
1843 req->io_req.br_iovcnt = 0;
1844 req->io_req.br_offset = 0;
1845 req->io_req.br_resid = 0;
1846 req->io_req.br_param = req;
1847 req->prev_gpaddr = 0;
1854 pci_nvme_io_done(struct blockif_req *br, int err)
1856 struct pci_nvme_ioreq *req = br->br_param;
1857 struct nvme_submission_queue *sq = req->nvme_sq;
1858 uint16_t code, status;
1860 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1862 /* TODO return correct error */
1863 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1864 pci_nvme_status_genc(&status, code);
1866 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1867 pci_nvme_stats_write_read_update(req->sc, req->opc,
1868 req->bytes, status);
1869 pci_nvme_release_ioreq(req->sc, req);
1873 * Implements the Flush command. The specification states:
1874 * If a volatile write cache is not present, Flush commands complete
1875 * successfully and have no effect
1876 * in the description of the Volatile Write Cache (VWC) field of the Identify
1877 * Controller data. Therefore, set status to Success if the command is
1878 * not supported (i.e. RAM or as indicated by the blockif).
1881 nvme_opc_flush(struct pci_nvme_softc *sc,
1882 struct nvme_command *cmd,
1883 struct pci_nvme_blockstore *nvstore,
1884 struct pci_nvme_ioreq *req,
1887 bool pending = false;
1889 if (nvstore->type == NVME_STOR_RAM) {
1890 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1894 req->io_req.br_callback = pci_nvme_io_done;
1896 err = blockif_flush(nvstore->ctx, &req->io_req);
1902 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1905 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1913 nvme_write_read_ram(struct pci_nvme_softc *sc,
1914 struct pci_nvme_blockstore *nvstore,
1915 uint64_t prp1, uint64_t prp2,
1916 size_t offset, uint64_t bytes,
1919 uint8_t *buf = nvstore->ctx;
1920 enum nvme_copy_dir dir;
1924 dir = NVME_COPY_TO_PRP;
1926 dir = NVME_COPY_FROM_PRP;
1928 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1929 buf + offset, bytes, dir))
1930 pci_nvme_status_genc(&status,
1931 NVME_SC_DATA_TRANSFER_ERROR);
1933 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1939 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1940 struct pci_nvme_blockstore *nvstore,
1941 struct pci_nvme_ioreq *req,
1942 uint64_t prp1, uint64_t prp2,
1943 size_t offset, uint64_t bytes,
1948 uint16_t status = NVME_NO_STATUS;
1950 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1951 if (pci_nvme_append_iov_req(sc, req, prp1,
1952 size, is_write, offset)) {
1953 pci_nvme_status_genc(&status,
1954 NVME_SC_DATA_TRANSFER_ERROR);
1963 } else if (bytes <= PAGE_SIZE) {
1965 if (pci_nvme_append_iov_req(sc, req, prp2,
1966 size, is_write, offset)) {
1967 pci_nvme_status_genc(&status,
1968 NVME_SC_DATA_TRANSFER_ERROR);
1972 void *vmctx = sc->nsc_pi->pi_vmctx;
1973 uint64_t *prp_list = &prp2;
1974 uint64_t *last = prp_list;
1976 /* PRP2 is pointer to a physical region page list */
1978 /* Last entry in list points to the next list */
1979 if (prp_list == last) {
1980 uint64_t prp = *prp_list;
1982 prp_list = paddr_guest2host(vmctx, prp,
1983 PAGE_SIZE - (prp % PAGE_SIZE));
1984 last = prp_list + (NVME_PRP2_ITEMS - 1);
1987 size = MIN(bytes, PAGE_SIZE);
1989 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1990 size, is_write, offset)) {
1991 pci_nvme_status_genc(&status,
1992 NVME_SC_DATA_TRANSFER_ERROR);
2002 req->io_req.br_callback = pci_nvme_io_done;
2004 err = blockif_write(nvstore->ctx, &req->io_req);
2006 err = blockif_read(nvstore->ctx, &req->io_req);
2009 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2015 nvme_opc_write_read(struct pci_nvme_softc *sc,
2016 struct nvme_command *cmd,
2017 struct pci_nvme_blockstore *nvstore,
2018 struct pci_nvme_ioreq *req,
2021 uint64_t lba, nblocks, bytes;
2023 bool is_write = cmd->opc == NVME_OPC_WRITE;
2024 bool pending = false;
2026 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2027 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2028 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2029 WPRINTF("%s command would exceed LBA range", __func__);
2030 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2034 bytes = nblocks << nvstore->sectsz_bits;
2035 if (bytes > NVME_MAX_DATA_SIZE) {
2036 WPRINTF("%s command would exceed MDTS", __func__);
2037 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2041 offset = lba << nvstore->sectsz_bits;
2044 req->io_req.br_offset = lba;
2046 /* PRP bits 1:0 must be zero */
2047 cmd->prp1 &= ~0x3UL;
2048 cmd->prp2 &= ~0x3UL;
2050 if (nvstore->type == NVME_STOR_RAM) {
2051 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2052 cmd->prp2, offset, bytes, is_write);
2054 *status = nvme_write_read_blockif(sc, nvstore, req,
2055 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2057 if (*status == NVME_NO_STATUS)
2062 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2068 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2070 struct pci_nvme_ioreq *req = br->br_param;
2071 struct pci_nvme_softc *sc = req->sc;
2076 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2077 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2078 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2080 struct iovec *iov = req->io_req.br_iov;
2083 iov += req->prev_gpaddr;
2085 /* The iov_* values already include the sector size */
2086 req->io_req.br_offset = (off_t)iov->iov_base;
2087 req->io_req.br_resid = iov->iov_len;
2088 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2089 pci_nvme_status_genc(&status,
2090 NVME_SC_INTERNAL_DEVICE_ERROR);
2096 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2097 req->cid, 0, status);
2098 pci_nvme_release_ioreq(sc, req);
2103 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2104 struct nvme_command *cmd,
2105 struct pci_nvme_blockstore *nvstore,
2106 struct pci_nvme_ioreq *req,
2109 struct nvme_dsm_range *range;
2110 uint32_t nr, r, non_zero, dr;
2112 bool pending = false;
2114 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2115 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2119 nr = cmd->cdw10 & 0xff;
2121 /* copy locally because a range entry could straddle PRPs */
2122 range = calloc(1, NVME_MAX_DSM_TRIM);
2123 if (range == NULL) {
2124 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2127 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2128 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2130 /* Check for invalid ranges and the number of non-zero lengths */
2132 for (r = 0; r <= nr; r++) {
2133 if (pci_nvme_out_of_range(nvstore,
2134 range[r].starting_lba, range[r].length)) {
2135 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2138 if (range[r].length != 0)
2142 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2143 size_t offset, bytes;
2144 int sectsz_bits = sc->nvstore.sectsz_bits;
2147 * DSM calls are advisory only, and compliant controllers
2148 * may choose to take no actions (i.e. return Success).
2150 if (!nvstore->deallocate) {
2151 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2155 /* If all ranges have a zero length, return Success */
2156 if (non_zero == 0) {
2157 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2162 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2166 offset = range[0].starting_lba << sectsz_bits;
2167 bytes = range[0].length << sectsz_bits;
2170 * If the request is for more than a single range, store
2171 * the ranges in the br_iov. Optimize for the common case
2172 * of a single range.
2174 * Note that NVMe Number of Ranges is a zero based value
2176 req->io_req.br_iovcnt = 0;
2177 req->io_req.br_offset = offset;
2178 req->io_req.br_resid = bytes;
2181 req->io_req.br_callback = pci_nvme_io_done;
2183 struct iovec *iov = req->io_req.br_iov;
2185 for (r = 0, dr = 0; r <= nr; r++) {
2186 offset = range[r].starting_lba << sectsz_bits;
2187 bytes = range[r].length << sectsz_bits;
2191 if ((nvstore->size - offset) < bytes) {
2192 pci_nvme_status_genc(status,
2193 NVME_SC_LBA_OUT_OF_RANGE);
2196 iov[dr].iov_base = (void *)offset;
2197 iov[dr].iov_len = bytes;
2200 req->io_req.br_callback = pci_nvme_dealloc_sm;
2203 * Use prev_gpaddr to track the current entry and
2204 * prev_size to track the number of entries
2206 req->prev_gpaddr = 0;
2207 req->prev_size = dr;
2210 err = blockif_delete(nvstore->ctx, &req->io_req);
2212 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2222 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2224 struct nvme_submission_queue *sq;
2228 /* handle all submissions up to sq->tail index */
2229 sq = &sc->submit_queues[idx];
2231 pthread_mutex_lock(&sq->mtx);
2234 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2235 idx, sqhead, sq->tail, sq->qbase);
2237 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2238 struct nvme_command *cmd;
2239 struct pci_nvme_ioreq *req;
2247 cmd = &sq->qbase[sqhead];
2248 sqhead = (sqhead + 1) % sq->size;
2250 nsid = le32toh(cmd->nsid);
2251 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2252 pci_nvme_status_genc(&status,
2253 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2255 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2259 req = pci_nvme_get_ioreq(sc);
2261 pci_nvme_status_genc(&status,
2262 NVME_SC_INTERNAL_DEVICE_ERROR);
2263 WPRINTF("%s: unable to allocate IO req", __func__);
2268 req->opc = cmd->opc;
2269 req->cid = cmd->cid;
2270 req->nsid = cmd->nsid;
2273 case NVME_OPC_FLUSH:
2274 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2277 case NVME_OPC_WRITE:
2279 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2282 case NVME_OPC_WRITE_ZEROES:
2283 /* TODO: write zeroes
2284 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2285 __func__, lba, cmd->cdw12 & 0xFFFF); */
2286 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2288 case NVME_OPC_DATASET_MANAGEMENT:
2289 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2293 WPRINTF("%s unhandled io command 0x%x",
2294 __func__, cmd->opc);
2295 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2299 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2302 pci_nvme_release_ioreq(sc, req);
2308 pthread_mutex_unlock(&sq->mtx);
2312 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2313 uint64_t idx, int is_sq, uint64_t value)
2315 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2316 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2319 if (idx > sc->num_squeues) {
2320 WPRINTF("%s queue index %lu overflow from "
2322 __func__, idx, sc->num_squeues);
2326 atomic_store_short(&sc->submit_queues[idx].tail,
2330 pci_nvme_handle_admin_cmd(sc, value);
2332 /* submission queue; handle new entries in SQ */
2333 if (idx > sc->num_squeues) {
2334 WPRINTF("%s SQ index %lu overflow from "
2336 __func__, idx, sc->num_squeues);
2339 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2342 if (idx > sc->num_cqueues) {
2343 WPRINTF("%s queue index %lu overflow from "
2345 __func__, idx, sc->num_cqueues);
2349 atomic_store_short(&sc->compl_queues[idx].head,
2355 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2357 const char *s = iswrite ? "WRITE" : "READ";
2360 case NVME_CR_CAP_LOW:
2361 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2363 case NVME_CR_CAP_HI:
2364 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2367 DPRINTF("%s %s NVME_CR_VS", func, s);
2370 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2373 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2376 DPRINTF("%s %s NVME_CR_CC", func, s);
2379 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2382 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2385 DPRINTF("%s %s NVME_CR_AQA", func, s);
2387 case NVME_CR_ASQ_LOW:
2388 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2390 case NVME_CR_ASQ_HI:
2391 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2393 case NVME_CR_ACQ_LOW:
2394 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2396 case NVME_CR_ACQ_HI:
2397 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2400 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2406 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2407 uint64_t offset, int size, uint64_t value)
2411 if (offset >= NVME_DOORBELL_OFFSET) {
2412 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2413 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2414 int is_sq = (belloffset % 8) < 4;
2416 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2417 WPRINTF("guest attempted an overflow write offset "
2418 "0x%lx, val 0x%lx in %s",
2419 offset, value, __func__);
2423 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2427 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2428 offset, size, value);
2431 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2432 "val 0x%lx) to bar0 in %s",
2433 size, offset, value, __func__);
2434 /* TODO: shutdown device */
2438 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2440 pthread_mutex_lock(&sc->mtx);
2443 case NVME_CR_CAP_LOW:
2444 case NVME_CR_CAP_HI:
2451 /* MSI-X, so ignore */
2454 /* MSI-X, so ignore */
2457 ccreg = (uint32_t)value;
2459 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2462 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2463 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2464 NVME_CC_GET_IOCQES(ccreg));
2466 if (NVME_CC_GET_SHN(ccreg)) {
2467 /* perform shutdown - flush out data to backend */
2468 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2469 NVME_CSTS_REG_SHST_SHIFT);
2470 sc->regs.csts |= NVME_SHST_COMPLETE <<
2471 NVME_CSTS_REG_SHST_SHIFT;
2473 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2474 if (NVME_CC_GET_EN(ccreg) == 0)
2475 /* transition 1-> causes controller reset */
2476 pci_nvme_reset_locked(sc);
2478 pci_nvme_init_controller(ctx, sc);
2481 /* Insert the iocqes, iosqes and en bits from the write */
2482 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2483 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2484 if (NVME_CC_GET_EN(ccreg) == 0) {
2485 /* Insert the ams, mps and css bit fields */
2486 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2487 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2488 sc->regs.csts &= ~NVME_CSTS_RDY;
2489 } else if (sc->pending_ios == 0) {
2490 sc->regs.csts |= NVME_CSTS_RDY;
2496 /* ignore writes; don't support subsystem reset */
2499 sc->regs.aqa = (uint32_t)value;
2501 case NVME_CR_ASQ_LOW:
2502 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2503 (0xFFFFF000 & value);
2505 case NVME_CR_ASQ_HI:
2506 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2509 case NVME_CR_ACQ_LOW:
2510 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2511 (0xFFFFF000 & value);
2513 case NVME_CR_ACQ_HI:
2514 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2518 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2519 __func__, offset, value, size);
2521 pthread_mutex_unlock(&sc->mtx);
2525 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2526 int baridx, uint64_t offset, int size, uint64_t value)
2528 struct pci_nvme_softc* sc = pi->pi_arg;
2530 if (baridx == pci_msix_table_bar(pi) ||
2531 baridx == pci_msix_pba_bar(pi)) {
2532 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2533 " value 0x%lx", baridx, offset, size, value);
2535 pci_emul_msix_twrite(pi, offset, size, value);
2541 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2545 DPRINTF("%s unknown baridx %d, val 0x%lx",
2546 __func__, baridx, value);
2550 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2551 uint64_t offset, int size)
2555 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2557 if (offset < NVME_DOORBELL_OFFSET) {
2558 void *p = &(sc->regs);
2559 pthread_mutex_lock(&sc->mtx);
2560 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2561 pthread_mutex_unlock(&sc->mtx);
2564 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2575 value &= 0xFFFFFFFF;
2579 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2580 offset, size, (uint32_t)value);
2588 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2589 uint64_t offset, int size)
2591 struct pci_nvme_softc* sc = pi->pi_arg;
2593 if (baridx == pci_msix_table_bar(pi) ||
2594 baridx == pci_msix_pba_bar(pi)) {
2595 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2596 baridx, offset, size);
2598 return pci_emul_msix_tread(pi, offset, size);
2603 return pci_nvme_read_bar_0(sc, offset, size);
2606 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2614 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2616 char bident[sizeof("XX:X:X")];
2617 char *uopt, *xopts, *config;
2621 sc->max_queues = NVME_QUEUES;
2622 sc->max_qentries = NVME_MAX_QENTRIES;
2623 sc->ioslots = NVME_IOSLOTS;
2624 sc->num_squeues = sc->max_queues;
2625 sc->num_cqueues = sc->max_queues;
2626 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2629 uopt = strdup(opts);
2631 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2632 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2633 for (xopts = strtok(uopt, ",");
2635 xopts = strtok(NULL, ",")) {
2637 if ((config = strchr(xopts, '=')) != NULL)
2640 if (!strcmp("maxq", xopts)) {
2641 sc->max_queues = atoi(config);
2642 } else if (!strcmp("qsz", xopts)) {
2643 sc->max_qentries = atoi(config);
2644 } else if (!strcmp("ioslots", xopts)) {
2645 sc->ioslots = atoi(config);
2646 } else if (!strcmp("sectsz", xopts)) {
2647 sectsz = atoi(config);
2648 } else if (!strcmp("ser", xopts)) {
2650 * This field indicates the Product Serial Number in
2651 * 7-bit ASCII, unused bytes should be space characters.
2654 cpywithpad((char *)sc->ctrldata.sn,
2655 sizeof(sc->ctrldata.sn), config, ' ');
2656 } else if (!strcmp("ram", xopts)) {
2657 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2659 sc->nvstore.type = NVME_STOR_RAM;
2660 sc->nvstore.size = sz * 1024 * 1024;
2661 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2662 sc->nvstore.sectsz = 4096;
2663 sc->nvstore.sectsz_bits = 12;
2664 if (sc->nvstore.ctx == NULL) {
2665 perror("Unable to allocate RAM");
2669 } else if (!strcmp("eui64", xopts)) {
2670 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2671 } else if (!strcmp("dsm", xopts)) {
2672 if (!strcmp("auto", config))
2673 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2674 else if (!strcmp("enable", config))
2675 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2676 else if (!strcmp("disable", config))
2677 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2678 } else if (optidx == 0) {
2679 snprintf(bident, sizeof(bident), "%d:%d",
2680 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2681 sc->nvstore.ctx = blockif_open(xopts, bident);
2682 if (sc->nvstore.ctx == NULL) {
2683 perror("Could not open backing file");
2687 sc->nvstore.type = NVME_STOR_BLOCKIF;
2688 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2690 EPRINTLN("Invalid option %s", xopts);
2699 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2700 EPRINTLN("backing store not specified");
2703 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2704 sc->nvstore.sectsz = sectsz;
2705 else if (sc->nvstore.type != NVME_STOR_RAM)
2706 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2707 for (sc->nvstore.sectsz_bits = 9;
2708 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2709 sc->nvstore.sectsz_bits++);
2711 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2712 sc->max_queues = NVME_QUEUES;
2714 if (sc->max_qentries <= 0) {
2715 EPRINTLN("Invalid qsz option");
2718 if (sc->ioslots <= 0) {
2719 EPRINTLN("Invalid ioslots option");
2727 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2729 struct pci_nvme_softc *sc;
2730 uint32_t pci_membar_sz;
2735 sc = calloc(1, sizeof(struct pci_nvme_softc));
2739 error = pci_nvme_parse_opts(sc, opts);
2745 STAILQ_INIT(&sc->ioreqs_free);
2746 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2747 for (int i = 0; i < sc->ioslots; i++) {
2748 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2751 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2752 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2753 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2754 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2755 pci_set_cfgdata8(pi, PCIR_PROGIF,
2756 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2759 * Allocate size of NVMe registers + doorbell space for all queues.
2761 * The specification requires a minimum memory I/O window size of 16K.
2762 * The Windows driver will refuse to start a device with a smaller
2765 pci_membar_sz = sizeof(struct nvme_registers) +
2766 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2767 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2769 DPRINTF("nvme membar size: %u", pci_membar_sz);
2771 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2773 WPRINTF("%s pci alloc mem bar failed", __func__);
2777 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2779 WPRINTF("%s pci add msixcap failed", __func__);
2783 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2785 WPRINTF("%s pci add Express capability failed", __func__);
2789 pthread_mutex_init(&sc->mtx, NULL);
2790 sem_init(&sc->iosemlock, 0, sc->ioslots);
2792 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2794 * Controller data depends on Namespace data so initialize Namespace
2797 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2798 pci_nvme_init_ctrldata(sc);
2799 pci_nvme_init_logpages(sc);
2800 pci_nvme_init_features(sc);
2802 pci_nvme_aer_init(sc);
2806 pci_lintr_request(pi);
2813 struct pci_devemu pci_de_nvme = {
2815 .pe_init = pci_nvme_init,
2816 .pe_barwrite = pci_nvme_write,
2817 .pe_barread = pci_nvme_read
2819 PCI_EMUL_SET(pci_de_nvme);