2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN 0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS 0xffff
115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero) ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one) ((one) - 1)
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
131 enum nvme_controller_register_offsets {
132 NVME_CR_CAP_LOW = 0x00,
133 NVME_CR_CAP_HI = 0x04,
135 NVME_CR_INTMS = 0x0c,
136 NVME_CR_INTMC = 0x10,
141 NVME_CR_ASQ_LOW = 0x28,
142 NVME_CR_ASQ_HI = 0x2c,
143 NVME_CR_ACQ_LOW = 0x30,
144 NVME_CR_ACQ_HI = 0x34,
147 enum nvme_cmd_cdw11 {
148 NVME_CMD_CDW11_PC = 0x0001,
149 NVME_CMD_CDW11_IEN = 0x0002,
150 NVME_CMD_CDW11_IV = 0xFFFF0000,
158 #define NVME_CQ_INTEN 0x01
159 #define NVME_CQ_INTCOAL 0x02
161 struct nvme_completion_queue {
162 struct nvme_completion *qbase;
165 uint16_t tail; /* nvme progress */
166 uint16_t head; /* guest progress */
171 struct nvme_submission_queue {
172 struct nvme_command *qbase;
175 uint16_t head; /* nvme progress */
176 uint16_t tail; /* guest progress */
177 uint16_t cqid; /* completion queue id */
181 enum nvme_storage_type {
182 NVME_STOR_BLOCKIF = 0,
186 struct pci_nvme_blockstore {
187 enum nvme_storage_type type;
191 uint32_t sectsz_bits;
193 uint32_t deallocate:1;
197 * Calculate the number of additional page descriptors for guest IO requests
198 * based on the advertised Max Data Transfer (MDTS) and given the number of
199 * default iovec's in a struct blockif_req.
201 * Note the + 1 allows for the initial descriptor to not be page aligned.
203 #define MDTS_PAD_SIZE \
204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
254 struct pci_nvme_aer {
255 STAILQ_ENTRY(pci_nvme_aer) link;
256 uint16_t cid; /* Command ID of the submitted AER */
259 struct pci_nvme_softc {
260 struct pci_devinst *nsc_pi;
264 struct nvme_registers regs;
266 struct nvme_namespace_data nsdata;
267 struct nvme_controller_data ctrldata;
268 struct nvme_error_information_entry err_log;
269 struct nvme_health_information_page health_log;
270 struct nvme_firmware_page fw_log;
272 struct pci_nvme_blockstore nvstore;
274 uint16_t max_qentries; /* max entries per queue */
275 uint32_t max_queues; /* max number of IO SQ's or CQ's */
276 uint32_t num_cqueues;
277 uint32_t num_squeues;
278 bool num_q_is_set; /* Has host set Number of Queues */
280 struct pci_nvme_ioreq *ioreqs;
281 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282 uint32_t pending_ios;
287 * Memory mapped Submission and Completion queues
288 * Each array includes both Admin and IO queues
290 struct nvme_completion_queue *compl_queues;
291 struct nvme_submission_queue *submit_queues;
293 struct nvme_feature_obj feat[NVME_FID_MAX];
295 enum nvme_dsm_type dataset_management;
297 /* Accounting for SMART data */
298 __uint128_t read_data_units;
299 __uint128_t write_data_units;
300 __uint128_t read_commands;
301 __uint128_t write_commands;
302 uint32_t read_dunits_remainder;
303 uint32_t write_dunits_remainder;
305 STAILQ_HEAD(, pci_nvme_aer) aer_list;
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
326 #define NVME_CC_WRITE_MASK \
327 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
331 #define NVME_CC_NEN_WRITE_MASK \
332 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
340 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
348 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352 struct nvme_feature_obj *,
353 struct nvme_command *,
354 struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356 struct nvme_feature_obj *,
357 struct nvme_command *,
358 struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360 struct nvme_feature_obj *,
361 struct nvme_command *,
362 struct nvme_completion *);
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
369 len = strnlen(src, dst_size);
370 memset(dst, pad, dst_size);
371 memcpy(dst, src, len);
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
378 *status &= ~NVME_STATUS_MASK;
379 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
387 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
391 * Initialize the requested number or IO Submission and Completion Queues.
392 * Admin queues are allocated implicitly.
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
400 * Allocate and initialize the Submission Queues
402 if (nsq > NVME_QUEUES) {
403 WPRINTF("%s: clamping number of SQ from %u to %u",
404 __func__, nsq, NVME_QUEUES);
408 sc->num_squeues = nsq;
410 sc->submit_queues = calloc(sc->num_squeues + 1,
411 sizeof(struct nvme_submission_queue));
412 if (sc->submit_queues == NULL) {
413 WPRINTF("%s: SQ allocation failed", __func__);
416 struct nvme_submission_queue *sq = sc->submit_queues;
418 for (i = 0; i < sc->num_squeues; i++)
419 pthread_mutex_init(&sq[i].mtx, NULL);
423 * Allocate and initialize the Completion Queues
425 if (ncq > NVME_QUEUES) {
426 WPRINTF("%s: clamping number of CQ from %u to %u",
427 __func__, ncq, NVME_QUEUES);
431 sc->num_cqueues = ncq;
433 sc->compl_queues = calloc(sc->num_cqueues + 1,
434 sizeof(struct nvme_completion_queue));
435 if (sc->compl_queues == NULL) {
436 WPRINTF("%s: CQ allocation failed", __func__);
439 struct nvme_completion_queue *cq = sc->compl_queues;
441 for (i = 0; i < sc->num_cqueues; i++)
442 pthread_mutex_init(&cq[i].mtx, NULL);
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
449 struct nvme_controller_data *cd = &sc->ctrldata;
454 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
457 /* Num of submission commands that we can handle at a time (2^rab) */
467 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
469 cd->ver = 0x00010300;
471 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
475 /* Advertise 1, Read-only firmware slot */
476 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478 cd->lpa = 0; /* TODO: support some simple things like SMART */
479 cd->elpe = 0; /* max error log page entries */
480 cd->npss = 1; /* number of power states support */
482 /* Warning Composite Temperature Threshold */
485 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489 cd->nn = 1; /* number of namespaces */
492 switch (sc->dataset_management) {
493 case NVME_DATASET_MANAGEMENT_AUTO:
494 if (sc->nvstore.deallocate)
495 cd->oncs |= NVME_ONCS_DSM;
497 case NVME_DATASET_MANAGEMENT_ENABLE:
498 cd->oncs |= NVME_ONCS_DSM;
506 cd->power_state[0].mp = 10;
510 * Calculate the CRC-16 of the given buffer
511 * See copyright attribution at top of file
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
516 const unsigned char *cp = buffer;
517 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518 static uint16_t const crc16_table[256] = {
519 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
554 crc = (((crc >> 8) & 0xffU) ^
555 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561 struct nvme_namespace_data *nd, uint32_t nsid,
562 struct pci_nvme_blockstore *nvstore)
565 /* Get capacity and block size information from backing store */
566 nd->nsze = nvstore->size / nvstore->sectsz;
570 if (nvstore->type == NVME_STOR_BLOCKIF)
571 nvstore->deallocate = blockif_candelete(nvstore->ctx);
573 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
576 /* Create an EUI-64 if user did not provide one */
577 if (nvstore->eui64 == 0) {
579 uint64_t eui64 = nvstore->eui64;
581 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
585 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
588 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
590 be64enc(nd->eui64, nvstore->eui64);
592 /* LBA data-sz = 2^lbads */
593 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
600 memset(&sc->err_log, 0, sizeof(sc->err_log));
601 memset(&sc->health_log, 0, sizeof(sc->health_log));
602 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
604 /* Set read/write remainder to round up according to spec */
605 sc->read_dunits_remainder = 999;
606 sc->write_dunits_remainder = 999;
608 /* Set nominal Health values checked by implementations */
609 sc->health_log.temperature = 310;
610 sc->health_log.available_spare = 100;
611 sc->health_log.available_spare_threshold = 10;
615 pci_nvme_init_features(struct pci_nvme_softc *sc)
618 sc->feat[0].set = nvme_feature_invalid_cb;
619 sc->feat[0].get = nvme_feature_invalid_cb;
621 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
622 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
623 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
624 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
625 nvme_feature_iv_config;
626 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
627 nvme_feature_invalid_cb;
628 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
629 nvme_feature_invalid_cb;
633 pci_nvme_aer_init(struct pci_nvme_softc *sc)
636 STAILQ_INIT(&sc->aer_list);
641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
643 struct pci_nvme_aer *aer = NULL;
645 while (!STAILQ_EMPTY(&sc->aer_list)) {
646 aer = STAILQ_FIRST(&sc->aer_list);
647 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
651 pci_nvme_aer_init(sc);
655 pci_nvme_aer_available(struct pci_nvme_softc *sc)
658 return (!STAILQ_EMPTY(&sc->aer_list));
662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
664 struct nvme_controller_data *cd = &sc->ctrldata;
666 /* AERL is a zero based value while aer_count is one's based */
667 return (sc->aer_count == (cd->aerl + 1));
671 * Add an Async Event Request
673 * Stores an AER to be returned later if the Controller needs to notify the
675 * Note that while the NVMe spec doesn't require Controllers to return AER's
676 * in order, this implementation does preserve the order.
679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
681 struct pci_nvme_aer *aer = NULL;
683 if (pci_nvme_aer_limit_reached(sc))
686 aer = calloc(1, sizeof(struct pci_nvme_aer));
692 /* Save the Command ID for use in the completion message */
694 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
700 * Get an Async Event Request structure
702 * Returns a pointer to an AER previously submitted by the host or NULL if
703 * no AER's exist. Caller is responsible for freeing the returned struct.
705 static struct pci_nvme_aer *
706 pci_nvme_aer_get(struct pci_nvme_softc *sc)
708 struct pci_nvme_aer *aer = NULL;
710 aer = STAILQ_FIRST(&sc->aer_list);
712 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
720 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
724 DPRINTF("%s", __func__);
726 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
727 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
728 (60 << NVME_CAP_LO_REG_TO_SHIFT);
730 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
732 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
737 assert(sc->submit_queues != NULL);
739 for (i = 0; i < sc->num_squeues + 1; i++) {
740 sc->submit_queues[i].qbase = NULL;
741 sc->submit_queues[i].size = 0;
742 sc->submit_queues[i].cqid = 0;
743 sc->submit_queues[i].tail = 0;
744 sc->submit_queues[i].head = 0;
747 assert(sc->compl_queues != NULL);
749 for (i = 0; i < sc->num_cqueues + 1; i++) {
750 sc->compl_queues[i].qbase = NULL;
751 sc->compl_queues[i].size = 0;
752 sc->compl_queues[i].tail = 0;
753 sc->compl_queues[i].head = 0;
756 sc->num_q_is_set = false;
758 pci_nvme_aer_destroy(sc);
762 pci_nvme_reset(struct pci_nvme_softc *sc)
764 pthread_mutex_lock(&sc->mtx);
765 pci_nvme_reset_locked(sc);
766 pthread_mutex_unlock(&sc->mtx);
770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
774 DPRINTF("%s", __func__);
776 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
777 sc->submit_queues[0].size = asqs;
778 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
779 sizeof(struct nvme_command) * asqs);
781 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
782 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
784 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
785 NVME_AQA_REG_ACQS_MASK) + 1;
786 sc->compl_queues[0].size = acqs;
787 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
788 sizeof(struct nvme_completion) * acqs);
789 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
791 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
792 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
797 size_t len, enum nvme_copy_dir dir)
802 if (len > (8 * 1024)) {
806 /* Copy from the start of prp1 to the end of the physical page */
807 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
808 bytes = MIN(bytes, len);
810 p = vm_map_gpa(ctx, prp1, bytes);
815 if (dir == NVME_COPY_TO_PRP)
827 len = MIN(len, PAGE_SIZE);
829 p = vm_map_gpa(ctx, prp2, len);
834 if (dir == NVME_COPY_TO_PRP)
843 * Write a Completion Queue Entry update
845 * Write the completion and update the doorbell value
848 pci_nvme_cq_update(struct pci_nvme_softc *sc,
849 struct nvme_completion_queue *cq,
855 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
856 struct nvme_completion *cqe;
858 assert(cq->qbase != NULL);
860 pthread_mutex_lock(&cq->mtx);
862 cqe = &cq->qbase[cq->tail];
864 /* Flip the phase bit */
865 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
868 cqe->sqhd = sq->head;
871 cqe->status = status;
874 if (cq->tail >= cq->size) {
878 pthread_mutex_unlock(&cq->mtx);
882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
883 struct nvme_completion* compl)
885 uint16_t qid = command->cdw10 & 0xffff;
887 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
888 if (qid == 0 || qid > sc->num_squeues ||
889 (sc->submit_queues[qid].qbase == NULL)) {
890 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
891 __func__, qid, sc->num_squeues);
892 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
893 NVME_SC_INVALID_QUEUE_IDENTIFIER);
897 sc->submit_queues[qid].qbase = NULL;
898 sc->submit_queues[qid].cqid = 0;
899 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
905 struct nvme_completion* compl)
907 if (command->cdw11 & NVME_CMD_CDW11_PC) {
908 uint16_t qid = command->cdw10 & 0xffff;
909 struct nvme_submission_queue *nsq;
911 if ((qid == 0) || (qid > sc->num_squeues) ||
912 (sc->submit_queues[qid].qbase != NULL)) {
913 WPRINTF("%s queue index %u > num_squeues %u",
914 __func__, qid, sc->num_squeues);
915 pci_nvme_status_tc(&compl->status,
916 NVME_SCT_COMMAND_SPECIFIC,
917 NVME_SC_INVALID_QUEUE_IDENTIFIER);
921 nsq = &sc->submit_queues[qid];
922 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
923 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
924 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
926 * Queues must specify at least two entries
927 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
928 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
930 pci_nvme_status_tc(&compl->status,
931 NVME_SCT_COMMAND_SPECIFIC,
932 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
936 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
937 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
938 pci_nvme_status_tc(&compl->status,
939 NVME_SCT_COMMAND_SPECIFIC,
940 NVME_SC_INVALID_QUEUE_IDENTIFIER);
944 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
945 pci_nvme_status_tc(&compl->status,
946 NVME_SCT_COMMAND_SPECIFIC,
947 NVME_SC_COMPLETION_QUEUE_INVALID);
951 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
953 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
954 sizeof(struct nvme_command) * (size_t)nsq->size);
956 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
957 qid, nsq->size, nsq->qbase, nsq->cqid);
959 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
961 DPRINTF("%s completed creating IOSQ qid %u",
965 * Guest sent non-cont submission queue request.
966 * This setting is unsupported by this emulation.
968 WPRINTF("%s unsupported non-contig (list-based) "
969 "create i/o submission queue", __func__);
971 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
977 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
978 struct nvme_completion* compl)
980 uint16_t qid = command->cdw10 & 0xffff;
983 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
984 if (qid == 0 || qid > sc->num_cqueues ||
985 (sc->compl_queues[qid].qbase == NULL)) {
986 WPRINTF("%s queue index %u / num_cqueues %u",
987 __func__, qid, sc->num_cqueues);
988 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
989 NVME_SC_INVALID_QUEUE_IDENTIFIER);
993 /* Deleting an Active CQ is an error */
994 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
995 if (sc->submit_queues[sqid].cqid == qid) {
996 pci_nvme_status_tc(&compl->status,
997 NVME_SCT_COMMAND_SPECIFIC,
998 NVME_SC_INVALID_QUEUE_DELETION);
1002 sc->compl_queues[qid].qbase = NULL;
1003 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1008 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1009 struct nvme_completion* compl)
1011 struct nvme_completion_queue *ncq;
1012 uint16_t qid = command->cdw10 & 0xffff;
1014 /* Only support Physically Contiguous queues */
1015 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1016 WPRINTF("%s unsupported non-contig (list-based) "
1017 "create i/o completion queue",
1020 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1024 if ((qid == 0) || (qid > sc->num_cqueues) ||
1025 (sc->compl_queues[qid].qbase != NULL)) {
1026 WPRINTF("%s queue index %u > num_cqueues %u",
1027 __func__, qid, sc->num_cqueues);
1028 pci_nvme_status_tc(&compl->status,
1029 NVME_SCT_COMMAND_SPECIFIC,
1030 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1034 ncq = &sc->compl_queues[qid];
1035 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1036 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1037 if (ncq->intr_vec > (sc->max_queues + 1)) {
1038 pci_nvme_status_tc(&compl->status,
1039 NVME_SCT_COMMAND_SPECIFIC,
1040 NVME_SC_INVALID_INTERRUPT_VECTOR);
1044 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1045 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1047 * Queues must specify at least two entries
1048 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1049 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1051 pci_nvme_status_tc(&compl->status,
1052 NVME_SCT_COMMAND_SPECIFIC,
1053 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1056 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1058 sizeof(struct nvme_command) * (size_t)ncq->size);
1060 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1067 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1068 struct nvme_completion* compl)
1071 uint8_t logpage = command->cdw10 & 0xFF;
1073 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1075 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1078 * Command specifies the number of dwords to return in fields NUMDU
1079 * and NUMDL. This is a zero-based value.
1081 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1082 logsize *= sizeof(uint32_t);
1085 case NVME_LOG_ERROR:
1086 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1087 command->prp2, (uint8_t *)&sc->err_log,
1088 MIN(logsize, sizeof(sc->err_log)),
1091 case NVME_LOG_HEALTH_INFORMATION:
1092 pthread_mutex_lock(&sc->mtx);
1093 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1094 sizeof(sc->health_log.data_units_read));
1095 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1096 sizeof(sc->health_log.data_units_written));
1097 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1098 sizeof(sc->health_log.host_read_commands));
1099 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1100 sizeof(sc->health_log.host_write_commands));
1101 pthread_mutex_unlock(&sc->mtx);
1103 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1104 command->prp2, (uint8_t *)&sc->health_log,
1105 MIN(logsize, sizeof(sc->health_log)),
1108 case NVME_LOG_FIRMWARE_SLOT:
1109 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1110 command->prp2, (uint8_t *)&sc->fw_log,
1111 MIN(logsize, sizeof(sc->fw_log)),
1115 DPRINTF("%s get log page %x command not supported",
1118 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1119 NVME_SC_INVALID_LOG_PAGE);
1126 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1127 struct nvme_completion* compl)
1132 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1133 command->cdw10 & 0xFF, command->nsid);
1135 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1137 switch (command->cdw10 & 0xFF) {
1138 case 0x00: /* return Identify Namespace data structure */
1139 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1143 case 0x01: /* return Identify Controller data structure */
1144 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1145 command->prp2, (uint8_t *)&sc->ctrldata,
1146 sizeof(sc->ctrldata),
1149 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1150 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1151 sizeof(uint32_t) * 1024);
1152 /* All unused entries shall be zero */
1153 bzero(dest, sizeof(uint32_t) * 1024);
1154 ((uint32_t *)dest)[0] = 1;
1156 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1157 if (command->nsid != 1) {
1158 pci_nvme_status_genc(&status,
1159 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1162 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1163 sizeof(uint32_t) * 1024);
1164 /* All bytes after the descriptor shall be zero */
1165 bzero(dest, sizeof(uint32_t) * 1024);
1167 /* Return NIDT=1 (i.e. EUI64) descriptor */
1168 ((uint8_t *)dest)[0] = 1;
1169 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1170 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1173 DPRINTF("%s unsupported identify command requested 0x%x",
1174 __func__, command->cdw10 & 0xFF);
1175 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1179 compl->status = status;
1184 nvme_fid_to_name(uint8_t fid)
1189 case NVME_FEAT_ARBITRATION:
1190 name = "Arbitration";
1192 case NVME_FEAT_POWER_MANAGEMENT:
1193 name = "Power Management";
1195 case NVME_FEAT_LBA_RANGE_TYPE:
1196 name = "LBA Range Type";
1198 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1199 name = "Temperature Threshold";
1201 case NVME_FEAT_ERROR_RECOVERY:
1202 name = "Error Recovery";
1204 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1205 name = "Volatile Write Cache";
1207 case NVME_FEAT_NUMBER_OF_QUEUES:
1208 name = "Number of Queues";
1210 case NVME_FEAT_INTERRUPT_COALESCING:
1211 name = "Interrupt Coalescing";
1213 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1214 name = "Interrupt Vector Configuration";
1216 case NVME_FEAT_WRITE_ATOMICITY:
1217 name = "Write Atomicity Normal";
1219 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1220 name = "Asynchronous Event Configuration";
1222 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1223 name = "Autonomous Power State Transition";
1225 case NVME_FEAT_HOST_MEMORY_BUFFER:
1226 name = "Host Memory Buffer";
1228 case NVME_FEAT_TIMESTAMP:
1231 case NVME_FEAT_KEEP_ALIVE_TIMER:
1232 name = "Keep Alive Timer";
1234 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1235 name = "Host Controlled Thermal Management";
1237 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1238 name = "Non-Operation Power State Config";
1240 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1241 name = "Read Recovery Level Config";
1243 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1244 name = "Predictable Latency Mode Config";
1246 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1247 name = "Predictable Latency Mode Window";
1249 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1250 name = "LBA Status Information Report Interval";
1252 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1253 name = "Host Behavior Support";
1255 case NVME_FEAT_SANITIZE_CONFIG:
1256 name = "Sanitize Config";
1258 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1259 name = "Endurance Group Event Configuration";
1261 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1262 name = "Software Progress Marker";
1264 case NVME_FEAT_HOST_IDENTIFIER:
1265 name = "Host Identifier";
1267 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1268 name = "Reservation Notification Mask";
1270 case NVME_FEAT_RESERVATION_PERSISTENCE:
1271 name = "Reservation Persistence";
1273 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1274 name = "Namespace Write Protection Config";
1285 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1286 struct nvme_feature_obj *feat,
1287 struct nvme_command *command,
1288 struct nvme_completion *compl)
1291 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1295 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1296 struct nvme_feature_obj *feat,
1297 struct nvme_command *command,
1298 struct nvme_completion *compl)
1301 uint32_t cdw11 = command->cdw11;
1305 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1307 iv = cdw11 & 0xffff;
1308 cd = cdw11 & (1 << 16);
1310 if (iv > (sc->max_queues + 1)) {
1314 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1315 if ((iv == 0) && !cd)
1318 /* Requested Interrupt Vector must be used by a CQ */
1319 for (i = 0; i < sc->num_cqueues + 1; i++) {
1320 if (sc->compl_queues[i].intr_vec == iv) {
1321 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1328 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1329 struct nvme_feature_obj *feat,
1330 struct nvme_command *command,
1331 struct nvme_completion *compl)
1333 uint16_t nqr; /* Number of Queues Requested */
1335 if (sc->num_q_is_set) {
1336 WPRINTF("%s: Number of Queues already set", __func__);
1337 pci_nvme_status_genc(&compl->status,
1338 NVME_SC_COMMAND_SEQUENCE_ERROR);
1342 nqr = command->cdw11 & 0xFFFF;
1343 if (nqr == 0xffff) {
1344 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1345 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1349 sc->num_squeues = ONE_BASED(nqr);
1350 if (sc->num_squeues > sc->max_queues) {
1351 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1353 sc->num_squeues = sc->max_queues;
1356 nqr = (command->cdw11 >> 16) & 0xFFFF;
1357 if (nqr == 0xffff) {
1358 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1359 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1363 sc->num_cqueues = ONE_BASED(nqr);
1364 if (sc->num_cqueues > sc->max_queues) {
1365 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1367 sc->num_cqueues = sc->max_queues;
1370 /* Patch the command value which will be saved on callback's return */
1371 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1372 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1374 sc->num_q_is_set = true;
1378 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1379 struct nvme_completion *compl)
1381 struct nvme_feature_obj *feat;
1382 uint32_t nsid = command->nsid;
1383 uint8_t fid = command->cdw10 & 0xFF;
1385 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1387 if (fid >= NVME_FID_MAX) {
1388 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1389 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1392 feat = &sc->feat[fid];
1394 if (!feat->namespace_specific &&
1395 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1396 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1397 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1402 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1405 feat->set(sc, feat, command, compl);
1407 if (compl->status == NVME_SC_SUCCESS)
1408 feat->cdw11 = command->cdw11;
1414 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1415 struct nvme_completion* compl)
1417 struct nvme_feature_obj *feat;
1418 uint8_t fid = command->cdw10 & 0xFF;
1420 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1422 if (fid >= NVME_FID_MAX) {
1423 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1424 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1429 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1431 feat = &sc->feat[fid];
1433 feat->get(sc, feat, command, compl);
1436 if (compl->status == NVME_SC_SUCCESS) {
1437 compl->cdw0 = feat->cdw11;
1444 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1445 struct nvme_completion* compl)
1447 uint8_t ses, lbaf, pi;
1449 /* Only supports Secure Erase Setting - User Data Erase */
1450 ses = (command->cdw10 >> 9) & 0x7;
1452 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1456 /* Only supports a single LBA Format */
1457 lbaf = command->cdw10 & 0xf;
1459 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1460 NVME_SC_INVALID_FORMAT);
1464 /* Doesn't support Protection Infomation */
1465 pi = (command->cdw10 >> 5) & 0x7;
1467 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1471 if (sc->nvstore.type == NVME_STOR_RAM) {
1472 if (sc->nvstore.ctx)
1473 free(sc->nvstore.ctx);
1474 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1475 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1477 struct pci_nvme_ioreq *req;
1480 req = pci_nvme_get_ioreq(sc);
1482 pci_nvme_status_genc(&compl->status,
1483 NVME_SC_INTERNAL_DEVICE_ERROR);
1484 WPRINTF("%s: unable to allocate IO req", __func__);
1487 req->nvme_sq = &sc->submit_queues[0];
1489 req->opc = command->opc;
1490 req->cid = command->cid;
1491 req->nsid = command->nsid;
1493 req->io_req.br_offset = 0;
1494 req->io_req.br_resid = sc->nvstore.size;
1495 req->io_req.br_callback = pci_nvme_io_done;
1497 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1499 pci_nvme_status_genc(&compl->status,
1500 NVME_SC_INTERNAL_DEVICE_ERROR);
1501 pci_nvme_release_ioreq(sc, req);
1509 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1510 struct nvme_completion* compl)
1512 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1513 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1515 /* TODO: search for the command ID and abort it */
1518 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1523 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1524 struct nvme_command* command, struct nvme_completion* compl)
1526 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1528 /* Don't exceed the Async Event Request Limit (AERL). */
1529 if (pci_nvme_aer_limit_reached(sc)) {
1530 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1531 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1535 if (pci_nvme_aer_add(sc, command->cid)) {
1536 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1537 NVME_SC_INTERNAL_DEVICE_ERROR);
1542 * Raise events when they happen based on the Set Features cmd.
1543 * These events happen async, so only set completion successful if
1544 * there is an event reflective of the request to get event.
1546 compl->status = NVME_NO_STATUS;
1552 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1554 struct nvme_completion compl;
1555 struct nvme_command *cmd;
1556 struct nvme_submission_queue *sq;
1557 struct nvme_completion_queue *cq;
1560 DPRINTF("%s index %u", __func__, (uint32_t)value);
1562 sq = &sc->submit_queues[0];
1563 cq = &sc->compl_queues[0];
1565 pthread_mutex_lock(&sq->mtx);
1568 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1570 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1571 cmd = &(sq->qbase)[sqhead];
1575 case NVME_OPC_DELETE_IO_SQ:
1576 DPRINTF("%s command DELETE_IO_SQ", __func__);
1577 nvme_opc_delete_io_sq(sc, cmd, &compl);
1579 case NVME_OPC_CREATE_IO_SQ:
1580 DPRINTF("%s command CREATE_IO_SQ", __func__);
1581 nvme_opc_create_io_sq(sc, cmd, &compl);
1583 case NVME_OPC_DELETE_IO_CQ:
1584 DPRINTF("%s command DELETE_IO_CQ", __func__);
1585 nvme_opc_delete_io_cq(sc, cmd, &compl);
1587 case NVME_OPC_CREATE_IO_CQ:
1588 DPRINTF("%s command CREATE_IO_CQ", __func__);
1589 nvme_opc_create_io_cq(sc, cmd, &compl);
1591 case NVME_OPC_GET_LOG_PAGE:
1592 DPRINTF("%s command GET_LOG_PAGE", __func__);
1593 nvme_opc_get_log_page(sc, cmd, &compl);
1595 case NVME_OPC_IDENTIFY:
1596 DPRINTF("%s command IDENTIFY", __func__);
1597 nvme_opc_identify(sc, cmd, &compl);
1599 case NVME_OPC_ABORT:
1600 DPRINTF("%s command ABORT", __func__);
1601 nvme_opc_abort(sc, cmd, &compl);
1603 case NVME_OPC_SET_FEATURES:
1604 DPRINTF("%s command SET_FEATURES", __func__);
1605 nvme_opc_set_features(sc, cmd, &compl);
1607 case NVME_OPC_GET_FEATURES:
1608 DPRINTF("%s command GET_FEATURES", __func__);
1609 nvme_opc_get_features(sc, cmd, &compl);
1611 case NVME_OPC_FIRMWARE_ACTIVATE:
1612 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1613 pci_nvme_status_tc(&compl.status,
1614 NVME_SCT_COMMAND_SPECIFIC,
1615 NVME_SC_INVALID_FIRMWARE_SLOT);
1617 case NVME_OPC_ASYNC_EVENT_REQUEST:
1618 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1619 nvme_opc_async_event_req(sc, cmd, &compl);
1621 case NVME_OPC_FORMAT_NVM:
1622 DPRINTF("%s command FORMAT_NVM", __func__);
1623 if ((sc->ctrldata.oacs &
1624 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1625 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1627 compl.status = NVME_NO_STATUS;
1628 nvme_opc_format_nvm(sc, cmd, &compl);
1631 DPRINTF("0x%x command is not implemented",
1633 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1635 sqhead = (sqhead + 1) % sq->size;
1637 if (NVME_COMPLETION_VALID(compl)) {
1638 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1646 DPRINTF("setting sqhead %u", sqhead);
1649 if (cq->head != cq->tail)
1650 pci_generate_msix(sc->nsc_pi, 0);
1652 pthread_mutex_unlock(&sq->mtx);
1656 * Update the Write and Read statistics reported in SMART data
1658 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1659 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1660 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1663 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1664 size_t bytes, uint16_t status)
1667 pthread_mutex_lock(&sc->mtx);
1669 case NVME_OPC_WRITE:
1670 sc->write_commands++;
1671 if (status != NVME_SC_SUCCESS)
1673 sc->write_dunits_remainder += (bytes / 512);
1674 while (sc->write_dunits_remainder >= 1000) {
1675 sc->write_data_units++;
1676 sc->write_dunits_remainder -= 1000;
1680 sc->read_commands++;
1681 if (status != NVME_SC_SUCCESS)
1683 sc->read_dunits_remainder += (bytes / 512);
1684 while (sc->read_dunits_remainder >= 1000) {
1685 sc->read_data_units++;
1686 sc->read_dunits_remainder -= 1000;
1690 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1693 pthread_mutex_unlock(&sc->mtx);
1697 * Check if the combination of Starting LBA (slba) and Number of Logical
1698 * Blocks (nlb) exceeds the range of the underlying storage.
1700 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1701 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1705 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1708 size_t offset, bytes;
1710 /* Overflow check of multiplying Starting LBA by the sector size */
1711 if (slba >> (64 - nvstore->sectsz_bits))
1714 offset = slba << nvstore->sectsz_bits;
1715 bytes = nlb << nvstore->sectsz_bits;
1717 /* Overflow check of Number of Logical Blocks */
1718 if ((nvstore->size - offset) < bytes)
1725 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1726 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1733 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1737 /* concatenate contig block-iovs to minimize number of iovs */
1738 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1739 iovidx = req->io_req.br_iovcnt - 1;
1741 req->io_req.br_iov[iovidx].iov_base =
1742 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1743 req->prev_gpaddr, size);
1745 req->prev_size += size;
1746 req->io_req.br_resid += size;
1748 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1750 iovidx = req->io_req.br_iovcnt;
1752 req->io_req.br_offset = lba;
1753 req->io_req.br_resid = 0;
1754 req->io_req.br_param = req;
1757 req->io_req.br_iov[iovidx].iov_base =
1758 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1761 req->io_req.br_iov[iovidx].iov_len = size;
1763 req->prev_gpaddr = gpaddr;
1764 req->prev_size = size;
1765 req->io_req.br_resid += size;
1767 req->io_req.br_iovcnt++;
1774 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1775 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1776 uint32_t cdw0, uint16_t status)
1778 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1780 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1781 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1782 NVME_STATUS_GET_SC(status));
1784 pci_nvme_cq_update(sc, cq,
1790 if (cq->head != cq->tail) {
1791 if (cq->intr_en & NVME_CQ_INTEN) {
1792 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1794 DPRINTF("%s: CQ%u interrupt disabled",
1795 __func__, sq->cqid);
1801 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1804 req->nvme_sq = NULL;
1807 pthread_mutex_lock(&sc->mtx);
1809 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1812 /* when no more IO pending, can set to ready if device reset/enabled */
1813 if (sc->pending_ios == 0 &&
1814 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1815 sc->regs.csts |= NVME_CSTS_RDY;
1817 pthread_mutex_unlock(&sc->mtx);
1819 sem_post(&sc->iosemlock);
1822 static struct pci_nvme_ioreq *
1823 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1825 struct pci_nvme_ioreq *req = NULL;;
1827 sem_wait(&sc->iosemlock);
1828 pthread_mutex_lock(&sc->mtx);
1830 req = STAILQ_FIRST(&sc->ioreqs_free);
1831 assert(req != NULL);
1832 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1838 pthread_mutex_unlock(&sc->mtx);
1840 req->io_req.br_iovcnt = 0;
1841 req->io_req.br_offset = 0;
1842 req->io_req.br_resid = 0;
1843 req->io_req.br_param = req;
1844 req->prev_gpaddr = 0;
1851 pci_nvme_io_done(struct blockif_req *br, int err)
1853 struct pci_nvme_ioreq *req = br->br_param;
1854 struct nvme_submission_queue *sq = req->nvme_sq;
1855 uint16_t code, status;
1857 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1859 /* TODO return correct error */
1860 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1861 pci_nvme_status_genc(&status, code);
1863 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1864 pci_nvme_stats_write_read_update(req->sc, req->opc,
1865 req->bytes, status);
1866 pci_nvme_release_ioreq(req->sc, req);
1870 * Implements the Flush command. The specification states:
1871 * If a volatile write cache is not present, Flush commands complete
1872 * successfully and have no effect
1873 * in the description of the Volatile Write Cache (VWC) field of the Identify
1874 * Controller data. Therefore, set status to Success if the command is
1875 * not supported (i.e. RAM or as indicated by the blockif).
1878 nvme_opc_flush(struct pci_nvme_softc *sc,
1879 struct nvme_command *cmd,
1880 struct pci_nvme_blockstore *nvstore,
1881 struct pci_nvme_ioreq *req,
1884 bool pending = false;
1886 if (nvstore->type == NVME_STOR_RAM) {
1887 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1891 req->io_req.br_callback = pci_nvme_io_done;
1893 err = blockif_flush(nvstore->ctx, &req->io_req);
1899 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1902 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1910 nvme_write_read_ram(struct pci_nvme_softc *sc,
1911 struct pci_nvme_blockstore *nvstore,
1912 uint64_t prp1, uint64_t prp2,
1913 size_t offset, uint64_t bytes,
1916 uint8_t *buf = nvstore->ctx;
1917 enum nvme_copy_dir dir;
1921 dir = NVME_COPY_TO_PRP;
1923 dir = NVME_COPY_FROM_PRP;
1925 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1926 buf + offset, bytes, dir))
1927 pci_nvme_status_genc(&status,
1928 NVME_SC_DATA_TRANSFER_ERROR);
1930 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1936 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1937 struct pci_nvme_blockstore *nvstore,
1938 struct pci_nvme_ioreq *req,
1939 uint64_t prp1, uint64_t prp2,
1940 size_t offset, uint64_t bytes,
1945 uint16_t status = NVME_NO_STATUS;
1947 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1948 if (pci_nvme_append_iov_req(sc, req, prp1,
1949 size, is_write, offset)) {
1950 pci_nvme_status_genc(&status,
1951 NVME_SC_DATA_TRANSFER_ERROR);
1960 } else if (bytes <= PAGE_SIZE) {
1962 if (pci_nvme_append_iov_req(sc, req, prp2,
1963 size, is_write, offset)) {
1964 pci_nvme_status_genc(&status,
1965 NVME_SC_DATA_TRANSFER_ERROR);
1969 void *vmctx = sc->nsc_pi->pi_vmctx;
1970 uint64_t *prp_list = &prp2;
1971 uint64_t *last = prp_list;
1973 /* PRP2 is pointer to a physical region page list */
1975 /* Last entry in list points to the next list */
1976 if ((prp_list == last) && (bytes > PAGE_SIZE)) {
1977 uint64_t prp = *prp_list;
1979 prp_list = paddr_guest2host(vmctx, prp,
1980 PAGE_SIZE - (prp % PAGE_SIZE));
1981 last = prp_list + (NVME_PRP2_ITEMS - 1);
1984 size = MIN(bytes, PAGE_SIZE);
1986 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1987 size, is_write, offset)) {
1988 pci_nvme_status_genc(&status,
1989 NVME_SC_DATA_TRANSFER_ERROR);
1999 req->io_req.br_callback = pci_nvme_io_done;
2001 err = blockif_write(nvstore->ctx, &req->io_req);
2003 err = blockif_read(nvstore->ctx, &req->io_req);
2006 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2012 nvme_opc_write_read(struct pci_nvme_softc *sc,
2013 struct nvme_command *cmd,
2014 struct pci_nvme_blockstore *nvstore,
2015 struct pci_nvme_ioreq *req,
2018 uint64_t lba, nblocks, bytes;
2020 bool is_write = cmd->opc == NVME_OPC_WRITE;
2021 bool pending = false;
2023 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2024 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2025 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2026 WPRINTF("%s command would exceed LBA range", __func__);
2027 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2031 bytes = nblocks << nvstore->sectsz_bits;
2032 if (bytes > NVME_MAX_DATA_SIZE) {
2033 WPRINTF("%s command would exceed MDTS", __func__);
2034 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2038 offset = lba << nvstore->sectsz_bits;
2041 req->io_req.br_offset = lba;
2043 /* PRP bits 1:0 must be zero */
2044 cmd->prp1 &= ~0x3UL;
2045 cmd->prp2 &= ~0x3UL;
2047 if (nvstore->type == NVME_STOR_RAM) {
2048 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2049 cmd->prp2, offset, bytes, is_write);
2051 *status = nvme_write_read_blockif(sc, nvstore, req,
2052 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2054 if (*status == NVME_NO_STATUS)
2059 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2065 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2067 struct pci_nvme_ioreq *req = br->br_param;
2068 struct pci_nvme_softc *sc = req->sc;
2073 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2074 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2075 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2077 struct iovec *iov = req->io_req.br_iov;
2080 iov += req->prev_gpaddr;
2082 /* The iov_* values already include the sector size */
2083 req->io_req.br_offset = (off_t)iov->iov_base;
2084 req->io_req.br_resid = iov->iov_len;
2085 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2086 pci_nvme_status_genc(&status,
2087 NVME_SC_INTERNAL_DEVICE_ERROR);
2093 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2094 req->cid, 0, status);
2095 pci_nvme_release_ioreq(sc, req);
2100 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2101 struct nvme_command *cmd,
2102 struct pci_nvme_blockstore *nvstore,
2103 struct pci_nvme_ioreq *req,
2106 struct nvme_dsm_range *range;
2107 uint32_t nr, r, non_zero, dr;
2109 bool pending = false;
2111 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2112 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2116 nr = cmd->cdw10 & 0xff;
2118 /* copy locally because a range entry could straddle PRPs */
2119 range = calloc(1, NVME_MAX_DSM_TRIM);
2120 if (range == NULL) {
2121 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2124 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2125 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2127 /* Check for invalid ranges and the number of non-zero lengths */
2129 for (r = 0; r <= nr; r++) {
2130 if (pci_nvme_out_of_range(nvstore,
2131 range[r].starting_lba, range[r].length)) {
2132 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2135 if (range[r].length != 0)
2139 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2140 size_t offset, bytes;
2141 int sectsz_bits = sc->nvstore.sectsz_bits;
2144 * DSM calls are advisory only, and compliant controllers
2145 * may choose to take no actions (i.e. return Success).
2147 if (!nvstore->deallocate) {
2148 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2152 /* If all ranges have a zero length, return Success */
2153 if (non_zero == 0) {
2154 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2159 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2163 offset = range[0].starting_lba << sectsz_bits;
2164 bytes = range[0].length << sectsz_bits;
2167 * If the request is for more than a single range, store
2168 * the ranges in the br_iov. Optimize for the common case
2169 * of a single range.
2171 * Note that NVMe Number of Ranges is a zero based value
2173 req->io_req.br_iovcnt = 0;
2174 req->io_req.br_offset = offset;
2175 req->io_req.br_resid = bytes;
2178 req->io_req.br_callback = pci_nvme_io_done;
2180 struct iovec *iov = req->io_req.br_iov;
2182 for (r = 0, dr = 0; r <= nr; r++) {
2183 offset = range[r].starting_lba << sectsz_bits;
2184 bytes = range[r].length << sectsz_bits;
2188 if ((nvstore->size - offset) < bytes) {
2189 pci_nvme_status_genc(status,
2190 NVME_SC_LBA_OUT_OF_RANGE);
2193 iov[dr].iov_base = (void *)offset;
2194 iov[dr].iov_len = bytes;
2197 req->io_req.br_callback = pci_nvme_dealloc_sm;
2200 * Use prev_gpaddr to track the current entry and
2201 * prev_size to track the number of entries
2203 req->prev_gpaddr = 0;
2204 req->prev_size = dr;
2207 err = blockif_delete(nvstore->ctx, &req->io_req);
2209 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2219 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2221 struct nvme_submission_queue *sq;
2225 /* handle all submissions up to sq->tail index */
2226 sq = &sc->submit_queues[idx];
2228 pthread_mutex_lock(&sq->mtx);
2231 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2232 idx, sqhead, sq->tail, sq->qbase);
2234 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2235 struct nvme_command *cmd;
2236 struct pci_nvme_ioreq *req;
2244 cmd = &sq->qbase[sqhead];
2245 sqhead = (sqhead + 1) % sq->size;
2247 nsid = le32toh(cmd->nsid);
2248 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2249 pci_nvme_status_genc(&status,
2250 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2252 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2256 req = pci_nvme_get_ioreq(sc);
2258 pci_nvme_status_genc(&status,
2259 NVME_SC_INTERNAL_DEVICE_ERROR);
2260 WPRINTF("%s: unable to allocate IO req", __func__);
2265 req->opc = cmd->opc;
2266 req->cid = cmd->cid;
2267 req->nsid = cmd->nsid;
2270 case NVME_OPC_FLUSH:
2271 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2274 case NVME_OPC_WRITE:
2276 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2279 case NVME_OPC_WRITE_ZEROES:
2280 /* TODO: write zeroes
2281 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2282 __func__, lba, cmd->cdw12 & 0xFFFF); */
2283 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2285 case NVME_OPC_DATASET_MANAGEMENT:
2286 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2290 WPRINTF("%s unhandled io command 0x%x",
2291 __func__, cmd->opc);
2292 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2296 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2299 pci_nvme_release_ioreq(sc, req);
2305 pthread_mutex_unlock(&sq->mtx);
2309 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2310 uint64_t idx, int is_sq, uint64_t value)
2312 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2313 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2316 if (idx > sc->num_squeues) {
2317 WPRINTF("%s queue index %lu overflow from "
2319 __func__, idx, sc->num_squeues);
2323 atomic_store_short(&sc->submit_queues[idx].tail,
2327 pci_nvme_handle_admin_cmd(sc, value);
2329 /* submission queue; handle new entries in SQ */
2330 if (idx > sc->num_squeues) {
2331 WPRINTF("%s SQ index %lu overflow from "
2333 __func__, idx, sc->num_squeues);
2336 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2339 if (idx > sc->num_cqueues) {
2340 WPRINTF("%s queue index %lu overflow from "
2342 __func__, idx, sc->num_cqueues);
2346 atomic_store_short(&sc->compl_queues[idx].head,
2352 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2354 const char *s = iswrite ? "WRITE" : "READ";
2357 case NVME_CR_CAP_LOW:
2358 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2360 case NVME_CR_CAP_HI:
2361 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2364 DPRINTF("%s %s NVME_CR_VS", func, s);
2367 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2370 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2373 DPRINTF("%s %s NVME_CR_CC", func, s);
2376 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2379 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2382 DPRINTF("%s %s NVME_CR_AQA", func, s);
2384 case NVME_CR_ASQ_LOW:
2385 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2387 case NVME_CR_ASQ_HI:
2388 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2390 case NVME_CR_ACQ_LOW:
2391 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2393 case NVME_CR_ACQ_HI:
2394 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2397 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2403 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2404 uint64_t offset, int size, uint64_t value)
2408 if (offset >= NVME_DOORBELL_OFFSET) {
2409 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2410 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2411 int is_sq = (belloffset % 8) < 4;
2413 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2414 WPRINTF("guest attempted an overflow write offset "
2415 "0x%lx, val 0x%lx in %s",
2416 offset, value, __func__);
2420 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2424 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2425 offset, size, value);
2428 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2429 "val 0x%lx) to bar0 in %s",
2430 size, offset, value, __func__);
2431 /* TODO: shutdown device */
2435 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2437 pthread_mutex_lock(&sc->mtx);
2440 case NVME_CR_CAP_LOW:
2441 case NVME_CR_CAP_HI:
2448 /* MSI-X, so ignore */
2451 /* MSI-X, so ignore */
2454 ccreg = (uint32_t)value;
2456 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2459 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2460 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2461 NVME_CC_GET_IOCQES(ccreg));
2463 if (NVME_CC_GET_SHN(ccreg)) {
2464 /* perform shutdown - flush out data to backend */
2465 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2466 NVME_CSTS_REG_SHST_SHIFT);
2467 sc->regs.csts |= NVME_SHST_COMPLETE <<
2468 NVME_CSTS_REG_SHST_SHIFT;
2470 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2471 if (NVME_CC_GET_EN(ccreg) == 0)
2472 /* transition 1-> causes controller reset */
2473 pci_nvme_reset_locked(sc);
2475 pci_nvme_init_controller(ctx, sc);
2478 /* Insert the iocqes, iosqes and en bits from the write */
2479 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2480 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2481 if (NVME_CC_GET_EN(ccreg) == 0) {
2482 /* Insert the ams, mps and css bit fields */
2483 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2484 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2485 sc->regs.csts &= ~NVME_CSTS_RDY;
2486 } else if (sc->pending_ios == 0) {
2487 sc->regs.csts |= NVME_CSTS_RDY;
2493 /* ignore writes; don't support subsystem reset */
2496 sc->regs.aqa = (uint32_t)value;
2498 case NVME_CR_ASQ_LOW:
2499 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2500 (0xFFFFF000 & value);
2502 case NVME_CR_ASQ_HI:
2503 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2506 case NVME_CR_ACQ_LOW:
2507 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2508 (0xFFFFF000 & value);
2510 case NVME_CR_ACQ_HI:
2511 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2515 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2516 __func__, offset, value, size);
2518 pthread_mutex_unlock(&sc->mtx);
2522 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2523 int baridx, uint64_t offset, int size, uint64_t value)
2525 struct pci_nvme_softc* sc = pi->pi_arg;
2527 if (baridx == pci_msix_table_bar(pi) ||
2528 baridx == pci_msix_pba_bar(pi)) {
2529 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2530 " value 0x%lx", baridx, offset, size, value);
2532 pci_emul_msix_twrite(pi, offset, size, value);
2538 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2542 DPRINTF("%s unknown baridx %d, val 0x%lx",
2543 __func__, baridx, value);
2547 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2548 uint64_t offset, int size)
2552 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2554 if (offset < NVME_DOORBELL_OFFSET) {
2555 void *p = &(sc->regs);
2556 pthread_mutex_lock(&sc->mtx);
2557 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2558 pthread_mutex_unlock(&sc->mtx);
2561 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2572 value &= 0xFFFFFFFF;
2576 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2577 offset, size, (uint32_t)value);
2585 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2586 uint64_t offset, int size)
2588 struct pci_nvme_softc* sc = pi->pi_arg;
2590 if (baridx == pci_msix_table_bar(pi) ||
2591 baridx == pci_msix_pba_bar(pi)) {
2592 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2593 baridx, offset, size);
2595 return pci_emul_msix_tread(pi, offset, size);
2600 return pci_nvme_read_bar_0(sc, offset, size);
2603 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2611 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2613 char bident[sizeof("XX:X:X")];
2614 char *uopt, *xopts, *config;
2618 sc->max_queues = NVME_QUEUES;
2619 sc->max_qentries = NVME_MAX_QENTRIES;
2620 sc->ioslots = NVME_IOSLOTS;
2621 sc->num_squeues = sc->max_queues;
2622 sc->num_cqueues = sc->max_queues;
2623 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2626 uopt = strdup(opts);
2628 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2629 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2630 for (xopts = strtok(uopt, ",");
2632 xopts = strtok(NULL, ",")) {
2634 if ((config = strchr(xopts, '=')) != NULL)
2637 if (!strcmp("maxq", xopts)) {
2638 sc->max_queues = atoi(config);
2639 } else if (!strcmp("qsz", xopts)) {
2640 sc->max_qentries = atoi(config);
2641 } else if (!strcmp("ioslots", xopts)) {
2642 sc->ioslots = atoi(config);
2643 } else if (!strcmp("sectsz", xopts)) {
2644 sectsz = atoi(config);
2645 } else if (!strcmp("ser", xopts)) {
2647 * This field indicates the Product Serial Number in
2648 * 7-bit ASCII, unused bytes should be space characters.
2651 cpywithpad((char *)sc->ctrldata.sn,
2652 sizeof(sc->ctrldata.sn), config, ' ');
2653 } else if (!strcmp("ram", xopts)) {
2654 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2656 sc->nvstore.type = NVME_STOR_RAM;
2657 sc->nvstore.size = sz * 1024 * 1024;
2658 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2659 sc->nvstore.sectsz = 4096;
2660 sc->nvstore.sectsz_bits = 12;
2661 if (sc->nvstore.ctx == NULL) {
2662 perror("Unable to allocate RAM");
2666 } else if (!strcmp("eui64", xopts)) {
2667 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2668 } else if (!strcmp("dsm", xopts)) {
2669 if (!strcmp("auto", config))
2670 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2671 else if (!strcmp("enable", config))
2672 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2673 else if (!strcmp("disable", config))
2674 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2675 } else if (optidx == 0) {
2676 snprintf(bident, sizeof(bident), "%d:%d",
2677 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2678 sc->nvstore.ctx = blockif_open(xopts, bident);
2679 if (sc->nvstore.ctx == NULL) {
2680 perror("Could not open backing file");
2684 sc->nvstore.type = NVME_STOR_BLOCKIF;
2685 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2687 EPRINTLN("Invalid option %s", xopts);
2696 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2697 EPRINTLN("backing store not specified");
2700 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2701 sc->nvstore.sectsz = sectsz;
2702 else if (sc->nvstore.type != NVME_STOR_RAM)
2703 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2704 for (sc->nvstore.sectsz_bits = 9;
2705 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2706 sc->nvstore.sectsz_bits++);
2708 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2709 sc->max_queues = NVME_QUEUES;
2711 if (sc->max_qentries <= 0) {
2712 EPRINTLN("Invalid qsz option");
2715 if (sc->ioslots <= 0) {
2716 EPRINTLN("Invalid ioslots option");
2724 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2726 struct pci_nvme_softc *sc;
2727 uint32_t pci_membar_sz;
2732 sc = calloc(1, sizeof(struct pci_nvme_softc));
2736 error = pci_nvme_parse_opts(sc, opts);
2742 STAILQ_INIT(&sc->ioreqs_free);
2743 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2744 for (int i = 0; i < sc->ioslots; i++) {
2745 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2748 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2749 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2750 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2751 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2752 pci_set_cfgdata8(pi, PCIR_PROGIF,
2753 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2756 * Allocate size of NVMe registers + doorbell space for all queues.
2758 * The specification requires a minimum memory I/O window size of 16K.
2759 * The Windows driver will refuse to start a device with a smaller
2762 pci_membar_sz = sizeof(struct nvme_registers) +
2763 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2764 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2766 DPRINTF("nvme membar size: %u", pci_membar_sz);
2768 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2770 WPRINTF("%s pci alloc mem bar failed", __func__);
2774 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2776 WPRINTF("%s pci add msixcap failed", __func__);
2780 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2782 WPRINTF("%s pci add Express capability failed", __func__);
2786 pthread_mutex_init(&sc->mtx, NULL);
2787 sem_init(&sc->iosemlock, 0, sc->ioslots);
2789 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2791 * Controller data depends on Namespace data so initialize Namespace
2794 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2795 pci_nvme_init_ctrldata(sc);
2796 pci_nvme_init_logpages(sc);
2797 pci_nvme_init_features(sc);
2799 pci_nvme_aer_init(sc);
2803 pci_lintr_request(pi);
2810 struct pci_devemu pci_de_nvme = {
2812 .pe_init = pci_nvme_init,
2813 .pe_barwrite = pci_nvme_write,
2814 .pe_barread = pci_nvme_read
2816 PCI_EMUL_SET(pci_de_nvme);