2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN 0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS 0xffff
115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero) ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one) ((one) - 1)
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
131 enum nvme_controller_register_offsets {
132 NVME_CR_CAP_LOW = 0x00,
133 NVME_CR_CAP_HI = 0x04,
135 NVME_CR_INTMS = 0x0c,
136 NVME_CR_INTMC = 0x10,
141 NVME_CR_ASQ_LOW = 0x28,
142 NVME_CR_ASQ_HI = 0x2c,
143 NVME_CR_ACQ_LOW = 0x30,
144 NVME_CR_ACQ_HI = 0x34,
147 enum nvme_cmd_cdw11 {
148 NVME_CMD_CDW11_PC = 0x0001,
149 NVME_CMD_CDW11_IEN = 0x0002,
150 NVME_CMD_CDW11_IV = 0xFFFF0000,
158 #define NVME_CQ_INTEN 0x01
159 #define NVME_CQ_INTCOAL 0x02
161 struct nvme_completion_queue {
162 struct nvme_completion *qbase;
165 uint16_t tail; /* nvme progress */
166 uint16_t head; /* guest progress */
171 struct nvme_submission_queue {
172 struct nvme_command *qbase;
175 uint16_t head; /* nvme progress */
176 uint16_t tail; /* guest progress */
177 uint16_t cqid; /* completion queue id */
181 enum nvme_storage_type {
182 NVME_STOR_BLOCKIF = 0,
186 struct pci_nvme_blockstore {
187 enum nvme_storage_type type;
191 uint32_t sectsz_bits;
193 uint32_t deallocate:1;
197 * Calculate the number of additional page descriptors for guest IO requests
198 * based on the advertised Max Data Transfer (MDTS) and given the number of
199 * default iovec's in a struct blockif_req.
201 * Note the + 1 allows for the initial descriptor to not be page aligned.
203 #define MDTS_PAD_SIZE \
204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
254 struct pci_nvme_aer {
255 STAILQ_ENTRY(pci_nvme_aer) link;
256 uint16_t cid; /* Command ID of the submitted AER */
259 struct pci_nvme_softc {
260 struct pci_devinst *nsc_pi;
264 struct nvme_registers regs;
266 struct nvme_namespace_data nsdata;
267 struct nvme_controller_data ctrldata;
268 struct nvme_error_information_entry err_log;
269 struct nvme_health_information_page health_log;
270 struct nvme_firmware_page fw_log;
272 struct pci_nvme_blockstore nvstore;
274 uint16_t max_qentries; /* max entries per queue */
275 uint32_t max_queues; /* max number of IO SQ's or CQ's */
276 uint32_t num_cqueues;
277 uint32_t num_squeues;
278 bool num_q_is_set; /* Has host set Number of Queues */
280 struct pci_nvme_ioreq *ioreqs;
281 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282 uint32_t pending_ios;
287 * Memory mapped Submission and Completion queues
288 * Each array includes both Admin and IO queues
290 struct nvme_completion_queue *compl_queues;
291 struct nvme_submission_queue *submit_queues;
293 struct nvme_feature_obj feat[NVME_FID_MAX];
295 enum nvme_dsm_type dataset_management;
297 /* Accounting for SMART data */
298 __uint128_t read_data_units;
299 __uint128_t write_data_units;
300 __uint128_t read_commands;
301 __uint128_t write_commands;
302 uint32_t read_dunits_remainder;
303 uint32_t write_dunits_remainder;
305 STAILQ_HEAD(, pci_nvme_aer) aer_list;
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
326 #define NVME_CC_WRITE_MASK \
327 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
331 #define NVME_CC_NEN_WRITE_MASK \
332 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
340 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
348 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352 struct nvme_feature_obj *,
353 struct nvme_command *,
354 struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356 struct nvme_feature_obj *,
357 struct nvme_command *,
358 struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360 struct nvme_feature_obj *,
361 struct nvme_command *,
362 struct nvme_completion *);
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
369 len = strnlen(src, dst_size);
370 memset(dst, pad, dst_size);
371 memcpy(dst, src, len);
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
378 *status &= ~NVME_STATUS_MASK;
379 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
387 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
391 * Initialize the requested number or IO Submission and Completion Queues.
392 * Admin queues are allocated implicitly.
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
400 * Allocate and initialize the Submission Queues
402 if (nsq > NVME_QUEUES) {
403 WPRINTF("%s: clamping number of SQ from %u to %u",
404 __func__, nsq, NVME_QUEUES);
408 sc->num_squeues = nsq;
410 sc->submit_queues = calloc(sc->num_squeues + 1,
411 sizeof(struct nvme_submission_queue));
412 if (sc->submit_queues == NULL) {
413 WPRINTF("%s: SQ allocation failed", __func__);
416 struct nvme_submission_queue *sq = sc->submit_queues;
418 for (i = 0; i < sc->num_squeues; i++)
419 pthread_mutex_init(&sq[i].mtx, NULL);
423 * Allocate and initialize the Completion Queues
425 if (ncq > NVME_QUEUES) {
426 WPRINTF("%s: clamping number of CQ from %u to %u",
427 __func__, ncq, NVME_QUEUES);
431 sc->num_cqueues = ncq;
433 sc->compl_queues = calloc(sc->num_cqueues + 1,
434 sizeof(struct nvme_completion_queue));
435 if (sc->compl_queues == NULL) {
436 WPRINTF("%s: CQ allocation failed", __func__);
439 struct nvme_completion_queue *cq = sc->compl_queues;
441 for (i = 0; i < sc->num_cqueues; i++)
442 pthread_mutex_init(&cq[i].mtx, NULL);
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
449 struct nvme_controller_data *cd = &sc->ctrldata;
454 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
457 /* Num of submission commands that we can handle at a time (2^rab) */
467 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
469 cd->ver = 0x00010300;
471 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
475 /* Advertise 1, Read-only firmware slot */
476 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478 cd->lpa = 0; /* TODO: support some simple things like SMART */
479 cd->elpe = 0; /* max error log page entries */
480 cd->npss = 1; /* number of power states support */
482 /* Warning Composite Temperature Threshold */
485 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489 cd->nn = 1; /* number of namespaces */
492 switch (sc->dataset_management) {
493 case NVME_DATASET_MANAGEMENT_AUTO:
494 if (sc->nvstore.deallocate)
495 cd->oncs |= NVME_ONCS_DSM;
497 case NVME_DATASET_MANAGEMENT_ENABLE:
498 cd->oncs |= NVME_ONCS_DSM;
506 cd->power_state[0].mp = 10;
510 * Calculate the CRC-16 of the given buffer
511 * See copyright attribution at top of file
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
516 const unsigned char *cp = buffer;
517 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518 static uint16_t const crc16_table[256] = {
519 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
554 crc = (((crc >> 8) & 0xffU) ^
555 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561 struct nvme_namespace_data *nd, uint32_t nsid,
562 struct pci_nvme_blockstore *nvstore)
565 /* Get capacity and block size information from backing store */
566 nd->nsze = nvstore->size / nvstore->sectsz;
570 if (nvstore->type == NVME_STOR_BLOCKIF)
571 nvstore->deallocate = blockif_candelete(nvstore->ctx);
573 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
576 /* Create an EUI-64 if user did not provide one */
577 if (nvstore->eui64 == 0) {
579 uint64_t eui64 = nvstore->eui64;
581 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
585 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
588 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
590 be64enc(nd->eui64, nvstore->eui64);
592 /* LBA data-sz = 2^lbads */
593 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
600 memset(&sc->err_log, 0, sizeof(sc->err_log));
601 memset(&sc->health_log, 0, sizeof(sc->health_log));
602 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
604 /* Set read/write remainder to round up according to spec */
605 sc->read_dunits_remainder = 999;
606 sc->write_dunits_remainder = 999;
610 pci_nvme_init_features(struct pci_nvme_softc *sc)
613 sc->feat[0].set = nvme_feature_invalid_cb;
614 sc->feat[0].get = nvme_feature_invalid_cb;
616 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
617 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
618 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
619 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
620 nvme_feature_iv_config;
621 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
622 nvme_feature_invalid_cb;
623 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
624 nvme_feature_invalid_cb;
628 pci_nvme_aer_init(struct pci_nvme_softc *sc)
631 STAILQ_INIT(&sc->aer_list);
636 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
638 struct pci_nvme_aer *aer = NULL;
640 while (!STAILQ_EMPTY(&sc->aer_list)) {
641 aer = STAILQ_FIRST(&sc->aer_list);
642 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
646 pci_nvme_aer_init(sc);
650 pci_nvme_aer_available(struct pci_nvme_softc *sc)
653 return (!STAILQ_EMPTY(&sc->aer_list));
657 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
659 struct nvme_controller_data *cd = &sc->ctrldata;
661 /* AERL is a zero based value while aer_count is one's based */
662 return (sc->aer_count == (cd->aerl + 1));
666 * Add an Async Event Request
668 * Stores an AER to be returned later if the Controller needs to notify the
670 * Note that while the NVMe spec doesn't require Controllers to return AER's
671 * in order, this implementation does preserve the order.
674 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
676 struct pci_nvme_aer *aer = NULL;
678 if (pci_nvme_aer_limit_reached(sc))
681 aer = calloc(1, sizeof(struct pci_nvme_aer));
687 /* Save the Command ID for use in the completion message */
689 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
695 * Get an Async Event Request structure
697 * Returns a pointer to an AER previously submitted by the host or NULL if
698 * no AER's exist. Caller is responsible for freeing the returned struct.
700 static struct pci_nvme_aer *
701 pci_nvme_aer_get(struct pci_nvme_softc *sc)
703 struct pci_nvme_aer *aer = NULL;
705 aer = STAILQ_FIRST(&sc->aer_list);
707 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
715 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
719 DPRINTF("%s", __func__);
721 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
722 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
723 (60 << NVME_CAP_LO_REG_TO_SHIFT);
725 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
727 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
732 assert(sc->submit_queues != NULL);
734 for (i = 0; i < sc->num_squeues + 1; i++) {
735 sc->submit_queues[i].qbase = NULL;
736 sc->submit_queues[i].size = 0;
737 sc->submit_queues[i].cqid = 0;
738 sc->submit_queues[i].tail = 0;
739 sc->submit_queues[i].head = 0;
742 assert(sc->compl_queues != NULL);
744 for (i = 0; i < sc->num_cqueues + 1; i++) {
745 sc->compl_queues[i].qbase = NULL;
746 sc->compl_queues[i].size = 0;
747 sc->compl_queues[i].tail = 0;
748 sc->compl_queues[i].head = 0;
751 sc->num_q_is_set = false;
753 pci_nvme_aer_destroy(sc);
757 pci_nvme_reset(struct pci_nvme_softc *sc)
759 pthread_mutex_lock(&sc->mtx);
760 pci_nvme_reset_locked(sc);
761 pthread_mutex_unlock(&sc->mtx);
765 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
769 DPRINTF("%s", __func__);
771 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
772 sc->submit_queues[0].size = asqs;
773 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
774 sizeof(struct nvme_command) * asqs);
776 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
777 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
779 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
780 NVME_AQA_REG_ACQS_MASK) + 1;
781 sc->compl_queues[0].size = acqs;
782 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
783 sizeof(struct nvme_completion) * acqs);
784 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
786 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
787 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
791 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
792 size_t len, enum nvme_copy_dir dir)
797 if (len > (8 * 1024)) {
801 /* Copy from the start of prp1 to the end of the physical page */
802 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
803 bytes = MIN(bytes, len);
805 p = vm_map_gpa(ctx, prp1, bytes);
810 if (dir == NVME_COPY_TO_PRP)
822 len = MIN(len, PAGE_SIZE);
824 p = vm_map_gpa(ctx, prp2, len);
829 if (dir == NVME_COPY_TO_PRP)
838 * Write a Completion Queue Entry update
840 * Write the completion and update the doorbell value
843 pci_nvme_cq_update(struct pci_nvme_softc *sc,
844 struct nvme_completion_queue *cq,
850 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
851 struct nvme_completion *cqe;
853 assert(cq->qbase != NULL);
855 pthread_mutex_lock(&cq->mtx);
857 cqe = &cq->qbase[cq->tail];
859 /* Flip the phase bit */
860 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
863 cqe->sqhd = sq->head;
866 cqe->status = status;
869 if (cq->tail >= cq->size) {
873 pthread_mutex_unlock(&cq->mtx);
877 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
878 struct nvme_completion* compl)
880 uint16_t qid = command->cdw10 & 0xffff;
882 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
883 if (qid == 0 || qid > sc->num_squeues ||
884 (sc->submit_queues[qid].qbase == NULL)) {
885 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
886 __func__, qid, sc->num_squeues);
887 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
888 NVME_SC_INVALID_QUEUE_IDENTIFIER);
892 sc->submit_queues[qid].qbase = NULL;
893 sc->submit_queues[qid].cqid = 0;
894 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
899 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
900 struct nvme_completion* compl)
902 if (command->cdw11 & NVME_CMD_CDW11_PC) {
903 uint16_t qid = command->cdw10 & 0xffff;
904 struct nvme_submission_queue *nsq;
906 if ((qid == 0) || (qid > sc->num_squeues) ||
907 (sc->submit_queues[qid].qbase != NULL)) {
908 WPRINTF("%s queue index %u > num_squeues %u",
909 __func__, qid, sc->num_squeues);
910 pci_nvme_status_tc(&compl->status,
911 NVME_SCT_COMMAND_SPECIFIC,
912 NVME_SC_INVALID_QUEUE_IDENTIFIER);
916 nsq = &sc->submit_queues[qid];
917 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
918 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
919 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
921 * Queues must specify at least two entries
922 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
923 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
925 pci_nvme_status_tc(&compl->status,
926 NVME_SCT_COMMAND_SPECIFIC,
927 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
931 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
932 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
933 pci_nvme_status_tc(&compl->status,
934 NVME_SCT_COMMAND_SPECIFIC,
935 NVME_SC_INVALID_QUEUE_IDENTIFIER);
939 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
940 pci_nvme_status_tc(&compl->status,
941 NVME_SCT_COMMAND_SPECIFIC,
942 NVME_SC_COMPLETION_QUEUE_INVALID);
946 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
948 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
949 sizeof(struct nvme_command) * (size_t)nsq->size);
951 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
952 qid, nsq->size, nsq->qbase, nsq->cqid);
954 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
956 DPRINTF("%s completed creating IOSQ qid %u",
960 * Guest sent non-cont submission queue request.
961 * This setting is unsupported by this emulation.
963 WPRINTF("%s unsupported non-contig (list-based) "
964 "create i/o submission queue", __func__);
966 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
972 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
973 struct nvme_completion* compl)
975 uint16_t qid = command->cdw10 & 0xffff;
978 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
979 if (qid == 0 || qid > sc->num_cqueues ||
980 (sc->compl_queues[qid].qbase == NULL)) {
981 WPRINTF("%s queue index %u / num_cqueues %u",
982 __func__, qid, sc->num_cqueues);
983 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
984 NVME_SC_INVALID_QUEUE_IDENTIFIER);
988 /* Deleting an Active CQ is an error */
989 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
990 if (sc->submit_queues[sqid].cqid == qid) {
991 pci_nvme_status_tc(&compl->status,
992 NVME_SCT_COMMAND_SPECIFIC,
993 NVME_SC_INVALID_QUEUE_DELETION);
997 sc->compl_queues[qid].qbase = NULL;
998 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1003 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1004 struct nvme_completion* compl)
1006 struct nvme_completion_queue *ncq;
1007 uint16_t qid = command->cdw10 & 0xffff;
1009 /* Only support Physically Contiguous queues */
1010 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1011 WPRINTF("%s unsupported non-contig (list-based) "
1012 "create i/o completion queue",
1015 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1019 if ((qid == 0) || (qid > sc->num_cqueues) ||
1020 (sc->compl_queues[qid].qbase != NULL)) {
1021 WPRINTF("%s queue index %u > num_cqueues %u",
1022 __func__, qid, sc->num_cqueues);
1023 pci_nvme_status_tc(&compl->status,
1024 NVME_SCT_COMMAND_SPECIFIC,
1025 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1029 ncq = &sc->compl_queues[qid];
1030 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1031 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1032 if (ncq->intr_vec > (sc->max_queues + 1)) {
1033 pci_nvme_status_tc(&compl->status,
1034 NVME_SCT_COMMAND_SPECIFIC,
1035 NVME_SC_INVALID_INTERRUPT_VECTOR);
1039 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1040 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1042 * Queues must specify at least two entries
1043 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1044 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1046 pci_nvme_status_tc(&compl->status,
1047 NVME_SCT_COMMAND_SPECIFIC,
1048 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1051 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1053 sizeof(struct nvme_command) * (size_t)ncq->size);
1055 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1062 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1063 struct nvme_completion* compl)
1066 uint8_t logpage = command->cdw10 & 0xFF;
1068 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1070 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1073 * Command specifies the number of dwords to return in fields NUMDU
1074 * and NUMDL. This is a zero-based value.
1076 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1077 logsize *= sizeof(uint32_t);
1080 case NVME_LOG_ERROR:
1081 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1082 command->prp2, (uint8_t *)&sc->err_log,
1083 MIN(logsize, sizeof(sc->err_log)),
1086 case NVME_LOG_HEALTH_INFORMATION:
1087 pthread_mutex_lock(&sc->mtx);
1088 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1089 sizeof(sc->health_log.data_units_read));
1090 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1091 sizeof(sc->health_log.data_units_written));
1092 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1093 sizeof(sc->health_log.host_read_commands));
1094 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1095 sizeof(sc->health_log.host_write_commands));
1096 pthread_mutex_unlock(&sc->mtx);
1098 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1099 command->prp2, (uint8_t *)&sc->health_log,
1100 MIN(logsize, sizeof(sc->health_log)),
1103 case NVME_LOG_FIRMWARE_SLOT:
1104 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1105 command->prp2, (uint8_t *)&sc->fw_log,
1106 MIN(logsize, sizeof(sc->fw_log)),
1110 DPRINTF("%s get log page %x command not supported",
1113 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1114 NVME_SC_INVALID_LOG_PAGE);
1121 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1122 struct nvme_completion* compl)
1127 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1128 command->cdw10 & 0xFF, command->nsid);
1130 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1132 switch (command->cdw10 & 0xFF) {
1133 case 0x00: /* return Identify Namespace data structure */
1134 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1135 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1138 case 0x01: /* return Identify Controller data structure */
1139 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140 command->prp2, (uint8_t *)&sc->ctrldata,
1141 sizeof(sc->ctrldata),
1144 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1145 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1146 sizeof(uint32_t) * 1024);
1147 ((uint32_t *)dest)[0] = 1;
1148 ((uint32_t *)dest)[1] = 0;
1150 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1151 if (command->nsid != 1) {
1152 pci_nvme_status_genc(&status,
1153 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1156 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1157 sizeof(uint32_t) * 1024);
1158 /* All bytes after the descriptor shall be zero */
1159 bzero(dest, sizeof(uint32_t) * 1024);
1161 /* Return NIDT=1 (i.e. EUI64) descriptor */
1162 ((uint8_t *)dest)[0] = 1;
1163 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1164 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1167 DPRINTF("%s unsupported identify command requested 0x%x",
1168 __func__, command->cdw10 & 0xFF);
1169 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1173 compl->status = status;
1178 nvme_fid_to_name(uint8_t fid)
1183 case NVME_FEAT_ARBITRATION:
1184 name = "Arbitration";
1186 case NVME_FEAT_POWER_MANAGEMENT:
1187 name = "Power Management";
1189 case NVME_FEAT_LBA_RANGE_TYPE:
1190 name = "LBA Range Type";
1192 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1193 name = "Temperature Threshold";
1195 case NVME_FEAT_ERROR_RECOVERY:
1196 name = "Error Recovery";
1198 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1199 name = "Volatile Write Cache";
1201 case NVME_FEAT_NUMBER_OF_QUEUES:
1202 name = "Number of Queues";
1204 case NVME_FEAT_INTERRUPT_COALESCING:
1205 name = "Interrupt Coalescing";
1207 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1208 name = "Interrupt Vector Configuration";
1210 case NVME_FEAT_WRITE_ATOMICITY:
1211 name = "Write Atomicity Normal";
1213 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1214 name = "Asynchronous Event Configuration";
1216 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1217 name = "Autonomous Power State Transition";
1219 case NVME_FEAT_HOST_MEMORY_BUFFER:
1220 name = "Host Memory Buffer";
1222 case NVME_FEAT_TIMESTAMP:
1225 case NVME_FEAT_KEEP_ALIVE_TIMER:
1226 name = "Keep Alive Timer";
1228 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1229 name = "Host Controlled Thermal Management";
1231 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1232 name = "Non-Operation Power State Config";
1234 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1235 name = "Read Recovery Level Config";
1237 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1238 name = "Predictable Latency Mode Config";
1240 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1241 name = "Predictable Latency Mode Window";
1243 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1244 name = "LBA Status Information Report Interval";
1246 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1247 name = "Host Behavior Support";
1249 case NVME_FEAT_SANITIZE_CONFIG:
1250 name = "Sanitize Config";
1252 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1253 name = "Endurance Group Event Configuration";
1255 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1256 name = "Software Progress Marker";
1258 case NVME_FEAT_HOST_IDENTIFIER:
1259 name = "Host Identifier";
1261 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1262 name = "Reservation Notification Mask";
1264 case NVME_FEAT_RESERVATION_PERSISTENCE:
1265 name = "Reservation Persistence";
1267 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1268 name = "Namespace Write Protection Config";
1279 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1280 struct nvme_feature_obj *feat,
1281 struct nvme_command *command,
1282 struct nvme_completion *compl)
1285 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1289 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1290 struct nvme_feature_obj *feat,
1291 struct nvme_command *command,
1292 struct nvme_completion *compl)
1295 uint32_t cdw11 = command->cdw11;
1299 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1301 iv = cdw11 & 0xffff;
1302 cd = cdw11 & (1 << 16);
1304 if (iv > (sc->max_queues + 1)) {
1308 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1309 if ((iv == 0) && !cd)
1312 /* Requested Interrupt Vector must be used by a CQ */
1313 for (i = 0; i < sc->num_cqueues + 1; i++) {
1314 if (sc->compl_queues[i].intr_vec == iv) {
1315 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1322 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1323 struct nvme_feature_obj *feat,
1324 struct nvme_command *command,
1325 struct nvme_completion *compl)
1327 uint16_t nqr; /* Number of Queues Requested */
1329 if (sc->num_q_is_set) {
1330 WPRINTF("%s: Number of Queues already set", __func__);
1331 pci_nvme_status_genc(&compl->status,
1332 NVME_SC_COMMAND_SEQUENCE_ERROR);
1336 nqr = command->cdw11 & 0xFFFF;
1337 if (nqr == 0xffff) {
1338 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1339 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1343 sc->num_squeues = ONE_BASED(nqr);
1344 if (sc->num_squeues > sc->max_queues) {
1345 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1347 sc->num_squeues = sc->max_queues;
1350 nqr = (command->cdw11 >> 16) & 0xFFFF;
1351 if (nqr == 0xffff) {
1352 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1353 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1357 sc->num_cqueues = ONE_BASED(nqr);
1358 if (sc->num_cqueues > sc->max_queues) {
1359 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1361 sc->num_cqueues = sc->max_queues;
1364 /* Patch the command value which will be saved on callback's return */
1365 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1366 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1368 sc->num_q_is_set = true;
1372 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1373 struct nvme_completion *compl)
1375 struct nvme_feature_obj *feat;
1376 uint32_t nsid = command->nsid;
1377 uint8_t fid = command->cdw10 & 0xFF;
1379 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1381 if (fid >= NVME_FID_MAX) {
1382 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1383 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1386 feat = &sc->feat[fid];
1388 if (!feat->namespace_specific &&
1389 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1390 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1391 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1396 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1399 feat->set(sc, feat, command, compl);
1401 if (compl->status == NVME_SC_SUCCESS)
1402 feat->cdw11 = command->cdw11;
1408 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1409 struct nvme_completion* compl)
1411 struct nvme_feature_obj *feat;
1412 uint8_t fid = command->cdw10 & 0xFF;
1414 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1416 if (fid >= NVME_FID_MAX) {
1417 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1418 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1423 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1425 feat = &sc->feat[fid];
1427 feat->get(sc, feat, command, compl);
1430 if (compl->status == NVME_SC_SUCCESS) {
1431 compl->cdw0 = feat->cdw11;
1438 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1439 struct nvme_completion* compl)
1441 uint8_t ses, lbaf, pi;
1443 /* Only supports Secure Erase Setting - User Data Erase */
1444 ses = (command->cdw10 >> 9) & 0x7;
1446 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1450 /* Only supports a single LBA Format */
1451 lbaf = command->cdw10 & 0xf;
1453 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1454 NVME_SC_INVALID_FORMAT);
1458 /* Doesn't support Protection Infomation */
1459 pi = (command->cdw10 >> 5) & 0x7;
1461 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1465 if (sc->nvstore.type == NVME_STOR_RAM) {
1466 if (sc->nvstore.ctx)
1467 free(sc->nvstore.ctx);
1468 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1469 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1471 struct pci_nvme_ioreq *req;
1474 req = pci_nvme_get_ioreq(sc);
1476 pci_nvme_status_genc(&compl->status,
1477 NVME_SC_INTERNAL_DEVICE_ERROR);
1478 WPRINTF("%s: unable to allocate IO req", __func__);
1481 req->nvme_sq = &sc->submit_queues[0];
1483 req->opc = command->opc;
1484 req->cid = command->cid;
1485 req->nsid = command->nsid;
1487 req->io_req.br_offset = 0;
1488 req->io_req.br_resid = sc->nvstore.size;
1489 req->io_req.br_callback = pci_nvme_io_done;
1491 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1493 pci_nvme_status_genc(&compl->status,
1494 NVME_SC_INTERNAL_DEVICE_ERROR);
1495 pci_nvme_release_ioreq(sc, req);
1503 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1504 struct nvme_completion* compl)
1506 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1507 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1509 /* TODO: search for the command ID and abort it */
1512 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1517 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1518 struct nvme_command* command, struct nvme_completion* compl)
1520 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1522 /* Don't exceed the Async Event Request Limit (AERL). */
1523 if (pci_nvme_aer_limit_reached(sc)) {
1524 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1525 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1529 if (pci_nvme_aer_add(sc, command->cid)) {
1530 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1531 NVME_SC_INTERNAL_DEVICE_ERROR);
1536 * Raise events when they happen based on the Set Features cmd.
1537 * These events happen async, so only set completion successful if
1538 * there is an event reflective of the request to get event.
1540 compl->status = NVME_NO_STATUS;
1546 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1548 struct nvme_completion compl;
1549 struct nvme_command *cmd;
1550 struct nvme_submission_queue *sq;
1551 struct nvme_completion_queue *cq;
1554 DPRINTF("%s index %u", __func__, (uint32_t)value);
1556 sq = &sc->submit_queues[0];
1557 cq = &sc->compl_queues[0];
1559 pthread_mutex_lock(&sq->mtx);
1562 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1564 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1565 cmd = &(sq->qbase)[sqhead];
1570 case NVME_OPC_DELETE_IO_SQ:
1571 DPRINTF("%s command DELETE_IO_SQ", __func__);
1572 nvme_opc_delete_io_sq(sc, cmd, &compl);
1574 case NVME_OPC_CREATE_IO_SQ:
1575 DPRINTF("%s command CREATE_IO_SQ", __func__);
1576 nvme_opc_create_io_sq(sc, cmd, &compl);
1578 case NVME_OPC_DELETE_IO_CQ:
1579 DPRINTF("%s command DELETE_IO_CQ", __func__);
1580 nvme_opc_delete_io_cq(sc, cmd, &compl);
1582 case NVME_OPC_CREATE_IO_CQ:
1583 DPRINTF("%s command CREATE_IO_CQ", __func__);
1584 nvme_opc_create_io_cq(sc, cmd, &compl);
1586 case NVME_OPC_GET_LOG_PAGE:
1587 DPRINTF("%s command GET_LOG_PAGE", __func__);
1588 nvme_opc_get_log_page(sc, cmd, &compl);
1590 case NVME_OPC_IDENTIFY:
1591 DPRINTF("%s command IDENTIFY", __func__);
1592 nvme_opc_identify(sc, cmd, &compl);
1594 case NVME_OPC_ABORT:
1595 DPRINTF("%s command ABORT", __func__);
1596 nvme_opc_abort(sc, cmd, &compl);
1598 case NVME_OPC_SET_FEATURES:
1599 DPRINTF("%s command SET_FEATURES", __func__);
1600 nvme_opc_set_features(sc, cmd, &compl);
1602 case NVME_OPC_GET_FEATURES:
1603 DPRINTF("%s command GET_FEATURES", __func__);
1604 nvme_opc_get_features(sc, cmd, &compl);
1606 case NVME_OPC_FIRMWARE_ACTIVATE:
1607 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1608 pci_nvme_status_tc(&compl.status,
1609 NVME_SCT_COMMAND_SPECIFIC,
1610 NVME_SC_INVALID_FIRMWARE_SLOT);
1612 case NVME_OPC_ASYNC_EVENT_REQUEST:
1613 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1614 nvme_opc_async_event_req(sc, cmd, &compl);
1616 case NVME_OPC_FORMAT_NVM:
1617 DPRINTF("%s command FORMAT_NVM", __func__);
1618 if ((sc->ctrldata.oacs &
1619 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1620 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1622 compl.status = NVME_NO_STATUS;
1623 nvme_opc_format_nvm(sc, cmd, &compl);
1626 DPRINTF("0x%x command is not implemented",
1628 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1630 sqhead = (sqhead + 1) % sq->size;
1632 if (NVME_COMPLETION_VALID(compl)) {
1633 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1641 DPRINTF("setting sqhead %u", sqhead);
1644 if (cq->head != cq->tail)
1645 pci_generate_msix(sc->nsc_pi, 0);
1647 pthread_mutex_unlock(&sq->mtx);
1651 * Update the Write and Read statistics reported in SMART data
1653 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1654 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1655 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1658 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1659 size_t bytes, uint16_t status)
1662 pthread_mutex_lock(&sc->mtx);
1664 case NVME_OPC_WRITE:
1665 sc->write_commands++;
1666 if (status != NVME_SC_SUCCESS)
1668 sc->write_dunits_remainder += (bytes / 512);
1669 while (sc->write_dunits_remainder >= 1000) {
1670 sc->write_data_units++;
1671 sc->write_dunits_remainder -= 1000;
1675 sc->read_commands++;
1676 if (status != NVME_SC_SUCCESS)
1678 sc->read_dunits_remainder += (bytes / 512);
1679 while (sc->read_dunits_remainder >= 1000) {
1680 sc->read_data_units++;
1681 sc->read_dunits_remainder -= 1000;
1685 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1688 pthread_mutex_unlock(&sc->mtx);
1692 * Check if the combination of Starting LBA (slba) and Number of Logical
1693 * Blocks (nlb) exceeds the range of the underlying storage.
1695 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1696 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1700 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1703 size_t offset, bytes;
1705 /* Overflow check of multiplying Starting LBA by the sector size */
1706 if (slba >> (64 - nvstore->sectsz_bits))
1709 offset = slba << nvstore->sectsz_bits;
1710 bytes = nlb << nvstore->sectsz_bits;
1712 /* Overflow check of Number of Logical Blocks */
1713 if ((nvstore->size - offset) < bytes)
1720 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1721 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1728 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1732 /* concatenate contig block-iovs to minimize number of iovs */
1733 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1734 iovidx = req->io_req.br_iovcnt - 1;
1736 req->io_req.br_iov[iovidx].iov_base =
1737 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1738 req->prev_gpaddr, size);
1740 req->prev_size += size;
1741 req->io_req.br_resid += size;
1743 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1745 iovidx = req->io_req.br_iovcnt;
1747 req->io_req.br_offset = lba;
1748 req->io_req.br_resid = 0;
1749 req->io_req.br_param = req;
1752 req->io_req.br_iov[iovidx].iov_base =
1753 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1756 req->io_req.br_iov[iovidx].iov_len = size;
1758 req->prev_gpaddr = gpaddr;
1759 req->prev_size = size;
1760 req->io_req.br_resid += size;
1762 req->io_req.br_iovcnt++;
1769 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1770 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1771 uint32_t cdw0, uint16_t status)
1773 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1775 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1776 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1777 NVME_STATUS_GET_SC(status));
1779 pci_nvme_cq_update(sc, cq,
1785 if (cq->head != cq->tail) {
1786 if (cq->intr_en & NVME_CQ_INTEN) {
1787 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1789 DPRINTF("%s: CQ%u interrupt disabled",
1790 __func__, sq->cqid);
1796 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1799 req->nvme_sq = NULL;
1802 pthread_mutex_lock(&sc->mtx);
1804 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1807 /* when no more IO pending, can set to ready if device reset/enabled */
1808 if (sc->pending_ios == 0 &&
1809 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1810 sc->regs.csts |= NVME_CSTS_RDY;
1812 pthread_mutex_unlock(&sc->mtx);
1814 sem_post(&sc->iosemlock);
1817 static struct pci_nvme_ioreq *
1818 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1820 struct pci_nvme_ioreq *req = NULL;;
1822 sem_wait(&sc->iosemlock);
1823 pthread_mutex_lock(&sc->mtx);
1825 req = STAILQ_FIRST(&sc->ioreqs_free);
1826 assert(req != NULL);
1827 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1833 pthread_mutex_unlock(&sc->mtx);
1835 req->io_req.br_iovcnt = 0;
1836 req->io_req.br_offset = 0;
1837 req->io_req.br_resid = 0;
1838 req->io_req.br_param = req;
1839 req->prev_gpaddr = 0;
1846 pci_nvme_io_done(struct blockif_req *br, int err)
1848 struct pci_nvme_ioreq *req = br->br_param;
1849 struct nvme_submission_queue *sq = req->nvme_sq;
1850 uint16_t code, status;
1852 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1854 /* TODO return correct error */
1855 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1856 pci_nvme_status_genc(&status, code);
1858 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1859 pci_nvme_stats_write_read_update(req->sc, req->opc,
1860 req->bytes, status);
1861 pci_nvme_release_ioreq(req->sc, req);
1865 * Implements the Flush command. The specification states:
1866 * If a volatile write cache is not present, Flush commands complete
1867 * successfully and have no effect
1868 * in the description of the Volatile Write Cache (VWC) field of the Identify
1869 * Controller data. Therefore, set status to Success if the command is
1870 * not supported (i.e. RAM or as indicated by the blockif).
1873 nvme_opc_flush(struct pci_nvme_softc *sc,
1874 struct nvme_command *cmd,
1875 struct pci_nvme_blockstore *nvstore,
1876 struct pci_nvme_ioreq *req,
1879 bool pending = false;
1881 if (nvstore->type == NVME_STOR_RAM) {
1882 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1886 req->io_req.br_callback = pci_nvme_io_done;
1888 err = blockif_flush(nvstore->ctx, &req->io_req);
1894 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1897 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1905 nvme_write_read_ram(struct pci_nvme_softc *sc,
1906 struct pci_nvme_blockstore *nvstore,
1907 uint64_t prp1, uint64_t prp2,
1908 size_t offset, uint64_t bytes,
1911 uint8_t *buf = nvstore->ctx;
1912 enum nvme_copy_dir dir;
1916 dir = NVME_COPY_TO_PRP;
1918 dir = NVME_COPY_FROM_PRP;
1920 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1921 buf + offset, bytes, dir))
1922 pci_nvme_status_genc(&status,
1923 NVME_SC_DATA_TRANSFER_ERROR);
1925 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1931 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1932 struct pci_nvme_blockstore *nvstore,
1933 struct pci_nvme_ioreq *req,
1934 uint64_t prp1, uint64_t prp2,
1935 size_t offset, uint64_t bytes,
1940 uint16_t status = NVME_NO_STATUS;
1942 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1943 if (pci_nvme_append_iov_req(sc, req, prp1,
1944 size, is_write, offset)) {
1945 pci_nvme_status_genc(&status,
1946 NVME_SC_DATA_TRANSFER_ERROR);
1955 } else if (bytes <= PAGE_SIZE) {
1957 if (pci_nvme_append_iov_req(sc, req, prp2,
1958 size, is_write, offset)) {
1959 pci_nvme_status_genc(&status,
1960 NVME_SC_DATA_TRANSFER_ERROR);
1964 void *vmctx = sc->nsc_pi->pi_vmctx;
1965 uint64_t *prp_list = &prp2;
1966 uint64_t *last = prp_list;
1968 /* PRP2 is pointer to a physical region page list */
1970 /* Last entry in list points to the next list */
1971 if (prp_list == last) {
1972 uint64_t prp = *prp_list;
1974 prp_list = paddr_guest2host(vmctx, prp,
1975 PAGE_SIZE - (prp % PAGE_SIZE));
1976 last = prp_list + (NVME_PRP2_ITEMS - 1);
1979 size = MIN(bytes, PAGE_SIZE);
1981 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1982 size, is_write, offset)) {
1983 pci_nvme_status_genc(&status,
1984 NVME_SC_DATA_TRANSFER_ERROR);
1994 req->io_req.br_callback = pci_nvme_io_done;
1996 err = blockif_write(nvstore->ctx, &req->io_req);
1998 err = blockif_read(nvstore->ctx, &req->io_req);
2001 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2007 nvme_opc_write_read(struct pci_nvme_softc *sc,
2008 struct nvme_command *cmd,
2009 struct pci_nvme_blockstore *nvstore,
2010 struct pci_nvme_ioreq *req,
2013 uint64_t lba, nblocks, bytes;
2015 bool is_write = cmd->opc == NVME_OPC_WRITE;
2016 bool pending = false;
2018 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2019 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2020 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2021 WPRINTF("%s command would exceed LBA range", __func__);
2022 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2026 bytes = nblocks << nvstore->sectsz_bits;
2027 if (bytes > NVME_MAX_DATA_SIZE) {
2028 WPRINTF("%s command would exceed MDTS", __func__);
2029 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2033 offset = lba << nvstore->sectsz_bits;
2036 req->io_req.br_offset = lba;
2038 /* PRP bits 1:0 must be zero */
2039 cmd->prp1 &= ~0x3UL;
2040 cmd->prp2 &= ~0x3UL;
2042 if (nvstore->type == NVME_STOR_RAM) {
2043 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2044 cmd->prp2, offset, bytes, is_write);
2046 *status = nvme_write_read_blockif(sc, nvstore, req,
2047 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2049 if (*status == NVME_NO_STATUS)
2054 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2060 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2062 struct pci_nvme_ioreq *req = br->br_param;
2063 struct pci_nvme_softc *sc = req->sc;
2068 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2069 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2070 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2072 struct iovec *iov = req->io_req.br_iov;
2075 iov += req->prev_gpaddr;
2077 /* The iov_* values already include the sector size */
2078 req->io_req.br_offset = (off_t)iov->iov_base;
2079 req->io_req.br_resid = iov->iov_len;
2080 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2081 pci_nvme_status_genc(&status,
2082 NVME_SC_INTERNAL_DEVICE_ERROR);
2088 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2089 req->cid, 0, status);
2090 pci_nvme_release_ioreq(sc, req);
2095 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2096 struct nvme_command *cmd,
2097 struct pci_nvme_blockstore *nvstore,
2098 struct pci_nvme_ioreq *req,
2101 struct nvme_dsm_range *range;
2102 uint32_t nr, r, non_zero, dr;
2104 bool pending = false;
2106 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2107 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2111 nr = cmd->cdw10 & 0xff;
2113 /* copy locally because a range entry could straddle PRPs */
2114 range = calloc(1, NVME_MAX_DSM_TRIM);
2115 if (range == NULL) {
2116 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2119 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2120 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2122 /* Check for invalid ranges and the number of non-zero lengths */
2124 for (r = 0; r <= nr; r++) {
2125 if (pci_nvme_out_of_range(nvstore,
2126 range[r].starting_lba, range[r].length)) {
2127 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2130 if (range[r].length != 0)
2134 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2135 size_t offset, bytes;
2136 int sectsz_bits = sc->nvstore.sectsz_bits;
2139 * DSM calls are advisory only, and compliant controllers
2140 * may choose to take no actions (i.e. return Success).
2142 if (!nvstore->deallocate) {
2143 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2147 /* If all ranges have a zero length, return Success */
2148 if (non_zero == 0) {
2149 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2154 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2158 offset = range[0].starting_lba << sectsz_bits;
2159 bytes = range[0].length << sectsz_bits;
2162 * If the request is for more than a single range, store
2163 * the ranges in the br_iov. Optimize for the common case
2164 * of a single range.
2166 * Note that NVMe Number of Ranges is a zero based value
2168 req->io_req.br_iovcnt = 0;
2169 req->io_req.br_offset = offset;
2170 req->io_req.br_resid = bytes;
2173 req->io_req.br_callback = pci_nvme_io_done;
2175 struct iovec *iov = req->io_req.br_iov;
2177 for (r = 0, dr = 0; r <= nr; r++) {
2178 offset = range[r].starting_lba << sectsz_bits;
2179 bytes = range[r].length << sectsz_bits;
2183 if ((nvstore->size - offset) < bytes) {
2184 pci_nvme_status_genc(status,
2185 NVME_SC_LBA_OUT_OF_RANGE);
2188 iov[dr].iov_base = (void *)offset;
2189 iov[dr].iov_len = bytes;
2192 req->io_req.br_callback = pci_nvme_dealloc_sm;
2195 * Use prev_gpaddr to track the current entry and
2196 * prev_size to track the number of entries
2198 req->prev_gpaddr = 0;
2199 req->prev_size = dr;
2202 err = blockif_delete(nvstore->ctx, &req->io_req);
2204 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2214 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2216 struct nvme_submission_queue *sq;
2220 /* handle all submissions up to sq->tail index */
2221 sq = &sc->submit_queues[idx];
2223 pthread_mutex_lock(&sq->mtx);
2226 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2227 idx, sqhead, sq->tail, sq->qbase);
2229 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2230 struct nvme_command *cmd;
2231 struct pci_nvme_ioreq *req;
2239 cmd = &sq->qbase[sqhead];
2240 sqhead = (sqhead + 1) % sq->size;
2242 nsid = le32toh(cmd->nsid);
2243 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2244 pci_nvme_status_genc(&status,
2245 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2247 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2251 req = pci_nvme_get_ioreq(sc);
2253 pci_nvme_status_genc(&status,
2254 NVME_SC_INTERNAL_DEVICE_ERROR);
2255 WPRINTF("%s: unable to allocate IO req", __func__);
2260 req->opc = cmd->opc;
2261 req->cid = cmd->cid;
2262 req->nsid = cmd->nsid;
2265 case NVME_OPC_FLUSH:
2266 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2269 case NVME_OPC_WRITE:
2271 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2274 case NVME_OPC_WRITE_ZEROES:
2275 /* TODO: write zeroes
2276 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2277 __func__, lba, cmd->cdw12 & 0xFFFF); */
2278 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2280 case NVME_OPC_DATASET_MANAGEMENT:
2281 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2285 WPRINTF("%s unhandled io command 0x%x",
2286 __func__, cmd->opc);
2287 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2291 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2294 pci_nvme_release_ioreq(sc, req);
2300 pthread_mutex_unlock(&sq->mtx);
2304 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2305 uint64_t idx, int is_sq, uint64_t value)
2307 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2308 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2311 if (idx > sc->num_squeues) {
2312 WPRINTF("%s queue index %lu overflow from "
2314 __func__, idx, sc->num_squeues);
2318 atomic_store_short(&sc->submit_queues[idx].tail,
2322 pci_nvme_handle_admin_cmd(sc, value);
2324 /* submission queue; handle new entries in SQ */
2325 if (idx > sc->num_squeues) {
2326 WPRINTF("%s SQ index %lu overflow from "
2328 __func__, idx, sc->num_squeues);
2331 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2334 if (idx > sc->num_cqueues) {
2335 WPRINTF("%s queue index %lu overflow from "
2337 __func__, idx, sc->num_cqueues);
2341 atomic_store_short(&sc->compl_queues[idx].head,
2347 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2349 const char *s = iswrite ? "WRITE" : "READ";
2352 case NVME_CR_CAP_LOW:
2353 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2355 case NVME_CR_CAP_HI:
2356 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2359 DPRINTF("%s %s NVME_CR_VS", func, s);
2362 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2365 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2368 DPRINTF("%s %s NVME_CR_CC", func, s);
2371 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2374 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2377 DPRINTF("%s %s NVME_CR_AQA", func, s);
2379 case NVME_CR_ASQ_LOW:
2380 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2382 case NVME_CR_ASQ_HI:
2383 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2385 case NVME_CR_ACQ_LOW:
2386 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2388 case NVME_CR_ACQ_HI:
2389 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2392 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2398 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2399 uint64_t offset, int size, uint64_t value)
2403 if (offset >= NVME_DOORBELL_OFFSET) {
2404 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2405 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2406 int is_sq = (belloffset % 8) < 4;
2408 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2409 WPRINTF("guest attempted an overflow write offset "
2410 "0x%lx, val 0x%lx in %s",
2411 offset, value, __func__);
2415 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2419 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2420 offset, size, value);
2423 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2424 "val 0x%lx) to bar0 in %s",
2425 size, offset, value, __func__);
2426 /* TODO: shutdown device */
2430 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2432 pthread_mutex_lock(&sc->mtx);
2435 case NVME_CR_CAP_LOW:
2436 case NVME_CR_CAP_HI:
2443 /* MSI-X, so ignore */
2446 /* MSI-X, so ignore */
2449 ccreg = (uint32_t)value;
2451 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2454 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2455 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2456 NVME_CC_GET_IOCQES(ccreg));
2458 if (NVME_CC_GET_SHN(ccreg)) {
2459 /* perform shutdown - flush out data to backend */
2460 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2461 NVME_CSTS_REG_SHST_SHIFT);
2462 sc->regs.csts |= NVME_SHST_COMPLETE <<
2463 NVME_CSTS_REG_SHST_SHIFT;
2465 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2466 if (NVME_CC_GET_EN(ccreg) == 0)
2467 /* transition 1-> causes controller reset */
2468 pci_nvme_reset_locked(sc);
2470 pci_nvme_init_controller(ctx, sc);
2473 /* Insert the iocqes, iosqes and en bits from the write */
2474 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2475 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2476 if (NVME_CC_GET_EN(ccreg) == 0) {
2477 /* Insert the ams, mps and css bit fields */
2478 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2479 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2480 sc->regs.csts &= ~NVME_CSTS_RDY;
2481 } else if (sc->pending_ios == 0) {
2482 sc->regs.csts |= NVME_CSTS_RDY;
2488 /* ignore writes; don't support subsystem reset */
2491 sc->regs.aqa = (uint32_t)value;
2493 case NVME_CR_ASQ_LOW:
2494 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2495 (0xFFFFF000 & value);
2497 case NVME_CR_ASQ_HI:
2498 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2501 case NVME_CR_ACQ_LOW:
2502 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2503 (0xFFFFF000 & value);
2505 case NVME_CR_ACQ_HI:
2506 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2510 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2511 __func__, offset, value, size);
2513 pthread_mutex_unlock(&sc->mtx);
2517 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2518 int baridx, uint64_t offset, int size, uint64_t value)
2520 struct pci_nvme_softc* sc = pi->pi_arg;
2522 if (baridx == pci_msix_table_bar(pi) ||
2523 baridx == pci_msix_pba_bar(pi)) {
2524 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2525 " value 0x%lx", baridx, offset, size, value);
2527 pci_emul_msix_twrite(pi, offset, size, value);
2533 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2537 DPRINTF("%s unknown baridx %d, val 0x%lx",
2538 __func__, baridx, value);
2542 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2543 uint64_t offset, int size)
2547 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2549 if (offset < NVME_DOORBELL_OFFSET) {
2550 void *p = &(sc->regs);
2551 pthread_mutex_lock(&sc->mtx);
2552 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2553 pthread_mutex_unlock(&sc->mtx);
2556 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2567 value &= 0xFFFFFFFF;
2571 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2572 offset, size, (uint32_t)value);
2580 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2581 uint64_t offset, int size)
2583 struct pci_nvme_softc* sc = pi->pi_arg;
2585 if (baridx == pci_msix_table_bar(pi) ||
2586 baridx == pci_msix_pba_bar(pi)) {
2587 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2588 baridx, offset, size);
2590 return pci_emul_msix_tread(pi, offset, size);
2595 return pci_nvme_read_bar_0(sc, offset, size);
2598 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2606 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2608 char bident[sizeof("XX:X:X")];
2609 char *uopt, *xopts, *config;
2613 sc->max_queues = NVME_QUEUES;
2614 sc->max_qentries = NVME_MAX_QENTRIES;
2615 sc->ioslots = NVME_IOSLOTS;
2616 sc->num_squeues = sc->max_queues;
2617 sc->num_cqueues = sc->max_queues;
2618 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2621 uopt = strdup(opts);
2623 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2624 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2625 for (xopts = strtok(uopt, ",");
2627 xopts = strtok(NULL, ",")) {
2629 if ((config = strchr(xopts, '=')) != NULL)
2632 if (!strcmp("maxq", xopts)) {
2633 sc->max_queues = atoi(config);
2634 } else if (!strcmp("qsz", xopts)) {
2635 sc->max_qentries = atoi(config);
2636 } else if (!strcmp("ioslots", xopts)) {
2637 sc->ioslots = atoi(config);
2638 } else if (!strcmp("sectsz", xopts)) {
2639 sectsz = atoi(config);
2640 } else if (!strcmp("ser", xopts)) {
2642 * This field indicates the Product Serial Number in
2643 * 7-bit ASCII, unused bytes should be space characters.
2646 cpywithpad((char *)sc->ctrldata.sn,
2647 sizeof(sc->ctrldata.sn), config, ' ');
2648 } else if (!strcmp("ram", xopts)) {
2649 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2651 sc->nvstore.type = NVME_STOR_RAM;
2652 sc->nvstore.size = sz * 1024 * 1024;
2653 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2654 sc->nvstore.sectsz = 4096;
2655 sc->nvstore.sectsz_bits = 12;
2656 if (sc->nvstore.ctx == NULL) {
2657 perror("Unable to allocate RAM");
2661 } else if (!strcmp("eui64", xopts)) {
2662 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2663 } else if (!strcmp("dsm", xopts)) {
2664 if (!strcmp("auto", config))
2665 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2666 else if (!strcmp("enable", config))
2667 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2668 else if (!strcmp("disable", config))
2669 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2670 } else if (optidx == 0) {
2671 snprintf(bident, sizeof(bident), "%d:%d",
2672 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2673 sc->nvstore.ctx = blockif_open(xopts, bident);
2674 if (sc->nvstore.ctx == NULL) {
2675 perror("Could not open backing file");
2679 sc->nvstore.type = NVME_STOR_BLOCKIF;
2680 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2682 EPRINTLN("Invalid option %s", xopts);
2691 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2692 EPRINTLN("backing store not specified");
2695 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2696 sc->nvstore.sectsz = sectsz;
2697 else if (sc->nvstore.type != NVME_STOR_RAM)
2698 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2699 for (sc->nvstore.sectsz_bits = 9;
2700 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2701 sc->nvstore.sectsz_bits++);
2703 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2704 sc->max_queues = NVME_QUEUES;
2706 if (sc->max_qentries <= 0) {
2707 EPRINTLN("Invalid qsz option");
2710 if (sc->ioslots <= 0) {
2711 EPRINTLN("Invalid ioslots option");
2719 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2721 struct pci_nvme_softc *sc;
2722 uint32_t pci_membar_sz;
2727 sc = calloc(1, sizeof(struct pci_nvme_softc));
2731 error = pci_nvme_parse_opts(sc, opts);
2737 STAILQ_INIT(&sc->ioreqs_free);
2738 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2739 for (int i = 0; i < sc->ioslots; i++) {
2740 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2743 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2744 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2745 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2746 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2747 pci_set_cfgdata8(pi, PCIR_PROGIF,
2748 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2751 * Allocate size of NVMe registers + doorbell space for all queues.
2753 * The specification requires a minimum memory I/O window size of 16K.
2754 * The Windows driver will refuse to start a device with a smaller
2757 pci_membar_sz = sizeof(struct nvme_registers) +
2758 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2759 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2761 DPRINTF("nvme membar size: %u", pci_membar_sz);
2763 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2765 WPRINTF("%s pci alloc mem bar failed", __func__);
2769 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2771 WPRINTF("%s pci add msixcap failed", __func__);
2775 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2777 WPRINTF("%s pci add Express capability failed", __func__);
2781 pthread_mutex_init(&sc->mtx, NULL);
2782 sem_init(&sc->iosemlock, 0, sc->ioslots);
2784 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2786 * Controller data depends on Namespace data so initialize Namespace
2789 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2790 pci_nvme_init_ctrldata(sc);
2791 pci_nvme_init_logpages(sc);
2792 pci_nvme_init_features(sc);
2794 pci_nvme_aer_init(sc);
2798 pci_lintr_request(pi);
2805 struct pci_devemu pci_de_nvme = {
2807 .pe_init = pci_nvme_init,
2808 .pe_barwrite = pci_nvme_write,
2809 .pe_barread = pci_nvme_read
2811 PCI_EMUL_SET(pci_de_nvme);