2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN 0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS 0xffff
115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero) ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one) ((one) - 1)
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
131 enum nvme_controller_register_offsets {
132 NVME_CR_CAP_LOW = 0x00,
133 NVME_CR_CAP_HI = 0x04,
135 NVME_CR_INTMS = 0x0c,
136 NVME_CR_INTMC = 0x10,
141 NVME_CR_ASQ_LOW = 0x28,
142 NVME_CR_ASQ_HI = 0x2c,
143 NVME_CR_ACQ_LOW = 0x30,
144 NVME_CR_ACQ_HI = 0x34,
147 enum nvme_cmd_cdw11 {
148 NVME_CMD_CDW11_PC = 0x0001,
149 NVME_CMD_CDW11_IEN = 0x0002,
150 NVME_CMD_CDW11_IV = 0xFFFF0000,
158 #define NVME_CQ_INTEN 0x01
159 #define NVME_CQ_INTCOAL 0x02
161 struct nvme_completion_queue {
162 struct nvme_completion *qbase;
165 uint16_t tail; /* nvme progress */
166 uint16_t head; /* guest progress */
171 struct nvme_submission_queue {
172 struct nvme_command *qbase;
175 uint16_t head; /* nvme progress */
176 uint16_t tail; /* guest progress */
177 uint16_t cqid; /* completion queue id */
181 enum nvme_storage_type {
182 NVME_STOR_BLOCKIF = 0,
186 struct pci_nvme_blockstore {
187 enum nvme_storage_type type;
191 uint32_t sectsz_bits;
193 uint32_t deallocate:1;
197 * Calculate the number of additional page descriptors for guest IO requests
198 * based on the advertised Max Data Transfer (MDTS) and given the number of
199 * default iovec's in a struct blockif_req.
201 * Note the + 1 allows for the initial descriptor to not be page aligned.
203 #define MDTS_PAD_SIZE \
204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
254 struct pci_nvme_aer {
255 STAILQ_ENTRY(pci_nvme_aer) link;
256 uint16_t cid; /* Command ID of the submitted AER */
259 struct pci_nvme_softc {
260 struct pci_devinst *nsc_pi;
264 struct nvme_registers regs;
266 struct nvme_namespace_data nsdata;
267 struct nvme_controller_data ctrldata;
268 struct nvme_error_information_entry err_log;
269 struct nvme_health_information_page health_log;
270 struct nvme_firmware_page fw_log;
272 struct pci_nvme_blockstore nvstore;
274 uint16_t max_qentries; /* max entries per queue */
275 uint32_t max_queues; /* max number of IO SQ's or CQ's */
276 uint32_t num_cqueues;
277 uint32_t num_squeues;
278 bool num_q_is_set; /* Has host set Number of Queues */
280 struct pci_nvme_ioreq *ioreqs;
281 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282 uint32_t pending_ios;
287 * Memory mapped Submission and Completion queues
288 * Each array includes both Admin and IO queues
290 struct nvme_completion_queue *compl_queues;
291 struct nvme_submission_queue *submit_queues;
293 struct nvme_feature_obj feat[NVME_FID_MAX];
295 enum nvme_dsm_type dataset_management;
297 /* Accounting for SMART data */
298 __uint128_t read_data_units;
299 __uint128_t write_data_units;
300 __uint128_t read_commands;
301 __uint128_t write_commands;
302 uint32_t read_dunits_remainder;
303 uint32_t write_dunits_remainder;
305 STAILQ_HEAD(, pci_nvme_aer) aer_list;
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
326 #define NVME_CC_WRITE_MASK \
327 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
331 #define NVME_CC_NEN_WRITE_MASK \
332 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
340 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
348 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352 struct nvme_feature_obj *,
353 struct nvme_command *,
354 struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356 struct nvme_feature_obj *,
357 struct nvme_command *,
358 struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360 struct nvme_feature_obj *,
361 struct nvme_command *,
362 struct nvme_completion *);
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
369 len = strnlen(src, dst_size);
370 memset(dst, pad, dst_size);
371 memcpy(dst, src, len);
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
378 *status &= ~NVME_STATUS_MASK;
379 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
387 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
391 * Initialize the requested number or IO Submission and Completion Queues.
392 * Admin queues are allocated implicitly.
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
400 * Allocate and initialize the Submission Queues
402 if (nsq > NVME_QUEUES) {
403 WPRINTF("%s: clamping number of SQ from %u to %u",
404 __func__, nsq, NVME_QUEUES);
408 sc->num_squeues = nsq;
410 sc->submit_queues = calloc(sc->num_squeues + 1,
411 sizeof(struct nvme_submission_queue));
412 if (sc->submit_queues == NULL) {
413 WPRINTF("%s: SQ allocation failed", __func__);
416 struct nvme_submission_queue *sq = sc->submit_queues;
418 for (i = 0; i < sc->num_squeues; i++)
419 pthread_mutex_init(&sq[i].mtx, NULL);
423 * Allocate and initialize the Completion Queues
425 if (ncq > NVME_QUEUES) {
426 WPRINTF("%s: clamping number of CQ from %u to %u",
427 __func__, ncq, NVME_QUEUES);
431 sc->num_cqueues = ncq;
433 sc->compl_queues = calloc(sc->num_cqueues + 1,
434 sizeof(struct nvme_completion_queue));
435 if (sc->compl_queues == NULL) {
436 WPRINTF("%s: CQ allocation failed", __func__);
439 struct nvme_completion_queue *cq = sc->compl_queues;
441 for (i = 0; i < sc->num_cqueues; i++)
442 pthread_mutex_init(&cq[i].mtx, NULL);
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
449 struct nvme_controller_data *cd = &sc->ctrldata;
454 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
457 /* Num of submission commands that we can handle at a time (2^rab) */
467 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
469 cd->ver = 0x00010300;
471 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
475 /* Advertise 1, Read-only firmware slot */
476 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478 cd->lpa = 0; /* TODO: support some simple things like SMART */
479 cd->elpe = 0; /* max error log page entries */
480 cd->npss = 1; /* number of power states support */
482 /* Warning Composite Temperature Threshold */
485 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489 cd->nn = 1; /* number of namespaces */
492 switch (sc->dataset_management) {
493 case NVME_DATASET_MANAGEMENT_AUTO:
494 if (sc->nvstore.deallocate)
495 cd->oncs |= NVME_ONCS_DSM;
497 case NVME_DATASET_MANAGEMENT_ENABLE:
498 cd->oncs |= NVME_ONCS_DSM;
506 cd->power_state[0].mp = 10;
510 * Calculate the CRC-16 of the given buffer
511 * See copyright attribution at top of file
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
516 const unsigned char *cp = buffer;
517 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518 static uint16_t const crc16_table[256] = {
519 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
554 crc = (((crc >> 8) & 0xffU) ^
555 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561 struct nvme_namespace_data *nd, uint32_t nsid,
562 struct pci_nvme_blockstore *nvstore)
565 /* Get capacity and block size information from backing store */
566 nd->nsze = nvstore->size / nvstore->sectsz;
570 if (nvstore->type == NVME_STOR_BLOCKIF)
571 nvstore->deallocate = blockif_candelete(nvstore->ctx);
573 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
576 /* Create an EUI-64 if user did not provide one */
577 if (nvstore->eui64 == 0) {
579 uint64_t eui64 = nvstore->eui64;
581 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
585 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
588 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
590 be64enc(nd->eui64, nvstore->eui64);
592 /* LBA data-sz = 2^lbads */
593 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
600 memset(&sc->err_log, 0, sizeof(sc->err_log));
601 memset(&sc->health_log, 0, sizeof(sc->health_log));
602 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
604 /* Set read/write remainder to round up according to spec */
605 sc->read_dunits_remainder = 999;
606 sc->write_dunits_remainder = 999;
610 pci_nvme_init_features(struct pci_nvme_softc *sc)
613 sc->feat[0].set = nvme_feature_invalid_cb;
614 sc->feat[0].get = nvme_feature_invalid_cb;
616 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
617 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
618 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
619 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
620 nvme_feature_iv_config;
621 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
622 nvme_feature_invalid_cb;
623 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
624 nvme_feature_invalid_cb;
628 pci_nvme_aer_init(struct pci_nvme_softc *sc)
631 STAILQ_INIT(&sc->aer_list);
636 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
638 struct pci_nvme_aer *aer = NULL;
640 while (!STAILQ_EMPTY(&sc->aer_list)) {
641 aer = STAILQ_FIRST(&sc->aer_list);
642 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
646 pci_nvme_aer_init(sc);
650 pci_nvme_aer_available(struct pci_nvme_softc *sc)
653 return (!STAILQ_EMPTY(&sc->aer_list));
657 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
659 struct nvme_controller_data *cd = &sc->ctrldata;
661 /* AERL is a zero based value while aer_count is one's based */
662 return (sc->aer_count == (cd->aerl + 1));
666 * Add an Async Event Request
668 * Stores an AER to be returned later if the Controller needs to notify the
670 * Note that while the NVMe spec doesn't require Controllers to return AER's
671 * in order, this implementation does preserve the order.
674 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
676 struct pci_nvme_aer *aer = NULL;
678 if (pci_nvme_aer_limit_reached(sc))
681 aer = calloc(1, sizeof(struct pci_nvme_aer));
687 /* Save the Command ID for use in the completion message */
689 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
695 * Get an Async Event Request structure
697 * Returns a pointer to an AER previously submitted by the host or NULL if
698 * no AER's exist. Caller is responsible for freeing the returned struct.
700 static struct pci_nvme_aer *
701 pci_nvme_aer_get(struct pci_nvme_softc *sc)
703 struct pci_nvme_aer *aer = NULL;
705 aer = STAILQ_FIRST(&sc->aer_list);
707 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
715 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
719 DPRINTF("%s", __func__);
721 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
722 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
723 (60 << NVME_CAP_LO_REG_TO_SHIFT);
725 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
727 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
732 assert(sc->submit_queues != NULL);
734 for (i = 0; i < sc->num_squeues + 1; i++) {
735 sc->submit_queues[i].qbase = NULL;
736 sc->submit_queues[i].size = 0;
737 sc->submit_queues[i].cqid = 0;
738 sc->submit_queues[i].tail = 0;
739 sc->submit_queues[i].head = 0;
742 assert(sc->compl_queues != NULL);
744 for (i = 0; i < sc->num_cqueues + 1; i++) {
745 sc->compl_queues[i].qbase = NULL;
746 sc->compl_queues[i].size = 0;
747 sc->compl_queues[i].tail = 0;
748 sc->compl_queues[i].head = 0;
751 sc->num_q_is_set = false;
753 pci_nvme_aer_destroy(sc);
757 pci_nvme_reset(struct pci_nvme_softc *sc)
759 pthread_mutex_lock(&sc->mtx);
760 pci_nvme_reset_locked(sc);
761 pthread_mutex_unlock(&sc->mtx);
765 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
769 DPRINTF("%s", __func__);
771 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
772 sc->submit_queues[0].size = asqs;
773 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
774 sizeof(struct nvme_command) * asqs);
776 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
777 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
779 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
780 NVME_AQA_REG_ACQS_MASK) + 1;
781 sc->compl_queues[0].size = acqs;
782 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
783 sizeof(struct nvme_completion) * acqs);
784 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
786 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
787 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
791 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
792 size_t len, enum nvme_copy_dir dir)
797 if (len > (8 * 1024)) {
801 /* Copy from the start of prp1 to the end of the physical page */
802 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
803 bytes = MIN(bytes, len);
805 p = vm_map_gpa(ctx, prp1, bytes);
810 if (dir == NVME_COPY_TO_PRP)
822 len = MIN(len, PAGE_SIZE);
824 p = vm_map_gpa(ctx, prp2, len);
829 if (dir == NVME_COPY_TO_PRP)
838 * Write a Completion Queue Entry update
840 * Write the completion and update the doorbell value
843 pci_nvme_cq_update(struct pci_nvme_softc *sc,
844 struct nvme_completion_queue *cq,
850 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
851 struct nvme_completion *cqe;
853 assert(cq->qbase != NULL);
855 pthread_mutex_lock(&cq->mtx);
857 cqe = &cq->qbase[cq->tail];
859 /* Flip the phase bit */
860 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
863 cqe->sqhd = sq->head;
866 cqe->status = status;
869 if (cq->tail >= cq->size) {
873 pthread_mutex_unlock(&cq->mtx);
877 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
878 struct nvme_completion* compl)
880 uint16_t qid = command->cdw10 & 0xffff;
882 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
883 if (qid == 0 || qid > sc->num_squeues ||
884 (sc->submit_queues[qid].qbase == NULL)) {
885 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
886 __func__, qid, sc->num_squeues);
887 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
888 NVME_SC_INVALID_QUEUE_IDENTIFIER);
892 sc->submit_queues[qid].qbase = NULL;
893 sc->submit_queues[qid].cqid = 0;
894 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
899 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
900 struct nvme_completion* compl)
902 if (command->cdw11 & NVME_CMD_CDW11_PC) {
903 uint16_t qid = command->cdw10 & 0xffff;
904 struct nvme_submission_queue *nsq;
906 if ((qid == 0) || (qid > sc->num_squeues) ||
907 (sc->submit_queues[qid].qbase != NULL)) {
908 WPRINTF("%s queue index %u > num_squeues %u",
909 __func__, qid, sc->num_squeues);
910 pci_nvme_status_tc(&compl->status,
911 NVME_SCT_COMMAND_SPECIFIC,
912 NVME_SC_INVALID_QUEUE_IDENTIFIER);
916 nsq = &sc->submit_queues[qid];
917 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
918 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
919 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
921 * Queues must specify at least two entries
922 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
923 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
925 pci_nvme_status_tc(&compl->status,
926 NVME_SCT_COMMAND_SPECIFIC,
927 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
931 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
932 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
933 pci_nvme_status_tc(&compl->status,
934 NVME_SCT_COMMAND_SPECIFIC,
935 NVME_SC_INVALID_QUEUE_IDENTIFIER);
939 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
940 pci_nvme_status_tc(&compl->status,
941 NVME_SCT_COMMAND_SPECIFIC,
942 NVME_SC_COMPLETION_QUEUE_INVALID);
946 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
948 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
949 sizeof(struct nvme_command) * (size_t)nsq->size);
951 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
952 qid, nsq->size, nsq->qbase, nsq->cqid);
954 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
956 DPRINTF("%s completed creating IOSQ qid %u",
960 * Guest sent non-cont submission queue request.
961 * This setting is unsupported by this emulation.
963 WPRINTF("%s unsupported non-contig (list-based) "
964 "create i/o submission queue", __func__);
966 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
972 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
973 struct nvme_completion* compl)
975 uint16_t qid = command->cdw10 & 0xffff;
978 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
979 if (qid == 0 || qid > sc->num_cqueues ||
980 (sc->compl_queues[qid].qbase == NULL)) {
981 WPRINTF("%s queue index %u / num_cqueues %u",
982 __func__, qid, sc->num_cqueues);
983 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
984 NVME_SC_INVALID_QUEUE_IDENTIFIER);
988 /* Deleting an Active CQ is an error */
989 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
990 if (sc->submit_queues[sqid].cqid == qid) {
991 pci_nvme_status_tc(&compl->status,
992 NVME_SCT_COMMAND_SPECIFIC,
993 NVME_SC_INVALID_QUEUE_DELETION);
997 sc->compl_queues[qid].qbase = NULL;
998 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1003 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1004 struct nvme_completion* compl)
1006 struct nvme_completion_queue *ncq;
1007 uint16_t qid = command->cdw10 & 0xffff;
1009 /* Only support Physically Contiguous queues */
1010 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1011 WPRINTF("%s unsupported non-contig (list-based) "
1012 "create i/o completion queue",
1015 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1019 if ((qid == 0) || (qid > sc->num_cqueues) ||
1020 (sc->compl_queues[qid].qbase != NULL)) {
1021 WPRINTF("%s queue index %u > num_cqueues %u",
1022 __func__, qid, sc->num_cqueues);
1023 pci_nvme_status_tc(&compl->status,
1024 NVME_SCT_COMMAND_SPECIFIC,
1025 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1029 ncq = &sc->compl_queues[qid];
1030 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1031 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1032 if (ncq->intr_vec > (sc->max_queues + 1)) {
1033 pci_nvme_status_tc(&compl->status,
1034 NVME_SCT_COMMAND_SPECIFIC,
1035 NVME_SC_INVALID_INTERRUPT_VECTOR);
1039 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1040 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1042 * Queues must specify at least two entries
1043 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1044 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1046 pci_nvme_status_tc(&compl->status,
1047 NVME_SCT_COMMAND_SPECIFIC,
1048 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1051 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1053 sizeof(struct nvme_command) * (size_t)ncq->size);
1055 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1062 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1063 struct nvme_completion* compl)
1066 uint8_t logpage = command->cdw10 & 0xFF;
1068 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1070 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1073 * Command specifies the number of dwords to return in fields NUMDU
1074 * and NUMDL. This is a zero-based value.
1076 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1077 logsize *= sizeof(uint32_t);
1080 case NVME_LOG_ERROR:
1081 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1082 command->prp2, (uint8_t *)&sc->err_log,
1083 MIN(logsize, sizeof(sc->err_log)),
1086 case NVME_LOG_HEALTH_INFORMATION:
1087 pthread_mutex_lock(&sc->mtx);
1088 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1089 sizeof(sc->health_log.data_units_read));
1090 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1091 sizeof(sc->health_log.data_units_written));
1092 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1093 sizeof(sc->health_log.host_read_commands));
1094 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1095 sizeof(sc->health_log.host_write_commands));
1096 pthread_mutex_unlock(&sc->mtx);
1098 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1099 command->prp2, (uint8_t *)&sc->health_log,
1100 MIN(logsize, sizeof(sc->health_log)),
1103 case NVME_LOG_FIRMWARE_SLOT:
1104 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1105 command->prp2, (uint8_t *)&sc->fw_log,
1106 MIN(logsize, sizeof(sc->fw_log)),
1110 DPRINTF("%s get log page %x command not supported",
1113 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1114 NVME_SC_INVALID_LOG_PAGE);
1121 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1122 struct nvme_completion* compl)
1127 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1128 command->cdw10 & 0xFF, command->nsid);
1130 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1132 switch (command->cdw10 & 0xFF) {
1133 case 0x00: /* return Identify Namespace data structure */
1134 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1135 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1138 case 0x01: /* return Identify Controller data structure */
1139 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140 command->prp2, (uint8_t *)&sc->ctrldata,
1141 sizeof(sc->ctrldata),
1144 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1145 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1146 sizeof(uint32_t) * 1024);
1147 /* All unused entries shall be zero */
1148 bzero(dest, sizeof(uint32_t) * 1024);
1149 ((uint32_t *)dest)[0] = 1;
1151 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1152 if (command->nsid != 1) {
1153 pci_nvme_status_genc(&status,
1154 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1157 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1158 sizeof(uint32_t) * 1024);
1159 /* All bytes after the descriptor shall be zero */
1160 bzero(dest, sizeof(uint32_t) * 1024);
1162 /* Return NIDT=1 (i.e. EUI64) descriptor */
1163 ((uint8_t *)dest)[0] = 1;
1164 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1165 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1168 DPRINTF("%s unsupported identify command requested 0x%x",
1169 __func__, command->cdw10 & 0xFF);
1170 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1174 compl->status = status;
1179 nvme_fid_to_name(uint8_t fid)
1184 case NVME_FEAT_ARBITRATION:
1185 name = "Arbitration";
1187 case NVME_FEAT_POWER_MANAGEMENT:
1188 name = "Power Management";
1190 case NVME_FEAT_LBA_RANGE_TYPE:
1191 name = "LBA Range Type";
1193 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1194 name = "Temperature Threshold";
1196 case NVME_FEAT_ERROR_RECOVERY:
1197 name = "Error Recovery";
1199 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1200 name = "Volatile Write Cache";
1202 case NVME_FEAT_NUMBER_OF_QUEUES:
1203 name = "Number of Queues";
1205 case NVME_FEAT_INTERRUPT_COALESCING:
1206 name = "Interrupt Coalescing";
1208 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1209 name = "Interrupt Vector Configuration";
1211 case NVME_FEAT_WRITE_ATOMICITY:
1212 name = "Write Atomicity Normal";
1214 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1215 name = "Asynchronous Event Configuration";
1217 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1218 name = "Autonomous Power State Transition";
1220 case NVME_FEAT_HOST_MEMORY_BUFFER:
1221 name = "Host Memory Buffer";
1223 case NVME_FEAT_TIMESTAMP:
1226 case NVME_FEAT_KEEP_ALIVE_TIMER:
1227 name = "Keep Alive Timer";
1229 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1230 name = "Host Controlled Thermal Management";
1232 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1233 name = "Non-Operation Power State Config";
1235 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1236 name = "Read Recovery Level Config";
1238 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1239 name = "Predictable Latency Mode Config";
1241 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1242 name = "Predictable Latency Mode Window";
1244 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1245 name = "LBA Status Information Report Interval";
1247 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1248 name = "Host Behavior Support";
1250 case NVME_FEAT_SANITIZE_CONFIG:
1251 name = "Sanitize Config";
1253 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1254 name = "Endurance Group Event Configuration";
1256 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1257 name = "Software Progress Marker";
1259 case NVME_FEAT_HOST_IDENTIFIER:
1260 name = "Host Identifier";
1262 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1263 name = "Reservation Notification Mask";
1265 case NVME_FEAT_RESERVATION_PERSISTENCE:
1266 name = "Reservation Persistence";
1268 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1269 name = "Namespace Write Protection Config";
1280 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1281 struct nvme_feature_obj *feat,
1282 struct nvme_command *command,
1283 struct nvme_completion *compl)
1286 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1290 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1291 struct nvme_feature_obj *feat,
1292 struct nvme_command *command,
1293 struct nvme_completion *compl)
1296 uint32_t cdw11 = command->cdw11;
1300 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1302 iv = cdw11 & 0xffff;
1303 cd = cdw11 & (1 << 16);
1305 if (iv > (sc->max_queues + 1)) {
1309 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1310 if ((iv == 0) && !cd)
1313 /* Requested Interrupt Vector must be used by a CQ */
1314 for (i = 0; i < sc->num_cqueues + 1; i++) {
1315 if (sc->compl_queues[i].intr_vec == iv) {
1316 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1323 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1324 struct nvme_feature_obj *feat,
1325 struct nvme_command *command,
1326 struct nvme_completion *compl)
1328 uint16_t nqr; /* Number of Queues Requested */
1330 if (sc->num_q_is_set) {
1331 WPRINTF("%s: Number of Queues already set", __func__);
1332 pci_nvme_status_genc(&compl->status,
1333 NVME_SC_COMMAND_SEQUENCE_ERROR);
1337 nqr = command->cdw11 & 0xFFFF;
1338 if (nqr == 0xffff) {
1339 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1340 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1344 sc->num_squeues = ONE_BASED(nqr);
1345 if (sc->num_squeues > sc->max_queues) {
1346 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1348 sc->num_squeues = sc->max_queues;
1351 nqr = (command->cdw11 >> 16) & 0xFFFF;
1352 if (nqr == 0xffff) {
1353 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1354 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1358 sc->num_cqueues = ONE_BASED(nqr);
1359 if (sc->num_cqueues > sc->max_queues) {
1360 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1362 sc->num_cqueues = sc->max_queues;
1365 /* Patch the command value which will be saved on callback's return */
1366 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1367 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1369 sc->num_q_is_set = true;
1373 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1374 struct nvme_completion *compl)
1376 struct nvme_feature_obj *feat;
1377 uint32_t nsid = command->nsid;
1378 uint8_t fid = command->cdw10 & 0xFF;
1380 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1382 if (fid >= NVME_FID_MAX) {
1383 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1384 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1387 feat = &sc->feat[fid];
1389 if (!feat->namespace_specific &&
1390 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1391 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1392 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1397 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1400 feat->set(sc, feat, command, compl);
1402 if (compl->status == NVME_SC_SUCCESS)
1403 feat->cdw11 = command->cdw11;
1409 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1410 struct nvme_completion* compl)
1412 struct nvme_feature_obj *feat;
1413 uint8_t fid = command->cdw10 & 0xFF;
1415 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1417 if (fid >= NVME_FID_MAX) {
1418 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1419 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1424 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1426 feat = &sc->feat[fid];
1428 feat->get(sc, feat, command, compl);
1431 if (compl->status == NVME_SC_SUCCESS) {
1432 compl->cdw0 = feat->cdw11;
1439 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1440 struct nvme_completion* compl)
1442 uint8_t ses, lbaf, pi;
1444 /* Only supports Secure Erase Setting - User Data Erase */
1445 ses = (command->cdw10 >> 9) & 0x7;
1447 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1451 /* Only supports a single LBA Format */
1452 lbaf = command->cdw10 & 0xf;
1454 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1455 NVME_SC_INVALID_FORMAT);
1459 /* Doesn't support Protection Infomation */
1460 pi = (command->cdw10 >> 5) & 0x7;
1462 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1466 if (sc->nvstore.type == NVME_STOR_RAM) {
1467 if (sc->nvstore.ctx)
1468 free(sc->nvstore.ctx);
1469 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1470 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1472 struct pci_nvme_ioreq *req;
1475 req = pci_nvme_get_ioreq(sc);
1477 pci_nvme_status_genc(&compl->status,
1478 NVME_SC_INTERNAL_DEVICE_ERROR);
1479 WPRINTF("%s: unable to allocate IO req", __func__);
1482 req->nvme_sq = &sc->submit_queues[0];
1484 req->opc = command->opc;
1485 req->cid = command->cid;
1486 req->nsid = command->nsid;
1488 req->io_req.br_offset = 0;
1489 req->io_req.br_resid = sc->nvstore.size;
1490 req->io_req.br_callback = pci_nvme_io_done;
1492 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1494 pci_nvme_status_genc(&compl->status,
1495 NVME_SC_INTERNAL_DEVICE_ERROR);
1496 pci_nvme_release_ioreq(sc, req);
1504 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1505 struct nvme_completion* compl)
1507 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1508 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1510 /* TODO: search for the command ID and abort it */
1513 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1518 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1519 struct nvme_command* command, struct nvme_completion* compl)
1521 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1523 /* Don't exceed the Async Event Request Limit (AERL). */
1524 if (pci_nvme_aer_limit_reached(sc)) {
1525 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1526 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1530 if (pci_nvme_aer_add(sc, command->cid)) {
1531 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1532 NVME_SC_INTERNAL_DEVICE_ERROR);
1537 * Raise events when they happen based on the Set Features cmd.
1538 * These events happen async, so only set completion successful if
1539 * there is an event reflective of the request to get event.
1541 compl->status = NVME_NO_STATUS;
1547 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1549 struct nvme_completion compl;
1550 struct nvme_command *cmd;
1551 struct nvme_submission_queue *sq;
1552 struct nvme_completion_queue *cq;
1555 DPRINTF("%s index %u", __func__, (uint32_t)value);
1557 sq = &sc->submit_queues[0];
1558 cq = &sc->compl_queues[0];
1560 pthread_mutex_lock(&sq->mtx);
1563 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1565 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1566 cmd = &(sq->qbase)[sqhead];
1571 case NVME_OPC_DELETE_IO_SQ:
1572 DPRINTF("%s command DELETE_IO_SQ", __func__);
1573 nvme_opc_delete_io_sq(sc, cmd, &compl);
1575 case NVME_OPC_CREATE_IO_SQ:
1576 DPRINTF("%s command CREATE_IO_SQ", __func__);
1577 nvme_opc_create_io_sq(sc, cmd, &compl);
1579 case NVME_OPC_DELETE_IO_CQ:
1580 DPRINTF("%s command DELETE_IO_CQ", __func__);
1581 nvme_opc_delete_io_cq(sc, cmd, &compl);
1583 case NVME_OPC_CREATE_IO_CQ:
1584 DPRINTF("%s command CREATE_IO_CQ", __func__);
1585 nvme_opc_create_io_cq(sc, cmd, &compl);
1587 case NVME_OPC_GET_LOG_PAGE:
1588 DPRINTF("%s command GET_LOG_PAGE", __func__);
1589 nvme_opc_get_log_page(sc, cmd, &compl);
1591 case NVME_OPC_IDENTIFY:
1592 DPRINTF("%s command IDENTIFY", __func__);
1593 nvme_opc_identify(sc, cmd, &compl);
1595 case NVME_OPC_ABORT:
1596 DPRINTF("%s command ABORT", __func__);
1597 nvme_opc_abort(sc, cmd, &compl);
1599 case NVME_OPC_SET_FEATURES:
1600 DPRINTF("%s command SET_FEATURES", __func__);
1601 nvme_opc_set_features(sc, cmd, &compl);
1603 case NVME_OPC_GET_FEATURES:
1604 DPRINTF("%s command GET_FEATURES", __func__);
1605 nvme_opc_get_features(sc, cmd, &compl);
1607 case NVME_OPC_FIRMWARE_ACTIVATE:
1608 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1609 pci_nvme_status_tc(&compl.status,
1610 NVME_SCT_COMMAND_SPECIFIC,
1611 NVME_SC_INVALID_FIRMWARE_SLOT);
1613 case NVME_OPC_ASYNC_EVENT_REQUEST:
1614 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1615 nvme_opc_async_event_req(sc, cmd, &compl);
1617 case NVME_OPC_FORMAT_NVM:
1618 DPRINTF("%s command FORMAT_NVM", __func__);
1619 if ((sc->ctrldata.oacs &
1620 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1621 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1623 compl.status = NVME_NO_STATUS;
1624 nvme_opc_format_nvm(sc, cmd, &compl);
1627 DPRINTF("0x%x command is not implemented",
1629 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1631 sqhead = (sqhead + 1) % sq->size;
1633 if (NVME_COMPLETION_VALID(compl)) {
1634 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1642 DPRINTF("setting sqhead %u", sqhead);
1645 if (cq->head != cq->tail)
1646 pci_generate_msix(sc->nsc_pi, 0);
1648 pthread_mutex_unlock(&sq->mtx);
1652 * Update the Write and Read statistics reported in SMART data
1654 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1655 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1656 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1659 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1660 size_t bytes, uint16_t status)
1663 pthread_mutex_lock(&sc->mtx);
1665 case NVME_OPC_WRITE:
1666 sc->write_commands++;
1667 if (status != NVME_SC_SUCCESS)
1669 sc->write_dunits_remainder += (bytes / 512);
1670 while (sc->write_dunits_remainder >= 1000) {
1671 sc->write_data_units++;
1672 sc->write_dunits_remainder -= 1000;
1676 sc->read_commands++;
1677 if (status != NVME_SC_SUCCESS)
1679 sc->read_dunits_remainder += (bytes / 512);
1680 while (sc->read_dunits_remainder >= 1000) {
1681 sc->read_data_units++;
1682 sc->read_dunits_remainder -= 1000;
1686 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1689 pthread_mutex_unlock(&sc->mtx);
1693 * Check if the combination of Starting LBA (slba) and Number of Logical
1694 * Blocks (nlb) exceeds the range of the underlying storage.
1696 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1697 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1701 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1704 size_t offset, bytes;
1706 /* Overflow check of multiplying Starting LBA by the sector size */
1707 if (slba >> (64 - nvstore->sectsz_bits))
1710 offset = slba << nvstore->sectsz_bits;
1711 bytes = nlb << nvstore->sectsz_bits;
1713 /* Overflow check of Number of Logical Blocks */
1714 if ((nvstore->size - offset) < bytes)
1721 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1722 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1729 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1733 /* concatenate contig block-iovs to minimize number of iovs */
1734 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1735 iovidx = req->io_req.br_iovcnt - 1;
1737 req->io_req.br_iov[iovidx].iov_base =
1738 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1739 req->prev_gpaddr, size);
1741 req->prev_size += size;
1742 req->io_req.br_resid += size;
1744 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1746 iovidx = req->io_req.br_iovcnt;
1748 req->io_req.br_offset = lba;
1749 req->io_req.br_resid = 0;
1750 req->io_req.br_param = req;
1753 req->io_req.br_iov[iovidx].iov_base =
1754 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1757 req->io_req.br_iov[iovidx].iov_len = size;
1759 req->prev_gpaddr = gpaddr;
1760 req->prev_size = size;
1761 req->io_req.br_resid += size;
1763 req->io_req.br_iovcnt++;
1770 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1771 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1772 uint32_t cdw0, uint16_t status)
1774 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1776 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1777 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1778 NVME_STATUS_GET_SC(status));
1780 pci_nvme_cq_update(sc, cq,
1786 if (cq->head != cq->tail) {
1787 if (cq->intr_en & NVME_CQ_INTEN) {
1788 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1790 DPRINTF("%s: CQ%u interrupt disabled",
1791 __func__, sq->cqid);
1797 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1800 req->nvme_sq = NULL;
1803 pthread_mutex_lock(&sc->mtx);
1805 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1808 /* when no more IO pending, can set to ready if device reset/enabled */
1809 if (sc->pending_ios == 0 &&
1810 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1811 sc->regs.csts |= NVME_CSTS_RDY;
1813 pthread_mutex_unlock(&sc->mtx);
1815 sem_post(&sc->iosemlock);
1818 static struct pci_nvme_ioreq *
1819 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1821 struct pci_nvme_ioreq *req = NULL;;
1823 sem_wait(&sc->iosemlock);
1824 pthread_mutex_lock(&sc->mtx);
1826 req = STAILQ_FIRST(&sc->ioreqs_free);
1827 assert(req != NULL);
1828 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1834 pthread_mutex_unlock(&sc->mtx);
1836 req->io_req.br_iovcnt = 0;
1837 req->io_req.br_offset = 0;
1838 req->io_req.br_resid = 0;
1839 req->io_req.br_param = req;
1840 req->prev_gpaddr = 0;
1847 pci_nvme_io_done(struct blockif_req *br, int err)
1849 struct pci_nvme_ioreq *req = br->br_param;
1850 struct nvme_submission_queue *sq = req->nvme_sq;
1851 uint16_t code, status;
1853 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1855 /* TODO return correct error */
1856 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1857 pci_nvme_status_genc(&status, code);
1859 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1860 pci_nvme_stats_write_read_update(req->sc, req->opc,
1861 req->bytes, status);
1862 pci_nvme_release_ioreq(req->sc, req);
1866 * Implements the Flush command. The specification states:
1867 * If a volatile write cache is not present, Flush commands complete
1868 * successfully and have no effect
1869 * in the description of the Volatile Write Cache (VWC) field of the Identify
1870 * Controller data. Therefore, set status to Success if the command is
1871 * not supported (i.e. RAM or as indicated by the blockif).
1874 nvme_opc_flush(struct pci_nvme_softc *sc,
1875 struct nvme_command *cmd,
1876 struct pci_nvme_blockstore *nvstore,
1877 struct pci_nvme_ioreq *req,
1880 bool pending = false;
1882 if (nvstore->type == NVME_STOR_RAM) {
1883 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1887 req->io_req.br_callback = pci_nvme_io_done;
1889 err = blockif_flush(nvstore->ctx, &req->io_req);
1895 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1898 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1906 nvme_write_read_ram(struct pci_nvme_softc *sc,
1907 struct pci_nvme_blockstore *nvstore,
1908 uint64_t prp1, uint64_t prp2,
1909 size_t offset, uint64_t bytes,
1912 uint8_t *buf = nvstore->ctx;
1913 enum nvme_copy_dir dir;
1917 dir = NVME_COPY_TO_PRP;
1919 dir = NVME_COPY_FROM_PRP;
1921 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1922 buf + offset, bytes, dir))
1923 pci_nvme_status_genc(&status,
1924 NVME_SC_DATA_TRANSFER_ERROR);
1926 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1932 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1933 struct pci_nvme_blockstore *nvstore,
1934 struct pci_nvme_ioreq *req,
1935 uint64_t prp1, uint64_t prp2,
1936 size_t offset, uint64_t bytes,
1941 uint16_t status = NVME_NO_STATUS;
1943 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1944 if (pci_nvme_append_iov_req(sc, req, prp1,
1945 size, is_write, offset)) {
1946 pci_nvme_status_genc(&status,
1947 NVME_SC_DATA_TRANSFER_ERROR);
1956 } else if (bytes <= PAGE_SIZE) {
1958 if (pci_nvme_append_iov_req(sc, req, prp2,
1959 size, is_write, offset)) {
1960 pci_nvme_status_genc(&status,
1961 NVME_SC_DATA_TRANSFER_ERROR);
1965 void *vmctx = sc->nsc_pi->pi_vmctx;
1966 uint64_t *prp_list = &prp2;
1967 uint64_t *last = prp_list;
1969 /* PRP2 is pointer to a physical region page list */
1971 /* Last entry in list points to the next list */
1972 if (prp_list == last) {
1973 uint64_t prp = *prp_list;
1975 prp_list = paddr_guest2host(vmctx, prp,
1976 PAGE_SIZE - (prp % PAGE_SIZE));
1977 last = prp_list + (NVME_PRP2_ITEMS - 1);
1980 size = MIN(bytes, PAGE_SIZE);
1982 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1983 size, is_write, offset)) {
1984 pci_nvme_status_genc(&status,
1985 NVME_SC_DATA_TRANSFER_ERROR);
1995 req->io_req.br_callback = pci_nvme_io_done;
1997 err = blockif_write(nvstore->ctx, &req->io_req);
1999 err = blockif_read(nvstore->ctx, &req->io_req);
2002 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2008 nvme_opc_write_read(struct pci_nvme_softc *sc,
2009 struct nvme_command *cmd,
2010 struct pci_nvme_blockstore *nvstore,
2011 struct pci_nvme_ioreq *req,
2014 uint64_t lba, nblocks, bytes;
2016 bool is_write = cmd->opc == NVME_OPC_WRITE;
2017 bool pending = false;
2019 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2020 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2021 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2022 WPRINTF("%s command would exceed LBA range", __func__);
2023 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2027 bytes = nblocks << nvstore->sectsz_bits;
2028 if (bytes > NVME_MAX_DATA_SIZE) {
2029 WPRINTF("%s command would exceed MDTS", __func__);
2030 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2034 offset = lba << nvstore->sectsz_bits;
2037 req->io_req.br_offset = lba;
2039 /* PRP bits 1:0 must be zero */
2040 cmd->prp1 &= ~0x3UL;
2041 cmd->prp2 &= ~0x3UL;
2043 if (nvstore->type == NVME_STOR_RAM) {
2044 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2045 cmd->prp2, offset, bytes, is_write);
2047 *status = nvme_write_read_blockif(sc, nvstore, req,
2048 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2050 if (*status == NVME_NO_STATUS)
2055 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2061 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2063 struct pci_nvme_ioreq *req = br->br_param;
2064 struct pci_nvme_softc *sc = req->sc;
2069 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2070 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2071 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2073 struct iovec *iov = req->io_req.br_iov;
2076 iov += req->prev_gpaddr;
2078 /* The iov_* values already include the sector size */
2079 req->io_req.br_offset = (off_t)iov->iov_base;
2080 req->io_req.br_resid = iov->iov_len;
2081 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2082 pci_nvme_status_genc(&status,
2083 NVME_SC_INTERNAL_DEVICE_ERROR);
2089 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2090 req->cid, 0, status);
2091 pci_nvme_release_ioreq(sc, req);
2096 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2097 struct nvme_command *cmd,
2098 struct pci_nvme_blockstore *nvstore,
2099 struct pci_nvme_ioreq *req,
2102 struct nvme_dsm_range *range;
2103 uint32_t nr, r, non_zero, dr;
2105 bool pending = false;
2107 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2108 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2112 nr = cmd->cdw10 & 0xff;
2114 /* copy locally because a range entry could straddle PRPs */
2115 range = calloc(1, NVME_MAX_DSM_TRIM);
2116 if (range == NULL) {
2117 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2120 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2121 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2123 /* Check for invalid ranges and the number of non-zero lengths */
2125 for (r = 0; r <= nr; r++) {
2126 if (pci_nvme_out_of_range(nvstore,
2127 range[r].starting_lba, range[r].length)) {
2128 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2131 if (range[r].length != 0)
2135 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2136 size_t offset, bytes;
2137 int sectsz_bits = sc->nvstore.sectsz_bits;
2140 * DSM calls are advisory only, and compliant controllers
2141 * may choose to take no actions (i.e. return Success).
2143 if (!nvstore->deallocate) {
2144 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2148 /* If all ranges have a zero length, return Success */
2149 if (non_zero == 0) {
2150 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2155 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2159 offset = range[0].starting_lba << sectsz_bits;
2160 bytes = range[0].length << sectsz_bits;
2163 * If the request is for more than a single range, store
2164 * the ranges in the br_iov. Optimize for the common case
2165 * of a single range.
2167 * Note that NVMe Number of Ranges is a zero based value
2169 req->io_req.br_iovcnt = 0;
2170 req->io_req.br_offset = offset;
2171 req->io_req.br_resid = bytes;
2174 req->io_req.br_callback = pci_nvme_io_done;
2176 struct iovec *iov = req->io_req.br_iov;
2178 for (r = 0, dr = 0; r <= nr; r++) {
2179 offset = range[r].starting_lba << sectsz_bits;
2180 bytes = range[r].length << sectsz_bits;
2184 if ((nvstore->size - offset) < bytes) {
2185 pci_nvme_status_genc(status,
2186 NVME_SC_LBA_OUT_OF_RANGE);
2189 iov[dr].iov_base = (void *)offset;
2190 iov[dr].iov_len = bytes;
2193 req->io_req.br_callback = pci_nvme_dealloc_sm;
2196 * Use prev_gpaddr to track the current entry and
2197 * prev_size to track the number of entries
2199 req->prev_gpaddr = 0;
2200 req->prev_size = dr;
2203 err = blockif_delete(nvstore->ctx, &req->io_req);
2205 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2215 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2217 struct nvme_submission_queue *sq;
2221 /* handle all submissions up to sq->tail index */
2222 sq = &sc->submit_queues[idx];
2224 pthread_mutex_lock(&sq->mtx);
2227 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2228 idx, sqhead, sq->tail, sq->qbase);
2230 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2231 struct nvme_command *cmd;
2232 struct pci_nvme_ioreq *req;
2240 cmd = &sq->qbase[sqhead];
2241 sqhead = (sqhead + 1) % sq->size;
2243 nsid = le32toh(cmd->nsid);
2244 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2245 pci_nvme_status_genc(&status,
2246 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2248 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2252 req = pci_nvme_get_ioreq(sc);
2254 pci_nvme_status_genc(&status,
2255 NVME_SC_INTERNAL_DEVICE_ERROR);
2256 WPRINTF("%s: unable to allocate IO req", __func__);
2261 req->opc = cmd->opc;
2262 req->cid = cmd->cid;
2263 req->nsid = cmd->nsid;
2266 case NVME_OPC_FLUSH:
2267 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2270 case NVME_OPC_WRITE:
2272 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2275 case NVME_OPC_WRITE_ZEROES:
2276 /* TODO: write zeroes
2277 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2278 __func__, lba, cmd->cdw12 & 0xFFFF); */
2279 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2281 case NVME_OPC_DATASET_MANAGEMENT:
2282 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2286 WPRINTF("%s unhandled io command 0x%x",
2287 __func__, cmd->opc);
2288 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2292 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2295 pci_nvme_release_ioreq(sc, req);
2301 pthread_mutex_unlock(&sq->mtx);
2305 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2306 uint64_t idx, int is_sq, uint64_t value)
2308 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2309 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2312 if (idx > sc->num_squeues) {
2313 WPRINTF("%s queue index %lu overflow from "
2315 __func__, idx, sc->num_squeues);
2319 atomic_store_short(&sc->submit_queues[idx].tail,
2323 pci_nvme_handle_admin_cmd(sc, value);
2325 /* submission queue; handle new entries in SQ */
2326 if (idx > sc->num_squeues) {
2327 WPRINTF("%s SQ index %lu overflow from "
2329 __func__, idx, sc->num_squeues);
2332 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2335 if (idx > sc->num_cqueues) {
2336 WPRINTF("%s queue index %lu overflow from "
2338 __func__, idx, sc->num_cqueues);
2342 atomic_store_short(&sc->compl_queues[idx].head,
2348 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2350 const char *s = iswrite ? "WRITE" : "READ";
2353 case NVME_CR_CAP_LOW:
2354 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2356 case NVME_CR_CAP_HI:
2357 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2360 DPRINTF("%s %s NVME_CR_VS", func, s);
2363 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2366 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2369 DPRINTF("%s %s NVME_CR_CC", func, s);
2372 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2375 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2378 DPRINTF("%s %s NVME_CR_AQA", func, s);
2380 case NVME_CR_ASQ_LOW:
2381 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2383 case NVME_CR_ASQ_HI:
2384 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2386 case NVME_CR_ACQ_LOW:
2387 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2389 case NVME_CR_ACQ_HI:
2390 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2393 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2399 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2400 uint64_t offset, int size, uint64_t value)
2404 if (offset >= NVME_DOORBELL_OFFSET) {
2405 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2406 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2407 int is_sq = (belloffset % 8) < 4;
2409 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2410 WPRINTF("guest attempted an overflow write offset "
2411 "0x%lx, val 0x%lx in %s",
2412 offset, value, __func__);
2416 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2420 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2421 offset, size, value);
2424 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2425 "val 0x%lx) to bar0 in %s",
2426 size, offset, value, __func__);
2427 /* TODO: shutdown device */
2431 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2433 pthread_mutex_lock(&sc->mtx);
2436 case NVME_CR_CAP_LOW:
2437 case NVME_CR_CAP_HI:
2444 /* MSI-X, so ignore */
2447 /* MSI-X, so ignore */
2450 ccreg = (uint32_t)value;
2452 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2455 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2456 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2457 NVME_CC_GET_IOCQES(ccreg));
2459 if (NVME_CC_GET_SHN(ccreg)) {
2460 /* perform shutdown - flush out data to backend */
2461 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2462 NVME_CSTS_REG_SHST_SHIFT);
2463 sc->regs.csts |= NVME_SHST_COMPLETE <<
2464 NVME_CSTS_REG_SHST_SHIFT;
2466 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2467 if (NVME_CC_GET_EN(ccreg) == 0)
2468 /* transition 1-> causes controller reset */
2469 pci_nvme_reset_locked(sc);
2471 pci_nvme_init_controller(ctx, sc);
2474 /* Insert the iocqes, iosqes and en bits from the write */
2475 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2476 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2477 if (NVME_CC_GET_EN(ccreg) == 0) {
2478 /* Insert the ams, mps and css bit fields */
2479 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2480 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2481 sc->regs.csts &= ~NVME_CSTS_RDY;
2482 } else if (sc->pending_ios == 0) {
2483 sc->regs.csts |= NVME_CSTS_RDY;
2489 /* ignore writes; don't support subsystem reset */
2492 sc->regs.aqa = (uint32_t)value;
2494 case NVME_CR_ASQ_LOW:
2495 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2496 (0xFFFFF000 & value);
2498 case NVME_CR_ASQ_HI:
2499 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2502 case NVME_CR_ACQ_LOW:
2503 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2504 (0xFFFFF000 & value);
2506 case NVME_CR_ACQ_HI:
2507 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2511 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2512 __func__, offset, value, size);
2514 pthread_mutex_unlock(&sc->mtx);
2518 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2519 int baridx, uint64_t offset, int size, uint64_t value)
2521 struct pci_nvme_softc* sc = pi->pi_arg;
2523 if (baridx == pci_msix_table_bar(pi) ||
2524 baridx == pci_msix_pba_bar(pi)) {
2525 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2526 " value 0x%lx", baridx, offset, size, value);
2528 pci_emul_msix_twrite(pi, offset, size, value);
2534 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2538 DPRINTF("%s unknown baridx %d, val 0x%lx",
2539 __func__, baridx, value);
2543 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2544 uint64_t offset, int size)
2548 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2550 if (offset < NVME_DOORBELL_OFFSET) {
2551 void *p = &(sc->regs);
2552 pthread_mutex_lock(&sc->mtx);
2553 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2554 pthread_mutex_unlock(&sc->mtx);
2557 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2568 value &= 0xFFFFFFFF;
2572 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2573 offset, size, (uint32_t)value);
2581 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2582 uint64_t offset, int size)
2584 struct pci_nvme_softc* sc = pi->pi_arg;
2586 if (baridx == pci_msix_table_bar(pi) ||
2587 baridx == pci_msix_pba_bar(pi)) {
2588 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2589 baridx, offset, size);
2591 return pci_emul_msix_tread(pi, offset, size);
2596 return pci_nvme_read_bar_0(sc, offset, size);
2599 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2607 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2609 char bident[sizeof("XX:X:X")];
2610 char *uopt, *xopts, *config;
2614 sc->max_queues = NVME_QUEUES;
2615 sc->max_qentries = NVME_MAX_QENTRIES;
2616 sc->ioslots = NVME_IOSLOTS;
2617 sc->num_squeues = sc->max_queues;
2618 sc->num_cqueues = sc->max_queues;
2619 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2622 uopt = strdup(opts);
2624 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2625 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2626 for (xopts = strtok(uopt, ",");
2628 xopts = strtok(NULL, ",")) {
2630 if ((config = strchr(xopts, '=')) != NULL)
2633 if (!strcmp("maxq", xopts)) {
2634 sc->max_queues = atoi(config);
2635 } else if (!strcmp("qsz", xopts)) {
2636 sc->max_qentries = atoi(config);
2637 } else if (!strcmp("ioslots", xopts)) {
2638 sc->ioslots = atoi(config);
2639 } else if (!strcmp("sectsz", xopts)) {
2640 sectsz = atoi(config);
2641 } else if (!strcmp("ser", xopts)) {
2643 * This field indicates the Product Serial Number in
2644 * 7-bit ASCII, unused bytes should be space characters.
2647 cpywithpad((char *)sc->ctrldata.sn,
2648 sizeof(sc->ctrldata.sn), config, ' ');
2649 } else if (!strcmp("ram", xopts)) {
2650 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2652 sc->nvstore.type = NVME_STOR_RAM;
2653 sc->nvstore.size = sz * 1024 * 1024;
2654 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2655 sc->nvstore.sectsz = 4096;
2656 sc->nvstore.sectsz_bits = 12;
2657 if (sc->nvstore.ctx == NULL) {
2658 perror("Unable to allocate RAM");
2662 } else if (!strcmp("eui64", xopts)) {
2663 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2664 } else if (!strcmp("dsm", xopts)) {
2665 if (!strcmp("auto", config))
2666 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2667 else if (!strcmp("enable", config))
2668 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2669 else if (!strcmp("disable", config))
2670 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2671 } else if (optidx == 0) {
2672 snprintf(bident, sizeof(bident), "%d:%d",
2673 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2674 sc->nvstore.ctx = blockif_open(xopts, bident);
2675 if (sc->nvstore.ctx == NULL) {
2676 perror("Could not open backing file");
2680 sc->nvstore.type = NVME_STOR_BLOCKIF;
2681 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2683 EPRINTLN("Invalid option %s", xopts);
2692 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2693 EPRINTLN("backing store not specified");
2696 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2697 sc->nvstore.sectsz = sectsz;
2698 else if (sc->nvstore.type != NVME_STOR_RAM)
2699 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2700 for (sc->nvstore.sectsz_bits = 9;
2701 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2702 sc->nvstore.sectsz_bits++);
2704 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2705 sc->max_queues = NVME_QUEUES;
2707 if (sc->max_qentries <= 0) {
2708 EPRINTLN("Invalid qsz option");
2711 if (sc->ioslots <= 0) {
2712 EPRINTLN("Invalid ioslots option");
2720 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2722 struct pci_nvme_softc *sc;
2723 uint32_t pci_membar_sz;
2728 sc = calloc(1, sizeof(struct pci_nvme_softc));
2732 error = pci_nvme_parse_opts(sc, opts);
2738 STAILQ_INIT(&sc->ioreqs_free);
2739 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2740 for (int i = 0; i < sc->ioslots; i++) {
2741 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2744 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2745 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2746 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2747 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2748 pci_set_cfgdata8(pi, PCIR_PROGIF,
2749 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2752 * Allocate size of NVMe registers + doorbell space for all queues.
2754 * The specification requires a minimum memory I/O window size of 16K.
2755 * The Windows driver will refuse to start a device with a smaller
2758 pci_membar_sz = sizeof(struct nvme_registers) +
2759 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2760 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2762 DPRINTF("nvme membar size: %u", pci_membar_sz);
2764 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2766 WPRINTF("%s pci alloc mem bar failed", __func__);
2770 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2772 WPRINTF("%s pci add msixcap failed", __func__);
2776 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2778 WPRINTF("%s pci add Express capability failed", __func__);
2782 pthread_mutex_init(&sc->mtx, NULL);
2783 sem_init(&sc->iosemlock, 0, sc->ioslots);
2785 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2787 * Controller data depends on Namespace data so initialize Namespace
2790 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2791 pci_nvme_init_ctrldata(sc);
2792 pci_nvme_init_logpages(sc);
2793 pci_nvme_init_features(sc);
2795 pci_nvme_aer_init(sc);
2799 pci_lintr_request(pi);
2806 struct pci_devemu pci_de_nvme = {
2808 .pe_init = pci_nvme_init,
2809 .pe_barwrite = pci_nvme_write,
2810 .pe_barread = pci_nvme_read
2812 PCI_EMUL_SET(pci_de_nvme);