2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN 0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS 0xffff
115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero) ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one) ((one) - 1)
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
131 enum nvme_controller_register_offsets {
132 NVME_CR_CAP_LOW = 0x00,
133 NVME_CR_CAP_HI = 0x04,
135 NVME_CR_INTMS = 0x0c,
136 NVME_CR_INTMC = 0x10,
141 NVME_CR_ASQ_LOW = 0x28,
142 NVME_CR_ASQ_HI = 0x2c,
143 NVME_CR_ACQ_LOW = 0x30,
144 NVME_CR_ACQ_HI = 0x34,
147 enum nvme_cmd_cdw11 {
148 NVME_CMD_CDW11_PC = 0x0001,
149 NVME_CMD_CDW11_IEN = 0x0002,
150 NVME_CMD_CDW11_IV = 0xFFFF0000,
158 #define NVME_CQ_INTEN 0x01
159 #define NVME_CQ_INTCOAL 0x02
161 struct nvme_completion_queue {
162 struct nvme_completion *qbase;
165 uint16_t tail; /* nvme progress */
166 uint16_t head; /* guest progress */
171 struct nvme_submission_queue {
172 struct nvme_command *qbase;
175 uint16_t head; /* nvme progress */
176 uint16_t tail; /* guest progress */
177 uint16_t cqid; /* completion queue id */
181 enum nvme_storage_type {
182 NVME_STOR_BLOCKIF = 0,
186 struct pci_nvme_blockstore {
187 enum nvme_storage_type type;
191 uint32_t sectsz_bits;
193 uint32_t deallocate:1;
197 * Calculate the number of additional page descriptors for guest IO requests
198 * based on the advertised Max Data Transfer (MDTS) and given the number of
199 * default iovec's in a struct blockif_req.
201 * Note the + 1 allows for the initial descriptor to not be page aligned.
203 #define MDTS_PAD_SIZE \
204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
254 struct pci_nvme_aer {
255 STAILQ_ENTRY(pci_nvme_aer) link;
256 uint16_t cid; /* Command ID of the submitted AER */
259 struct pci_nvme_softc {
260 struct pci_devinst *nsc_pi;
264 struct nvme_registers regs;
266 struct nvme_namespace_data nsdata;
267 struct nvme_controller_data ctrldata;
268 struct nvme_error_information_entry err_log;
269 struct nvme_health_information_page health_log;
270 struct nvme_firmware_page fw_log;
272 struct pci_nvme_blockstore nvstore;
274 uint16_t max_qentries; /* max entries per queue */
275 uint32_t max_queues; /* max number of IO SQ's or CQ's */
276 uint32_t num_cqueues;
277 uint32_t num_squeues;
278 bool num_q_is_set; /* Has host set Number of Queues */
280 struct pci_nvme_ioreq *ioreqs;
281 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282 uint32_t pending_ios;
287 * Memory mapped Submission and Completion queues
288 * Each array includes both Admin and IO queues
290 struct nvme_completion_queue *compl_queues;
291 struct nvme_submission_queue *submit_queues;
293 struct nvme_feature_obj feat[NVME_FID_MAX];
295 enum nvme_dsm_type dataset_management;
297 /* Accounting for SMART data */
298 __uint128_t read_data_units;
299 __uint128_t write_data_units;
300 __uint128_t read_commands;
301 __uint128_t write_commands;
302 uint32_t read_dunits_remainder;
303 uint32_t write_dunits_remainder;
305 STAILQ_HEAD(, pci_nvme_aer) aer_list;
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
314 /* Controller Configuration utils */
315 #define NVME_CC_GET_EN(cc) \
316 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define NVME_CC_GET_CSS(cc) \
318 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define NVME_CC_GET_SHN(cc) \
320 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define NVME_CC_GET_IOSQES(cc) \
322 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define NVME_CC_GET_IOCQES(cc) \
324 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
326 #define NVME_CC_WRITE_MASK \
327 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
331 #define NVME_CC_NEN_WRITE_MASK \
332 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
336 /* Controller Status utils */
337 #define NVME_CSTS_GET_RDY(sts) \
338 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
340 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
342 /* Completion Queue status word utils */
343 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
344 #define NVME_STATUS_MASK \
345 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
348 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352 struct nvme_feature_obj *,
353 struct nvme_command *,
354 struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356 struct nvme_feature_obj *,
357 struct nvme_command *,
358 struct nvme_completion *);
361 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
365 len = strnlen(src, dst_size);
366 memset(dst, pad, dst_size);
367 memcpy(dst, src, len);
371 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
374 *status &= ~NVME_STATUS_MASK;
375 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
376 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
380 pci_nvme_status_genc(uint16_t *status, uint16_t code)
383 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
387 * Initialize the requested number or IO Submission and Completion Queues.
388 * Admin queues are allocated implicitly.
391 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
396 * Allocate and initialize the Submission Queues
398 if (nsq > NVME_QUEUES) {
399 WPRINTF("%s: clamping number of SQ from %u to %u",
400 __func__, nsq, NVME_QUEUES);
404 sc->num_squeues = nsq;
406 sc->submit_queues = calloc(sc->num_squeues + 1,
407 sizeof(struct nvme_submission_queue));
408 if (sc->submit_queues == NULL) {
409 WPRINTF("%s: SQ allocation failed", __func__);
412 struct nvme_submission_queue *sq = sc->submit_queues;
414 for (i = 0; i < sc->num_squeues; i++)
415 pthread_mutex_init(&sq[i].mtx, NULL);
419 * Allocate and initialize the Completion Queues
421 if (ncq > NVME_QUEUES) {
422 WPRINTF("%s: clamping number of CQ from %u to %u",
423 __func__, ncq, NVME_QUEUES);
427 sc->num_cqueues = ncq;
429 sc->compl_queues = calloc(sc->num_cqueues + 1,
430 sizeof(struct nvme_completion_queue));
431 if (sc->compl_queues == NULL) {
432 WPRINTF("%s: CQ allocation failed", __func__);
435 struct nvme_completion_queue *cq = sc->compl_queues;
437 for (i = 0; i < sc->num_cqueues; i++)
438 pthread_mutex_init(&cq[i].mtx, NULL);
443 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
445 struct nvme_controller_data *cd = &sc->ctrldata;
450 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
451 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
453 /* Num of submission commands that we can handle at a time (2^rab) */
463 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
465 cd->ver = 0x00010300;
467 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
471 /* Advertise 1, Read-only firmware slot */
472 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
473 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
474 cd->lpa = 0; /* TODO: support some simple things like SMART */
475 cd->elpe = 0; /* max error log page entries */
476 cd->npss = 1; /* number of power states support */
478 /* Warning Composite Temperature Threshold */
481 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
482 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
483 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
484 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
485 cd->nn = 1; /* number of namespaces */
488 switch (sc->dataset_management) {
489 case NVME_DATASET_MANAGEMENT_AUTO:
490 if (sc->nvstore.deallocate)
491 cd->oncs |= NVME_ONCS_DSM;
493 case NVME_DATASET_MANAGEMENT_ENABLE:
494 cd->oncs |= NVME_ONCS_DSM;
502 cd->power_state[0].mp = 10;
506 * Calculate the CRC-16 of the given buffer
507 * See copyright attribution at top of file
510 crc16(uint16_t crc, const void *buffer, unsigned int len)
512 const unsigned char *cp = buffer;
513 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
514 static uint16_t const crc16_table[256] = {
515 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
516 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
517 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
518 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
519 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
520 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
521 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
522 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
523 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
524 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
525 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
526 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
527 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
528 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
529 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
530 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
531 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
532 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
533 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
534 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
535 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
536 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
537 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
538 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
539 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
540 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
541 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
542 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
543 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
544 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
545 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
546 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
550 crc = (((crc >> 8) & 0xffU) ^
551 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
556 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
557 struct nvme_namespace_data *nd, uint32_t nsid,
558 struct pci_nvme_blockstore *nvstore)
561 /* Get capacity and block size information from backing store */
562 nd->nsze = nvstore->size / nvstore->sectsz;
566 if (nvstore->type == NVME_STOR_BLOCKIF)
567 nvstore->deallocate = blockif_candelete(nvstore->ctx);
569 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
572 /* Create an EUI-64 if user did not provide one */
573 if (nvstore->eui64 == 0) {
575 uint64_t eui64 = nvstore->eui64;
577 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
578 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
581 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
584 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
586 be64enc(nd->eui64, nvstore->eui64);
588 /* LBA data-sz = 2^lbads */
589 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
593 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
596 memset(&sc->err_log, 0, sizeof(sc->err_log));
597 memset(&sc->health_log, 0, sizeof(sc->health_log));
598 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
600 /* Set read/write remainder to round up according to spec */
601 sc->read_dunits_remainder = 999;
602 sc->write_dunits_remainder = 999;
606 pci_nvme_init_features(struct pci_nvme_softc *sc)
609 sc->feat[0].set = nvme_feature_invalid_cb;
610 sc->feat[0].get = nvme_feature_invalid_cb;
612 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
613 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
614 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
618 pci_nvme_aer_init(struct pci_nvme_softc *sc)
621 STAILQ_INIT(&sc->aer_list);
626 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
628 struct pci_nvme_aer *aer = NULL;
630 while (!STAILQ_EMPTY(&sc->aer_list)) {
631 aer = STAILQ_FIRST(&sc->aer_list);
632 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
636 pci_nvme_aer_init(sc);
640 pci_nvme_aer_available(struct pci_nvme_softc *sc)
643 return (!STAILQ_EMPTY(&sc->aer_list));
647 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
649 struct nvme_controller_data *cd = &sc->ctrldata;
651 /* AERL is a zero based value while aer_count is one's based */
652 return (sc->aer_count == (cd->aerl + 1));
656 * Add an Async Event Request
658 * Stores an AER to be returned later if the Controller needs to notify the
660 * Note that while the NVMe spec doesn't require Controllers to return AER's
661 * in order, this implementation does preserve the order.
664 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
666 struct pci_nvme_aer *aer = NULL;
668 if (pci_nvme_aer_limit_reached(sc))
671 aer = calloc(1, sizeof(struct pci_nvme_aer));
677 /* Save the Command ID for use in the completion message */
679 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
685 * Get an Async Event Request structure
687 * Returns a pointer to an AER previously submitted by the host or NULL if
688 * no AER's exist. Caller is responsible for freeing the returned struct.
690 static struct pci_nvme_aer *
691 pci_nvme_aer_get(struct pci_nvme_softc *sc)
693 struct pci_nvme_aer *aer = NULL;
695 aer = STAILQ_FIRST(&sc->aer_list);
697 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
705 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
709 DPRINTF("%s", __func__);
711 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
712 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
713 (60 << NVME_CAP_LO_REG_TO_SHIFT);
715 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
717 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
722 assert(sc->submit_queues != NULL);
724 for (i = 0; i < sc->num_squeues + 1; i++) {
725 sc->submit_queues[i].qbase = NULL;
726 sc->submit_queues[i].size = 0;
727 sc->submit_queues[i].cqid = 0;
728 sc->submit_queues[i].tail = 0;
729 sc->submit_queues[i].head = 0;
732 assert(sc->compl_queues != NULL);
734 for (i = 0; i < sc->num_cqueues + 1; i++) {
735 sc->compl_queues[i].qbase = NULL;
736 sc->compl_queues[i].size = 0;
737 sc->compl_queues[i].tail = 0;
738 sc->compl_queues[i].head = 0;
741 sc->num_q_is_set = false;
743 pci_nvme_aer_destroy(sc);
747 pci_nvme_reset(struct pci_nvme_softc *sc)
749 pthread_mutex_lock(&sc->mtx);
750 pci_nvme_reset_locked(sc);
751 pthread_mutex_unlock(&sc->mtx);
755 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
759 DPRINTF("%s", __func__);
761 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
762 sc->submit_queues[0].size = asqs;
763 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
764 sizeof(struct nvme_command) * asqs);
766 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
767 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
769 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
770 NVME_AQA_REG_ACQS_MASK) + 1;
771 sc->compl_queues[0].size = acqs;
772 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
773 sizeof(struct nvme_completion) * acqs);
774 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
776 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
777 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
781 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
782 size_t len, enum nvme_copy_dir dir)
787 if (len > (8 * 1024)) {
791 /* Copy from the start of prp1 to the end of the physical page */
792 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
793 bytes = MIN(bytes, len);
795 p = vm_map_gpa(ctx, prp1, bytes);
800 if (dir == NVME_COPY_TO_PRP)
812 len = MIN(len, PAGE_SIZE);
814 p = vm_map_gpa(ctx, prp2, len);
819 if (dir == NVME_COPY_TO_PRP)
828 * Write a Completion Queue Entry update
830 * Write the completion and update the doorbell value
833 pci_nvme_cq_update(struct pci_nvme_softc *sc,
834 struct nvme_completion_queue *cq,
840 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
841 struct nvme_completion *cqe;
843 assert(cq->qbase != NULL);
845 pthread_mutex_lock(&cq->mtx);
847 cqe = &cq->qbase[cq->tail];
849 /* Flip the phase bit */
850 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
853 cqe->sqhd = sq->head;
856 cqe->status = status;
859 if (cq->tail >= cq->size) {
863 pthread_mutex_unlock(&cq->mtx);
867 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
868 struct nvme_completion* compl)
870 uint16_t qid = command->cdw10 & 0xffff;
872 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
873 if (qid == 0 || qid > sc->num_squeues ||
874 (sc->submit_queues[qid].qbase == NULL)) {
875 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
876 __func__, qid, sc->num_squeues);
877 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
878 NVME_SC_INVALID_QUEUE_IDENTIFIER);
882 sc->submit_queues[qid].qbase = NULL;
883 sc->submit_queues[qid].cqid = 0;
884 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
889 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
890 struct nvme_completion* compl)
892 if (command->cdw11 & NVME_CMD_CDW11_PC) {
893 uint16_t qid = command->cdw10 & 0xffff;
894 struct nvme_submission_queue *nsq;
896 if ((qid == 0) || (qid > sc->num_squeues) ||
897 (sc->submit_queues[qid].qbase != NULL)) {
898 WPRINTF("%s queue index %u > num_squeues %u",
899 __func__, qid, sc->num_squeues);
900 pci_nvme_status_tc(&compl->status,
901 NVME_SCT_COMMAND_SPECIFIC,
902 NVME_SC_INVALID_QUEUE_IDENTIFIER);
906 nsq = &sc->submit_queues[qid];
907 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
908 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
909 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
911 * Queues must specify at least two entries
912 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
913 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
915 pci_nvme_status_tc(&compl->status,
916 NVME_SCT_COMMAND_SPECIFIC,
917 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
921 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
922 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
923 pci_nvme_status_tc(&compl->status,
924 NVME_SCT_COMMAND_SPECIFIC,
925 NVME_SC_INVALID_QUEUE_IDENTIFIER);
929 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
930 pci_nvme_status_tc(&compl->status,
931 NVME_SCT_COMMAND_SPECIFIC,
932 NVME_SC_COMPLETION_QUEUE_INVALID);
936 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
938 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
939 sizeof(struct nvme_command) * (size_t)nsq->size);
941 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
942 qid, nsq->size, nsq->qbase, nsq->cqid);
944 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
946 DPRINTF("%s completed creating IOSQ qid %u",
950 * Guest sent non-cont submission queue request.
951 * This setting is unsupported by this emulation.
953 WPRINTF("%s unsupported non-contig (list-based) "
954 "create i/o submission queue", __func__);
956 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
962 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
963 struct nvme_completion* compl)
965 uint16_t qid = command->cdw10 & 0xffff;
968 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
969 if (qid == 0 || qid > sc->num_cqueues ||
970 (sc->compl_queues[qid].qbase == NULL)) {
971 WPRINTF("%s queue index %u / num_cqueues %u",
972 __func__, qid, sc->num_cqueues);
973 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
974 NVME_SC_INVALID_QUEUE_IDENTIFIER);
978 /* Deleting an Active CQ is an error */
979 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
980 if (sc->submit_queues[sqid].cqid == qid) {
981 pci_nvme_status_tc(&compl->status,
982 NVME_SCT_COMMAND_SPECIFIC,
983 NVME_SC_INVALID_QUEUE_DELETION);
987 sc->compl_queues[qid].qbase = NULL;
988 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
993 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
994 struct nvme_completion* compl)
996 struct nvme_completion_queue *ncq;
997 uint16_t qid = command->cdw10 & 0xffff;
999 /* Only support Physically Contiguous queues */
1000 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1001 WPRINTF("%s unsupported non-contig (list-based) "
1002 "create i/o completion queue",
1005 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1009 if ((qid == 0) || (qid > sc->num_cqueues) ||
1010 (sc->compl_queues[qid].qbase != NULL)) {
1011 WPRINTF("%s queue index %u > num_cqueues %u",
1012 __func__, qid, sc->num_cqueues);
1013 pci_nvme_status_tc(&compl->status,
1014 NVME_SCT_COMMAND_SPECIFIC,
1015 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1019 ncq = &sc->compl_queues[qid];
1020 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1021 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1022 if (ncq->intr_vec > (sc->max_queues + 1)) {
1023 pci_nvme_status_tc(&compl->status,
1024 NVME_SCT_COMMAND_SPECIFIC,
1025 NVME_SC_INVALID_INTERRUPT_VECTOR);
1029 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1030 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1032 * Queues must specify at least two entries
1033 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1034 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1036 pci_nvme_status_tc(&compl->status,
1037 NVME_SCT_COMMAND_SPECIFIC,
1038 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1041 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1043 sizeof(struct nvme_command) * (size_t)ncq->size);
1045 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1052 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1053 struct nvme_completion* compl)
1056 uint8_t logpage = command->cdw10 & 0xFF;
1058 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1060 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1063 * Command specifies the number of dwords to return in fields NUMDU
1064 * and NUMDL. This is a zero-based value.
1066 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1067 logsize *= sizeof(uint32_t);
1070 case NVME_LOG_ERROR:
1071 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1072 command->prp2, (uint8_t *)&sc->err_log,
1073 MIN(logsize, sizeof(sc->err_log)),
1076 case NVME_LOG_HEALTH_INFORMATION:
1077 pthread_mutex_lock(&sc->mtx);
1078 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1079 sizeof(sc->health_log.data_units_read));
1080 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1081 sizeof(sc->health_log.data_units_written));
1082 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1083 sizeof(sc->health_log.host_read_commands));
1084 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1085 sizeof(sc->health_log.host_write_commands));
1086 pthread_mutex_unlock(&sc->mtx);
1088 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1089 command->prp2, (uint8_t *)&sc->health_log,
1090 MIN(logsize, sizeof(sc->health_log)),
1093 case NVME_LOG_FIRMWARE_SLOT:
1094 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1095 command->prp2, (uint8_t *)&sc->fw_log,
1096 MIN(logsize, sizeof(sc->fw_log)),
1100 DPRINTF("%s get log page %x command not supported",
1103 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1104 NVME_SC_INVALID_LOG_PAGE);
1111 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1112 struct nvme_completion* compl)
1117 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1118 command->cdw10 & 0xFF, command->nsid);
1120 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1122 switch (command->cdw10 & 0xFF) {
1123 case 0x00: /* return Identify Namespace data structure */
1124 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1125 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1128 case 0x01: /* return Identify Controller data structure */
1129 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1130 command->prp2, (uint8_t *)&sc->ctrldata,
1131 sizeof(sc->ctrldata),
1134 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1135 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1136 sizeof(uint32_t) * 1024);
1137 ((uint32_t *)dest)[0] = 1;
1138 ((uint32_t *)dest)[1] = 0;
1140 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1141 if (command->nsid != 1) {
1142 pci_nvme_status_genc(&status,
1143 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1146 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1147 sizeof(uint32_t) * 1024);
1148 /* All bytes after the descriptor shall be zero */
1149 bzero(dest, sizeof(uint32_t) * 1024);
1151 /* Return NIDT=1 (i.e. EUI64) descriptor */
1152 ((uint8_t *)dest)[0] = 1;
1153 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1154 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1157 DPRINTF("%s unsupported identify command requested 0x%x",
1158 __func__, command->cdw10 & 0xFF);
1159 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1163 compl->status = status;
1168 nvme_fid_to_name(uint8_t fid)
1173 case NVME_FEAT_ARBITRATION:
1174 name = "Arbitration";
1176 case NVME_FEAT_POWER_MANAGEMENT:
1177 name = "Power Management";
1179 case NVME_FEAT_LBA_RANGE_TYPE:
1180 name = "LBA Range Type";
1182 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1183 name = "Temperature Threshold";
1185 case NVME_FEAT_ERROR_RECOVERY:
1186 name = "Error Recovery";
1188 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1189 name = "Volatile Write Cache";
1191 case NVME_FEAT_NUMBER_OF_QUEUES:
1192 name = "Number of Queues";
1194 case NVME_FEAT_INTERRUPT_COALESCING:
1195 name = "Interrupt Coalescing";
1197 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1198 name = "Interrupt Vector Configuration";
1200 case NVME_FEAT_WRITE_ATOMICITY:
1201 name = "Write Atomicity Normal";
1203 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1204 name = "Asynchronous Event Configuration";
1206 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1207 name = "Autonomous Power State Transition";
1209 case NVME_FEAT_HOST_MEMORY_BUFFER:
1210 name = "Host Memory Buffer";
1212 case NVME_FEAT_TIMESTAMP:
1215 case NVME_FEAT_KEEP_ALIVE_TIMER:
1216 name = "Keep Alive Timer";
1218 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1219 name = "Host Controlled Thermal Management";
1221 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1222 name = "Non-Operation Power State Config";
1224 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1225 name = "Read Recovery Level Config";
1227 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1228 name = "Predictable Latency Mode Config";
1230 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1231 name = "Predictable Latency Mode Window";
1233 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1234 name = "LBA Status Information Report Interval";
1236 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1237 name = "Host Behavior Support";
1239 case NVME_FEAT_SANITIZE_CONFIG:
1240 name = "Sanitize Config";
1242 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1243 name = "Endurance Group Event Configuration";
1245 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1246 name = "Software Progress Marker";
1248 case NVME_FEAT_HOST_IDENTIFIER:
1249 name = "Host Identifier";
1251 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1252 name = "Reservation Notification Mask";
1254 case NVME_FEAT_RESERVATION_PERSISTENCE:
1255 name = "Reservation Persistence";
1257 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1258 name = "Namespace Write Protection Config";
1269 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1270 struct nvme_feature_obj *feat,
1271 struct nvme_command *command,
1272 struct nvme_completion *compl)
1275 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1279 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1280 struct nvme_feature_obj *feat,
1281 struct nvme_command *command,
1282 struct nvme_completion *compl)
1284 uint16_t nqr; /* Number of Queues Requested */
1286 if (sc->num_q_is_set) {
1287 WPRINTF("%s: Number of Queues already set", __func__);
1288 pci_nvme_status_genc(&compl->status,
1289 NVME_SC_COMMAND_SEQUENCE_ERROR);
1293 nqr = command->cdw11 & 0xFFFF;
1294 if (nqr == 0xffff) {
1295 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1296 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1300 sc->num_squeues = ONE_BASED(nqr);
1301 if (sc->num_squeues > sc->max_queues) {
1302 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1304 sc->num_squeues = sc->max_queues;
1307 nqr = (command->cdw11 >> 16) & 0xFFFF;
1308 if (nqr == 0xffff) {
1309 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1310 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1314 sc->num_cqueues = ONE_BASED(nqr);
1315 if (sc->num_cqueues > sc->max_queues) {
1316 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1318 sc->num_cqueues = sc->max_queues;
1321 /* Patch the command value which will be saved on callback's return */
1322 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1323 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1325 sc->num_q_is_set = true;
1329 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1330 struct nvme_completion *compl)
1332 struct nvme_feature_obj *feat;
1333 uint32_t nsid = command->nsid;
1334 uint8_t fid = command->cdw10 & 0xFF;
1336 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1338 if (fid >= NVME_FID_MAX) {
1339 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1340 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1343 feat = &sc->feat[fid];
1345 if (!feat->namespace_specific &&
1346 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1347 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1348 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1353 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1356 feat->set(sc, feat, command, compl);
1358 if (compl->status == NVME_SC_SUCCESS)
1359 feat->cdw11 = command->cdw11;
1365 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1366 struct nvme_completion* compl)
1368 struct nvme_feature_obj *feat;
1369 uint8_t fid = command->cdw10 & 0xFF;
1371 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1373 if (fid >= NVME_FID_MAX) {
1374 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1375 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1380 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1382 feat = &sc->feat[fid];
1384 feat->get(sc, feat, command, compl);
1387 if (compl->status == NVME_SC_SUCCESS) {
1388 compl->cdw0 = feat->cdw11;
1395 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1396 struct nvme_completion* compl)
1398 uint8_t ses, lbaf, pi;
1400 /* Only supports Secure Erase Setting - User Data Erase */
1401 ses = (command->cdw10 >> 9) & 0x7;
1403 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1407 /* Only supports a single LBA Format */
1408 lbaf = command->cdw10 & 0xf;
1410 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1411 NVME_SC_INVALID_FORMAT);
1415 /* Doesn't support Protection Infomation */
1416 pi = (command->cdw10 >> 5) & 0x7;
1418 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1422 if (sc->nvstore.type == NVME_STOR_RAM) {
1423 if (sc->nvstore.ctx)
1424 free(sc->nvstore.ctx);
1425 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1426 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1428 struct pci_nvme_ioreq *req;
1431 req = pci_nvme_get_ioreq(sc);
1433 pci_nvme_status_genc(&compl->status,
1434 NVME_SC_INTERNAL_DEVICE_ERROR);
1435 WPRINTF("%s: unable to allocate IO req", __func__);
1438 req->nvme_sq = &sc->submit_queues[0];
1440 req->opc = command->opc;
1441 req->cid = command->cid;
1442 req->nsid = command->nsid;
1444 req->io_req.br_offset = 0;
1445 req->io_req.br_resid = sc->nvstore.size;
1446 req->io_req.br_callback = pci_nvme_io_done;
1448 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1450 pci_nvme_status_genc(&compl->status,
1451 NVME_SC_INTERNAL_DEVICE_ERROR);
1452 pci_nvme_release_ioreq(sc, req);
1460 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1461 struct nvme_completion* compl)
1463 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1464 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1466 /* TODO: search for the command ID and abort it */
1469 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1474 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1475 struct nvme_command* command, struct nvme_completion* compl)
1477 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1479 /* Don't exceed the Async Event Request Limit (AERL). */
1480 if (pci_nvme_aer_limit_reached(sc)) {
1481 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1482 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1486 if (pci_nvme_aer_add(sc, command->cid)) {
1487 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1488 NVME_SC_INTERNAL_DEVICE_ERROR);
1493 * Raise events when they happen based on the Set Features cmd.
1494 * These events happen async, so only set completion successful if
1495 * there is an event reflective of the request to get event.
1497 compl->status = NVME_NO_STATUS;
1503 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1505 struct nvme_completion compl;
1506 struct nvme_command *cmd;
1507 struct nvme_submission_queue *sq;
1508 struct nvme_completion_queue *cq;
1511 DPRINTF("%s index %u", __func__, (uint32_t)value);
1513 sq = &sc->submit_queues[0];
1514 cq = &sc->compl_queues[0];
1516 pthread_mutex_lock(&sq->mtx);
1519 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1521 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1522 cmd = &(sq->qbase)[sqhead];
1527 case NVME_OPC_DELETE_IO_SQ:
1528 DPRINTF("%s command DELETE_IO_SQ", __func__);
1529 nvme_opc_delete_io_sq(sc, cmd, &compl);
1531 case NVME_OPC_CREATE_IO_SQ:
1532 DPRINTF("%s command CREATE_IO_SQ", __func__);
1533 nvme_opc_create_io_sq(sc, cmd, &compl);
1535 case NVME_OPC_DELETE_IO_CQ:
1536 DPRINTF("%s command DELETE_IO_CQ", __func__);
1537 nvme_opc_delete_io_cq(sc, cmd, &compl);
1539 case NVME_OPC_CREATE_IO_CQ:
1540 DPRINTF("%s command CREATE_IO_CQ", __func__);
1541 nvme_opc_create_io_cq(sc, cmd, &compl);
1543 case NVME_OPC_GET_LOG_PAGE:
1544 DPRINTF("%s command GET_LOG_PAGE", __func__);
1545 nvme_opc_get_log_page(sc, cmd, &compl);
1547 case NVME_OPC_IDENTIFY:
1548 DPRINTF("%s command IDENTIFY", __func__);
1549 nvme_opc_identify(sc, cmd, &compl);
1551 case NVME_OPC_ABORT:
1552 DPRINTF("%s command ABORT", __func__);
1553 nvme_opc_abort(sc, cmd, &compl);
1555 case NVME_OPC_SET_FEATURES:
1556 DPRINTF("%s command SET_FEATURES", __func__);
1557 nvme_opc_set_features(sc, cmd, &compl);
1559 case NVME_OPC_GET_FEATURES:
1560 DPRINTF("%s command GET_FEATURES", __func__);
1561 nvme_opc_get_features(sc, cmd, &compl);
1563 case NVME_OPC_FIRMWARE_ACTIVATE:
1564 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1565 pci_nvme_status_tc(&compl.status,
1566 NVME_SCT_COMMAND_SPECIFIC,
1567 NVME_SC_INVALID_FIRMWARE_SLOT);
1569 case NVME_OPC_ASYNC_EVENT_REQUEST:
1570 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1571 nvme_opc_async_event_req(sc, cmd, &compl);
1573 case NVME_OPC_FORMAT_NVM:
1574 DPRINTF("%s command FORMAT_NVM", __func__);
1575 if ((sc->ctrldata.oacs &
1576 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1577 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1579 compl.status = NVME_NO_STATUS;
1580 nvme_opc_format_nvm(sc, cmd, &compl);
1583 DPRINTF("0x%x command is not implemented",
1585 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1587 sqhead = (sqhead + 1) % sq->size;
1589 if (NVME_COMPLETION_VALID(compl)) {
1590 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1598 DPRINTF("setting sqhead %u", sqhead);
1601 if (cq->head != cq->tail)
1602 pci_generate_msix(sc->nsc_pi, 0);
1604 pthread_mutex_unlock(&sq->mtx);
1608 * Update the Write and Read statistics reported in SMART data
1610 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1611 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1612 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1615 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1616 size_t bytes, uint16_t status)
1619 pthread_mutex_lock(&sc->mtx);
1621 case NVME_OPC_WRITE:
1622 sc->write_commands++;
1623 if (status != NVME_SC_SUCCESS)
1625 sc->write_dunits_remainder += (bytes / 512);
1626 while (sc->write_dunits_remainder >= 1000) {
1627 sc->write_data_units++;
1628 sc->write_dunits_remainder -= 1000;
1632 sc->read_commands++;
1633 if (status != NVME_SC_SUCCESS)
1635 sc->read_dunits_remainder += (bytes / 512);
1636 while (sc->read_dunits_remainder >= 1000) {
1637 sc->read_data_units++;
1638 sc->read_dunits_remainder -= 1000;
1642 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1645 pthread_mutex_unlock(&sc->mtx);
1649 * Check if the combination of Starting LBA (slba) and Number of Logical
1650 * Blocks (nlb) exceeds the range of the underlying storage.
1652 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1653 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1657 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1660 size_t offset, bytes;
1662 /* Overflow check of multiplying Starting LBA by the sector size */
1663 if (slba >> (64 - nvstore->sectsz_bits))
1666 offset = slba << nvstore->sectsz_bits;
1667 bytes = nlb << nvstore->sectsz_bits;
1669 /* Overflow check of Number of Logical Blocks */
1670 if ((nvstore->size - offset) < bytes)
1677 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1678 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1685 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1689 /* concatenate contig block-iovs to minimize number of iovs */
1690 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1691 iovidx = req->io_req.br_iovcnt - 1;
1693 req->io_req.br_iov[iovidx].iov_base =
1694 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1695 req->prev_gpaddr, size);
1697 req->prev_size += size;
1698 req->io_req.br_resid += size;
1700 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1702 iovidx = req->io_req.br_iovcnt;
1704 req->io_req.br_offset = lba;
1705 req->io_req.br_resid = 0;
1706 req->io_req.br_param = req;
1709 req->io_req.br_iov[iovidx].iov_base =
1710 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1713 req->io_req.br_iov[iovidx].iov_len = size;
1715 req->prev_gpaddr = gpaddr;
1716 req->prev_size = size;
1717 req->io_req.br_resid += size;
1719 req->io_req.br_iovcnt++;
1726 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1727 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1728 uint32_t cdw0, uint16_t status)
1730 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1732 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1733 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1734 NVME_STATUS_GET_SC(status));
1736 pci_nvme_cq_update(sc, cq,
1742 if (cq->head != cq->tail) {
1743 if (cq->intr_en & NVME_CQ_INTEN) {
1744 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1746 DPRINTF("%s: CQ%u interrupt disabled",
1747 __func__, sq->cqid);
1753 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1756 req->nvme_sq = NULL;
1759 pthread_mutex_lock(&sc->mtx);
1761 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1764 /* when no more IO pending, can set to ready if device reset/enabled */
1765 if (sc->pending_ios == 0 &&
1766 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1767 sc->regs.csts |= NVME_CSTS_RDY;
1769 pthread_mutex_unlock(&sc->mtx);
1771 sem_post(&sc->iosemlock);
1774 static struct pci_nvme_ioreq *
1775 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1777 struct pci_nvme_ioreq *req = NULL;;
1779 sem_wait(&sc->iosemlock);
1780 pthread_mutex_lock(&sc->mtx);
1782 req = STAILQ_FIRST(&sc->ioreqs_free);
1783 assert(req != NULL);
1784 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1790 pthread_mutex_unlock(&sc->mtx);
1792 req->io_req.br_iovcnt = 0;
1793 req->io_req.br_offset = 0;
1794 req->io_req.br_resid = 0;
1795 req->io_req.br_param = req;
1796 req->prev_gpaddr = 0;
1803 pci_nvme_io_done(struct blockif_req *br, int err)
1805 struct pci_nvme_ioreq *req = br->br_param;
1806 struct nvme_submission_queue *sq = req->nvme_sq;
1807 uint16_t code, status;
1809 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1811 /* TODO return correct error */
1812 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1813 pci_nvme_status_genc(&status, code);
1815 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1816 pci_nvme_stats_write_read_update(req->sc, req->opc,
1817 req->bytes, status);
1818 pci_nvme_release_ioreq(req->sc, req);
1822 * Implements the Flush command. The specification states:
1823 * If a volatile write cache is not present, Flush commands complete
1824 * successfully and have no effect
1825 * in the description of the Volatile Write Cache (VWC) field of the Identify
1826 * Controller data. Therefore, set status to Success if the command is
1827 * not supported (i.e. RAM or as indicated by the blockif).
1830 nvme_opc_flush(struct pci_nvme_softc *sc,
1831 struct nvme_command *cmd,
1832 struct pci_nvme_blockstore *nvstore,
1833 struct pci_nvme_ioreq *req,
1836 bool pending = false;
1838 if (nvstore->type == NVME_STOR_RAM) {
1839 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1843 req->io_req.br_callback = pci_nvme_io_done;
1845 err = blockif_flush(nvstore->ctx, &req->io_req);
1851 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1854 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1862 nvme_write_read_ram(struct pci_nvme_softc *sc,
1863 struct pci_nvme_blockstore *nvstore,
1864 uint64_t prp1, uint64_t prp2,
1865 size_t offset, uint64_t bytes,
1868 uint8_t *buf = nvstore->ctx;
1869 enum nvme_copy_dir dir;
1873 dir = NVME_COPY_TO_PRP;
1875 dir = NVME_COPY_FROM_PRP;
1877 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1878 buf + offset, bytes, dir))
1879 pci_nvme_status_genc(&status,
1880 NVME_SC_DATA_TRANSFER_ERROR);
1882 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1888 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1889 struct pci_nvme_blockstore *nvstore,
1890 struct pci_nvme_ioreq *req,
1891 uint64_t prp1, uint64_t prp2,
1892 size_t offset, uint64_t bytes,
1897 uint16_t status = NVME_NO_STATUS;
1899 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1900 if (pci_nvme_append_iov_req(sc, req, prp1,
1901 size, is_write, offset)) {
1902 pci_nvme_status_genc(&status,
1903 NVME_SC_DATA_TRANSFER_ERROR);
1912 } else if (bytes <= PAGE_SIZE) {
1914 if (pci_nvme_append_iov_req(sc, req, prp2,
1915 size, is_write, offset)) {
1916 pci_nvme_status_genc(&status,
1917 NVME_SC_DATA_TRANSFER_ERROR);
1921 void *vmctx = sc->nsc_pi->pi_vmctx;
1922 uint64_t *prp_list = &prp2;
1923 uint64_t *last = prp_list;
1925 /* PRP2 is pointer to a physical region page list */
1927 /* Last entry in list points to the next list */
1928 if (prp_list == last) {
1929 uint64_t prp = *prp_list;
1931 prp_list = paddr_guest2host(vmctx, prp,
1932 PAGE_SIZE - (prp % PAGE_SIZE));
1933 last = prp_list + (NVME_PRP2_ITEMS - 1);
1936 size = MIN(bytes, PAGE_SIZE);
1938 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1939 size, is_write, offset)) {
1940 pci_nvme_status_genc(&status,
1941 NVME_SC_DATA_TRANSFER_ERROR);
1951 req->io_req.br_callback = pci_nvme_io_done;
1953 err = blockif_write(nvstore->ctx, &req->io_req);
1955 err = blockif_read(nvstore->ctx, &req->io_req);
1958 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
1964 nvme_opc_write_read(struct pci_nvme_softc *sc,
1965 struct nvme_command *cmd,
1966 struct pci_nvme_blockstore *nvstore,
1967 struct pci_nvme_ioreq *req,
1970 uint64_t lba, nblocks, bytes;
1972 bool is_write = cmd->opc == NVME_OPC_WRITE;
1973 bool pending = false;
1975 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1976 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1977 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
1978 WPRINTF("%s command would exceed LBA range", __func__);
1979 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1983 bytes = nblocks << nvstore->sectsz_bits;
1984 if (bytes > NVME_MAX_DATA_SIZE) {
1985 WPRINTF("%s command would exceed MDTS", __func__);
1986 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
1990 offset = lba << nvstore->sectsz_bits;
1993 req->io_req.br_offset = lba;
1995 /* PRP bits 1:0 must be zero */
1996 cmd->prp1 &= ~0x3UL;
1997 cmd->prp2 &= ~0x3UL;
1999 if (nvstore->type == NVME_STOR_RAM) {
2000 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2001 cmd->prp2, offset, bytes, is_write);
2003 *status = nvme_write_read_blockif(sc, nvstore, req,
2004 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2006 if (*status == NVME_NO_STATUS)
2011 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2017 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2019 struct pci_nvme_ioreq *req = br->br_param;
2020 struct pci_nvme_softc *sc = req->sc;
2025 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2026 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2027 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2029 struct iovec *iov = req->io_req.br_iov;
2032 iov += req->prev_gpaddr;
2034 /* The iov_* values already include the sector size */
2035 req->io_req.br_offset = (off_t)iov->iov_base;
2036 req->io_req.br_resid = iov->iov_len;
2037 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2038 pci_nvme_status_genc(&status,
2039 NVME_SC_INTERNAL_DEVICE_ERROR);
2045 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2046 req->cid, 0, status);
2047 pci_nvme_release_ioreq(sc, req);
2052 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2053 struct nvme_command *cmd,
2054 struct pci_nvme_blockstore *nvstore,
2055 struct pci_nvme_ioreq *req,
2059 bool pending = false;
2061 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2062 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2066 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2067 struct nvme_dsm_range *range;
2068 size_t offset, bytes;
2070 int sectsz_bits = sc->nvstore.sectsz_bits;
2073 * DSM calls are advisory only, and compliant controllers
2074 * may choose to take no actions (i.e. return Success).
2076 if (!nvstore->deallocate) {
2077 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2082 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2086 /* copy locally because a range entry could straddle PRPs */
2087 range = calloc(1, NVME_MAX_DSM_TRIM);
2088 if (range == NULL) {
2089 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2092 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2093 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2095 if (pci_nvme_out_of_range(nvstore, range[0].starting_lba,
2097 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2100 offset = range[0].starting_lba << sectsz_bits;
2101 bytes = range[0].length << sectsz_bits;
2104 * If the request is for more than a single range, store
2105 * the ranges in the br_iov. Optimize for the common case
2106 * of a single range.
2108 * Note that NVMe Number of Ranges is a zero based value
2110 nr = cmd->cdw10 & 0xff;
2112 req->io_req.br_iovcnt = 0;
2113 req->io_req.br_offset = offset;
2114 req->io_req.br_resid = bytes;
2117 req->io_req.br_callback = pci_nvme_io_done;
2119 struct iovec *iov = req->io_req.br_iov;
2121 for (r = 0; r <= nr; r++) {
2122 if (pci_nvme_out_of_range(nvstore, range[r].starting_lba,
2124 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2127 offset = range[r].starting_lba << sectsz_bits;
2128 bytes = range[r].length << sectsz_bits;
2129 if ((nvstore->size - offset) < bytes) {
2130 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2133 iov[r].iov_base = (void *)offset;
2134 iov[r].iov_len = bytes;
2136 req->io_req.br_callback = pci_nvme_dealloc_sm;
2139 * Use prev_gpaddr to track the current entry and
2140 * prev_size to track the number of entries
2142 req->prev_gpaddr = 0;
2146 err = blockif_delete(nvstore->ctx, &req->io_req);
2148 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2159 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2161 struct nvme_submission_queue *sq;
2165 /* handle all submissions up to sq->tail index */
2166 sq = &sc->submit_queues[idx];
2168 pthread_mutex_lock(&sq->mtx);
2171 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2172 idx, sqhead, sq->tail, sq->qbase);
2174 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2175 struct nvme_command *cmd;
2176 struct pci_nvme_ioreq *req;
2184 cmd = &sq->qbase[sqhead];
2185 sqhead = (sqhead + 1) % sq->size;
2187 nsid = le32toh(cmd->nsid);
2188 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2189 pci_nvme_status_genc(&status,
2190 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2192 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2196 req = pci_nvme_get_ioreq(sc);
2198 pci_nvme_status_genc(&status,
2199 NVME_SC_INTERNAL_DEVICE_ERROR);
2200 WPRINTF("%s: unable to allocate IO req", __func__);
2205 req->opc = cmd->opc;
2206 req->cid = cmd->cid;
2207 req->nsid = cmd->nsid;
2210 case NVME_OPC_FLUSH:
2211 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2214 case NVME_OPC_WRITE:
2216 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2219 case NVME_OPC_WRITE_ZEROES:
2220 /* TODO: write zeroes
2221 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2222 __func__, lba, cmd->cdw12 & 0xFFFF); */
2223 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2225 case NVME_OPC_DATASET_MANAGEMENT:
2226 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2230 WPRINTF("%s unhandled io command 0x%x",
2231 __func__, cmd->opc);
2232 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2236 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2239 pci_nvme_release_ioreq(sc, req);
2245 pthread_mutex_unlock(&sq->mtx);
2249 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2250 uint64_t idx, int is_sq, uint64_t value)
2252 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2253 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2256 if (idx > sc->num_squeues) {
2257 WPRINTF("%s queue index %lu overflow from "
2259 __func__, idx, sc->num_squeues);
2263 atomic_store_short(&sc->submit_queues[idx].tail,
2267 pci_nvme_handle_admin_cmd(sc, value);
2269 /* submission queue; handle new entries in SQ */
2270 if (idx > sc->num_squeues) {
2271 WPRINTF("%s SQ index %lu overflow from "
2273 __func__, idx, sc->num_squeues);
2276 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2279 if (idx > sc->num_cqueues) {
2280 WPRINTF("%s queue index %lu overflow from "
2282 __func__, idx, sc->num_cqueues);
2286 atomic_store_short(&sc->compl_queues[idx].head,
2292 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2294 const char *s = iswrite ? "WRITE" : "READ";
2297 case NVME_CR_CAP_LOW:
2298 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2300 case NVME_CR_CAP_HI:
2301 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2304 DPRINTF("%s %s NVME_CR_VS", func, s);
2307 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2310 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2313 DPRINTF("%s %s NVME_CR_CC", func, s);
2316 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2319 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2322 DPRINTF("%s %s NVME_CR_AQA", func, s);
2324 case NVME_CR_ASQ_LOW:
2325 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2327 case NVME_CR_ASQ_HI:
2328 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2330 case NVME_CR_ACQ_LOW:
2331 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2333 case NVME_CR_ACQ_HI:
2334 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2337 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2343 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2344 uint64_t offset, int size, uint64_t value)
2348 if (offset >= NVME_DOORBELL_OFFSET) {
2349 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2350 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2351 int is_sq = (belloffset % 8) < 4;
2353 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2354 WPRINTF("guest attempted an overflow write offset "
2355 "0x%lx, val 0x%lx in %s",
2356 offset, value, __func__);
2360 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2364 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2365 offset, size, value);
2368 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2369 "val 0x%lx) to bar0 in %s",
2370 size, offset, value, __func__);
2371 /* TODO: shutdown device */
2375 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2377 pthread_mutex_lock(&sc->mtx);
2380 case NVME_CR_CAP_LOW:
2381 case NVME_CR_CAP_HI:
2388 /* MSI-X, so ignore */
2391 /* MSI-X, so ignore */
2394 ccreg = (uint32_t)value;
2396 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2399 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2400 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2401 NVME_CC_GET_IOCQES(ccreg));
2403 if (NVME_CC_GET_SHN(ccreg)) {
2404 /* perform shutdown - flush out data to backend */
2405 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2406 NVME_CSTS_REG_SHST_SHIFT);
2407 sc->regs.csts |= NVME_SHST_COMPLETE <<
2408 NVME_CSTS_REG_SHST_SHIFT;
2410 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2411 if (NVME_CC_GET_EN(ccreg) == 0)
2412 /* transition 1-> causes controller reset */
2413 pci_nvme_reset_locked(sc);
2415 pci_nvme_init_controller(ctx, sc);
2418 /* Insert the iocqes, iosqes and en bits from the write */
2419 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2420 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2421 if (NVME_CC_GET_EN(ccreg) == 0) {
2422 /* Insert the ams, mps and css bit fields */
2423 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2424 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2425 sc->regs.csts &= ~NVME_CSTS_RDY;
2426 } else if (sc->pending_ios == 0) {
2427 sc->regs.csts |= NVME_CSTS_RDY;
2433 /* ignore writes; don't support subsystem reset */
2436 sc->regs.aqa = (uint32_t)value;
2438 case NVME_CR_ASQ_LOW:
2439 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2440 (0xFFFFF000 & value);
2442 case NVME_CR_ASQ_HI:
2443 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2446 case NVME_CR_ACQ_LOW:
2447 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2448 (0xFFFFF000 & value);
2450 case NVME_CR_ACQ_HI:
2451 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2455 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2456 __func__, offset, value, size);
2458 pthread_mutex_unlock(&sc->mtx);
2462 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2463 int baridx, uint64_t offset, int size, uint64_t value)
2465 struct pci_nvme_softc* sc = pi->pi_arg;
2467 if (baridx == pci_msix_table_bar(pi) ||
2468 baridx == pci_msix_pba_bar(pi)) {
2469 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2470 " value 0x%lx", baridx, offset, size, value);
2472 pci_emul_msix_twrite(pi, offset, size, value);
2478 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2482 DPRINTF("%s unknown baridx %d, val 0x%lx",
2483 __func__, baridx, value);
2487 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2488 uint64_t offset, int size)
2492 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2494 if (offset < NVME_DOORBELL_OFFSET) {
2495 void *p = &(sc->regs);
2496 pthread_mutex_lock(&sc->mtx);
2497 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2498 pthread_mutex_unlock(&sc->mtx);
2501 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2512 value &= 0xFFFFFFFF;
2516 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2517 offset, size, (uint32_t)value);
2525 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2526 uint64_t offset, int size)
2528 struct pci_nvme_softc* sc = pi->pi_arg;
2530 if (baridx == pci_msix_table_bar(pi) ||
2531 baridx == pci_msix_pba_bar(pi)) {
2532 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2533 baridx, offset, size);
2535 return pci_emul_msix_tread(pi, offset, size);
2540 return pci_nvme_read_bar_0(sc, offset, size);
2543 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2551 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2553 char bident[sizeof("XX:X:X")];
2554 char *uopt, *xopts, *config;
2558 sc->max_queues = NVME_QUEUES;
2559 sc->max_qentries = NVME_MAX_QENTRIES;
2560 sc->ioslots = NVME_IOSLOTS;
2561 sc->num_squeues = sc->max_queues;
2562 sc->num_cqueues = sc->max_queues;
2563 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2566 uopt = strdup(opts);
2568 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2569 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2570 for (xopts = strtok(uopt, ",");
2572 xopts = strtok(NULL, ",")) {
2574 if ((config = strchr(xopts, '=')) != NULL)
2577 if (!strcmp("maxq", xopts)) {
2578 sc->max_queues = atoi(config);
2579 } else if (!strcmp("qsz", xopts)) {
2580 sc->max_qentries = atoi(config);
2581 } else if (!strcmp("ioslots", xopts)) {
2582 sc->ioslots = atoi(config);
2583 } else if (!strcmp("sectsz", xopts)) {
2584 sectsz = atoi(config);
2585 } else if (!strcmp("ser", xopts)) {
2587 * This field indicates the Product Serial Number in
2588 * 7-bit ASCII, unused bytes should be space characters.
2591 cpywithpad((char *)sc->ctrldata.sn,
2592 sizeof(sc->ctrldata.sn), config, ' ');
2593 } else if (!strcmp("ram", xopts)) {
2594 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2596 sc->nvstore.type = NVME_STOR_RAM;
2597 sc->nvstore.size = sz * 1024 * 1024;
2598 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2599 sc->nvstore.sectsz = 4096;
2600 sc->nvstore.sectsz_bits = 12;
2601 if (sc->nvstore.ctx == NULL) {
2602 perror("Unable to allocate RAM");
2606 } else if (!strcmp("eui64", xopts)) {
2607 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2608 } else if (!strcmp("dsm", xopts)) {
2609 if (!strcmp("auto", config))
2610 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2611 else if (!strcmp("enable", config))
2612 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2613 else if (!strcmp("disable", config))
2614 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2615 } else if (optidx == 0) {
2616 snprintf(bident, sizeof(bident), "%d:%d",
2617 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2618 sc->nvstore.ctx = blockif_open(xopts, bident);
2619 if (sc->nvstore.ctx == NULL) {
2620 perror("Could not open backing file");
2624 sc->nvstore.type = NVME_STOR_BLOCKIF;
2625 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2627 EPRINTLN("Invalid option %s", xopts);
2636 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2637 EPRINTLN("backing store not specified");
2640 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2641 sc->nvstore.sectsz = sectsz;
2642 else if (sc->nvstore.type != NVME_STOR_RAM)
2643 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2644 for (sc->nvstore.sectsz_bits = 9;
2645 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2646 sc->nvstore.sectsz_bits++);
2648 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2649 sc->max_queues = NVME_QUEUES;
2651 if (sc->max_qentries <= 0) {
2652 EPRINTLN("Invalid qsz option");
2655 if (sc->ioslots <= 0) {
2656 EPRINTLN("Invalid ioslots option");
2664 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2666 struct pci_nvme_softc *sc;
2667 uint32_t pci_membar_sz;
2672 sc = calloc(1, sizeof(struct pci_nvme_softc));
2676 error = pci_nvme_parse_opts(sc, opts);
2682 STAILQ_INIT(&sc->ioreqs_free);
2683 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2684 for (int i = 0; i < sc->ioslots; i++) {
2685 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2688 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2689 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2690 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2691 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2692 pci_set_cfgdata8(pi, PCIR_PROGIF,
2693 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2696 * Allocate size of NVMe registers + doorbell space for all queues.
2698 * The specification requires a minimum memory I/O window size of 16K.
2699 * The Windows driver will refuse to start a device with a smaller
2702 pci_membar_sz = sizeof(struct nvme_registers) +
2703 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2704 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2706 DPRINTF("nvme membar size: %u", pci_membar_sz);
2708 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2710 WPRINTF("%s pci alloc mem bar failed", __func__);
2714 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2716 WPRINTF("%s pci add msixcap failed", __func__);
2720 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2722 WPRINTF("%s pci add Express capability failed", __func__);
2726 pthread_mutex_init(&sc->mtx, NULL);
2727 sem_init(&sc->iosemlock, 0, sc->ioslots);
2729 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2731 * Controller data depends on Namespace data so initialize Namespace
2734 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2735 pci_nvme_init_ctrldata(sc);
2736 pci_nvme_init_logpages(sc);
2737 pci_nvme_init_features(sc);
2739 pci_nvme_aer_init(sc);
2743 pci_lintr_request(pi);
2750 struct pci_devemu pci_de_nvme = {
2752 .pe_init = pci_nvme_init,
2753 .pe_barwrite = pci_nvme_write,
2754 .pe_barread = pci_nvme_read
2756 PCI_EMUL_SET(pci_de_nvme);