2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN 0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS 0xffff
115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero) ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one) ((one) - 1)
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
131 enum nvme_controller_register_offsets {
132 NVME_CR_CAP_LOW = 0x00,
133 NVME_CR_CAP_HI = 0x04,
135 NVME_CR_INTMS = 0x0c,
136 NVME_CR_INTMC = 0x10,
141 NVME_CR_ASQ_LOW = 0x28,
142 NVME_CR_ASQ_HI = 0x2c,
143 NVME_CR_ACQ_LOW = 0x30,
144 NVME_CR_ACQ_HI = 0x34,
147 enum nvme_cmd_cdw11 {
148 NVME_CMD_CDW11_PC = 0x0001,
149 NVME_CMD_CDW11_IEN = 0x0002,
150 NVME_CMD_CDW11_IV = 0xFFFF0000,
158 #define NVME_CQ_INTEN 0x01
159 #define NVME_CQ_INTCOAL 0x02
161 struct nvme_completion_queue {
162 struct nvme_completion *qbase;
165 uint16_t tail; /* nvme progress */
166 uint16_t head; /* guest progress */
171 struct nvme_submission_queue {
172 struct nvme_command *qbase;
175 uint16_t head; /* nvme progress */
176 uint16_t tail; /* guest progress */
177 uint16_t cqid; /* completion queue id */
181 enum nvme_storage_type {
182 NVME_STOR_BLOCKIF = 0,
186 struct pci_nvme_blockstore {
187 enum nvme_storage_type type;
191 uint32_t sectsz_bits;
193 uint32_t deallocate:1;
197 * Calculate the number of additional page descriptors for guest IO requests
198 * based on the advertised Max Data Transfer (MDTS) and given the number of
199 * default iovec's in a struct blockif_req.
201 * Note the + 1 allows for the initial descriptor to not be page aligned.
203 #define MDTS_PAD_SIZE \
204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
222 struct blockif_req io_req;
224 struct iovec iovpadding[MDTS_PAD_SIZE];
228 /* Dataset Management bit in ONCS reflects backing storage capability */
229 NVME_DATASET_MANAGEMENT_AUTO,
230 /* Unconditionally set Dataset Management bit in ONCS */
231 NVME_DATASET_MANAGEMENT_ENABLE,
232 /* Unconditionally clear Dataset Management bit in ONCS */
233 NVME_DATASET_MANAGEMENT_DISABLE,
236 struct pci_nvme_softc;
237 struct nvme_feature_obj;
239 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
240 struct nvme_feature_obj *,
241 struct nvme_command *,
242 struct nvme_completion *);
244 struct nvme_feature_obj {
248 bool namespace_specific;
251 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253 struct pci_nvme_softc {
254 struct pci_devinst *nsc_pi;
258 struct nvme_registers regs;
260 struct nvme_namespace_data nsdata;
261 struct nvme_controller_data ctrldata;
262 struct nvme_error_information_entry err_log;
263 struct nvme_health_information_page health_log;
264 struct nvme_firmware_page fw_log;
266 struct pci_nvme_blockstore nvstore;
268 uint16_t max_qentries; /* max entries per queue */
269 uint32_t max_queues; /* max number of IO SQ's or CQ's */
270 uint32_t num_cqueues;
271 uint32_t num_squeues;
272 bool num_q_is_set; /* Has host set Number of Queues */
274 struct pci_nvme_ioreq *ioreqs;
275 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
276 uint32_t pending_ios;
281 * Memory mapped Submission and Completion queues
282 * Each array includes both Admin and IO queues
284 struct nvme_completion_queue *compl_queues;
285 struct nvme_submission_queue *submit_queues;
287 struct nvme_feature_obj feat[NVME_FID_MAX];
289 enum nvme_dsm_type dataset_management;
293 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
294 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
295 static void pci_nvme_io_done(struct blockif_req *, int);
297 /* Controller Configuration utils */
298 #define NVME_CC_GET_EN(cc) \
299 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
300 #define NVME_CC_GET_CSS(cc) \
301 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
302 #define NVME_CC_GET_SHN(cc) \
303 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
304 #define NVME_CC_GET_IOSQES(cc) \
305 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
306 #define NVME_CC_GET_IOCQES(cc) \
307 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
309 #define NVME_CC_WRITE_MASK \
310 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
311 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
312 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
314 #define NVME_CC_NEN_WRITE_MASK \
315 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
316 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
317 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
319 /* Controller Status utils */
320 #define NVME_CSTS_GET_RDY(sts) \
321 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
323 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
325 /* Completion Queue status word utils */
326 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
327 #define NVME_STATUS_MASK \
328 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
329 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
331 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
332 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
334 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
335 struct nvme_feature_obj *,
336 struct nvme_command *,
337 struct nvme_completion *);
338 static void nvme_feature_num_queues(struct pci_nvme_softc *,
339 struct nvme_feature_obj *,
340 struct nvme_command *,
341 struct nvme_completion *);
344 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
348 len = strnlen(src, dst_size);
349 memset(dst, pad, dst_size);
350 memcpy(dst, src, len);
354 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
357 *status &= ~NVME_STATUS_MASK;
358 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
359 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
363 pci_nvme_status_genc(uint16_t *status, uint16_t code)
366 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
370 * Initialize the requested number or IO Submission and Completion Queues.
371 * Admin queues are allocated implicitly.
374 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
379 * Allocate and initialize the Submission Queues
381 if (nsq > NVME_QUEUES) {
382 WPRINTF("%s: clamping number of SQ from %u to %u",
383 __func__, nsq, NVME_QUEUES);
387 sc->num_squeues = nsq;
389 sc->submit_queues = calloc(sc->num_squeues + 1,
390 sizeof(struct nvme_submission_queue));
391 if (sc->submit_queues == NULL) {
392 WPRINTF("%s: SQ allocation failed", __func__);
395 struct nvme_submission_queue *sq = sc->submit_queues;
397 for (i = 0; i < sc->num_squeues; i++)
398 pthread_mutex_init(&sq[i].mtx, NULL);
402 * Allocate and initialize the Completion Queues
404 if (ncq > NVME_QUEUES) {
405 WPRINTF("%s: clamping number of CQ from %u to %u",
406 __func__, ncq, NVME_QUEUES);
410 sc->num_cqueues = ncq;
412 sc->compl_queues = calloc(sc->num_cqueues + 1,
413 sizeof(struct nvme_completion_queue));
414 if (sc->compl_queues == NULL) {
415 WPRINTF("%s: CQ allocation failed", __func__);
418 struct nvme_completion_queue *cq = sc->compl_queues;
420 for (i = 0; i < sc->num_cqueues; i++)
421 pthread_mutex_init(&cq[i].mtx, NULL);
426 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
428 struct nvme_controller_data *cd = &sc->ctrldata;
433 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
434 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
436 /* Num of submission commands that we can handle at a time (2^rab) */
446 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
448 cd->ver = 0x00010300;
450 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
454 cd->lpa = 0; /* TODO: support some simple things like SMART */
455 cd->elpe = 0; /* max error log page entries */
456 cd->npss = 1; /* number of power states support */
458 /* Warning Composite Temperature Threshold */
461 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
462 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
463 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
464 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
465 cd->nn = 1; /* number of namespaces */
468 switch (sc->dataset_management) {
469 case NVME_DATASET_MANAGEMENT_AUTO:
470 if (sc->nvstore.deallocate)
471 cd->oncs |= NVME_ONCS_DSM;
473 case NVME_DATASET_MANAGEMENT_ENABLE:
474 cd->oncs |= NVME_ONCS_DSM;
482 cd->power_state[0].mp = 10;
486 * Calculate the CRC-16 of the given buffer
487 * See copyright attribution at top of file
490 crc16(uint16_t crc, const void *buffer, unsigned int len)
492 const unsigned char *cp = buffer;
493 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
494 static uint16_t const crc16_table[256] = {
495 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
496 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
497 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
498 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
499 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
500 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
501 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
502 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
503 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
504 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
505 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
506 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
507 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
508 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
509 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
510 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
511 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
512 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
513 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
514 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
515 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
516 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
517 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
518 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
519 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
520 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
521 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
522 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
523 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
524 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
525 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
526 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
530 crc = (((crc >> 8) & 0xffU) ^
531 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
536 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
537 struct nvme_namespace_data *nd, uint32_t nsid,
538 struct pci_nvme_blockstore *nvstore)
541 /* Get capacity and block size information from backing store */
542 nd->nsze = nvstore->size / nvstore->sectsz;
546 if (nvstore->type == NVME_STOR_BLOCKIF)
547 nvstore->deallocate = blockif_candelete(nvstore->ctx);
549 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
552 /* Create an EUI-64 if user did not provide one */
553 if (nvstore->eui64 == 0) {
555 uint64_t eui64 = nvstore->eui64;
557 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
558 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
561 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
564 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
566 be64enc(nd->eui64, nvstore->eui64);
568 /* LBA data-sz = 2^lbads */
569 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
573 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
576 memset(&sc->err_log, 0, sizeof(sc->err_log));
577 memset(&sc->health_log, 0, sizeof(sc->health_log));
578 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
582 pci_nvme_init_features(struct pci_nvme_softc *sc)
585 sc->feat[0].set = nvme_feature_invalid_cb;
586 sc->feat[0].get = nvme_feature_invalid_cb;
588 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
589 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
590 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
594 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
598 DPRINTF("%s", __func__);
600 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
601 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
602 (60 << NVME_CAP_LO_REG_TO_SHIFT);
604 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
606 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
611 assert(sc->submit_queues != NULL);
613 for (i = 0; i < sc->num_squeues + 1; i++) {
614 sc->submit_queues[i].qbase = NULL;
615 sc->submit_queues[i].size = 0;
616 sc->submit_queues[i].cqid = 0;
617 sc->submit_queues[i].tail = 0;
618 sc->submit_queues[i].head = 0;
621 assert(sc->compl_queues != NULL);
623 for (i = 0; i < sc->num_cqueues + 1; i++) {
624 sc->compl_queues[i].qbase = NULL;
625 sc->compl_queues[i].size = 0;
626 sc->compl_queues[i].tail = 0;
627 sc->compl_queues[i].head = 0;
630 sc->num_q_is_set = false;
634 pci_nvme_reset(struct pci_nvme_softc *sc)
636 pthread_mutex_lock(&sc->mtx);
637 pci_nvme_reset_locked(sc);
638 pthread_mutex_unlock(&sc->mtx);
642 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
646 DPRINTF("%s", __func__);
648 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
649 sc->submit_queues[0].size = asqs;
650 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
651 sizeof(struct nvme_command) * asqs);
653 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
654 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
656 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
657 NVME_AQA_REG_ACQS_MASK) + 1;
658 sc->compl_queues[0].size = acqs;
659 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
660 sizeof(struct nvme_completion) * acqs);
661 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
663 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
664 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
668 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
669 size_t len, enum nvme_copy_dir dir)
674 if (len > (8 * 1024)) {
678 /* Copy from the start of prp1 to the end of the physical page */
679 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
680 bytes = MIN(bytes, len);
682 p = vm_map_gpa(ctx, prp1, bytes);
687 if (dir == NVME_COPY_TO_PRP)
699 len = MIN(len, PAGE_SIZE);
701 p = vm_map_gpa(ctx, prp2, len);
706 if (dir == NVME_COPY_TO_PRP)
715 * Write a Completion Queue Entry update
717 * Write the completion and update the doorbell value
720 pci_nvme_cq_update(struct pci_nvme_softc *sc,
721 struct nvme_completion_queue *cq,
727 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
728 struct nvme_completion *cqe;
730 assert(cq->qbase != NULL);
732 pthread_mutex_lock(&cq->mtx);
734 cqe = &cq->qbase[cq->tail];
736 /* Flip the phase bit */
737 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
740 cqe->sqhd = sq->head;
743 cqe->status = status;
746 if (cq->tail >= cq->size) {
750 pthread_mutex_unlock(&cq->mtx);
754 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
755 struct nvme_completion* compl)
757 uint16_t qid = command->cdw10 & 0xffff;
759 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
760 if (qid == 0 || qid > sc->num_squeues ||
761 (sc->submit_queues[qid].qbase == NULL)) {
762 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
763 __func__, qid, sc->num_squeues);
764 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
765 NVME_SC_INVALID_QUEUE_IDENTIFIER);
769 sc->submit_queues[qid].qbase = NULL;
770 sc->submit_queues[qid].cqid = 0;
771 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
776 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
777 struct nvme_completion* compl)
779 if (command->cdw11 & NVME_CMD_CDW11_PC) {
780 uint16_t qid = command->cdw10 & 0xffff;
781 struct nvme_submission_queue *nsq;
783 if ((qid == 0) || (qid > sc->num_squeues) ||
784 (sc->submit_queues[qid].qbase != NULL)) {
785 WPRINTF("%s queue index %u > num_squeues %u",
786 __func__, qid, sc->num_squeues);
787 pci_nvme_status_tc(&compl->status,
788 NVME_SCT_COMMAND_SPECIFIC,
789 NVME_SC_INVALID_QUEUE_IDENTIFIER);
793 nsq = &sc->submit_queues[qid];
794 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
795 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
796 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
798 * Queues must specify at least two entries
799 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
800 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
802 pci_nvme_status_tc(&compl->status,
803 NVME_SCT_COMMAND_SPECIFIC,
804 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
808 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
809 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
810 pci_nvme_status_tc(&compl->status,
811 NVME_SCT_COMMAND_SPECIFIC,
812 NVME_SC_INVALID_QUEUE_IDENTIFIER);
816 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
817 pci_nvme_status_tc(&compl->status,
818 NVME_SCT_COMMAND_SPECIFIC,
819 NVME_SC_COMPLETION_QUEUE_INVALID);
823 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
825 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
826 sizeof(struct nvme_command) * (size_t)nsq->size);
828 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
829 qid, nsq->size, nsq->qbase, nsq->cqid);
831 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
833 DPRINTF("%s completed creating IOSQ qid %u",
837 * Guest sent non-cont submission queue request.
838 * This setting is unsupported by this emulation.
840 WPRINTF("%s unsupported non-contig (list-based) "
841 "create i/o submission queue", __func__);
843 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
849 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
850 struct nvme_completion* compl)
852 uint16_t qid = command->cdw10 & 0xffff;
855 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
856 if (qid == 0 || qid > sc->num_cqueues ||
857 (sc->compl_queues[qid].qbase == NULL)) {
858 WPRINTF("%s queue index %u / num_cqueues %u",
859 __func__, qid, sc->num_cqueues);
860 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
861 NVME_SC_INVALID_QUEUE_IDENTIFIER);
865 /* Deleting an Active CQ is an error */
866 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
867 if (sc->submit_queues[sqid].cqid == qid) {
868 pci_nvme_status_tc(&compl->status,
869 NVME_SCT_COMMAND_SPECIFIC,
870 NVME_SC_INVALID_QUEUE_DELETION);
874 sc->compl_queues[qid].qbase = NULL;
875 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
880 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
881 struct nvme_completion* compl)
883 struct nvme_completion_queue *ncq;
884 uint16_t qid = command->cdw10 & 0xffff;
886 /* Only support Physically Contiguous queues */
887 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
888 WPRINTF("%s unsupported non-contig (list-based) "
889 "create i/o completion queue",
892 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
896 if ((qid == 0) || (qid > sc->num_cqueues) ||
897 (sc->compl_queues[qid].qbase != NULL)) {
898 WPRINTF("%s queue index %u > num_cqueues %u",
899 __func__, qid, sc->num_cqueues);
900 pci_nvme_status_tc(&compl->status,
901 NVME_SCT_COMMAND_SPECIFIC,
902 NVME_SC_INVALID_QUEUE_IDENTIFIER);
906 ncq = &sc->compl_queues[qid];
907 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
908 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
909 if (ncq->intr_vec > (sc->max_queues + 1)) {
910 pci_nvme_status_tc(&compl->status,
911 NVME_SCT_COMMAND_SPECIFIC,
912 NVME_SC_INVALID_INTERRUPT_VECTOR);
916 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
917 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
919 * Queues must specify at least two entries
920 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
921 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
923 pci_nvme_status_tc(&compl->status,
924 NVME_SCT_COMMAND_SPECIFIC,
925 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
928 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
930 sizeof(struct nvme_command) * (size_t)ncq->size);
932 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
939 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
940 struct nvme_completion* compl)
943 uint8_t logpage = command->cdw10 & 0xFF;
945 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
947 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
950 * Command specifies the number of dwords to return in fields NUMDU
951 * and NUMDL. This is a zero-based value.
953 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
954 logsize *= sizeof(uint32_t);
958 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
959 command->prp2, (uint8_t *)&sc->err_log,
960 MIN(logsize, sizeof(sc->err_log)),
963 case NVME_LOG_HEALTH_INFORMATION:
964 /* TODO: present some smart info */
965 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
966 command->prp2, (uint8_t *)&sc->health_log,
967 MIN(logsize, sizeof(sc->health_log)),
970 case NVME_LOG_FIRMWARE_SLOT:
971 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
972 command->prp2, (uint8_t *)&sc->fw_log,
973 MIN(logsize, sizeof(sc->fw_log)),
977 DPRINTF("%s get log page %x command not supported",
980 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
981 NVME_SC_INVALID_LOG_PAGE);
988 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
989 struct nvme_completion* compl)
994 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
995 command->cdw10 & 0xFF, command->nsid);
997 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
999 switch (command->cdw10 & 0xFF) {
1000 case 0x00: /* return Identify Namespace data structure */
1001 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1002 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1005 case 0x01: /* return Identify Controller data structure */
1006 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1007 command->prp2, (uint8_t *)&sc->ctrldata,
1008 sizeof(sc->ctrldata),
1011 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1012 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1013 sizeof(uint32_t) * 1024);
1014 ((uint32_t *)dest)[0] = 1;
1015 ((uint32_t *)dest)[1] = 0;
1017 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1018 if (command->nsid != 1) {
1019 pci_nvme_status_genc(&status,
1020 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1023 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1024 sizeof(uint32_t) * 1024);
1025 /* All bytes after the descriptor shall be zero */
1026 bzero(dest, sizeof(uint32_t) * 1024);
1028 /* Return NIDT=1 (i.e. EUI64) descriptor */
1029 ((uint8_t *)dest)[0] = 1;
1030 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1031 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1034 DPRINTF("%s unsupported identify command requested 0x%x",
1035 __func__, command->cdw10 & 0xFF);
1036 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1040 compl->status = status;
1045 nvme_fid_to_name(uint8_t fid)
1050 case NVME_FEAT_ARBITRATION:
1051 name = "Arbitration";
1053 case NVME_FEAT_POWER_MANAGEMENT:
1054 name = "Power Management";
1056 case NVME_FEAT_LBA_RANGE_TYPE:
1057 name = "LBA Range Type";
1059 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1060 name = "Temperature Threshold";
1062 case NVME_FEAT_ERROR_RECOVERY:
1063 name = "Error Recovery";
1065 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1066 name = "Volatile Write Cache";
1068 case NVME_FEAT_NUMBER_OF_QUEUES:
1069 name = "Number of Queues";
1071 case NVME_FEAT_INTERRUPT_COALESCING:
1072 name = "Interrupt Coalescing";
1074 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1075 name = "Interrupt Vector Configuration";
1077 case NVME_FEAT_WRITE_ATOMICITY:
1078 name = "Write Atomicity Normal";
1080 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1081 name = "Asynchronous Event Configuration";
1083 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1084 name = "Autonomous Power State Transition";
1086 case NVME_FEAT_HOST_MEMORY_BUFFER:
1087 name = "Host Memory Buffer";
1089 case NVME_FEAT_TIMESTAMP:
1092 case NVME_FEAT_KEEP_ALIVE_TIMER:
1093 name = "Keep Alive Timer";
1095 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1096 name = "Host Controlled Thermal Management";
1098 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1099 name = "Non-Operation Power State Config";
1101 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1102 name = "Read Recovery Level Config";
1104 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1105 name = "Predictable Latency Mode Config";
1107 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1108 name = "Predictable Latency Mode Window";
1110 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1111 name = "LBA Status Information Report Interval";
1113 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1114 name = "Host Behavior Support";
1116 case NVME_FEAT_SANITIZE_CONFIG:
1117 name = "Sanitize Config";
1119 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1120 name = "Endurance Group Event Configuration";
1122 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1123 name = "Software Progress Marker";
1125 case NVME_FEAT_HOST_IDENTIFIER:
1126 name = "Host Identifier";
1128 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1129 name = "Reservation Notification Mask";
1131 case NVME_FEAT_RESERVATION_PERSISTENCE:
1132 name = "Reservation Persistence";
1134 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1135 name = "Namespace Write Protection Config";
1146 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1147 struct nvme_feature_obj *feat,
1148 struct nvme_command *command,
1149 struct nvme_completion *compl)
1152 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1156 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1157 struct nvme_feature_obj *feat,
1158 struct nvme_command *command,
1159 struct nvme_completion *compl)
1161 uint16_t nqr; /* Number of Queues Requested */
1163 if (sc->num_q_is_set) {
1164 WPRINTF("%s: Number of Queues already set", __func__);
1165 pci_nvme_status_genc(&compl->status,
1166 NVME_SC_COMMAND_SEQUENCE_ERROR);
1170 nqr = command->cdw11 & 0xFFFF;
1171 if (nqr == 0xffff) {
1172 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1173 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1177 sc->num_squeues = ONE_BASED(nqr);
1178 if (sc->num_squeues > sc->max_queues) {
1179 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1181 sc->num_squeues = sc->max_queues;
1184 nqr = (command->cdw11 >> 16) & 0xFFFF;
1185 if (nqr == 0xffff) {
1186 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1187 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1191 sc->num_cqueues = ONE_BASED(nqr);
1192 if (sc->num_cqueues > sc->max_queues) {
1193 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1195 sc->num_cqueues = sc->max_queues;
1198 /* Patch the command value which will be saved on callback's return */
1199 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1200 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1202 sc->num_q_is_set = true;
1206 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1207 struct nvme_completion *compl)
1209 struct nvme_feature_obj *feat;
1210 uint32_t nsid = command->nsid;
1211 uint8_t fid = command->cdw10 & 0xFF;
1213 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1215 if (fid >= NVME_FID_MAX) {
1216 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1217 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1220 feat = &sc->feat[fid];
1222 if (!feat->namespace_specific &&
1223 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1224 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1225 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1230 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1233 feat->set(sc, feat, command, compl);
1235 if (compl->status == NVME_SC_SUCCESS)
1236 feat->cdw11 = command->cdw11;
1242 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1243 struct nvme_completion* compl)
1245 struct nvme_feature_obj *feat;
1246 uint8_t fid = command->cdw10 & 0xFF;
1248 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1250 if (fid >= NVME_FID_MAX) {
1251 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1252 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1257 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1259 feat = &sc->feat[fid];
1261 feat->get(sc, feat, command, compl);
1264 if (compl->status == NVME_SC_SUCCESS) {
1265 compl->cdw0 = feat->cdw11;
1272 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1273 struct nvme_completion* compl)
1275 uint8_t ses, lbaf, pi;
1277 /* Only supports Secure Erase Setting - User Data Erase */
1278 ses = (command->cdw10 >> 9) & 0x7;
1280 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1284 /* Only supports a single LBA Format */
1285 lbaf = command->cdw10 & 0xf;
1287 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1288 NVME_SC_INVALID_FORMAT);
1292 /* Doesn't support Protection Infomation */
1293 pi = (command->cdw10 >> 5) & 0x7;
1295 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1299 if (sc->nvstore.type == NVME_STOR_RAM) {
1300 if (sc->nvstore.ctx)
1301 free(sc->nvstore.ctx);
1302 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1303 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1305 struct pci_nvme_ioreq *req;
1308 req = pci_nvme_get_ioreq(sc);
1310 pci_nvme_status_genc(&compl->status,
1311 NVME_SC_INTERNAL_DEVICE_ERROR);
1312 WPRINTF("%s: unable to allocate IO req", __func__);
1315 req->nvme_sq = &sc->submit_queues[0];
1317 req->opc = command->opc;
1318 req->cid = command->cid;
1319 req->nsid = command->nsid;
1321 req->io_req.br_offset = 0;
1322 req->io_req.br_resid = sc->nvstore.size;
1323 req->io_req.br_callback = pci_nvme_io_done;
1325 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1327 pci_nvme_status_genc(&compl->status,
1328 NVME_SC_INTERNAL_DEVICE_ERROR);
1329 pci_nvme_release_ioreq(sc, req);
1337 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1338 struct nvme_completion* compl)
1340 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1341 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1343 /* TODO: search for the command ID and abort it */
1346 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1351 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1352 struct nvme_command* command, struct nvme_completion* compl)
1354 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1357 * TODO: raise events when they happen based on the Set Features cmd.
1358 * These events happen async, so only set completion successful if
1359 * there is an event reflective of the request to get event.
1361 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1362 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1367 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1369 struct nvme_completion compl;
1370 struct nvme_command *cmd;
1371 struct nvme_submission_queue *sq;
1372 struct nvme_completion_queue *cq;
1375 DPRINTF("%s index %u", __func__, (uint32_t)value);
1377 sq = &sc->submit_queues[0];
1378 cq = &sc->compl_queues[0];
1380 pthread_mutex_lock(&sq->mtx);
1383 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1385 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1386 cmd = &(sq->qbase)[sqhead];
1391 case NVME_OPC_DELETE_IO_SQ:
1392 DPRINTF("%s command DELETE_IO_SQ", __func__);
1393 nvme_opc_delete_io_sq(sc, cmd, &compl);
1395 case NVME_OPC_CREATE_IO_SQ:
1396 DPRINTF("%s command CREATE_IO_SQ", __func__);
1397 nvme_opc_create_io_sq(sc, cmd, &compl);
1399 case NVME_OPC_DELETE_IO_CQ:
1400 DPRINTF("%s command DELETE_IO_CQ", __func__);
1401 nvme_opc_delete_io_cq(sc, cmd, &compl);
1403 case NVME_OPC_CREATE_IO_CQ:
1404 DPRINTF("%s command CREATE_IO_CQ", __func__);
1405 nvme_opc_create_io_cq(sc, cmd, &compl);
1407 case NVME_OPC_GET_LOG_PAGE:
1408 DPRINTF("%s command GET_LOG_PAGE", __func__);
1409 nvme_opc_get_log_page(sc, cmd, &compl);
1411 case NVME_OPC_IDENTIFY:
1412 DPRINTF("%s command IDENTIFY", __func__);
1413 nvme_opc_identify(sc, cmd, &compl);
1415 case NVME_OPC_ABORT:
1416 DPRINTF("%s command ABORT", __func__);
1417 nvme_opc_abort(sc, cmd, &compl);
1419 case NVME_OPC_SET_FEATURES:
1420 DPRINTF("%s command SET_FEATURES", __func__);
1421 nvme_opc_set_features(sc, cmd, &compl);
1423 case NVME_OPC_GET_FEATURES:
1424 DPRINTF("%s command GET_FEATURES", __func__);
1425 nvme_opc_get_features(sc, cmd, &compl);
1427 case NVME_OPC_ASYNC_EVENT_REQUEST:
1428 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1429 /* XXX dont care, unhandled for now
1430 nvme_opc_async_event_req(sc, cmd, &compl);
1432 compl.status = NVME_NO_STATUS;
1434 case NVME_OPC_FORMAT_NVM:
1435 DPRINTF("%s command FORMAT_NVM", __func__);
1436 if ((sc->ctrldata.oacs &
1437 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1438 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1440 compl.status = NVME_NO_STATUS;
1441 nvme_opc_format_nvm(sc, cmd, &compl);
1444 DPRINTF("0x%x command is not implemented",
1446 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1448 sqhead = (sqhead + 1) % sq->size;
1450 if (NVME_COMPLETION_VALID(compl)) {
1451 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1459 DPRINTF("setting sqhead %u", sqhead);
1462 if (cq->head != cq->tail)
1463 pci_generate_msix(sc->nsc_pi, 0);
1465 pthread_mutex_unlock(&sq->mtx);
1469 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1470 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1477 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1481 /* concatenate contig block-iovs to minimize number of iovs */
1482 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1483 iovidx = req->io_req.br_iovcnt - 1;
1485 req->io_req.br_iov[iovidx].iov_base =
1486 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1487 req->prev_gpaddr, size);
1489 req->prev_size += size;
1490 req->io_req.br_resid += size;
1492 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1494 iovidx = req->io_req.br_iovcnt;
1496 req->io_req.br_offset = lba;
1497 req->io_req.br_resid = 0;
1498 req->io_req.br_param = req;
1501 req->io_req.br_iov[iovidx].iov_base =
1502 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1505 req->io_req.br_iov[iovidx].iov_len = size;
1507 req->prev_gpaddr = gpaddr;
1508 req->prev_size = size;
1509 req->io_req.br_resid += size;
1511 req->io_req.br_iovcnt++;
1518 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1519 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1520 uint32_t cdw0, uint16_t status)
1522 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1524 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1525 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1526 NVME_STATUS_GET_SC(status));
1528 pci_nvme_cq_update(sc, cq,
1534 if (cq->head != cq->tail) {
1535 if (cq->intr_en & NVME_CQ_INTEN) {
1536 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1538 DPRINTF("%s: CQ%u interrupt disabled",
1539 __func__, sq->cqid);
1545 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1548 req->nvme_sq = NULL;
1551 pthread_mutex_lock(&sc->mtx);
1553 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1556 /* when no more IO pending, can set to ready if device reset/enabled */
1557 if (sc->pending_ios == 0 &&
1558 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1559 sc->regs.csts |= NVME_CSTS_RDY;
1561 pthread_mutex_unlock(&sc->mtx);
1563 sem_post(&sc->iosemlock);
1566 static struct pci_nvme_ioreq *
1567 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1569 struct pci_nvme_ioreq *req = NULL;;
1571 sem_wait(&sc->iosemlock);
1572 pthread_mutex_lock(&sc->mtx);
1574 req = STAILQ_FIRST(&sc->ioreqs_free);
1575 assert(req != NULL);
1576 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1582 pthread_mutex_unlock(&sc->mtx);
1584 req->io_req.br_iovcnt = 0;
1585 req->io_req.br_offset = 0;
1586 req->io_req.br_resid = 0;
1587 req->io_req.br_param = req;
1588 req->prev_gpaddr = 0;
1595 pci_nvme_io_done(struct blockif_req *br, int err)
1597 struct pci_nvme_ioreq *req = br->br_param;
1598 struct nvme_submission_queue *sq = req->nvme_sq;
1599 uint16_t code, status;
1601 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1603 /* TODO return correct error */
1604 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1605 pci_nvme_status_genc(&status, code);
1607 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1608 pci_nvme_release_ioreq(req->sc, req);
1612 * Implements the Flush command. The specification states:
1613 * If a volatile write cache is not present, Flush commands complete
1614 * successfully and have no effect
1615 * in the description of the Volatile Write Cache (VWC) field of the Identify
1616 * Controller data. Therefore, set status to Success if the command is
1617 * not supported (i.e. RAM or as indicated by the blockif).
1620 nvme_opc_flush(struct pci_nvme_softc *sc,
1621 struct nvme_command *cmd,
1622 struct pci_nvme_blockstore *nvstore,
1623 struct pci_nvme_ioreq *req,
1626 bool pending = false;
1628 if (nvstore->type == NVME_STOR_RAM) {
1629 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1633 req->io_req.br_callback = pci_nvme_io_done;
1635 err = blockif_flush(nvstore->ctx, &req->io_req);
1641 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1644 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1652 nvme_write_read_ram(struct pci_nvme_softc *sc,
1653 struct pci_nvme_blockstore *nvstore,
1654 uint64_t prp1, uint64_t prp2,
1655 size_t offset, uint64_t bytes,
1658 uint8_t *buf = nvstore->ctx;
1659 enum nvme_copy_dir dir;
1663 dir = NVME_COPY_TO_PRP;
1665 dir = NVME_COPY_FROM_PRP;
1667 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1668 buf + offset, bytes, dir))
1669 pci_nvme_status_genc(&status,
1670 NVME_SC_DATA_TRANSFER_ERROR);
1672 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1678 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1679 struct pci_nvme_blockstore *nvstore,
1680 struct pci_nvme_ioreq *req,
1681 uint64_t prp1, uint64_t prp2,
1682 size_t offset, uint64_t bytes,
1687 uint16_t status = NVME_NO_STATUS;
1689 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1690 if (pci_nvme_append_iov_req(sc, req, prp1,
1691 size, is_write, offset)) {
1692 pci_nvme_status_genc(&status,
1693 NVME_SC_DATA_TRANSFER_ERROR);
1702 } else if (bytes <= PAGE_SIZE) {
1704 if (pci_nvme_append_iov_req(sc, req, prp2,
1705 size, is_write, offset)) {
1706 pci_nvme_status_genc(&status,
1707 NVME_SC_DATA_TRANSFER_ERROR);
1711 void *vmctx = sc->nsc_pi->pi_vmctx;
1712 uint64_t *prp_list = &prp2;
1713 uint64_t *last = prp_list;
1715 /* PRP2 is pointer to a physical region page list */
1717 /* Last entry in list points to the next list */
1718 if (prp_list == last) {
1719 uint64_t prp = *prp_list;
1721 prp_list = paddr_guest2host(vmctx, prp,
1722 PAGE_SIZE - (prp % PAGE_SIZE));
1723 last = prp_list + (NVME_PRP2_ITEMS - 1);
1726 size = MIN(bytes, PAGE_SIZE);
1728 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1729 size, is_write, offset)) {
1730 pci_nvme_status_genc(&status,
1731 NVME_SC_DATA_TRANSFER_ERROR);
1741 req->io_req.br_callback = pci_nvme_io_done;
1743 err = blockif_write(nvstore->ctx, &req->io_req);
1745 err = blockif_read(nvstore->ctx, &req->io_req);
1748 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
1754 nvme_opc_write_read(struct pci_nvme_softc *sc,
1755 struct nvme_command *cmd,
1756 struct pci_nvme_blockstore *nvstore,
1757 struct pci_nvme_ioreq *req,
1760 uint64_t lba, nblocks, bytes;
1762 bool is_write = cmd->opc == NVME_OPC_WRITE;
1763 bool pending = false;
1765 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1766 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1768 bytes = nblocks * nvstore->sectsz;
1769 if (bytes > NVME_MAX_DATA_SIZE) {
1770 WPRINTF("%s command would exceed MDTS", __func__);
1771 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
1775 offset = lba * nvstore->sectsz;
1776 if ((offset + bytes) > nvstore->size) {
1777 WPRINTF("%s command would exceed LBA range", __func__);
1778 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1782 req->io_req.br_offset = lba;
1784 /* PRP bits 1:0 must be zero */
1785 cmd->prp1 &= ~0x3UL;
1786 cmd->prp2 &= ~0x3UL;
1788 if (nvstore->type == NVME_STOR_RAM) {
1789 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
1790 cmd->prp2, offset, bytes, is_write);
1792 *status = nvme_write_read_blockif(sc, nvstore, req,
1793 cmd->prp1, cmd->prp2, offset, bytes, is_write);
1795 if (*status == NVME_NO_STATUS)
1803 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1805 struct pci_nvme_ioreq *req = br->br_param;
1806 struct pci_nvme_softc *sc = req->sc;
1811 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1812 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1813 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1815 struct iovec *iov = req->io_req.br_iov;
1818 iov += req->prev_gpaddr;
1820 /* The iov_* values already include the sector size */
1821 req->io_req.br_offset = (off_t)iov->iov_base;
1822 req->io_req.br_resid = iov->iov_len;
1823 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1824 pci_nvme_status_genc(&status,
1825 NVME_SC_INTERNAL_DEVICE_ERROR);
1831 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1832 req->cid, 0, status);
1833 pci_nvme_release_ioreq(sc, req);
1838 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1839 struct nvme_command *cmd,
1840 struct pci_nvme_blockstore *nvstore,
1841 struct pci_nvme_ioreq *req,
1845 bool pending = false;
1847 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1848 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1852 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1853 struct nvme_dsm_range *range;
1855 int sectsz = sc->nvstore.sectsz;
1858 * DSM calls are advisory only, and compliant controllers
1859 * may choose to take no actions (i.e. return Success).
1861 if (!nvstore->deallocate) {
1862 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1867 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1871 /* copy locally because a range entry could straddle PRPs */
1872 range = calloc(1, NVME_MAX_DSM_TRIM);
1873 if (range == NULL) {
1874 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1877 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1878 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1881 * If the request is for more than a single range, store
1882 * the ranges in the br_iov. Optimize for the common case
1883 * of a single range.
1885 * Note that NVMe Number of Ranges is a zero based value
1887 nr = cmd->cdw10 & 0xff;
1889 req->io_req.br_iovcnt = 0;
1890 req->io_req.br_offset = range[0].starting_lba * sectsz;
1891 req->io_req.br_resid = range[0].length * sectsz;
1894 req->io_req.br_callback = pci_nvme_io_done;
1896 struct iovec *iov = req->io_req.br_iov;
1898 for (r = 0; r <= nr; r++) {
1899 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1900 iov[r].iov_len = range[r].length * sectsz;
1902 req->io_req.br_callback = pci_nvme_dealloc_sm;
1905 * Use prev_gpaddr to track the current entry and
1906 * prev_size to track the number of entries
1908 req->prev_gpaddr = 0;
1912 err = blockif_delete(nvstore->ctx, &req->io_req);
1914 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1925 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1927 struct nvme_submission_queue *sq;
1931 /* handle all submissions up to sq->tail index */
1932 sq = &sc->submit_queues[idx];
1934 pthread_mutex_lock(&sq->mtx);
1937 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1938 idx, sqhead, sq->tail, sq->qbase);
1940 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1941 struct nvme_command *cmd;
1942 struct pci_nvme_ioreq *req;
1950 cmd = &sq->qbase[sqhead];
1951 sqhead = (sqhead + 1) % sq->size;
1953 nsid = le32toh(cmd->nsid);
1954 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1955 pci_nvme_status_genc(&status,
1956 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1958 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1962 req = pci_nvme_get_ioreq(sc);
1964 pci_nvme_status_genc(&status,
1965 NVME_SC_INTERNAL_DEVICE_ERROR);
1966 WPRINTF("%s: unable to allocate IO req", __func__);
1971 req->opc = cmd->opc;
1972 req->cid = cmd->cid;
1973 req->nsid = cmd->nsid;
1976 case NVME_OPC_FLUSH:
1977 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1980 case NVME_OPC_WRITE:
1982 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1985 case NVME_OPC_WRITE_ZEROES:
1986 /* TODO: write zeroes
1987 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1988 __func__, lba, cmd->cdw12 & 0xFFFF); */
1989 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1991 case NVME_OPC_DATASET_MANAGEMENT:
1992 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1996 WPRINTF("%s unhandled io command 0x%x",
1997 __func__, cmd->opc);
1998 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2002 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2005 pci_nvme_release_ioreq(sc, req);
2011 pthread_mutex_unlock(&sq->mtx);
2015 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2016 uint64_t idx, int is_sq, uint64_t value)
2018 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2019 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2022 if (idx > sc->num_squeues) {
2023 WPRINTF("%s queue index %lu overflow from "
2025 __func__, idx, sc->num_squeues);
2029 atomic_store_short(&sc->submit_queues[idx].tail,
2033 pci_nvme_handle_admin_cmd(sc, value);
2035 /* submission queue; handle new entries in SQ */
2036 if (idx > sc->num_squeues) {
2037 WPRINTF("%s SQ index %lu overflow from "
2039 __func__, idx, sc->num_squeues);
2042 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2045 if (idx > sc->num_cqueues) {
2046 WPRINTF("%s queue index %lu overflow from "
2048 __func__, idx, sc->num_cqueues);
2052 atomic_store_short(&sc->compl_queues[idx].head,
2058 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2060 const char *s = iswrite ? "WRITE" : "READ";
2063 case NVME_CR_CAP_LOW:
2064 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2066 case NVME_CR_CAP_HI:
2067 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2070 DPRINTF("%s %s NVME_CR_VS", func, s);
2073 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2076 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2079 DPRINTF("%s %s NVME_CR_CC", func, s);
2082 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2085 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2088 DPRINTF("%s %s NVME_CR_AQA", func, s);
2090 case NVME_CR_ASQ_LOW:
2091 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2093 case NVME_CR_ASQ_HI:
2094 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2096 case NVME_CR_ACQ_LOW:
2097 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2099 case NVME_CR_ACQ_HI:
2100 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2103 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2109 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2110 uint64_t offset, int size, uint64_t value)
2114 if (offset >= NVME_DOORBELL_OFFSET) {
2115 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2116 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2117 int is_sq = (belloffset % 8) < 4;
2119 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2120 WPRINTF("guest attempted an overflow write offset "
2121 "0x%lx, val 0x%lx in %s",
2122 offset, value, __func__);
2126 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2130 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2131 offset, size, value);
2134 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2135 "val 0x%lx) to bar0 in %s",
2136 size, offset, value, __func__);
2137 /* TODO: shutdown device */
2141 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2143 pthread_mutex_lock(&sc->mtx);
2146 case NVME_CR_CAP_LOW:
2147 case NVME_CR_CAP_HI:
2154 /* MSI-X, so ignore */
2157 /* MSI-X, so ignore */
2160 ccreg = (uint32_t)value;
2162 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2165 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2166 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2167 NVME_CC_GET_IOCQES(ccreg));
2169 if (NVME_CC_GET_SHN(ccreg)) {
2170 /* perform shutdown - flush out data to backend */
2171 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2172 NVME_CSTS_REG_SHST_SHIFT);
2173 sc->regs.csts |= NVME_SHST_COMPLETE <<
2174 NVME_CSTS_REG_SHST_SHIFT;
2176 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2177 if (NVME_CC_GET_EN(ccreg) == 0)
2178 /* transition 1-> causes controller reset */
2179 pci_nvme_reset_locked(sc);
2181 pci_nvme_init_controller(ctx, sc);
2184 /* Insert the iocqes, iosqes and en bits from the write */
2185 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2186 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2187 if (NVME_CC_GET_EN(ccreg) == 0) {
2188 /* Insert the ams, mps and css bit fields */
2189 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2190 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2191 sc->regs.csts &= ~NVME_CSTS_RDY;
2192 } else if (sc->pending_ios == 0) {
2193 sc->regs.csts |= NVME_CSTS_RDY;
2199 /* ignore writes; don't support subsystem reset */
2202 sc->regs.aqa = (uint32_t)value;
2204 case NVME_CR_ASQ_LOW:
2205 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2206 (0xFFFFF000 & value);
2208 case NVME_CR_ASQ_HI:
2209 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2212 case NVME_CR_ACQ_LOW:
2213 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2214 (0xFFFFF000 & value);
2216 case NVME_CR_ACQ_HI:
2217 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2221 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2222 __func__, offset, value, size);
2224 pthread_mutex_unlock(&sc->mtx);
2228 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2229 int baridx, uint64_t offset, int size, uint64_t value)
2231 struct pci_nvme_softc* sc = pi->pi_arg;
2233 if (baridx == pci_msix_table_bar(pi) ||
2234 baridx == pci_msix_pba_bar(pi)) {
2235 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2236 " value 0x%lx", baridx, offset, size, value);
2238 pci_emul_msix_twrite(pi, offset, size, value);
2244 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2248 DPRINTF("%s unknown baridx %d, val 0x%lx",
2249 __func__, baridx, value);
2253 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2254 uint64_t offset, int size)
2258 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2260 if (offset < NVME_DOORBELL_OFFSET) {
2261 void *p = &(sc->regs);
2262 pthread_mutex_lock(&sc->mtx);
2263 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2264 pthread_mutex_unlock(&sc->mtx);
2267 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2278 value &= 0xFFFFFFFF;
2282 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2283 offset, size, (uint32_t)value);
2291 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2292 uint64_t offset, int size)
2294 struct pci_nvme_softc* sc = pi->pi_arg;
2296 if (baridx == pci_msix_table_bar(pi) ||
2297 baridx == pci_msix_pba_bar(pi)) {
2298 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2299 baridx, offset, size);
2301 return pci_emul_msix_tread(pi, offset, size);
2306 return pci_nvme_read_bar_0(sc, offset, size);
2309 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2317 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2319 char bident[sizeof("XX:X:X")];
2320 char *uopt, *xopts, *config;
2324 sc->max_queues = NVME_QUEUES;
2325 sc->max_qentries = NVME_MAX_QENTRIES;
2326 sc->ioslots = NVME_IOSLOTS;
2327 sc->num_squeues = sc->max_queues;
2328 sc->num_cqueues = sc->max_queues;
2329 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2332 uopt = strdup(opts);
2334 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2335 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2336 for (xopts = strtok(uopt, ",");
2338 xopts = strtok(NULL, ",")) {
2340 if ((config = strchr(xopts, '=')) != NULL)
2343 if (!strcmp("maxq", xopts)) {
2344 sc->max_queues = atoi(config);
2345 } else if (!strcmp("qsz", xopts)) {
2346 sc->max_qentries = atoi(config);
2347 } else if (!strcmp("ioslots", xopts)) {
2348 sc->ioslots = atoi(config);
2349 } else if (!strcmp("sectsz", xopts)) {
2350 sectsz = atoi(config);
2351 } else if (!strcmp("ser", xopts)) {
2353 * This field indicates the Product Serial Number in
2354 * 7-bit ASCII, unused bytes should be space characters.
2357 cpywithpad((char *)sc->ctrldata.sn,
2358 sizeof(sc->ctrldata.sn), config, ' ');
2359 } else if (!strcmp("ram", xopts)) {
2360 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2362 sc->nvstore.type = NVME_STOR_RAM;
2363 sc->nvstore.size = sz * 1024 * 1024;
2364 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2365 sc->nvstore.sectsz = 4096;
2366 sc->nvstore.sectsz_bits = 12;
2367 if (sc->nvstore.ctx == NULL) {
2368 perror("Unable to allocate RAM");
2372 } else if (!strcmp("eui64", xopts)) {
2373 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2374 } else if (!strcmp("dsm", xopts)) {
2375 if (!strcmp("auto", config))
2376 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2377 else if (!strcmp("enable", config))
2378 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2379 else if (!strcmp("disable", config))
2380 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2381 } else if (optidx == 0) {
2382 snprintf(bident, sizeof(bident), "%d:%d",
2383 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2384 sc->nvstore.ctx = blockif_open(xopts, bident);
2385 if (sc->nvstore.ctx == NULL) {
2386 perror("Could not open backing file");
2390 sc->nvstore.type = NVME_STOR_BLOCKIF;
2391 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2393 EPRINTLN("Invalid option %s", xopts);
2402 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2403 EPRINTLN("backing store not specified");
2406 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2407 sc->nvstore.sectsz = sectsz;
2408 else if (sc->nvstore.type != NVME_STOR_RAM)
2409 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2410 for (sc->nvstore.sectsz_bits = 9;
2411 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2412 sc->nvstore.sectsz_bits++);
2414 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2415 sc->max_queues = NVME_QUEUES;
2417 if (sc->max_qentries <= 0) {
2418 EPRINTLN("Invalid qsz option");
2421 if (sc->ioslots <= 0) {
2422 EPRINTLN("Invalid ioslots option");
2430 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2432 struct pci_nvme_softc *sc;
2433 uint32_t pci_membar_sz;
2438 sc = calloc(1, sizeof(struct pci_nvme_softc));
2442 error = pci_nvme_parse_opts(sc, opts);
2448 STAILQ_INIT(&sc->ioreqs_free);
2449 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2450 for (int i = 0; i < sc->ioslots; i++) {
2451 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2454 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2455 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2456 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2457 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2458 pci_set_cfgdata8(pi, PCIR_PROGIF,
2459 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2462 * Allocate size of NVMe registers + doorbell space for all queues.
2464 * The specification requires a minimum memory I/O window size of 16K.
2465 * The Windows driver will refuse to start a device with a smaller
2468 pci_membar_sz = sizeof(struct nvme_registers) +
2469 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2470 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2472 DPRINTF("nvme membar size: %u", pci_membar_sz);
2474 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2476 WPRINTF("%s pci alloc mem bar failed", __func__);
2480 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2482 WPRINTF("%s pci add msixcap failed", __func__);
2486 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2488 WPRINTF("%s pci add Express capability failed", __func__);
2492 pthread_mutex_init(&sc->mtx, NULL);
2493 sem_init(&sc->iosemlock, 0, sc->ioslots);
2495 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2497 * Controller data depends on Namespace data so initialize Namespace
2500 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2501 pci_nvme_init_ctrldata(sc);
2502 pci_nvme_init_logpages(sc);
2503 pci_nvme_init_features(sc);
2507 pci_lintr_request(pi);
2514 struct pci_devemu pci_de_nvme = {
2516 .pe_init = pci_nvme_init,
2517 .pe_barwrite = pci_nvme_write,
2518 .pe_barread = pci_nvme_read
2520 PCI_EMUL_SET(pci_de_nvme);