2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
103 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
104 #define NVME_MAX_BLOCKIOVS 512
106 /* This is a synthetic status code to indicate there is no status */
107 #define NVME_NO_STATUS 0xffff
108 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
112 /* Convert a zero-based value into a one-based value */
113 #define ONE_BASED(zero) ((zero) + 1)
114 /* Convert a one-based value into a zero-based value */
115 #define ZERO_BASED(one) ((one) - 1)
117 /* Encode number of SQ's and CQ's for Set/Get Features */
118 #define NVME_FEATURE_NUM_QUEUES(sc) \
119 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
120 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
122 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
124 enum nvme_controller_register_offsets {
125 NVME_CR_CAP_LOW = 0x00,
126 NVME_CR_CAP_HI = 0x04,
128 NVME_CR_INTMS = 0x0c,
129 NVME_CR_INTMC = 0x10,
134 NVME_CR_ASQ_LOW = 0x28,
135 NVME_CR_ASQ_HI = 0x2c,
136 NVME_CR_ACQ_LOW = 0x30,
137 NVME_CR_ACQ_HI = 0x34,
140 enum nvme_cmd_cdw11 {
141 NVME_CMD_CDW11_PC = 0x0001,
142 NVME_CMD_CDW11_IEN = 0x0002,
143 NVME_CMD_CDW11_IV = 0xFFFF0000,
151 #define NVME_CQ_INTEN 0x01
152 #define NVME_CQ_INTCOAL 0x02
154 struct nvme_completion_queue {
155 struct nvme_completion *qbase;
158 uint16_t tail; /* nvme progress */
159 uint16_t head; /* guest progress */
164 struct nvme_submission_queue {
165 struct nvme_command *qbase;
168 uint16_t head; /* nvme progress */
169 uint16_t tail; /* guest progress */
170 uint16_t cqid; /* completion queue id */
174 enum nvme_storage_type {
175 NVME_STOR_BLOCKIF = 0,
179 struct pci_nvme_blockstore {
180 enum nvme_storage_type type;
184 uint32_t sectsz_bits;
186 uint32_t deallocate:1;
189 struct pci_nvme_ioreq {
190 struct pci_nvme_softc *sc;
191 STAILQ_ENTRY(pci_nvme_ioreq) link;
192 struct nvme_submission_queue *nvme_sq;
195 /* command information */
200 uint64_t prev_gpaddr;
204 * lock if all iovs consumed (big IO);
205 * complete transaction before continuing
210 struct blockif_req io_req;
212 /* pad to fit up to 512 page descriptors from guest IO request */
213 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
217 /* Dataset Management bit in ONCS reflects backing storage capability */
218 NVME_DATASET_MANAGEMENT_AUTO,
219 /* Unconditionally set Dataset Management bit in ONCS */
220 NVME_DATASET_MANAGEMENT_ENABLE,
221 /* Unconditionally clear Dataset Management bit in ONCS */
222 NVME_DATASET_MANAGEMENT_DISABLE,
225 struct pci_nvme_softc {
226 struct pci_devinst *nsc_pi;
230 struct nvme_registers regs;
232 struct nvme_namespace_data nsdata;
233 struct nvme_controller_data ctrldata;
234 struct nvme_error_information_entry err_log;
235 struct nvme_health_information_page health_log;
236 struct nvme_firmware_page fw_log;
238 struct pci_nvme_blockstore nvstore;
240 uint16_t max_qentries; /* max entries per queue */
241 uint32_t max_queues; /* max number of IO SQ's or CQ's */
242 uint32_t num_cqueues;
243 uint32_t num_squeues;
245 struct pci_nvme_ioreq *ioreqs;
246 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
247 uint32_t pending_ios;
252 * Memory mapped Submission and Completion queues
253 * Each array includes both Admin and IO queues
255 struct nvme_completion_queue *compl_queues;
256 struct nvme_submission_queue *submit_queues;
258 /* controller features */
259 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
260 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
261 uint32_t async_ev_config; /* 0x0B: async event config */
263 enum nvme_dsm_type dataset_management;
267 static void pci_nvme_io_partial(struct blockif_req *br, int err);
269 /* Controller Configuration utils */
270 #define NVME_CC_GET_EN(cc) \
271 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
272 #define NVME_CC_GET_CSS(cc) \
273 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
274 #define NVME_CC_GET_SHN(cc) \
275 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
276 #define NVME_CC_GET_IOSQES(cc) \
277 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
278 #define NVME_CC_GET_IOCQES(cc) \
279 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
281 #define NVME_CC_WRITE_MASK \
282 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
283 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
284 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
286 #define NVME_CC_NEN_WRITE_MASK \
287 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
288 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
289 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
291 /* Controller Status utils */
292 #define NVME_CSTS_GET_RDY(sts) \
293 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
295 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
297 /* Completion Queue status word utils */
298 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
299 #define NVME_STATUS_MASK \
300 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
301 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
303 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
304 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
307 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
311 len = strnlen(src, dst_size);
312 memset(dst, pad, dst_size);
313 memcpy(dst, src, len);
317 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
320 *status &= ~NVME_STATUS_MASK;
321 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
322 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
326 pci_nvme_status_genc(uint16_t *status, uint16_t code)
329 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
333 * Initialize the requested number or IO Submission and Completion Queues.
334 * Admin queues are allocated implicitly.
337 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
342 * Allocate and initialize the Submission Queues
344 if (nsq > NVME_QUEUES) {
345 WPRINTF("%s: clamping number of SQ from %u to %u",
346 __func__, nsq, NVME_QUEUES);
350 sc->num_squeues = nsq;
352 sc->submit_queues = calloc(sc->num_squeues + 1,
353 sizeof(struct nvme_submission_queue));
354 if (sc->submit_queues == NULL) {
355 WPRINTF("%s: SQ allocation failed", __func__);
358 struct nvme_submission_queue *sq = sc->submit_queues;
360 for (i = 0; i < sc->num_squeues; i++)
361 pthread_mutex_init(&sq[i].mtx, NULL);
365 * Allocate and initialize the Completion Queues
367 if (ncq > NVME_QUEUES) {
368 WPRINTF("%s: clamping number of CQ from %u to %u",
369 __func__, ncq, NVME_QUEUES);
373 sc->num_cqueues = ncq;
375 sc->compl_queues = calloc(sc->num_cqueues + 1,
376 sizeof(struct nvme_completion_queue));
377 if (sc->compl_queues == NULL) {
378 WPRINTF("%s: CQ allocation failed", __func__);
381 struct nvme_completion_queue *cq = sc->compl_queues;
383 for (i = 0; i < sc->num_cqueues; i++)
384 pthread_mutex_init(&cq[i].mtx, NULL);
389 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
391 struct nvme_controller_data *cd = &sc->ctrldata;
396 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
397 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
399 /* Num of submission commands that we can handle at a time (2^rab) */
409 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
411 cd->ver = 0x00010300;
413 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
417 cd->lpa = 0; /* TODO: support some simple things like SMART */
418 cd->elpe = 0; /* max error log page entries */
419 cd->npss = 1; /* number of power states support */
421 /* Warning Composite Temperature Threshold */
424 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
425 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
426 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
427 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
428 cd->nn = 1; /* number of namespaces */
431 switch (sc->dataset_management) {
432 case NVME_DATASET_MANAGEMENT_AUTO:
433 if (sc->nvstore.deallocate)
434 cd->oncs |= NVME_ONCS_DSM;
436 case NVME_DATASET_MANAGEMENT_ENABLE:
437 cd->oncs |= NVME_ONCS_DSM;
445 cd->power_state[0].mp = 10;
449 * Calculate the CRC-16 of the given buffer
450 * See copyright attribution at top of file
453 crc16(uint16_t crc, const void *buffer, unsigned int len)
455 const unsigned char *cp = buffer;
456 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
457 static uint16_t const crc16_table[256] = {
458 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
459 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
460 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
461 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
462 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
463 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
464 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
465 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
466 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
467 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
468 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
469 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
470 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
471 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
472 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
473 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
474 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
475 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
476 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
477 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
478 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
479 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
480 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
481 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
482 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
483 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
484 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
485 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
486 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
487 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
488 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
489 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
493 crc = (((crc >> 8) & 0xffU) ^
494 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
499 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
500 struct nvme_namespace_data *nd, uint32_t nsid,
501 struct pci_nvme_blockstore *nvstore)
504 /* Get capacity and block size information from backing store */
505 nd->nsze = nvstore->size / nvstore->sectsz;
509 if (nvstore->type == NVME_STOR_BLOCKIF)
510 nvstore->deallocate = blockif_candelete(nvstore->ctx);
512 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
515 /* Create an EUI-64 if user did not provide one */
516 if (nvstore->eui64 == 0) {
518 uint64_t eui64 = nvstore->eui64;
520 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
521 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
524 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
527 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
529 be64enc(nd->eui64, nvstore->eui64);
531 /* LBA data-sz = 2^lbads */
532 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
536 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
539 memset(&sc->err_log, 0, sizeof(sc->err_log));
540 memset(&sc->health_log, 0, sizeof(sc->health_log));
541 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
545 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
549 DPRINTF("%s", __func__);
551 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
552 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
553 (60 << NVME_CAP_LO_REG_TO_SHIFT);
555 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
557 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
562 assert(sc->submit_queues != NULL);
564 for (i = 0; i < sc->num_squeues + 1; i++) {
565 sc->submit_queues[i].qbase = NULL;
566 sc->submit_queues[i].size = 0;
567 sc->submit_queues[i].cqid = 0;
568 sc->submit_queues[i].tail = 0;
569 sc->submit_queues[i].head = 0;
572 assert(sc->compl_queues != NULL);
574 for (i = 0; i < sc->num_cqueues + 1; i++) {
575 sc->compl_queues[i].qbase = NULL;
576 sc->compl_queues[i].size = 0;
577 sc->compl_queues[i].tail = 0;
578 sc->compl_queues[i].head = 0;
583 pci_nvme_reset(struct pci_nvme_softc *sc)
585 pthread_mutex_lock(&sc->mtx);
586 pci_nvme_reset_locked(sc);
587 pthread_mutex_unlock(&sc->mtx);
591 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
595 DPRINTF("%s", __func__);
597 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
598 sc->submit_queues[0].size = asqs;
599 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
600 sizeof(struct nvme_command) * asqs);
602 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
603 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
605 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
606 NVME_AQA_REG_ACQS_MASK) + 1;
607 sc->compl_queues[0].size = acqs;
608 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
609 sizeof(struct nvme_completion) * acqs);
611 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
612 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
616 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
617 size_t len, enum nvme_copy_dir dir)
622 if (len > (8 * 1024)) {
626 /* Copy from the start of prp1 to the end of the physical page */
627 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
628 bytes = MIN(bytes, len);
630 p = vm_map_gpa(ctx, prp1, bytes);
635 if (dir == NVME_COPY_TO_PRP)
647 len = MIN(len, PAGE_SIZE);
649 p = vm_map_gpa(ctx, prp2, len);
654 if (dir == NVME_COPY_TO_PRP)
663 * Write a Completion Queue Entry update
665 * Write the completion and update the doorbell value
668 pci_nvme_cq_update(struct pci_nvme_softc *sc,
669 struct nvme_completion_queue *cq,
675 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
676 struct nvme_completion *cqe;
678 assert(cq->qbase != NULL);
680 pthread_mutex_lock(&cq->mtx);
682 cqe = &cq->qbase[cq->tail];
684 /* Flip the phase bit */
685 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
688 cqe->sqhd = sq->head;
691 cqe->status = status;
694 if (cq->tail >= cq->size) {
698 pthread_mutex_unlock(&cq->mtx);
702 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
703 struct nvme_completion* compl)
705 uint16_t qid = command->cdw10 & 0xffff;
707 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
708 if (qid == 0 || qid > sc->num_squeues ||
709 (sc->submit_queues[qid].qbase == NULL)) {
710 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
711 __func__, qid, sc->num_squeues);
712 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
713 NVME_SC_INVALID_QUEUE_IDENTIFIER);
717 sc->submit_queues[qid].qbase = NULL;
718 sc->submit_queues[qid].cqid = 0;
719 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
724 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
725 struct nvme_completion* compl)
727 if (command->cdw11 & NVME_CMD_CDW11_PC) {
728 uint16_t qid = command->cdw10 & 0xffff;
729 struct nvme_submission_queue *nsq;
731 if ((qid == 0) || (qid > sc->num_squeues) ||
732 (sc->submit_queues[qid].qbase != NULL)) {
733 WPRINTF("%s queue index %u > num_squeues %u",
734 __func__, qid, sc->num_squeues);
735 pci_nvme_status_tc(&compl->status,
736 NVME_SCT_COMMAND_SPECIFIC,
737 NVME_SC_INVALID_QUEUE_IDENTIFIER);
741 nsq = &sc->submit_queues[qid];
742 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
743 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
744 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
746 * Queues must specify at least two entries
747 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
748 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
750 pci_nvme_status_tc(&compl->status,
751 NVME_SCT_COMMAND_SPECIFIC,
752 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
756 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
757 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
758 pci_nvme_status_tc(&compl->status,
759 NVME_SCT_COMMAND_SPECIFIC,
760 NVME_SC_INVALID_QUEUE_IDENTIFIER);
764 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
765 pci_nvme_status_tc(&compl->status,
766 NVME_SCT_COMMAND_SPECIFIC,
767 NVME_SC_COMPLETION_QUEUE_INVALID);
771 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
773 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
774 sizeof(struct nvme_command) * (size_t)nsq->size);
776 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
777 qid, nsq->size, nsq->qbase, nsq->cqid);
779 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
781 DPRINTF("%s completed creating IOSQ qid %u",
785 * Guest sent non-cont submission queue request.
786 * This setting is unsupported by this emulation.
788 WPRINTF("%s unsupported non-contig (list-based) "
789 "create i/o submission queue", __func__);
791 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
797 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
798 struct nvme_completion* compl)
800 uint16_t qid = command->cdw10 & 0xffff;
803 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
804 if (qid == 0 || qid > sc->num_cqueues ||
805 (sc->compl_queues[qid].qbase == NULL)) {
806 WPRINTF("%s queue index %u / num_cqueues %u",
807 __func__, qid, sc->num_cqueues);
808 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
809 NVME_SC_INVALID_QUEUE_IDENTIFIER);
813 /* Deleting an Active CQ is an error */
814 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
815 if (sc->submit_queues[sqid].cqid == qid) {
816 pci_nvme_status_tc(&compl->status,
817 NVME_SCT_COMMAND_SPECIFIC,
818 NVME_SC_INVALID_QUEUE_DELETION);
822 sc->compl_queues[qid].qbase = NULL;
823 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
828 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
829 struct nvme_completion* compl)
831 struct nvme_completion_queue *ncq;
832 uint16_t qid = command->cdw10 & 0xffff;
834 /* Only support Physically Contiguous queues */
835 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
836 WPRINTF("%s unsupported non-contig (list-based) "
837 "create i/o completion queue",
840 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
844 if ((qid == 0) || (qid > sc->num_cqueues) ||
845 (sc->compl_queues[qid].qbase != NULL)) {
846 WPRINTF("%s queue index %u > num_cqueues %u",
847 __func__, qid, sc->num_cqueues);
848 pci_nvme_status_tc(&compl->status,
849 NVME_SCT_COMMAND_SPECIFIC,
850 NVME_SC_INVALID_QUEUE_IDENTIFIER);
854 ncq = &sc->compl_queues[qid];
855 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
856 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
857 if (ncq->intr_vec > (sc->max_queues + 1)) {
858 pci_nvme_status_tc(&compl->status,
859 NVME_SCT_COMMAND_SPECIFIC,
860 NVME_SC_INVALID_INTERRUPT_VECTOR);
864 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
865 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
867 * Queues must specify at least two entries
868 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
869 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
871 pci_nvme_status_tc(&compl->status,
872 NVME_SCT_COMMAND_SPECIFIC,
873 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
876 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
878 sizeof(struct nvme_command) * (size_t)ncq->size);
880 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
887 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
888 struct nvme_completion* compl)
891 uint8_t logpage = command->cdw10 & 0xFF;
893 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
895 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
898 * Command specifies the number of dwords to return in fields NUMDU
899 * and NUMDL. This is a zero-based value.
901 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
902 logsize *= sizeof(uint32_t);
906 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
907 command->prp2, (uint8_t *)&sc->err_log,
908 MIN(logsize, sizeof(sc->err_log)),
911 case NVME_LOG_HEALTH_INFORMATION:
912 /* TODO: present some smart info */
913 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
914 command->prp2, (uint8_t *)&sc->health_log,
915 MIN(logsize, sizeof(sc->health_log)),
918 case NVME_LOG_FIRMWARE_SLOT:
919 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
920 command->prp2, (uint8_t *)&sc->fw_log,
921 MIN(logsize, sizeof(sc->fw_log)),
925 DPRINTF("%s get log page %x command not supported",
928 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
929 NVME_SC_INVALID_LOG_PAGE);
936 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
937 struct nvme_completion* compl)
942 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
943 command->cdw10 & 0xFF, command->nsid);
945 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
947 switch (command->cdw10 & 0xFF) {
948 case 0x00: /* return Identify Namespace data structure */
949 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
950 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
953 case 0x01: /* return Identify Controller data structure */
954 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
955 command->prp2, (uint8_t *)&sc->ctrldata,
956 sizeof(sc->ctrldata),
959 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
960 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
961 sizeof(uint32_t) * 1024);
962 ((uint32_t *)dest)[0] = 1;
963 ((uint32_t *)dest)[1] = 0;
965 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
966 if (command->nsid != 1) {
967 pci_nvme_status_genc(&status,
968 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
971 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
972 sizeof(uint32_t) * 1024);
973 /* All bytes after the descriptor shall be zero */
974 bzero(dest, sizeof(uint32_t) * 1024);
976 /* Return NIDT=1 (i.e. EUI64) descriptor */
977 ((uint8_t *)dest)[0] = 1;
978 ((uint8_t *)dest)[1] = sizeof(uint64_t);
979 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
982 DPRINTF("%s unsupported identify command requested 0x%x",
983 __func__, command->cdw10 & 0xFF);
984 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
988 compl->status = status;
993 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
994 struct nvme_completion* compl)
996 uint16_t nqr; /* Number of Queues Requested */
998 nqr = command->cdw11 & 0xFFFF;
1000 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1001 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1005 sc->num_squeues = ONE_BASED(nqr);
1006 if (sc->num_squeues > sc->max_queues) {
1007 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1009 sc->num_squeues = sc->max_queues;
1012 nqr = (command->cdw11 >> 16) & 0xFFFF;
1013 if (nqr == 0xffff) {
1014 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1015 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1019 sc->num_cqueues = ONE_BASED(nqr);
1020 if (sc->num_cqueues > sc->max_queues) {
1021 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1023 sc->num_cqueues = sc->max_queues;
1026 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1032 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1033 struct nvme_completion* compl)
1035 int feature = command->cdw10 & 0xFF;
1038 DPRINTF("%s feature 0x%x", __func__, feature);
1042 case NVME_FEAT_ARBITRATION:
1043 DPRINTF(" arbitration 0x%x", command->cdw11);
1045 case NVME_FEAT_POWER_MANAGEMENT:
1046 DPRINTF(" power management 0x%x", command->cdw11);
1048 case NVME_FEAT_LBA_RANGE_TYPE:
1049 DPRINTF(" lba range 0x%x", command->cdw11);
1051 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1052 DPRINTF(" temperature threshold 0x%x", command->cdw11);
1054 case NVME_FEAT_ERROR_RECOVERY:
1055 DPRINTF(" error recovery 0x%x", command->cdw11);
1057 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1058 DPRINTF(" volatile write cache 0x%x", command->cdw11);
1060 case NVME_FEAT_NUMBER_OF_QUEUES:
1061 nvme_set_feature_queues(sc, command, compl);
1063 case NVME_FEAT_INTERRUPT_COALESCING:
1064 DPRINTF(" interrupt coalescing 0x%x", command->cdw11);
1067 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
1069 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
1071 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1072 iv = command->cdw11 & 0xFFFF;
1074 DPRINTF(" interrupt vector configuration 0x%x",
1077 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
1078 if (sc->compl_queues[i].intr_vec == iv) {
1079 if (command->cdw11 & (1 << 16))
1080 sc->compl_queues[i].intr_en |=
1083 sc->compl_queues[i].intr_en &=
1088 case NVME_FEAT_WRITE_ATOMICITY:
1089 DPRINTF(" write atomicity 0x%x", command->cdw11);
1091 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1092 DPRINTF(" async event configuration 0x%x",
1094 sc->async_ev_config = command->cdw11;
1096 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1097 DPRINTF(" software progress marker 0x%x",
1101 DPRINTF(" autonomous power state transition 0x%x",
1105 WPRINTF("%s invalid feature", __func__);
1106 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1110 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1115 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1116 struct nvme_completion* compl)
1118 int feature = command->cdw10 & 0xFF;
1120 DPRINTF("%s feature 0x%x", __func__, feature);
1125 case NVME_FEAT_ARBITRATION:
1126 DPRINTF(" arbitration");
1128 case NVME_FEAT_POWER_MANAGEMENT:
1129 DPRINTF(" power management");
1131 case NVME_FEAT_LBA_RANGE_TYPE:
1132 DPRINTF(" lba range");
1134 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1135 DPRINTF(" temperature threshold");
1136 switch ((command->cdw11 >> 20) & 0x3) {
1138 /* Over temp threshold */
1139 compl->cdw0 = 0xFFFF;
1142 /* Under temp threshold */
1146 WPRINTF(" invalid threshold type select");
1147 pci_nvme_status_genc(&compl->status,
1148 NVME_SC_INVALID_FIELD);
1152 case NVME_FEAT_ERROR_RECOVERY:
1153 DPRINTF(" error recovery");
1155 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1156 DPRINTF(" volatile write cache");
1158 case NVME_FEAT_NUMBER_OF_QUEUES:
1159 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1161 DPRINTF(" number of queues (submit %u, completion %u)",
1162 compl->cdw0 & 0xFFFF,
1163 (compl->cdw0 >> 16) & 0xFFFF);
1166 case NVME_FEAT_INTERRUPT_COALESCING:
1167 DPRINTF(" interrupt coalescing");
1169 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1170 DPRINTF(" interrupt vector configuration");
1172 case NVME_FEAT_WRITE_ATOMICITY:
1173 DPRINTF(" write atomicity");
1175 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1176 DPRINTF(" async event configuration");
1177 sc->async_ev_config = command->cdw11;
1179 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1180 DPRINTF(" software progress marker");
1183 DPRINTF(" autonomous power state transition");
1186 WPRINTF("%s invalid feature 0x%x", __func__, feature);
1187 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1191 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1196 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1197 struct nvme_completion* compl)
1199 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1200 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1202 /* TODO: search for the command ID and abort it */
1205 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1210 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1211 struct nvme_command* command, struct nvme_completion* compl)
1213 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1216 * TODO: raise events when they happen based on the Set Features cmd.
1217 * These events happen async, so only set completion successful if
1218 * there is an event reflective of the request to get event.
1220 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1221 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1226 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1228 struct nvme_completion compl;
1229 struct nvme_command *cmd;
1230 struct nvme_submission_queue *sq;
1231 struct nvme_completion_queue *cq;
1234 DPRINTF("%s index %u", __func__, (uint32_t)value);
1236 sq = &sc->submit_queues[0];
1237 cq = &sc->compl_queues[0];
1239 pthread_mutex_lock(&sq->mtx);
1242 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1244 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1245 cmd = &(sq->qbase)[sqhead];
1249 case NVME_OPC_DELETE_IO_SQ:
1250 DPRINTF("%s command DELETE_IO_SQ", __func__);
1251 nvme_opc_delete_io_sq(sc, cmd, &compl);
1253 case NVME_OPC_CREATE_IO_SQ:
1254 DPRINTF("%s command CREATE_IO_SQ", __func__);
1255 nvme_opc_create_io_sq(sc, cmd, &compl);
1257 case NVME_OPC_DELETE_IO_CQ:
1258 DPRINTF("%s command DELETE_IO_CQ", __func__);
1259 nvme_opc_delete_io_cq(sc, cmd, &compl);
1261 case NVME_OPC_CREATE_IO_CQ:
1262 DPRINTF("%s command CREATE_IO_CQ", __func__);
1263 nvme_opc_create_io_cq(sc, cmd, &compl);
1265 case NVME_OPC_GET_LOG_PAGE:
1266 DPRINTF("%s command GET_LOG_PAGE", __func__);
1267 nvme_opc_get_log_page(sc, cmd, &compl);
1269 case NVME_OPC_IDENTIFY:
1270 DPRINTF("%s command IDENTIFY", __func__);
1271 nvme_opc_identify(sc, cmd, &compl);
1273 case NVME_OPC_ABORT:
1274 DPRINTF("%s command ABORT", __func__);
1275 nvme_opc_abort(sc, cmd, &compl);
1277 case NVME_OPC_SET_FEATURES:
1278 DPRINTF("%s command SET_FEATURES", __func__);
1279 nvme_opc_set_features(sc, cmd, &compl);
1281 case NVME_OPC_GET_FEATURES:
1282 DPRINTF("%s command GET_FEATURES", __func__);
1283 nvme_opc_get_features(sc, cmd, &compl);
1285 case NVME_OPC_ASYNC_EVENT_REQUEST:
1286 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1287 /* XXX dont care, unhandled for now
1288 nvme_opc_async_event_req(sc, cmd, &compl);
1290 compl.status = NVME_NO_STATUS;
1293 WPRINTF("0x%x command is not implemented",
1295 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1297 sqhead = (sqhead + 1) % sq->size;
1299 if (NVME_COMPLETION_VALID(compl)) {
1300 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1308 DPRINTF("setting sqhead %u", sqhead);
1311 if (cq->head != cq->tail)
1312 pci_generate_msix(sc->nsc_pi, 0);
1314 pthread_mutex_unlock(&sq->mtx);
1318 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1319 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1324 /* concatenate contig block-iovs to minimize number of iovs */
1325 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1326 iovidx = req->io_req.br_iovcnt - 1;
1328 req->io_req.br_iov[iovidx].iov_base =
1329 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1330 req->prev_gpaddr, size);
1332 req->prev_size += size;
1333 req->io_req.br_resid += size;
1335 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1337 pthread_mutex_lock(&req->mtx);
1339 iovidx = req->io_req.br_iovcnt;
1340 if (iovidx == NVME_MAX_BLOCKIOVS) {
1343 DPRINTF("large I/O, doing partial req");
1346 req->io_req.br_iovcnt = 0;
1348 req->io_req.br_callback = pci_nvme_io_partial;
1351 err = blockif_read(sc->nvstore.ctx,
1354 err = blockif_write(sc->nvstore.ctx,
1357 /* wait until req completes before cont */
1359 pthread_cond_wait(&req->cv, &req->mtx);
1362 req->io_req.br_offset = lba;
1363 req->io_req.br_resid = 0;
1364 req->io_req.br_param = req;
1367 req->io_req.br_iov[iovidx].iov_base =
1368 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1371 req->io_req.br_iov[iovidx].iov_len = size;
1373 req->prev_gpaddr = gpaddr;
1374 req->prev_size = size;
1375 req->io_req.br_resid += size;
1377 req->io_req.br_iovcnt++;
1379 pthread_mutex_unlock(&req->mtx);
1382 /* RAM buffer: read/write directly */
1383 void *p = sc->nvstore.ctx;
1386 if ((lba + size) > sc->nvstore.size) {
1387 WPRINTF("%s write would overflow RAM", __func__);
1391 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1392 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1394 memcpy(p, gptr, size);
1396 memcpy(gptr, p, size);
1402 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1403 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1404 uint32_t cdw0, uint16_t status)
1406 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1408 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1409 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1410 NVME_STATUS_GET_SC(status));
1412 pci_nvme_cq_update(sc, cq,
1418 if (cq->head != cq->tail) {
1419 if (cq->intr_en & NVME_CQ_INTEN) {
1420 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1422 DPRINTF("%s: CQ%u interrupt disabled",
1423 __func__, sq->cqid);
1429 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1432 req->nvme_sq = NULL;
1435 pthread_mutex_lock(&sc->mtx);
1437 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1440 /* when no more IO pending, can set to ready if device reset/enabled */
1441 if (sc->pending_ios == 0 &&
1442 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1443 sc->regs.csts |= NVME_CSTS_RDY;
1445 pthread_mutex_unlock(&sc->mtx);
1447 sem_post(&sc->iosemlock);
1450 static struct pci_nvme_ioreq *
1451 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1453 struct pci_nvme_ioreq *req = NULL;;
1455 sem_wait(&sc->iosemlock);
1456 pthread_mutex_lock(&sc->mtx);
1458 req = STAILQ_FIRST(&sc->ioreqs_free);
1459 assert(req != NULL);
1460 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1466 pthread_mutex_unlock(&sc->mtx);
1468 req->io_req.br_iovcnt = 0;
1469 req->io_req.br_offset = 0;
1470 req->io_req.br_resid = 0;
1471 req->io_req.br_param = req;
1472 req->prev_gpaddr = 0;
1479 pci_nvme_io_done(struct blockif_req *br, int err)
1481 struct pci_nvme_ioreq *req = br->br_param;
1482 struct nvme_submission_queue *sq = req->nvme_sq;
1483 uint16_t code, status;
1485 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1487 /* TODO return correct error */
1488 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1489 pci_nvme_status_genc(&status, code);
1491 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1492 pci_nvme_release_ioreq(req->sc, req);
1496 pci_nvme_io_partial(struct blockif_req *br, int err)
1498 struct pci_nvme_ioreq *req = br->br_param;
1500 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1502 pthread_cond_signal(&req->cv);
1506 * Implements the Flush command. The specification states:
1507 * If a volatile write cache is not present, Flush commands complete
1508 * successfully and have no effect
1509 * in the description of the Volatile Write Cache (VWC) field of the Identify
1510 * Controller data. Therefore, set status to Success if the command is
1511 * not supported (i.e. RAM or as indicated by the blockif).
1514 nvme_opc_flush(struct pci_nvme_softc *sc,
1515 struct nvme_command *cmd,
1516 struct pci_nvme_blockstore *nvstore,
1517 struct pci_nvme_ioreq *req,
1520 bool pending = false;
1522 if (nvstore->type == NVME_STOR_RAM) {
1523 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1527 req->io_req.br_callback = pci_nvme_io_done;
1529 err = blockif_flush(nvstore->ctx, &req->io_req);
1535 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1538 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1546 nvme_opc_write_read(struct pci_nvme_softc *sc,
1547 struct nvme_command *cmd,
1548 struct pci_nvme_blockstore *nvstore,
1549 struct pci_nvme_ioreq *req,
1552 uint64_t lba, nblocks, bytes;
1554 bool is_write = cmd->opc == NVME_OPC_WRITE;
1555 bool pending = false;
1557 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1558 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1560 offset = lba * nvstore->sectsz;
1561 bytes = nblocks * nvstore->sectsz;
1563 if ((offset + bytes) > nvstore->size) {
1564 WPRINTF("%s command would exceed LBA range", __func__);
1565 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1569 req->io_req.br_offset = lba;
1571 /* PRP bits 1:0 must be zero */
1572 cmd->prp1 &= ~0x3UL;
1573 cmd->prp2 &= ~0x3UL;
1575 if (nvstore->type == NVME_STOR_RAM) {
1576 uint8_t *buf = nvstore->ctx;
1577 enum nvme_copy_dir dir;
1580 dir = NVME_COPY_TO_PRP;
1582 dir = NVME_COPY_FROM_PRP;
1584 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1585 buf + offset, bytes, dir))
1586 pci_nvme_status_genc(status,
1587 NVME_SC_DATA_TRANSFER_ERROR);
1589 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1594 size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes);
1595 if (pci_nvme_append_iov_req(sc, req, cmd->prp1,
1596 size, is_write, offset)) {
1597 pci_nvme_status_genc(status,
1598 NVME_SC_DATA_TRANSFER_ERROR);
1607 } else if (bytes <= PAGE_SIZE) {
1609 if (pci_nvme_append_iov_req(sc, req, cmd->prp2,
1610 size, is_write, offset)) {
1611 pci_nvme_status_genc(status,
1612 NVME_SC_DATA_TRANSFER_ERROR);
1616 void *vmctx = sc->nsc_pi->pi_vmctx;
1617 uint64_t *prp_list = &cmd->prp2;
1618 uint64_t *last = prp_list;
1620 /* PRP2 is pointer to a physical region page list */
1622 /* Last entry in list points to the next list */
1623 if (prp_list == last) {
1624 uint64_t prp = *prp_list;
1626 prp_list = paddr_guest2host(vmctx, prp,
1627 PAGE_SIZE - (prp % PAGE_SIZE));
1628 last = prp_list + (NVME_PRP2_ITEMS - 1);
1631 size = MIN(bytes, PAGE_SIZE);
1633 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1634 size, is_write, offset)) {
1635 pci_nvme_status_genc(status,
1636 NVME_SC_DATA_TRANSFER_ERROR);
1646 req->io_req.br_callback = pci_nvme_io_done;
1648 err = blockif_write(nvstore->ctx, &req->io_req);
1650 err = blockif_read(nvstore->ctx, &req->io_req);
1653 pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR);
1662 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1664 struct pci_nvme_ioreq *req = br->br_param;
1665 struct pci_nvme_softc *sc = req->sc;
1670 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1671 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1672 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1674 struct iovec *iov = req->io_req.br_iov;
1677 iov += req->prev_gpaddr;
1679 /* The iov_* values already include the sector size */
1680 req->io_req.br_offset = (off_t)iov->iov_base;
1681 req->io_req.br_resid = iov->iov_len;
1682 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1683 pci_nvme_status_genc(&status,
1684 NVME_SC_INTERNAL_DEVICE_ERROR);
1690 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1691 req->cid, 0, status);
1692 pci_nvme_release_ioreq(sc, req);
1697 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1698 struct nvme_command *cmd,
1699 struct pci_nvme_blockstore *nvstore,
1700 struct pci_nvme_ioreq *req,
1704 bool pending = false;
1706 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1707 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1711 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1712 struct nvme_dsm_range *range;
1714 int sectsz = sc->nvstore.sectsz;
1717 * DSM calls are advisory only, and compliant controllers
1718 * may choose to take no actions (i.e. return Success).
1720 if (!nvstore->deallocate) {
1721 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1726 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1730 /* copy locally because a range entry could straddle PRPs */
1731 range = calloc(1, NVME_MAX_DSM_TRIM);
1732 if (range == NULL) {
1733 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1736 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1737 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1740 * If the request is for more than a single range, store
1741 * the ranges in the br_iov. Optimize for the common case
1742 * of a single range.
1744 * Note that NVMe Number of Ranges is a zero based value
1746 nr = cmd->cdw10 & 0xff;
1748 req->io_req.br_iovcnt = 0;
1749 req->io_req.br_offset = range[0].starting_lba * sectsz;
1750 req->io_req.br_resid = range[0].length * sectsz;
1753 req->io_req.br_callback = pci_nvme_io_done;
1755 struct iovec *iov = req->io_req.br_iov;
1757 for (r = 0; r <= nr; r++) {
1758 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1759 iov[r].iov_len = range[r].length * sectsz;
1761 req->io_req.br_callback = pci_nvme_dealloc_sm;
1764 * Use prev_gpaddr to track the current entry and
1765 * prev_size to track the number of entries
1767 req->prev_gpaddr = 0;
1771 err = blockif_delete(nvstore->ctx, &req->io_req);
1773 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1784 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1786 struct nvme_submission_queue *sq;
1790 /* handle all submissions up to sq->tail index */
1791 sq = &sc->submit_queues[idx];
1793 pthread_mutex_lock(&sq->mtx);
1796 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1797 idx, sqhead, sq->tail, sq->qbase);
1799 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1800 struct nvme_command *cmd;
1801 struct pci_nvme_ioreq *req;
1809 cmd = &sq->qbase[sqhead];
1810 sqhead = (sqhead + 1) % sq->size;
1812 nsid = le32toh(cmd->nsid);
1813 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1814 pci_nvme_status_genc(&status,
1815 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1817 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1821 req = pci_nvme_get_ioreq(sc);
1823 pci_nvme_status_genc(&status,
1824 NVME_SC_INTERNAL_DEVICE_ERROR);
1825 WPRINTF("%s: unable to allocate IO req", __func__);
1830 req->opc = cmd->opc;
1831 req->cid = cmd->cid;
1832 req->nsid = cmd->nsid;
1835 case NVME_OPC_FLUSH:
1836 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1839 case NVME_OPC_WRITE:
1841 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1844 case NVME_OPC_WRITE_ZEROES:
1845 /* TODO: write zeroes
1846 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1847 __func__, lba, cmd->cdw12 & 0xFFFF); */
1848 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1850 case NVME_OPC_DATASET_MANAGEMENT:
1851 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1855 WPRINTF("%s unhandled io command 0x%x",
1856 __func__, cmd->opc);
1857 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1861 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1864 pci_nvme_release_ioreq(sc, req);
1870 pthread_mutex_unlock(&sq->mtx);
1874 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1875 uint64_t idx, int is_sq, uint64_t value)
1877 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
1878 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
1881 if (idx > sc->num_squeues) {
1882 WPRINTF("%s queue index %lu overflow from "
1884 __func__, idx, sc->num_squeues);
1888 atomic_store_short(&sc->submit_queues[idx].tail,
1892 pci_nvme_handle_admin_cmd(sc, value);
1894 /* submission queue; handle new entries in SQ */
1895 if (idx > sc->num_squeues) {
1896 WPRINTF("%s SQ index %lu overflow from "
1898 __func__, idx, sc->num_squeues);
1901 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1904 if (idx > sc->num_cqueues) {
1905 WPRINTF("%s queue index %lu overflow from "
1907 __func__, idx, sc->num_cqueues);
1911 atomic_store_short(&sc->compl_queues[idx].head,
1917 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1919 const char *s = iswrite ? "WRITE" : "READ";
1922 case NVME_CR_CAP_LOW:
1923 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
1925 case NVME_CR_CAP_HI:
1926 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
1929 DPRINTF("%s %s NVME_CR_VS", func, s);
1932 DPRINTF("%s %s NVME_CR_INTMS", func, s);
1935 DPRINTF("%s %s NVME_CR_INTMC", func, s);
1938 DPRINTF("%s %s NVME_CR_CC", func, s);
1941 DPRINTF("%s %s NVME_CR_CSTS", func, s);
1944 DPRINTF("%s %s NVME_CR_NSSR", func, s);
1947 DPRINTF("%s %s NVME_CR_AQA", func, s);
1949 case NVME_CR_ASQ_LOW:
1950 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
1952 case NVME_CR_ASQ_HI:
1953 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
1955 case NVME_CR_ACQ_LOW:
1956 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
1958 case NVME_CR_ACQ_HI:
1959 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
1962 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
1968 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1969 uint64_t offset, int size, uint64_t value)
1973 if (offset >= NVME_DOORBELL_OFFSET) {
1974 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1975 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1976 int is_sq = (belloffset % 8) < 4;
1978 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1979 WPRINTF("guest attempted an overflow write offset "
1980 "0x%lx, val 0x%lx in %s",
1981 offset, value, __func__);
1985 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1989 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
1990 offset, size, value);
1993 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
1994 "val 0x%lx) to bar0 in %s",
1995 size, offset, value, __func__);
1996 /* TODO: shutdown device */
2000 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2002 pthread_mutex_lock(&sc->mtx);
2005 case NVME_CR_CAP_LOW:
2006 case NVME_CR_CAP_HI:
2013 /* MSI-X, so ignore */
2016 /* MSI-X, so ignore */
2019 ccreg = (uint32_t)value;
2021 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2024 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2025 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2026 NVME_CC_GET_IOCQES(ccreg));
2028 if (NVME_CC_GET_SHN(ccreg)) {
2029 /* perform shutdown - flush out data to backend */
2030 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2031 NVME_CSTS_REG_SHST_SHIFT);
2032 sc->regs.csts |= NVME_SHST_COMPLETE <<
2033 NVME_CSTS_REG_SHST_SHIFT;
2035 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2036 if (NVME_CC_GET_EN(ccreg) == 0)
2037 /* transition 1-> causes controller reset */
2038 pci_nvme_reset_locked(sc);
2040 pci_nvme_init_controller(ctx, sc);
2043 /* Insert the iocqes, iosqes and en bits from the write */
2044 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2045 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2046 if (NVME_CC_GET_EN(ccreg) == 0) {
2047 /* Insert the ams, mps and css bit fields */
2048 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2049 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2050 sc->regs.csts &= ~NVME_CSTS_RDY;
2051 } else if (sc->pending_ios == 0) {
2052 sc->regs.csts |= NVME_CSTS_RDY;
2058 /* ignore writes; don't support subsystem reset */
2061 sc->regs.aqa = (uint32_t)value;
2063 case NVME_CR_ASQ_LOW:
2064 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2065 (0xFFFFF000 & value);
2067 case NVME_CR_ASQ_HI:
2068 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2071 case NVME_CR_ACQ_LOW:
2072 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2073 (0xFFFFF000 & value);
2075 case NVME_CR_ACQ_HI:
2076 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2080 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2081 __func__, offset, value, size);
2083 pthread_mutex_unlock(&sc->mtx);
2087 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2088 int baridx, uint64_t offset, int size, uint64_t value)
2090 struct pci_nvme_softc* sc = pi->pi_arg;
2092 if (baridx == pci_msix_table_bar(pi) ||
2093 baridx == pci_msix_pba_bar(pi)) {
2094 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2095 " value 0x%lx", baridx, offset, size, value);
2097 pci_emul_msix_twrite(pi, offset, size, value);
2103 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2107 DPRINTF("%s unknown baridx %d, val 0x%lx",
2108 __func__, baridx, value);
2112 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2113 uint64_t offset, int size)
2117 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2119 if (offset < NVME_DOORBELL_OFFSET) {
2120 void *p = &(sc->regs);
2121 pthread_mutex_lock(&sc->mtx);
2122 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2123 pthread_mutex_unlock(&sc->mtx);
2126 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2137 value &= 0xFFFFFFFF;
2141 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2142 offset, size, (uint32_t)value);
2150 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2151 uint64_t offset, int size)
2153 struct pci_nvme_softc* sc = pi->pi_arg;
2155 if (baridx == pci_msix_table_bar(pi) ||
2156 baridx == pci_msix_pba_bar(pi)) {
2157 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2158 baridx, offset, size);
2160 return pci_emul_msix_tread(pi, offset, size);
2165 return pci_nvme_read_bar_0(sc, offset, size);
2168 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2176 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2178 char bident[sizeof("XX:X:X")];
2179 char *uopt, *xopts, *config;
2183 sc->max_queues = NVME_QUEUES;
2184 sc->max_qentries = NVME_MAX_QENTRIES;
2185 sc->ioslots = NVME_IOSLOTS;
2186 sc->num_squeues = sc->max_queues;
2187 sc->num_cqueues = sc->max_queues;
2188 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2191 uopt = strdup(opts);
2193 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2194 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2195 for (xopts = strtok(uopt, ",");
2197 xopts = strtok(NULL, ",")) {
2199 if ((config = strchr(xopts, '=')) != NULL)
2202 if (!strcmp("maxq", xopts)) {
2203 sc->max_queues = atoi(config);
2204 } else if (!strcmp("qsz", xopts)) {
2205 sc->max_qentries = atoi(config);
2206 } else if (!strcmp("ioslots", xopts)) {
2207 sc->ioslots = atoi(config);
2208 } else if (!strcmp("sectsz", xopts)) {
2209 sectsz = atoi(config);
2210 } else if (!strcmp("ser", xopts)) {
2212 * This field indicates the Product Serial Number in
2213 * 7-bit ASCII, unused bytes should be space characters.
2216 cpywithpad((char *)sc->ctrldata.sn,
2217 sizeof(sc->ctrldata.sn), config, ' ');
2218 } else if (!strcmp("ram", xopts)) {
2219 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2221 sc->nvstore.type = NVME_STOR_RAM;
2222 sc->nvstore.size = sz * 1024 * 1024;
2223 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2224 sc->nvstore.sectsz = 4096;
2225 sc->nvstore.sectsz_bits = 12;
2226 if (sc->nvstore.ctx == NULL) {
2227 perror("Unable to allocate RAM");
2231 } else if (!strcmp("eui64", xopts)) {
2232 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2233 } else if (!strcmp("dsm", xopts)) {
2234 if (!strcmp("auto", config))
2235 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2236 else if (!strcmp("enable", config))
2237 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2238 else if (!strcmp("disable", config))
2239 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2240 } else if (optidx == 0) {
2241 snprintf(bident, sizeof(bident), "%d:%d",
2242 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2243 sc->nvstore.ctx = blockif_open(xopts, bident);
2244 if (sc->nvstore.ctx == NULL) {
2245 perror("Could not open backing file");
2249 sc->nvstore.type = NVME_STOR_BLOCKIF;
2250 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2252 EPRINTLN("Invalid option %s", xopts);
2261 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2262 EPRINTLN("backing store not specified");
2265 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2266 sc->nvstore.sectsz = sectsz;
2267 else if (sc->nvstore.type != NVME_STOR_RAM)
2268 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2269 for (sc->nvstore.sectsz_bits = 9;
2270 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2271 sc->nvstore.sectsz_bits++);
2273 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2274 sc->max_queues = NVME_QUEUES;
2276 if (sc->max_qentries <= 0) {
2277 EPRINTLN("Invalid qsz option");
2280 if (sc->ioslots <= 0) {
2281 EPRINTLN("Invalid ioslots option");
2289 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2291 struct pci_nvme_softc *sc;
2292 uint32_t pci_membar_sz;
2297 sc = calloc(1, sizeof(struct pci_nvme_softc));
2301 error = pci_nvme_parse_opts(sc, opts);
2307 STAILQ_INIT(&sc->ioreqs_free);
2308 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2309 for (int i = 0; i < sc->ioslots; i++) {
2310 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2311 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2312 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2314 sc->intr_coales_aggr_thresh = 1;
2316 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2317 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2318 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2319 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2320 pci_set_cfgdata8(pi, PCIR_PROGIF,
2321 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2324 * Allocate size of NVMe registers + doorbell space for all queues.
2326 * The specification requires a minimum memory I/O window size of 16K.
2327 * The Windows driver will refuse to start a device with a smaller
2330 pci_membar_sz = sizeof(struct nvme_registers) +
2331 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2332 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2334 DPRINTF("nvme membar size: %u", pci_membar_sz);
2336 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2338 WPRINTF("%s pci alloc mem bar failed", __func__);
2342 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2344 WPRINTF("%s pci add msixcap failed", __func__);
2348 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2350 WPRINTF("%s pci add Express capability failed", __func__);
2354 pthread_mutex_init(&sc->mtx, NULL);
2355 sem_init(&sc->iosemlock, 0, sc->ioslots);
2357 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2359 * Controller data depends on Namespace data so initialize Namespace
2362 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2363 pci_nvme_init_ctrldata(sc);
2364 pci_nvme_init_logpages(sc);
2368 pci_lintr_request(pi);
2375 struct pci_devemu pci_de_nvme = {
2377 .pe_init = pci_nvme_init,
2378 .pe_barwrite = pci_nvme_write,
2379 .pe_barread = pci_nvme_read
2381 PCI_EMUL_SET(pci_de_nvme);