2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
103 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
104 #define NVME_MAX_BLOCKIOVS 512
106 /* This is a synthetic status code to indicate there is no status */
107 #define NVME_NO_STATUS 0xffff
108 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
112 /* Convert a zero-based value into a one-based value */
113 #define ONE_BASED(zero) ((zero) + 1)
114 /* Convert a one-based value into a zero-based value */
115 #define ZERO_BASED(one) ((one) - 1)
117 /* Encode number of SQ's and CQ's for Set/Get Features */
118 #define NVME_FEATURE_NUM_QUEUES(sc) \
119 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
120 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
122 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
124 enum nvme_controller_register_offsets {
125 NVME_CR_CAP_LOW = 0x00,
126 NVME_CR_CAP_HI = 0x04,
128 NVME_CR_INTMS = 0x0c,
129 NVME_CR_INTMC = 0x10,
134 NVME_CR_ASQ_LOW = 0x28,
135 NVME_CR_ASQ_HI = 0x2c,
136 NVME_CR_ACQ_LOW = 0x30,
137 NVME_CR_ACQ_HI = 0x34,
140 enum nvme_cmd_cdw11 {
141 NVME_CMD_CDW11_PC = 0x0001,
142 NVME_CMD_CDW11_IEN = 0x0002,
143 NVME_CMD_CDW11_IV = 0xFFFF0000,
151 #define NVME_CQ_INTEN 0x01
152 #define NVME_CQ_INTCOAL 0x02
154 struct nvme_completion_queue {
155 struct nvme_completion *qbase;
158 uint16_t tail; /* nvme progress */
159 uint16_t head; /* guest progress */
164 struct nvme_submission_queue {
165 struct nvme_command *qbase;
168 uint16_t head; /* nvme progress */
169 uint16_t tail; /* guest progress */
170 uint16_t cqid; /* completion queue id */
174 enum nvme_storage_type {
175 NVME_STOR_BLOCKIF = 0,
179 struct pci_nvme_blockstore {
180 enum nvme_storage_type type;
184 uint32_t sectsz_bits;
186 uint32_t deallocate:1;
189 struct pci_nvme_ioreq {
190 struct pci_nvme_softc *sc;
191 STAILQ_ENTRY(pci_nvme_ioreq) link;
192 struct nvme_submission_queue *nvme_sq;
195 /* command information */
200 uint64_t prev_gpaddr;
204 * lock if all iovs consumed (big IO);
205 * complete transaction before continuing
210 struct blockif_req io_req;
212 /* pad to fit up to 512 page descriptors from guest IO request */
213 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
217 /* Dataset Management bit in ONCS reflects backing storage capability */
218 NVME_DATASET_MANAGEMENT_AUTO,
219 /* Unconditionally set Dataset Management bit in ONCS */
220 NVME_DATASET_MANAGEMENT_ENABLE,
221 /* Unconditionally clear Dataset Management bit in ONCS */
222 NVME_DATASET_MANAGEMENT_DISABLE,
225 struct pci_nvme_softc {
226 struct pci_devinst *nsc_pi;
230 struct nvme_registers regs;
232 struct nvme_namespace_data nsdata;
233 struct nvme_controller_data ctrldata;
234 struct nvme_error_information_entry err_log;
235 struct nvme_health_information_page health_log;
236 struct nvme_firmware_page fw_log;
238 struct pci_nvme_blockstore nvstore;
240 uint16_t max_qentries; /* max entries per queue */
241 uint32_t max_queues; /* max number of IO SQ's or CQ's */
242 uint32_t num_cqueues;
243 uint32_t num_squeues;
245 struct pci_nvme_ioreq *ioreqs;
246 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
247 uint32_t pending_ios;
252 * Memory mapped Submission and Completion queues
253 * Each array includes both Admin and IO queues
255 struct nvme_completion_queue *compl_queues;
256 struct nvme_submission_queue *submit_queues;
258 /* controller features */
259 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
260 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
261 uint32_t async_ev_config; /* 0x0B: async event config */
263 enum nvme_dsm_type dataset_management;
267 static void pci_nvme_io_partial(struct blockif_req *br, int err);
269 /* Controller Configuration utils */
270 #define NVME_CC_GET_EN(cc) \
271 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
272 #define NVME_CC_GET_CSS(cc) \
273 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
274 #define NVME_CC_GET_SHN(cc) \
275 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
276 #define NVME_CC_GET_IOSQES(cc) \
277 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
278 #define NVME_CC_GET_IOCQES(cc) \
279 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
281 #define NVME_CC_WRITE_MASK \
282 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
283 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
284 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
286 #define NVME_CC_NEN_WRITE_MASK \
287 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
288 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
289 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
291 /* Controller Status utils */
292 #define NVME_CSTS_GET_RDY(sts) \
293 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
295 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
297 /* Completion Queue status word utils */
298 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
299 #define NVME_STATUS_MASK \
300 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
301 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
303 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
304 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
307 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
311 len = strnlen(src, dst_size);
312 memset(dst, pad, dst_size);
313 memcpy(dst, src, len);
317 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
320 *status &= ~NVME_STATUS_MASK;
321 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
322 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
326 pci_nvme_status_genc(uint16_t *status, uint16_t code)
329 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
333 * Initialize the requested number or IO Submission and Completion Queues.
334 * Admin queues are allocated implicitly.
337 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
342 * Allocate and initialize the Submission Queues
344 if (nsq > NVME_QUEUES) {
345 WPRINTF("%s: clamping number of SQ from %u to %u",
346 __func__, nsq, NVME_QUEUES);
350 sc->num_squeues = nsq;
352 sc->submit_queues = calloc(sc->num_squeues + 1,
353 sizeof(struct nvme_submission_queue));
354 if (sc->submit_queues == NULL) {
355 WPRINTF("%s: SQ allocation failed", __func__);
358 struct nvme_submission_queue *sq = sc->submit_queues;
360 for (i = 0; i < sc->num_squeues; i++)
361 pthread_mutex_init(&sq[i].mtx, NULL);
365 * Allocate and initialize the Completion Queues
367 if (ncq > NVME_QUEUES) {
368 WPRINTF("%s: clamping number of CQ from %u to %u",
369 __func__, ncq, NVME_QUEUES);
373 sc->num_cqueues = ncq;
375 sc->compl_queues = calloc(sc->num_cqueues + 1,
376 sizeof(struct nvme_completion_queue));
377 if (sc->compl_queues == NULL) {
378 WPRINTF("%s: CQ allocation failed", __func__);
381 struct nvme_completion_queue *cq = sc->compl_queues;
383 for (i = 0; i < sc->num_cqueues; i++)
384 pthread_mutex_init(&cq[i].mtx, NULL);
389 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
391 struct nvme_controller_data *cd = &sc->ctrldata;
396 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
397 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
399 /* Num of submission commands that we can handle at a time (2^rab) */
409 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
411 cd->ver = 0x00010300;
413 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
417 cd->lpa = 0; /* TODO: support some simple things like SMART */
418 cd->elpe = 0; /* max error log page entries */
419 cd->npss = 1; /* number of power states support */
421 /* Warning Composite Temperature Threshold */
424 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
425 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
426 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
427 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
428 cd->nn = 1; /* number of namespaces */
431 switch (sc->dataset_management) {
432 case NVME_DATASET_MANAGEMENT_AUTO:
433 if (sc->nvstore.deallocate)
434 cd->oncs |= NVME_ONCS_DSM;
436 case NVME_DATASET_MANAGEMENT_ENABLE:
437 cd->oncs |= NVME_ONCS_DSM;
445 cd->power_state[0].mp = 10;
449 * Calculate the CRC-16 of the given buffer
450 * See copyright attribution at top of file
453 crc16(uint16_t crc, const void *buffer, unsigned int len)
455 const unsigned char *cp = buffer;
456 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
457 static uint16_t const crc16_table[256] = {
458 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
459 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
460 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
461 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
462 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
463 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
464 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
465 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
466 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
467 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
468 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
469 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
470 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
471 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
472 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
473 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
474 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
475 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
476 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
477 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
478 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
479 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
480 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
481 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
482 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
483 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
484 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
485 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
486 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
487 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
488 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
489 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
493 crc = (((crc >> 8) & 0xffU) ^
494 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
499 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
500 struct nvme_namespace_data *nd, uint32_t nsid,
501 struct pci_nvme_blockstore *nvstore)
504 /* Get capacity and block size information from backing store */
505 nd->nsze = nvstore->size / nvstore->sectsz;
509 if (nvstore->type == NVME_STOR_BLOCKIF)
510 nvstore->deallocate = blockif_candelete(nvstore->ctx);
512 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
515 /* Create an EUI-64 if user did not provide one */
516 if (nvstore->eui64 == 0) {
518 uint64_t eui64 = nvstore->eui64;
520 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
521 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
524 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
527 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
529 be64enc(nd->eui64, nvstore->eui64);
531 /* LBA data-sz = 2^lbads */
532 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
536 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
539 memset(&sc->err_log, 0, sizeof(sc->err_log));
540 memset(&sc->health_log, 0, sizeof(sc->health_log));
541 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
545 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
549 DPRINTF("%s", __func__);
551 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
552 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
553 (60 << NVME_CAP_LO_REG_TO_SHIFT);
555 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
557 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
562 assert(sc->submit_queues != NULL);
564 for (i = 0; i < sc->num_squeues + 1; i++) {
565 sc->submit_queues[i].qbase = NULL;
566 sc->submit_queues[i].size = 0;
567 sc->submit_queues[i].cqid = 0;
568 sc->submit_queues[i].tail = 0;
569 sc->submit_queues[i].head = 0;
572 assert(sc->compl_queues != NULL);
574 for (i = 0; i < sc->num_cqueues + 1; i++) {
575 sc->compl_queues[i].qbase = NULL;
576 sc->compl_queues[i].size = 0;
577 sc->compl_queues[i].tail = 0;
578 sc->compl_queues[i].head = 0;
583 pci_nvme_reset(struct pci_nvme_softc *sc)
585 pthread_mutex_lock(&sc->mtx);
586 pci_nvme_reset_locked(sc);
587 pthread_mutex_unlock(&sc->mtx);
591 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
595 DPRINTF("%s", __func__);
597 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
598 sc->submit_queues[0].size = asqs;
599 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
600 sizeof(struct nvme_command) * asqs);
602 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
603 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
605 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
606 NVME_AQA_REG_ACQS_MASK) + 1;
607 sc->compl_queues[0].size = acqs;
608 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
609 sizeof(struct nvme_completion) * acqs);
611 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
612 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
616 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
617 size_t len, enum nvme_copy_dir dir)
622 if (len > (8 * 1024)) {
626 /* Copy from the start of prp1 to the end of the physical page */
627 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
628 bytes = MIN(bytes, len);
630 p = vm_map_gpa(ctx, prp1, bytes);
635 if (dir == NVME_COPY_TO_PRP)
647 len = MIN(len, PAGE_SIZE);
649 p = vm_map_gpa(ctx, prp2, len);
654 if (dir == NVME_COPY_TO_PRP)
663 * Write a Completion Queue Entry update
665 * Write the completion and update the doorbell value
668 pci_nvme_cq_update(struct pci_nvme_softc *sc,
669 struct nvme_completion_queue *cq,
675 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
676 struct nvme_completion *cqe;
678 assert(cq->qbase != NULL);
680 pthread_mutex_lock(&cq->mtx);
682 cqe = &cq->qbase[cq->tail];
684 /* Flip the phase bit */
685 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
688 cqe->sqhd = sq->head;
691 cqe->status = status;
694 if (cq->tail >= cq->size) {
698 pthread_mutex_unlock(&cq->mtx);
702 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
703 struct nvme_completion* compl)
705 uint16_t qid = command->cdw10 & 0xffff;
707 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
708 if (qid == 0 || qid > sc->num_squeues) {
709 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
710 __func__, qid, sc->num_squeues);
711 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
712 NVME_SC_INVALID_QUEUE_IDENTIFIER);
716 sc->submit_queues[qid].qbase = NULL;
717 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
722 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
723 struct nvme_completion* compl)
725 if (command->cdw11 & NVME_CMD_CDW11_PC) {
726 uint16_t qid = command->cdw10 & 0xffff;
727 struct nvme_submission_queue *nsq;
729 if ((qid == 0) || (qid > sc->num_squeues)) {
730 WPRINTF("%s queue index %u > num_squeues %u",
731 __func__, qid, sc->num_squeues);
732 pci_nvme_status_tc(&compl->status,
733 NVME_SCT_COMMAND_SPECIFIC,
734 NVME_SC_INVALID_QUEUE_IDENTIFIER);
738 nsq = &sc->submit_queues[qid];
739 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
741 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
742 sizeof(struct nvme_command) * (size_t)nsq->size);
743 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
744 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
746 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
747 qid, nsq->size, nsq->qbase, nsq->cqid);
749 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
751 DPRINTF("%s completed creating IOSQ qid %u",
755 * Guest sent non-cont submission queue request.
756 * This setting is unsupported by this emulation.
758 WPRINTF("%s unsupported non-contig (list-based) "
759 "create i/o submission queue", __func__);
761 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
767 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
768 struct nvme_completion* compl)
770 uint16_t qid = command->cdw10 & 0xffff;
772 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
773 if (qid == 0 || qid > sc->num_cqueues) {
774 WPRINTF("%s queue index %u / num_cqueues %u",
775 __func__, qid, sc->num_cqueues);
776 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
777 NVME_SC_INVALID_QUEUE_IDENTIFIER);
781 sc->compl_queues[qid].qbase = NULL;
782 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
787 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
788 struct nvme_completion* compl)
791 if (command->cdw11 & NVME_CMD_CDW11_PC) {
792 uint16_t qid = command->cdw10 & 0xffff;
793 struct nvme_completion_queue *ncq;
795 if ((qid == 0) || (qid > sc->num_cqueues)) {
796 WPRINTF("%s queue index %u > num_cqueues %u",
797 __func__, qid, sc->num_cqueues);
798 pci_nvme_status_tc(&compl->status,
799 NVME_SCT_COMMAND_SPECIFIC,
800 NVME_SC_INVALID_QUEUE_IDENTIFIER);
804 ncq = &sc->compl_queues[qid];
805 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
806 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
807 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
809 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
811 sizeof(struct nvme_command) * (size_t)ncq->size);
813 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
816 * Non-contig completion queue unsupported.
818 WPRINTF("%s unsupported non-contig (list-based) "
819 "create i/o completion queue",
822 /* 0x12 = Invalid Use of Controller Memory Buffer */
823 pci_nvme_status_genc(&compl->status, 0x12);
830 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
831 struct nvme_completion* compl)
834 uint8_t logpage = command->cdw10 & 0xFF;
836 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
838 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
841 * Command specifies the number of dwords to return in fields NUMDU
842 * and NUMDL. This is a zero-based value.
844 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
845 logsize *= sizeof(uint32_t);
849 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
850 command->prp2, (uint8_t *)&sc->err_log,
851 MIN(logsize, sizeof(sc->err_log)),
854 case NVME_LOG_HEALTH_INFORMATION:
855 /* TODO: present some smart info */
856 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
857 command->prp2, (uint8_t *)&sc->health_log,
858 MIN(logsize, sizeof(sc->health_log)),
861 case NVME_LOG_FIRMWARE_SLOT:
862 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
863 command->prp2, (uint8_t *)&sc->fw_log,
864 MIN(logsize, sizeof(sc->fw_log)),
868 DPRINTF("%s get log page %x command not supported",
871 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
872 NVME_SC_INVALID_LOG_PAGE);
879 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
880 struct nvme_completion* compl)
885 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
886 command->cdw10 & 0xFF, command->nsid);
888 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
890 switch (command->cdw10 & 0xFF) {
891 case 0x00: /* return Identify Namespace data structure */
892 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
893 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
896 case 0x01: /* return Identify Controller data structure */
897 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
898 command->prp2, (uint8_t *)&sc->ctrldata,
899 sizeof(sc->ctrldata),
902 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
903 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
904 sizeof(uint32_t) * 1024);
905 ((uint32_t *)dest)[0] = 1;
906 ((uint32_t *)dest)[1] = 0;
908 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
909 if (command->nsid != 1) {
910 pci_nvme_status_genc(&status,
911 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
914 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
915 sizeof(uint32_t) * 1024);
916 /* All bytes after the descriptor shall be zero */
917 bzero(dest, sizeof(uint32_t) * 1024);
919 /* Return NIDT=1 (i.e. EUI64) descriptor */
920 ((uint8_t *)dest)[0] = 1;
921 ((uint8_t *)dest)[1] = sizeof(uint64_t);
922 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
925 DPRINTF("%s unsupported identify command requested 0x%x",
926 __func__, command->cdw10 & 0xFF);
927 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
931 compl->status = status;
936 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
937 struct nvme_completion* compl)
939 uint16_t nqr; /* Number of Queues Requested */
941 nqr = command->cdw11 & 0xFFFF;
943 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
944 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
948 sc->num_squeues = ONE_BASED(nqr);
949 if (sc->num_squeues > sc->max_queues) {
950 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
952 sc->num_squeues = sc->max_queues;
955 nqr = (command->cdw11 >> 16) & 0xFFFF;
957 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
958 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
962 sc->num_cqueues = ONE_BASED(nqr);
963 if (sc->num_cqueues > sc->max_queues) {
964 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
966 sc->num_cqueues = sc->max_queues;
969 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
975 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
976 struct nvme_completion* compl)
978 int feature = command->cdw10 & 0xFF;
981 DPRINTF("%s feature 0x%x", __func__, feature);
985 case NVME_FEAT_ARBITRATION:
986 DPRINTF(" arbitration 0x%x", command->cdw11);
988 case NVME_FEAT_POWER_MANAGEMENT:
989 DPRINTF(" power management 0x%x", command->cdw11);
991 case NVME_FEAT_LBA_RANGE_TYPE:
992 DPRINTF(" lba range 0x%x", command->cdw11);
994 case NVME_FEAT_TEMPERATURE_THRESHOLD:
995 DPRINTF(" temperature threshold 0x%x", command->cdw11);
997 case NVME_FEAT_ERROR_RECOVERY:
998 DPRINTF(" error recovery 0x%x", command->cdw11);
1000 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1001 DPRINTF(" volatile write cache 0x%x", command->cdw11);
1003 case NVME_FEAT_NUMBER_OF_QUEUES:
1004 nvme_set_feature_queues(sc, command, compl);
1006 case NVME_FEAT_INTERRUPT_COALESCING:
1007 DPRINTF(" interrupt coalescing 0x%x", command->cdw11);
1010 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
1012 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
1014 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1015 iv = command->cdw11 & 0xFFFF;
1017 DPRINTF(" interrupt vector configuration 0x%x",
1020 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
1021 if (sc->compl_queues[i].intr_vec == iv) {
1022 if (command->cdw11 & (1 << 16))
1023 sc->compl_queues[i].intr_en |=
1026 sc->compl_queues[i].intr_en &=
1031 case NVME_FEAT_WRITE_ATOMICITY:
1032 DPRINTF(" write atomicity 0x%x", command->cdw11);
1034 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1035 DPRINTF(" async event configuration 0x%x",
1037 sc->async_ev_config = command->cdw11;
1039 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1040 DPRINTF(" software progress marker 0x%x",
1044 DPRINTF(" autonomous power state transition 0x%x",
1048 WPRINTF("%s invalid feature", __func__);
1049 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1053 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1058 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1059 struct nvme_completion* compl)
1061 int feature = command->cdw10 & 0xFF;
1063 DPRINTF("%s feature 0x%x", __func__, feature);
1068 case NVME_FEAT_ARBITRATION:
1069 DPRINTF(" arbitration");
1071 case NVME_FEAT_POWER_MANAGEMENT:
1072 DPRINTF(" power management");
1074 case NVME_FEAT_LBA_RANGE_TYPE:
1075 DPRINTF(" lba range");
1077 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1078 DPRINTF(" temperature threshold");
1079 switch ((command->cdw11 >> 20) & 0x3) {
1081 /* Over temp threshold */
1082 compl->cdw0 = 0xFFFF;
1085 /* Under temp threshold */
1089 WPRINTF(" invalid threshold type select");
1090 pci_nvme_status_genc(&compl->status,
1091 NVME_SC_INVALID_FIELD);
1095 case NVME_FEAT_ERROR_RECOVERY:
1096 DPRINTF(" error recovery");
1098 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1099 DPRINTF(" volatile write cache");
1101 case NVME_FEAT_NUMBER_OF_QUEUES:
1102 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1104 DPRINTF(" number of queues (submit %u, completion %u)",
1105 compl->cdw0 & 0xFFFF,
1106 (compl->cdw0 >> 16) & 0xFFFF);
1109 case NVME_FEAT_INTERRUPT_COALESCING:
1110 DPRINTF(" interrupt coalescing");
1112 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1113 DPRINTF(" interrupt vector configuration");
1115 case NVME_FEAT_WRITE_ATOMICITY:
1116 DPRINTF(" write atomicity");
1118 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1119 DPRINTF(" async event configuration");
1120 sc->async_ev_config = command->cdw11;
1122 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1123 DPRINTF(" software progress marker");
1126 DPRINTF(" autonomous power state transition");
1129 WPRINTF("%s invalid feature 0x%x", __func__, feature);
1130 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1134 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1139 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1140 struct nvme_completion* compl)
1142 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1143 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1145 /* TODO: search for the command ID and abort it */
1148 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1153 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1154 struct nvme_command* command, struct nvme_completion* compl)
1156 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1159 * TODO: raise events when they happen based on the Set Features cmd.
1160 * These events happen async, so only set completion successful if
1161 * there is an event reflective of the request to get event.
1163 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1164 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1169 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1171 struct nvme_completion compl;
1172 struct nvme_command *cmd;
1173 struct nvme_submission_queue *sq;
1174 struct nvme_completion_queue *cq;
1177 DPRINTF("%s index %u", __func__, (uint32_t)value);
1179 sq = &sc->submit_queues[0];
1180 cq = &sc->compl_queues[0];
1182 pthread_mutex_lock(&sq->mtx);
1185 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1187 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1188 cmd = &(sq->qbase)[sqhead];
1193 case NVME_OPC_DELETE_IO_SQ:
1194 DPRINTF("%s command DELETE_IO_SQ", __func__);
1195 nvme_opc_delete_io_sq(sc, cmd, &compl);
1197 case NVME_OPC_CREATE_IO_SQ:
1198 DPRINTF("%s command CREATE_IO_SQ", __func__);
1199 nvme_opc_create_io_sq(sc, cmd, &compl);
1201 case NVME_OPC_DELETE_IO_CQ:
1202 DPRINTF("%s command DELETE_IO_CQ", __func__);
1203 nvme_opc_delete_io_cq(sc, cmd, &compl);
1205 case NVME_OPC_CREATE_IO_CQ:
1206 DPRINTF("%s command CREATE_IO_CQ", __func__);
1207 nvme_opc_create_io_cq(sc, cmd, &compl);
1209 case NVME_OPC_GET_LOG_PAGE:
1210 DPRINTF("%s command GET_LOG_PAGE", __func__);
1211 nvme_opc_get_log_page(sc, cmd, &compl);
1213 case NVME_OPC_IDENTIFY:
1214 DPRINTF("%s command IDENTIFY", __func__);
1215 nvme_opc_identify(sc, cmd, &compl);
1217 case NVME_OPC_ABORT:
1218 DPRINTF("%s command ABORT", __func__);
1219 nvme_opc_abort(sc, cmd, &compl);
1221 case NVME_OPC_SET_FEATURES:
1222 DPRINTF("%s command SET_FEATURES", __func__);
1223 nvme_opc_set_features(sc, cmd, &compl);
1225 case NVME_OPC_GET_FEATURES:
1226 DPRINTF("%s command GET_FEATURES", __func__);
1227 nvme_opc_get_features(sc, cmd, &compl);
1229 case NVME_OPC_ASYNC_EVENT_REQUEST:
1230 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1231 /* XXX dont care, unhandled for now
1232 nvme_opc_async_event_req(sc, cmd, &compl);
1234 compl.status = NVME_NO_STATUS;
1237 WPRINTF("0x%x command is not implemented",
1239 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1241 sqhead = (sqhead + 1) % sq->size;
1243 if (NVME_COMPLETION_VALID(compl)) {
1244 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1252 DPRINTF("setting sqhead %u", sqhead);
1255 if (cq->head != cq->tail)
1256 pci_generate_msix(sc->nsc_pi, 0);
1258 pthread_mutex_unlock(&sq->mtx);
1262 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1263 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1268 /* concatenate contig block-iovs to minimize number of iovs */
1269 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1270 iovidx = req->io_req.br_iovcnt - 1;
1272 req->io_req.br_iov[iovidx].iov_base =
1273 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1274 req->prev_gpaddr, size);
1276 req->prev_size += size;
1277 req->io_req.br_resid += size;
1279 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1281 pthread_mutex_lock(&req->mtx);
1283 iovidx = req->io_req.br_iovcnt;
1284 if (iovidx == NVME_MAX_BLOCKIOVS) {
1287 DPRINTF("large I/O, doing partial req");
1290 req->io_req.br_iovcnt = 0;
1292 req->io_req.br_callback = pci_nvme_io_partial;
1295 err = blockif_read(sc->nvstore.ctx,
1298 err = blockif_write(sc->nvstore.ctx,
1301 /* wait until req completes before cont */
1303 pthread_cond_wait(&req->cv, &req->mtx);
1306 req->io_req.br_offset = lba;
1307 req->io_req.br_resid = 0;
1308 req->io_req.br_param = req;
1311 req->io_req.br_iov[iovidx].iov_base =
1312 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1315 req->io_req.br_iov[iovidx].iov_len = size;
1317 req->prev_gpaddr = gpaddr;
1318 req->prev_size = size;
1319 req->io_req.br_resid += size;
1321 req->io_req.br_iovcnt++;
1323 pthread_mutex_unlock(&req->mtx);
1326 /* RAM buffer: read/write directly */
1327 void *p = sc->nvstore.ctx;
1330 if ((lba + size) > sc->nvstore.size) {
1331 WPRINTF("%s write would overflow RAM", __func__);
1335 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1336 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1338 memcpy(p, gptr, size);
1340 memcpy(gptr, p, size);
1346 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1347 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1348 uint32_t cdw0, uint16_t status)
1350 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1352 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1353 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1354 NVME_STATUS_GET_SC(status));
1356 pci_nvme_cq_update(sc, cq,
1362 if (cq->head != cq->tail) {
1363 if (cq->intr_en & NVME_CQ_INTEN) {
1364 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1366 DPRINTF("%s: CQ%u interrupt disabled",
1367 __func__, sq->cqid);
1373 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1376 req->nvme_sq = NULL;
1379 pthread_mutex_lock(&sc->mtx);
1381 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1384 /* when no more IO pending, can set to ready if device reset/enabled */
1385 if (sc->pending_ios == 0 &&
1386 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1387 sc->regs.csts |= NVME_CSTS_RDY;
1389 pthread_mutex_unlock(&sc->mtx);
1391 sem_post(&sc->iosemlock);
1394 static struct pci_nvme_ioreq *
1395 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1397 struct pci_nvme_ioreq *req = NULL;;
1399 sem_wait(&sc->iosemlock);
1400 pthread_mutex_lock(&sc->mtx);
1402 req = STAILQ_FIRST(&sc->ioreqs_free);
1403 assert(req != NULL);
1404 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1410 pthread_mutex_unlock(&sc->mtx);
1412 req->io_req.br_iovcnt = 0;
1413 req->io_req.br_offset = 0;
1414 req->io_req.br_resid = 0;
1415 req->io_req.br_param = req;
1416 req->prev_gpaddr = 0;
1423 pci_nvme_io_done(struct blockif_req *br, int err)
1425 struct pci_nvme_ioreq *req = br->br_param;
1426 struct nvme_submission_queue *sq = req->nvme_sq;
1427 uint16_t code, status;
1429 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1431 /* TODO return correct error */
1432 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1433 pci_nvme_status_genc(&status, code);
1435 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1436 pci_nvme_release_ioreq(req->sc, req);
1440 pci_nvme_io_partial(struct blockif_req *br, int err)
1442 struct pci_nvme_ioreq *req = br->br_param;
1444 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1446 pthread_cond_signal(&req->cv);
1450 * Implements the Flush command. The specification states:
1451 * If a volatile write cache is not present, Flush commands complete
1452 * successfully and have no effect
1453 * in the description of the Volatile Write Cache (VWC) field of the Identify
1454 * Controller data. Therefore, set status to Success if the command is
1455 * not supported (i.e. RAM or as indicated by the blockif).
1458 nvme_opc_flush(struct pci_nvme_softc *sc,
1459 struct nvme_command *cmd,
1460 struct pci_nvme_blockstore *nvstore,
1461 struct pci_nvme_ioreq *req,
1464 bool pending = false;
1466 if (nvstore->type == NVME_STOR_RAM) {
1467 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1471 req->io_req.br_callback = pci_nvme_io_done;
1473 err = blockif_flush(nvstore->ctx, &req->io_req);
1479 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1482 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1490 nvme_opc_write_read(struct pci_nvme_softc *sc,
1491 struct nvme_command *cmd,
1492 struct pci_nvme_blockstore *nvstore,
1493 struct pci_nvme_ioreq *req,
1496 uint64_t lba, nblocks, bytes;
1498 bool is_write = cmd->opc == NVME_OPC_WRITE;
1499 bool pending = false;
1501 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1502 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1504 offset = lba * nvstore->sectsz;
1505 bytes = nblocks * nvstore->sectsz;
1507 if ((offset + bytes) > nvstore->size) {
1508 WPRINTF("%s command would exceed LBA range", __func__);
1509 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1513 req->io_req.br_offset = lba;
1515 /* PRP bits 1:0 must be zero */
1516 cmd->prp1 &= ~0x3UL;
1517 cmd->prp2 &= ~0x3UL;
1519 if (nvstore->type == NVME_STOR_RAM) {
1520 uint8_t *buf = nvstore->ctx;
1521 enum nvme_copy_dir dir;
1524 dir = NVME_COPY_TO_PRP;
1526 dir = NVME_COPY_FROM_PRP;
1528 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1529 buf + offset, bytes, dir))
1530 pci_nvme_status_genc(status,
1531 NVME_SC_DATA_TRANSFER_ERROR);
1533 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1538 size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes);
1539 if (pci_nvme_append_iov_req(sc, req, cmd->prp1,
1540 size, is_write, offset)) {
1541 pci_nvme_status_genc(status,
1542 NVME_SC_DATA_TRANSFER_ERROR);
1551 } else if (bytes <= PAGE_SIZE) {
1553 if (pci_nvme_append_iov_req(sc, req, cmd->prp2,
1554 size, is_write, offset)) {
1555 pci_nvme_status_genc(status,
1556 NVME_SC_DATA_TRANSFER_ERROR);
1560 void *vmctx = sc->nsc_pi->pi_vmctx;
1561 uint64_t *prp_list = &cmd->prp2;
1562 uint64_t *last = prp_list;
1564 /* PRP2 is pointer to a physical region page list */
1566 /* Last entry in list points to the next list */
1567 if (prp_list == last) {
1568 uint64_t prp = *prp_list;
1570 prp_list = paddr_guest2host(vmctx, prp,
1571 PAGE_SIZE - (prp % PAGE_SIZE));
1572 last = prp_list + (NVME_PRP2_ITEMS - 1);
1575 size = MIN(bytes, PAGE_SIZE);
1577 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1578 size, is_write, offset)) {
1579 pci_nvme_status_genc(status,
1580 NVME_SC_DATA_TRANSFER_ERROR);
1590 req->io_req.br_callback = pci_nvme_io_done;
1592 err = blockif_write(nvstore->ctx, &req->io_req);
1594 err = blockif_read(nvstore->ctx, &req->io_req);
1597 pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR);
1606 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1608 struct pci_nvme_ioreq *req = br->br_param;
1609 struct pci_nvme_softc *sc = req->sc;
1614 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1615 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1616 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1618 struct iovec *iov = req->io_req.br_iov;
1621 iov += req->prev_gpaddr;
1623 /* The iov_* values already include the sector size */
1624 req->io_req.br_offset = (off_t)iov->iov_base;
1625 req->io_req.br_resid = iov->iov_len;
1626 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1627 pci_nvme_status_genc(&status,
1628 NVME_SC_INTERNAL_DEVICE_ERROR);
1634 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1635 req->cid, 0, status);
1636 pci_nvme_release_ioreq(sc, req);
1641 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1642 struct nvme_command *cmd,
1643 struct pci_nvme_blockstore *nvstore,
1644 struct pci_nvme_ioreq *req,
1648 bool pending = false;
1650 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1651 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1655 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1656 struct nvme_dsm_range *range;
1658 int sectsz = sc->nvstore.sectsz;
1661 * DSM calls are advisory only, and compliant controllers
1662 * may choose to take no actions (i.e. return Success).
1664 if (!nvstore->deallocate) {
1665 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1670 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1674 /* copy locally because a range entry could straddle PRPs */
1675 range = calloc(1, NVME_MAX_DSM_TRIM);
1676 if (range == NULL) {
1677 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1680 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1681 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1684 * If the request is for more than a single range, store
1685 * the ranges in the br_iov. Optimize for the common case
1686 * of a single range.
1688 * Note that NVMe Number of Ranges is a zero based value
1690 nr = cmd->cdw10 & 0xff;
1692 req->io_req.br_iovcnt = 0;
1693 req->io_req.br_offset = range[0].starting_lba * sectsz;
1694 req->io_req.br_resid = range[0].length * sectsz;
1697 req->io_req.br_callback = pci_nvme_io_done;
1699 struct iovec *iov = req->io_req.br_iov;
1701 for (r = 0; r <= nr; r++) {
1702 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1703 iov[r].iov_len = range[r].length * sectsz;
1705 req->io_req.br_callback = pci_nvme_dealloc_sm;
1708 * Use prev_gpaddr to track the current entry and
1709 * prev_size to track the number of entries
1711 req->prev_gpaddr = 0;
1715 err = blockif_delete(nvstore->ctx, &req->io_req);
1717 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1728 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1730 struct nvme_submission_queue *sq;
1734 /* handle all submissions up to sq->tail index */
1735 sq = &sc->submit_queues[idx];
1737 pthread_mutex_lock(&sq->mtx);
1740 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1741 idx, sqhead, sq->tail, sq->qbase);
1743 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1744 struct nvme_command *cmd;
1745 struct pci_nvme_ioreq *req;
1753 cmd = &sq->qbase[sqhead];
1754 sqhead = (sqhead + 1) % sq->size;
1756 nsid = le32toh(cmd->nsid);
1757 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1758 pci_nvme_status_genc(&status,
1759 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1761 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1765 req = pci_nvme_get_ioreq(sc);
1767 pci_nvme_status_genc(&status,
1768 NVME_SC_INTERNAL_DEVICE_ERROR);
1769 WPRINTF("%s: unable to allocate IO req", __func__);
1774 req->opc = cmd->opc;
1775 req->cid = cmd->cid;
1776 req->nsid = cmd->nsid;
1779 case NVME_OPC_FLUSH:
1780 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1783 case NVME_OPC_WRITE:
1785 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1788 case NVME_OPC_WRITE_ZEROES:
1789 /* TODO: write zeroes
1790 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1791 __func__, lba, cmd->cdw12 & 0xFFFF); */
1792 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1794 case NVME_OPC_DATASET_MANAGEMENT:
1795 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1799 WPRINTF("%s unhandled io command 0x%x",
1800 __func__, cmd->opc);
1801 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1805 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1808 pci_nvme_release_ioreq(sc, req);
1814 pthread_mutex_unlock(&sq->mtx);
1818 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1819 uint64_t idx, int is_sq, uint64_t value)
1821 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
1822 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
1825 if (idx > sc->num_squeues) {
1826 WPRINTF("%s queue index %lu overflow from "
1828 __func__, idx, sc->num_squeues);
1832 atomic_store_short(&sc->submit_queues[idx].tail,
1836 pci_nvme_handle_admin_cmd(sc, value);
1838 /* submission queue; handle new entries in SQ */
1839 if (idx > sc->num_squeues) {
1840 WPRINTF("%s SQ index %lu overflow from "
1842 __func__, idx, sc->num_squeues);
1845 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1848 if (idx > sc->num_cqueues) {
1849 WPRINTF("%s queue index %lu overflow from "
1851 __func__, idx, sc->num_cqueues);
1855 atomic_store_short(&sc->compl_queues[idx].head,
1861 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1863 const char *s = iswrite ? "WRITE" : "READ";
1866 case NVME_CR_CAP_LOW:
1867 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
1869 case NVME_CR_CAP_HI:
1870 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
1873 DPRINTF("%s %s NVME_CR_VS", func, s);
1876 DPRINTF("%s %s NVME_CR_INTMS", func, s);
1879 DPRINTF("%s %s NVME_CR_INTMC", func, s);
1882 DPRINTF("%s %s NVME_CR_CC", func, s);
1885 DPRINTF("%s %s NVME_CR_CSTS", func, s);
1888 DPRINTF("%s %s NVME_CR_NSSR", func, s);
1891 DPRINTF("%s %s NVME_CR_AQA", func, s);
1893 case NVME_CR_ASQ_LOW:
1894 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
1896 case NVME_CR_ASQ_HI:
1897 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
1899 case NVME_CR_ACQ_LOW:
1900 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
1902 case NVME_CR_ACQ_HI:
1903 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
1906 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
1912 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1913 uint64_t offset, int size, uint64_t value)
1917 if (offset >= NVME_DOORBELL_OFFSET) {
1918 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1919 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1920 int is_sq = (belloffset % 8) < 4;
1922 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1923 WPRINTF("guest attempted an overflow write offset "
1924 "0x%lx, val 0x%lx in %s",
1925 offset, value, __func__);
1929 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1933 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
1934 offset, size, value);
1937 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
1938 "val 0x%lx) to bar0 in %s",
1939 size, offset, value, __func__);
1940 /* TODO: shutdown device */
1944 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1946 pthread_mutex_lock(&sc->mtx);
1949 case NVME_CR_CAP_LOW:
1950 case NVME_CR_CAP_HI:
1957 /* MSI-X, so ignore */
1960 /* MSI-X, so ignore */
1963 ccreg = (uint32_t)value;
1965 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1968 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1969 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1970 NVME_CC_GET_IOCQES(ccreg));
1972 if (NVME_CC_GET_SHN(ccreg)) {
1973 /* perform shutdown - flush out data to backend */
1974 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1975 NVME_CSTS_REG_SHST_SHIFT);
1976 sc->regs.csts |= NVME_SHST_COMPLETE <<
1977 NVME_CSTS_REG_SHST_SHIFT;
1979 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1980 if (NVME_CC_GET_EN(ccreg) == 0)
1981 /* transition 1-> causes controller reset */
1982 pci_nvme_reset_locked(sc);
1984 pci_nvme_init_controller(ctx, sc);
1987 /* Insert the iocqes, iosqes and en bits from the write */
1988 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1989 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1990 if (NVME_CC_GET_EN(ccreg) == 0) {
1991 /* Insert the ams, mps and css bit fields */
1992 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1993 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1994 sc->regs.csts &= ~NVME_CSTS_RDY;
1995 } else if (sc->pending_ios == 0) {
1996 sc->regs.csts |= NVME_CSTS_RDY;
2002 /* ignore writes; don't support subsystem reset */
2005 sc->regs.aqa = (uint32_t)value;
2007 case NVME_CR_ASQ_LOW:
2008 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2009 (0xFFFFF000 & value);
2011 case NVME_CR_ASQ_HI:
2012 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2015 case NVME_CR_ACQ_LOW:
2016 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2017 (0xFFFFF000 & value);
2019 case NVME_CR_ACQ_HI:
2020 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2024 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2025 __func__, offset, value, size);
2027 pthread_mutex_unlock(&sc->mtx);
2031 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2032 int baridx, uint64_t offset, int size, uint64_t value)
2034 struct pci_nvme_softc* sc = pi->pi_arg;
2036 if (baridx == pci_msix_table_bar(pi) ||
2037 baridx == pci_msix_pba_bar(pi)) {
2038 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2039 " value 0x%lx", baridx, offset, size, value);
2041 pci_emul_msix_twrite(pi, offset, size, value);
2047 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2051 DPRINTF("%s unknown baridx %d, val 0x%lx",
2052 __func__, baridx, value);
2056 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2057 uint64_t offset, int size)
2061 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2063 if (offset < NVME_DOORBELL_OFFSET) {
2064 void *p = &(sc->regs);
2065 pthread_mutex_lock(&sc->mtx);
2066 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2067 pthread_mutex_unlock(&sc->mtx);
2070 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2081 value &= 0xFFFFFFFF;
2085 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2086 offset, size, (uint32_t)value);
2094 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2095 uint64_t offset, int size)
2097 struct pci_nvme_softc* sc = pi->pi_arg;
2099 if (baridx == pci_msix_table_bar(pi) ||
2100 baridx == pci_msix_pba_bar(pi)) {
2101 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2102 baridx, offset, size);
2104 return pci_emul_msix_tread(pi, offset, size);
2109 return pci_nvme_read_bar_0(sc, offset, size);
2112 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2120 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2122 char bident[sizeof("XX:X:X")];
2123 char *uopt, *xopts, *config;
2127 sc->max_queues = NVME_QUEUES;
2128 sc->max_qentries = NVME_MAX_QENTRIES;
2129 sc->ioslots = NVME_IOSLOTS;
2130 sc->num_squeues = sc->max_queues;
2131 sc->num_cqueues = sc->max_queues;
2132 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2135 uopt = strdup(opts);
2137 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2138 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2139 for (xopts = strtok(uopt, ",");
2141 xopts = strtok(NULL, ",")) {
2143 if ((config = strchr(xopts, '=')) != NULL)
2146 if (!strcmp("maxq", xopts)) {
2147 sc->max_queues = atoi(config);
2148 } else if (!strcmp("qsz", xopts)) {
2149 sc->max_qentries = atoi(config);
2150 } else if (!strcmp("ioslots", xopts)) {
2151 sc->ioslots = atoi(config);
2152 } else if (!strcmp("sectsz", xopts)) {
2153 sectsz = atoi(config);
2154 } else if (!strcmp("ser", xopts)) {
2156 * This field indicates the Product Serial Number in
2157 * 7-bit ASCII, unused bytes should be space characters.
2160 cpywithpad((char *)sc->ctrldata.sn,
2161 sizeof(sc->ctrldata.sn), config, ' ');
2162 } else if (!strcmp("ram", xopts)) {
2163 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2165 sc->nvstore.type = NVME_STOR_RAM;
2166 sc->nvstore.size = sz * 1024 * 1024;
2167 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2168 sc->nvstore.sectsz = 4096;
2169 sc->nvstore.sectsz_bits = 12;
2170 if (sc->nvstore.ctx == NULL) {
2171 perror("Unable to allocate RAM");
2175 } else if (!strcmp("eui64", xopts)) {
2176 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2177 } else if (!strcmp("dsm", xopts)) {
2178 if (!strcmp("auto", config))
2179 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2180 else if (!strcmp("enable", config))
2181 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2182 else if (!strcmp("disable", config))
2183 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2184 } else if (optidx == 0) {
2185 snprintf(bident, sizeof(bident), "%d:%d",
2186 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2187 sc->nvstore.ctx = blockif_open(xopts, bident);
2188 if (sc->nvstore.ctx == NULL) {
2189 perror("Could not open backing file");
2193 sc->nvstore.type = NVME_STOR_BLOCKIF;
2194 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2196 EPRINTLN("Invalid option %s", xopts);
2205 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2206 EPRINTLN("backing store not specified");
2209 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2210 sc->nvstore.sectsz = sectsz;
2211 else if (sc->nvstore.type != NVME_STOR_RAM)
2212 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2213 for (sc->nvstore.sectsz_bits = 9;
2214 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2215 sc->nvstore.sectsz_bits++);
2217 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2218 sc->max_queues = NVME_QUEUES;
2220 if (sc->max_qentries <= 0) {
2221 EPRINTLN("Invalid qsz option");
2224 if (sc->ioslots <= 0) {
2225 EPRINTLN("Invalid ioslots option");
2233 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2235 struct pci_nvme_softc *sc;
2236 uint32_t pci_membar_sz;
2241 sc = calloc(1, sizeof(struct pci_nvme_softc));
2245 error = pci_nvme_parse_opts(sc, opts);
2251 STAILQ_INIT(&sc->ioreqs_free);
2252 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2253 for (int i = 0; i < sc->ioslots; i++) {
2254 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2255 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2256 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2258 sc->intr_coales_aggr_thresh = 1;
2260 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2261 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2262 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2263 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2264 pci_set_cfgdata8(pi, PCIR_PROGIF,
2265 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2268 * Allocate size of NVMe registers + doorbell space for all queues.
2270 * The specification requires a minimum memory I/O window size of 16K.
2271 * The Windows driver will refuse to start a device with a smaller
2274 pci_membar_sz = sizeof(struct nvme_registers) +
2275 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2276 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2278 DPRINTF("nvme membar size: %u", pci_membar_sz);
2280 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2282 WPRINTF("%s pci alloc mem bar failed", __func__);
2286 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2288 WPRINTF("%s pci add msixcap failed", __func__);
2292 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2294 WPRINTF("%s pci add Express capability failed", __func__);
2298 pthread_mutex_init(&sc->mtx, NULL);
2299 sem_init(&sc->iosemlock, 0, sc->ioslots);
2301 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2303 * Controller data depends on Namespace data so initialize Namespace
2306 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2307 pci_nvme_init_ctrldata(sc);
2308 pci_nvme_init_logpages(sc);
2312 pci_lintr_request(pi);
2319 struct pci_devemu pci_de_nvme = {
2321 .pe_init = pci_nvme_init,
2322 .pe_barwrite = pci_nvme_write,
2323 .pe_barread = pci_nvme_read
2325 PCI_EMUL_SET(pci_de_nvme);