2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
103 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
104 #define NVME_MAX_BLOCKIOVS 512
106 /* This is a synthetic status code to indicate there is no status */
107 #define NVME_NO_STATUS 0xffff
108 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
112 /* Convert a zero-based value into a one-based value */
113 #define ONE_BASED(zero) ((zero) + 1)
114 /* Convert a one-based value into a zero-based value */
115 #define ZERO_BASED(one) ((one) - 1)
117 /* Encode number of SQ's and CQ's for Set/Get Features */
118 #define NVME_FEATURE_NUM_QUEUES(sc) \
119 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
120 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
122 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
124 enum nvme_controller_register_offsets {
125 NVME_CR_CAP_LOW = 0x00,
126 NVME_CR_CAP_HI = 0x04,
128 NVME_CR_INTMS = 0x0c,
129 NVME_CR_INTMC = 0x10,
134 NVME_CR_ASQ_LOW = 0x28,
135 NVME_CR_ASQ_HI = 0x2c,
136 NVME_CR_ACQ_LOW = 0x30,
137 NVME_CR_ACQ_HI = 0x34,
140 enum nvme_cmd_cdw11 {
141 NVME_CMD_CDW11_PC = 0x0001,
142 NVME_CMD_CDW11_IEN = 0x0002,
143 NVME_CMD_CDW11_IV = 0xFFFF0000,
151 #define NVME_CQ_INTEN 0x01
152 #define NVME_CQ_INTCOAL 0x02
154 struct nvme_completion_queue {
155 struct nvme_completion *qbase;
158 uint16_t tail; /* nvme progress */
159 uint16_t head; /* guest progress */
164 struct nvme_submission_queue {
165 struct nvme_command *qbase;
168 uint16_t head; /* nvme progress */
169 uint16_t tail; /* guest progress */
170 uint16_t cqid; /* completion queue id */
174 enum nvme_storage_type {
175 NVME_STOR_BLOCKIF = 0,
179 struct pci_nvme_blockstore {
180 enum nvme_storage_type type;
184 uint32_t sectsz_bits;
186 uint32_t deallocate:1;
189 struct pci_nvme_ioreq {
190 struct pci_nvme_softc *sc;
191 STAILQ_ENTRY(pci_nvme_ioreq) link;
192 struct nvme_submission_queue *nvme_sq;
195 /* command information */
200 uint64_t prev_gpaddr;
204 * lock if all iovs consumed (big IO);
205 * complete transaction before continuing
210 struct blockif_req io_req;
212 /* pad to fit up to 512 page descriptors from guest IO request */
213 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
217 /* Dataset Management bit in ONCS reflects backing storage capability */
218 NVME_DATASET_MANAGEMENT_AUTO,
219 /* Unconditionally set Dataset Management bit in ONCS */
220 NVME_DATASET_MANAGEMENT_ENABLE,
221 /* Unconditionally clear Dataset Management bit in ONCS */
222 NVME_DATASET_MANAGEMENT_DISABLE,
225 struct pci_nvme_softc {
226 struct pci_devinst *nsc_pi;
230 struct nvme_registers regs;
232 struct nvme_namespace_data nsdata;
233 struct nvme_controller_data ctrldata;
234 struct nvme_error_information_entry err_log;
235 struct nvme_health_information_page health_log;
236 struct nvme_firmware_page fw_log;
238 struct pci_nvme_blockstore nvstore;
240 uint16_t max_qentries; /* max entries per queue */
241 uint32_t max_queues; /* max number of IO SQ's or CQ's */
242 uint32_t num_cqueues;
243 uint32_t num_squeues;
245 struct pci_nvme_ioreq *ioreqs;
246 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
247 uint32_t pending_ios;
252 * Memory mapped Submission and Completion queues
253 * Each array includes both Admin and IO queues
255 struct nvme_completion_queue *compl_queues;
256 struct nvme_submission_queue *submit_queues;
258 /* controller features */
259 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
260 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
261 uint32_t async_ev_config; /* 0x0B: async event config */
263 enum nvme_dsm_type dataset_management;
267 static void pci_nvme_io_partial(struct blockif_req *br, int err);
269 /* Controller Configuration utils */
270 #define NVME_CC_GET_EN(cc) \
271 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
272 #define NVME_CC_GET_CSS(cc) \
273 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
274 #define NVME_CC_GET_SHN(cc) \
275 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
276 #define NVME_CC_GET_IOSQES(cc) \
277 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
278 #define NVME_CC_GET_IOCQES(cc) \
279 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
281 #define NVME_CC_WRITE_MASK \
282 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
283 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
284 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
286 #define NVME_CC_NEN_WRITE_MASK \
287 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
288 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
289 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
291 /* Controller Status utils */
292 #define NVME_CSTS_GET_RDY(sts) \
293 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
295 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
297 /* Completion Queue status word utils */
298 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
299 #define NVME_STATUS_MASK \
300 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
301 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
303 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
304 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
307 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
311 len = strnlen(src, dst_size);
312 memset(dst, pad, dst_size);
313 memcpy(dst, src, len);
317 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
320 *status &= ~NVME_STATUS_MASK;
321 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
322 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
326 pci_nvme_status_genc(uint16_t *status, uint16_t code)
329 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
333 pci_nvme_toggle_phase(uint16_t *status, int prev)
337 *status &= ~NVME_STATUS_P;
339 *status |= NVME_STATUS_P;
343 * Initialize the requested number or IO Submission and Completion Queues.
344 * Admin queues are allocated implicitly.
347 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
352 * Allocate and initialize the Submission Queues
354 if (nsq > NVME_QUEUES) {
355 WPRINTF("%s: clamping number of SQ from %u to %u",
356 __func__, nsq, NVME_QUEUES);
360 sc->num_squeues = nsq;
362 sc->submit_queues = calloc(sc->num_squeues + 1,
363 sizeof(struct nvme_submission_queue));
364 if (sc->submit_queues == NULL) {
365 WPRINTF("%s: SQ allocation failed", __func__);
368 struct nvme_submission_queue *sq = sc->submit_queues;
370 for (i = 0; i < sc->num_squeues; i++)
371 pthread_mutex_init(&sq[i].mtx, NULL);
375 * Allocate and initialize the Completion Queues
377 if (ncq > NVME_QUEUES) {
378 WPRINTF("%s: clamping number of CQ from %u to %u",
379 __func__, ncq, NVME_QUEUES);
383 sc->num_cqueues = ncq;
385 sc->compl_queues = calloc(sc->num_cqueues + 1,
386 sizeof(struct nvme_completion_queue));
387 if (sc->compl_queues == NULL) {
388 WPRINTF("%s: CQ allocation failed", __func__);
391 struct nvme_completion_queue *cq = sc->compl_queues;
393 for (i = 0; i < sc->num_cqueues; i++)
394 pthread_mutex_init(&cq[i].mtx, NULL);
399 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
401 struct nvme_controller_data *cd = &sc->ctrldata;
406 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
407 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
409 /* Num of submission commands that we can handle at a time (2^rab) */
419 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
421 cd->ver = 0x00010300;
423 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
427 cd->lpa = 0; /* TODO: support some simple things like SMART */
428 cd->elpe = 0; /* max error log page entries */
429 cd->npss = 1; /* number of power states support */
431 /* Warning Composite Temperature Threshold */
434 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
435 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
436 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
437 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
438 cd->nn = 1; /* number of namespaces */
441 switch (sc->dataset_management) {
442 case NVME_DATASET_MANAGEMENT_AUTO:
443 if (sc->nvstore.deallocate)
444 cd->oncs |= NVME_ONCS_DSM;
446 case NVME_DATASET_MANAGEMENT_ENABLE:
447 cd->oncs |= NVME_ONCS_DSM;
455 cd->power_state[0].mp = 10;
459 * Calculate the CRC-16 of the given buffer
460 * See copyright attribution at top of file
463 crc16(uint16_t crc, const void *buffer, unsigned int len)
465 const unsigned char *cp = buffer;
466 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
467 static uint16_t const crc16_table[256] = {
468 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
469 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
470 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
471 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
472 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
473 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
474 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
475 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
476 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
477 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
478 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
479 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
480 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
481 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
482 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
483 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
484 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
485 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
486 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
487 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
488 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
489 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
490 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
491 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
492 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
493 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
494 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
495 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
496 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
497 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
498 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
499 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
503 crc = (((crc >> 8) & 0xffU) ^
504 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
509 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
510 struct nvme_namespace_data *nd, uint32_t nsid,
511 struct pci_nvme_blockstore *nvstore)
514 /* Get capacity and block size information from backing store */
515 nd->nsze = nvstore->size / nvstore->sectsz;
519 if (nvstore->type == NVME_STOR_BLOCKIF)
520 nvstore->deallocate = blockif_candelete(nvstore->ctx);
522 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
525 /* Create an EUI-64 if user did not provide one */
526 if (nvstore->eui64 == 0) {
528 uint64_t eui64 = nvstore->eui64;
530 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
531 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
534 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
537 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
539 be64enc(nd->eui64, nvstore->eui64);
541 /* LBA data-sz = 2^lbads */
542 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
546 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
549 memset(&sc->err_log, 0, sizeof(sc->err_log));
550 memset(&sc->health_log, 0, sizeof(sc->health_log));
551 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
555 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
559 DPRINTF("%s", __func__);
561 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
562 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
563 (60 << NVME_CAP_LO_REG_TO_SHIFT);
565 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
567 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
572 assert(sc->submit_queues != NULL);
574 for (i = 0; i < sc->num_squeues + 1; i++) {
575 sc->submit_queues[i].qbase = NULL;
576 sc->submit_queues[i].size = 0;
577 sc->submit_queues[i].cqid = 0;
578 sc->submit_queues[i].tail = 0;
579 sc->submit_queues[i].head = 0;
582 assert(sc->compl_queues != NULL);
584 for (i = 0; i < sc->num_cqueues + 1; i++) {
585 sc->compl_queues[i].qbase = NULL;
586 sc->compl_queues[i].size = 0;
587 sc->compl_queues[i].tail = 0;
588 sc->compl_queues[i].head = 0;
593 pci_nvme_reset(struct pci_nvme_softc *sc)
595 pthread_mutex_lock(&sc->mtx);
596 pci_nvme_reset_locked(sc);
597 pthread_mutex_unlock(&sc->mtx);
601 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
605 DPRINTF("%s", __func__);
607 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
608 sc->submit_queues[0].size = asqs;
609 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
610 sizeof(struct nvme_command) * asqs);
612 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
613 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
615 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
616 NVME_AQA_REG_ACQS_MASK) + 1;
617 sc->compl_queues[0].size = acqs;
618 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
619 sizeof(struct nvme_completion) * acqs);
620 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
621 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
625 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
626 size_t len, enum nvme_copy_dir dir)
631 if (len > (8 * 1024)) {
635 /* Copy from the start of prp1 to the end of the physical page */
636 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
637 bytes = MIN(bytes, len);
639 p = vm_map_gpa(ctx, prp1, bytes);
644 if (dir == NVME_COPY_TO_PRP)
656 len = MIN(len, PAGE_SIZE);
658 p = vm_map_gpa(ctx, prp2, len);
663 if (dir == NVME_COPY_TO_PRP)
672 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
673 struct nvme_completion* compl)
675 uint16_t qid = command->cdw10 & 0xffff;
677 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
678 if (qid == 0 || qid > sc->num_squeues) {
679 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
680 __func__, qid, sc->num_squeues);
681 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
682 NVME_SC_INVALID_QUEUE_IDENTIFIER);
686 sc->submit_queues[qid].qbase = NULL;
687 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
692 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
693 struct nvme_completion* compl)
695 if (command->cdw11 & NVME_CMD_CDW11_PC) {
696 uint16_t qid = command->cdw10 & 0xffff;
697 struct nvme_submission_queue *nsq;
699 if ((qid == 0) || (qid > sc->num_squeues)) {
700 WPRINTF("%s queue index %u > num_squeues %u",
701 __func__, qid, sc->num_squeues);
702 pci_nvme_status_tc(&compl->status,
703 NVME_SCT_COMMAND_SPECIFIC,
704 NVME_SC_INVALID_QUEUE_IDENTIFIER);
708 nsq = &sc->submit_queues[qid];
709 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
711 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
712 sizeof(struct nvme_command) * (size_t)nsq->size);
713 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
714 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
716 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
717 qid, nsq->size, nsq->qbase, nsq->cqid);
719 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
721 DPRINTF("%s completed creating IOSQ qid %u",
725 * Guest sent non-cont submission queue request.
726 * This setting is unsupported by this emulation.
728 WPRINTF("%s unsupported non-contig (list-based) "
729 "create i/o submission queue", __func__);
731 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
737 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
738 struct nvme_completion* compl)
740 uint16_t qid = command->cdw10 & 0xffff;
742 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
743 if (qid == 0 || qid > sc->num_cqueues) {
744 WPRINTF("%s queue index %u / num_cqueues %u",
745 __func__, qid, sc->num_cqueues);
746 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
747 NVME_SC_INVALID_QUEUE_IDENTIFIER);
751 sc->compl_queues[qid].qbase = NULL;
752 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
757 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
758 struct nvme_completion* compl)
760 if (command->cdw11 & NVME_CMD_CDW11_PC) {
761 uint16_t qid = command->cdw10 & 0xffff;
762 struct nvme_completion_queue *ncq;
764 if ((qid == 0) || (qid > sc->num_cqueues)) {
765 WPRINTF("%s queue index %u > num_cqueues %u",
766 __func__, qid, sc->num_cqueues);
767 pci_nvme_status_tc(&compl->status,
768 NVME_SCT_COMMAND_SPECIFIC,
769 NVME_SC_INVALID_QUEUE_IDENTIFIER);
773 ncq = &sc->compl_queues[qid];
774 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
775 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
776 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
778 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
780 sizeof(struct nvme_command) * (size_t)ncq->size);
782 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
785 * Non-contig completion queue unsupported.
787 WPRINTF("%s unsupported non-contig (list-based) "
788 "create i/o completion queue",
791 /* 0x12 = Invalid Use of Controller Memory Buffer */
792 pci_nvme_status_genc(&compl->status, 0x12);
799 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
800 struct nvme_completion* compl)
802 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
803 uint8_t logpage = command->cdw10 & 0xFF;
805 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
807 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
811 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
812 command->prp2, (uint8_t *)&sc->err_log, logsize,
815 case NVME_LOG_HEALTH_INFORMATION:
816 /* TODO: present some smart info */
817 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
818 command->prp2, (uint8_t *)&sc->health_log, logsize,
821 case NVME_LOG_FIRMWARE_SLOT:
822 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
823 command->prp2, (uint8_t *)&sc->fw_log, logsize,
827 WPRINTF("%s get log page %x command not supported",
830 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
831 NVME_SC_INVALID_LOG_PAGE);
838 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
839 struct nvme_completion* compl)
843 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
844 command->cdw10 & 0xFF, command->nsid);
846 switch (command->cdw10 & 0xFF) {
847 case 0x00: /* return Identify Namespace data structure */
848 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
849 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
852 case 0x01: /* return Identify Controller data structure */
853 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
854 command->prp2, (uint8_t *)&sc->ctrldata,
855 sizeof(sc->ctrldata),
858 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
859 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
860 sizeof(uint32_t) * 1024);
861 ((uint32_t *)dest)[0] = 1;
862 ((uint32_t *)dest)[1] = 0;
865 pci_nvme_status_genc(&compl->status,
866 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
868 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
875 DPRINTF("%s unsupported identify command requested 0x%x",
876 __func__, command->cdw10 & 0xFF);
877 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
881 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
886 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
887 struct nvme_completion* compl)
889 uint16_t nqr; /* Number of Queues Requested */
891 nqr = command->cdw11 & 0xFFFF;
893 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
894 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
898 sc->num_squeues = ONE_BASED(nqr);
899 if (sc->num_squeues > sc->max_queues) {
900 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
902 sc->num_squeues = sc->max_queues;
905 nqr = (command->cdw11 >> 16) & 0xFFFF;
907 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
908 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
912 sc->num_cqueues = ONE_BASED(nqr);
913 if (sc->num_cqueues > sc->max_queues) {
914 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
916 sc->num_cqueues = sc->max_queues;
919 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
925 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
926 struct nvme_completion* compl)
928 int feature = command->cdw10 & 0xFF;
931 DPRINTF("%s feature 0x%x", __func__, feature);
935 case NVME_FEAT_ARBITRATION:
936 DPRINTF(" arbitration 0x%x", command->cdw11);
938 case NVME_FEAT_POWER_MANAGEMENT:
939 DPRINTF(" power management 0x%x", command->cdw11);
941 case NVME_FEAT_LBA_RANGE_TYPE:
942 DPRINTF(" lba range 0x%x", command->cdw11);
944 case NVME_FEAT_TEMPERATURE_THRESHOLD:
945 DPRINTF(" temperature threshold 0x%x", command->cdw11);
947 case NVME_FEAT_ERROR_RECOVERY:
948 DPRINTF(" error recovery 0x%x", command->cdw11);
950 case NVME_FEAT_VOLATILE_WRITE_CACHE:
951 DPRINTF(" volatile write cache 0x%x", command->cdw11);
953 case NVME_FEAT_NUMBER_OF_QUEUES:
954 nvme_set_feature_queues(sc, command, compl);
956 case NVME_FEAT_INTERRUPT_COALESCING:
957 DPRINTF(" interrupt coalescing 0x%x", command->cdw11);
960 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
962 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
964 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
965 iv = command->cdw11 & 0xFFFF;
967 DPRINTF(" interrupt vector configuration 0x%x",
970 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
971 if (sc->compl_queues[i].intr_vec == iv) {
972 if (command->cdw11 & (1 << 16))
973 sc->compl_queues[i].intr_en |=
976 sc->compl_queues[i].intr_en &=
981 case NVME_FEAT_WRITE_ATOMICITY:
982 DPRINTF(" write atomicity 0x%x", command->cdw11);
984 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
985 DPRINTF(" async event configuration 0x%x",
987 sc->async_ev_config = command->cdw11;
989 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
990 DPRINTF(" software progress marker 0x%x",
994 DPRINTF(" autonomous power state transition 0x%x",
998 WPRINTF("%s invalid feature", __func__);
999 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1003 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1008 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1009 struct nvme_completion* compl)
1011 int feature = command->cdw10 & 0xFF;
1013 DPRINTF("%s feature 0x%x", __func__, feature);
1018 case NVME_FEAT_ARBITRATION:
1019 DPRINTF(" arbitration");
1021 case NVME_FEAT_POWER_MANAGEMENT:
1022 DPRINTF(" power management");
1024 case NVME_FEAT_LBA_RANGE_TYPE:
1025 DPRINTF(" lba range");
1027 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1028 DPRINTF(" temperature threshold");
1029 switch ((command->cdw11 >> 20) & 0x3) {
1031 /* Over temp threshold */
1032 compl->cdw0 = 0xFFFF;
1035 /* Under temp threshold */
1039 WPRINTF(" invalid threshold type select");
1040 pci_nvme_status_genc(&compl->status,
1041 NVME_SC_INVALID_FIELD);
1045 case NVME_FEAT_ERROR_RECOVERY:
1046 DPRINTF(" error recovery");
1048 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1049 DPRINTF(" volatile write cache");
1051 case NVME_FEAT_NUMBER_OF_QUEUES:
1052 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1054 DPRINTF(" number of queues (submit %u, completion %u)",
1055 compl->cdw0 & 0xFFFF,
1056 (compl->cdw0 >> 16) & 0xFFFF);
1059 case NVME_FEAT_INTERRUPT_COALESCING:
1060 DPRINTF(" interrupt coalescing");
1062 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1063 DPRINTF(" interrupt vector configuration");
1065 case NVME_FEAT_WRITE_ATOMICITY:
1066 DPRINTF(" write atomicity");
1068 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1069 DPRINTF(" async event configuration");
1070 sc->async_ev_config = command->cdw11;
1072 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1073 DPRINTF(" software progress marker");
1076 DPRINTF(" autonomous power state transition");
1079 WPRINTF("%s invalid feature 0x%x", __func__, feature);
1080 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1084 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1089 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1090 struct nvme_completion* compl)
1092 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1093 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1095 /* TODO: search for the command ID and abort it */
1098 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1103 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1104 struct nvme_command* command, struct nvme_completion* compl)
1106 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1109 * TODO: raise events when they happen based on the Set Features cmd.
1110 * These events happen async, so only set completion successful if
1111 * there is an event reflective of the request to get event.
1113 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1114 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1119 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1121 struct nvme_completion compl;
1122 struct nvme_command *cmd;
1123 struct nvme_submission_queue *sq;
1124 struct nvme_completion_queue *cq;
1127 DPRINTF("%s index %u", __func__, (uint32_t)value);
1129 sq = &sc->submit_queues[0];
1130 cq = &sc->compl_queues[0];
1132 pthread_mutex_lock(&sq->mtx);
1135 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1137 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1138 cmd = &(sq->qbase)[sqhead];
1143 case NVME_OPC_DELETE_IO_SQ:
1144 DPRINTF("%s command DELETE_IO_SQ", __func__);
1145 nvme_opc_delete_io_sq(sc, cmd, &compl);
1147 case NVME_OPC_CREATE_IO_SQ:
1148 DPRINTF("%s command CREATE_IO_SQ", __func__);
1149 nvme_opc_create_io_sq(sc, cmd, &compl);
1151 case NVME_OPC_DELETE_IO_CQ:
1152 DPRINTF("%s command DELETE_IO_CQ", __func__);
1153 nvme_opc_delete_io_cq(sc, cmd, &compl);
1155 case NVME_OPC_CREATE_IO_CQ:
1156 DPRINTF("%s command CREATE_IO_CQ", __func__);
1157 nvme_opc_create_io_cq(sc, cmd, &compl);
1159 case NVME_OPC_GET_LOG_PAGE:
1160 DPRINTF("%s command GET_LOG_PAGE", __func__);
1161 nvme_opc_get_log_page(sc, cmd, &compl);
1163 case NVME_OPC_IDENTIFY:
1164 DPRINTF("%s command IDENTIFY", __func__);
1165 nvme_opc_identify(sc, cmd, &compl);
1167 case NVME_OPC_ABORT:
1168 DPRINTF("%s command ABORT", __func__);
1169 nvme_opc_abort(sc, cmd, &compl);
1171 case NVME_OPC_SET_FEATURES:
1172 DPRINTF("%s command SET_FEATURES", __func__);
1173 nvme_opc_set_features(sc, cmd, &compl);
1175 case NVME_OPC_GET_FEATURES:
1176 DPRINTF("%s command GET_FEATURES", __func__);
1177 nvme_opc_get_features(sc, cmd, &compl);
1179 case NVME_OPC_ASYNC_EVENT_REQUEST:
1180 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1181 /* XXX dont care, unhandled for now
1182 nvme_opc_async_event_req(sc, cmd, &compl);
1184 compl.status = NVME_NO_STATUS;
1187 WPRINTF("0x%x command is not implemented",
1189 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1191 sqhead = (sqhead + 1) % sq->size;
1193 if (NVME_COMPLETION_VALID(compl)) {
1194 struct nvme_completion *cp;
1197 pthread_mutex_lock(&cq->mtx);
1199 cp = &(cq->qbase)[cq->tail];
1200 cp->cdw0 = compl.cdw0;
1205 phase = NVME_STATUS_GET_P(cp->status);
1206 cp->status = compl.status;
1207 pci_nvme_toggle_phase(&cp->status, phase);
1209 cq->tail = (cq->tail + 1) % cq->size;
1211 pthread_mutex_unlock(&cq->mtx);
1215 DPRINTF("setting sqhead %u", sqhead);
1218 if (cq->head != cq->tail)
1219 pci_generate_msix(sc->nsc_pi, 0);
1221 pthread_mutex_unlock(&sq->mtx);
1225 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1226 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1231 /* concatenate contig block-iovs to minimize number of iovs */
1232 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1233 iovidx = req->io_req.br_iovcnt - 1;
1235 req->io_req.br_iov[iovidx].iov_base =
1236 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1237 req->prev_gpaddr, size);
1239 req->prev_size += size;
1240 req->io_req.br_resid += size;
1242 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1244 pthread_mutex_lock(&req->mtx);
1246 iovidx = req->io_req.br_iovcnt;
1247 if (iovidx == NVME_MAX_BLOCKIOVS) {
1250 DPRINTF("large I/O, doing partial req");
1253 req->io_req.br_iovcnt = 0;
1255 req->io_req.br_callback = pci_nvme_io_partial;
1258 err = blockif_read(sc->nvstore.ctx,
1261 err = blockif_write(sc->nvstore.ctx,
1264 /* wait until req completes before cont */
1266 pthread_cond_wait(&req->cv, &req->mtx);
1269 req->io_req.br_offset = lba;
1270 req->io_req.br_resid = 0;
1271 req->io_req.br_param = req;
1274 req->io_req.br_iov[iovidx].iov_base =
1275 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1278 req->io_req.br_iov[iovidx].iov_len = size;
1280 req->prev_gpaddr = gpaddr;
1281 req->prev_size = size;
1282 req->io_req.br_resid += size;
1284 req->io_req.br_iovcnt++;
1286 pthread_mutex_unlock(&req->mtx);
1289 /* RAM buffer: read/write directly */
1290 void *p = sc->nvstore.ctx;
1293 if ((lba + size) > sc->nvstore.size) {
1294 WPRINTF("%s write would overflow RAM", __func__);
1298 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1299 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1301 memcpy(p, gptr, size);
1303 memcpy(gptr, p, size);
1309 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1310 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1311 uint32_t cdw0, uint16_t status)
1313 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1314 struct nvme_completion *compl;
1317 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1318 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1319 NVME_STATUS_GET_SC(status));
1321 pthread_mutex_lock(&cq->mtx);
1323 assert(cq->qbase != NULL);
1325 compl = &cq->qbase[cq->tail];
1329 compl->sqhd = sq->head;
1333 phase = NVME_STATUS_GET_P(compl->status);
1334 compl->status = status;
1335 pci_nvme_toggle_phase(&compl->status, phase);
1337 cq->tail = (cq->tail + 1) % cq->size;
1339 pthread_mutex_unlock(&cq->mtx);
1341 if (cq->head != cq->tail) {
1342 if (cq->intr_en & NVME_CQ_INTEN) {
1343 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1345 DPRINTF("%s: CQ%u interrupt disabled",
1346 __func__, sq->cqid);
1352 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1355 req->nvme_sq = NULL;
1358 pthread_mutex_lock(&sc->mtx);
1360 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1363 /* when no more IO pending, can set to ready if device reset/enabled */
1364 if (sc->pending_ios == 0 &&
1365 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1366 sc->regs.csts |= NVME_CSTS_RDY;
1368 pthread_mutex_unlock(&sc->mtx);
1370 sem_post(&sc->iosemlock);
1373 static struct pci_nvme_ioreq *
1374 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1376 struct pci_nvme_ioreq *req = NULL;;
1378 sem_wait(&sc->iosemlock);
1379 pthread_mutex_lock(&sc->mtx);
1381 req = STAILQ_FIRST(&sc->ioreqs_free);
1382 assert(req != NULL);
1383 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1389 pthread_mutex_unlock(&sc->mtx);
1391 req->io_req.br_iovcnt = 0;
1392 req->io_req.br_offset = 0;
1393 req->io_req.br_resid = 0;
1394 req->io_req.br_param = req;
1395 req->prev_gpaddr = 0;
1402 pci_nvme_io_done(struct blockif_req *br, int err)
1404 struct pci_nvme_ioreq *req = br->br_param;
1405 struct nvme_submission_queue *sq = req->nvme_sq;
1406 uint16_t code, status;
1408 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1410 /* TODO return correct error */
1411 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1412 pci_nvme_status_genc(&status, code);
1414 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1415 pci_nvme_release_ioreq(req->sc, req);
1419 pci_nvme_io_partial(struct blockif_req *br, int err)
1421 struct pci_nvme_ioreq *req = br->br_param;
1423 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1425 pthread_cond_signal(&req->cv);
1429 * Implements the Flush command. The specification states:
1430 * If a volatile write cache is not present, Flush commands complete
1431 * successfully and have no effect
1432 * in the description of the Volatile Write Cache (VWC) field of the Identify
1433 * Controller data. Therefore, set status to Success if the command is
1434 * not supported (i.e. RAM or as indicated by the blockif).
1437 nvme_opc_flush(struct pci_nvme_softc *sc,
1438 struct nvme_command *cmd,
1439 struct pci_nvme_blockstore *nvstore,
1440 struct pci_nvme_ioreq *req,
1443 bool pending = false;
1445 if (nvstore->type == NVME_STOR_RAM) {
1446 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1450 req->io_req.br_callback = pci_nvme_io_done;
1452 err = blockif_flush(nvstore->ctx, &req->io_req);
1458 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1461 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1469 nvme_opc_write_read(struct pci_nvme_softc *sc,
1470 struct nvme_command *cmd,
1471 struct pci_nvme_blockstore *nvstore,
1472 struct pci_nvme_ioreq *req,
1475 uint64_t lba, nblocks, bytes;
1477 bool is_write = cmd->opc == NVME_OPC_WRITE;
1478 bool pending = false;
1480 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1481 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1483 offset = lba * nvstore->sectsz;
1484 bytes = nblocks * nvstore->sectsz;
1486 if ((offset + bytes) > nvstore->size) {
1487 WPRINTF("%s command would exceed LBA range", __func__);
1488 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1492 req->io_req.br_offset = lba;
1494 /* PRP bits 1:0 must be zero */
1495 cmd->prp1 &= ~0x3UL;
1496 cmd->prp2 &= ~0x3UL;
1498 if (nvstore->type == NVME_STOR_RAM) {
1499 uint8_t *buf = nvstore->ctx;
1500 enum nvme_copy_dir dir;
1503 dir = NVME_COPY_TO_PRP;
1505 dir = NVME_COPY_FROM_PRP;
1507 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1508 buf + offset, bytes, dir))
1509 pci_nvme_status_genc(status,
1510 NVME_SC_DATA_TRANSFER_ERROR);
1512 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1517 size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes);
1518 if (pci_nvme_append_iov_req(sc, req, cmd->prp1,
1519 size, is_write, offset)) {
1520 pci_nvme_status_genc(status,
1521 NVME_SC_DATA_TRANSFER_ERROR);
1530 } else if (bytes <= PAGE_SIZE) {
1532 if (pci_nvme_append_iov_req(sc, req, cmd->prp2,
1533 size, is_write, offset)) {
1534 pci_nvme_status_genc(status,
1535 NVME_SC_DATA_TRANSFER_ERROR);
1539 void *vmctx = sc->nsc_pi->pi_vmctx;
1540 uint64_t *prp_list = &cmd->prp2;
1541 uint64_t *last = prp_list;
1543 /* PRP2 is pointer to a physical region page list */
1545 /* Last entry in list points to the next list */
1546 if (prp_list == last) {
1547 uint64_t prp = *prp_list;
1549 prp_list = paddr_guest2host(vmctx, prp,
1550 PAGE_SIZE - (prp % PAGE_SIZE));
1551 last = prp_list + (NVME_PRP2_ITEMS - 1);
1554 size = MIN(bytes, PAGE_SIZE);
1556 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1557 size, is_write, offset)) {
1558 pci_nvme_status_genc(status,
1559 NVME_SC_DATA_TRANSFER_ERROR);
1569 req->io_req.br_callback = pci_nvme_io_done;
1571 err = blockif_write(nvstore->ctx, &req->io_req);
1573 err = blockif_read(nvstore->ctx, &req->io_req);
1576 pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR);
1585 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1587 struct pci_nvme_ioreq *req = br->br_param;
1588 struct pci_nvme_softc *sc = req->sc;
1593 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1594 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1595 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1597 struct iovec *iov = req->io_req.br_iov;
1600 iov += req->prev_gpaddr;
1602 /* The iov_* values already include the sector size */
1603 req->io_req.br_offset = (off_t)iov->iov_base;
1604 req->io_req.br_resid = iov->iov_len;
1605 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1606 pci_nvme_status_genc(&status,
1607 NVME_SC_INTERNAL_DEVICE_ERROR);
1613 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1614 req->cid, 0, status);
1615 pci_nvme_release_ioreq(sc, req);
1620 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1621 struct nvme_command *cmd,
1622 struct pci_nvme_blockstore *nvstore,
1623 struct pci_nvme_ioreq *req,
1627 bool pending = false;
1629 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1630 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1634 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1635 struct nvme_dsm_range *range;
1637 int sectsz = sc->nvstore.sectsz;
1640 * DSM calls are advisory only, and compliant controllers
1641 * may choose to take no actions (i.e. return Success).
1643 if (!nvstore->deallocate) {
1644 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1649 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1653 /* copy locally because a range entry could straddle PRPs */
1654 range = calloc(1, NVME_MAX_DSM_TRIM);
1655 if (range == NULL) {
1656 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1659 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1660 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1663 * If the request is for more than a single range, store
1664 * the ranges in the br_iov. Optimize for the common case
1665 * of a single range.
1667 * Note that NVMe Number of Ranges is a zero based value
1669 nr = cmd->cdw10 & 0xff;
1671 req->io_req.br_iovcnt = 0;
1672 req->io_req.br_offset = range[0].starting_lba * sectsz;
1673 req->io_req.br_resid = range[0].length * sectsz;
1676 req->io_req.br_callback = pci_nvme_io_done;
1678 struct iovec *iov = req->io_req.br_iov;
1680 for (r = 0; r <= nr; r++) {
1681 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1682 iov[r].iov_len = range[r].length * sectsz;
1684 req->io_req.br_callback = pci_nvme_dealloc_sm;
1687 * Use prev_gpaddr to track the current entry and
1688 * prev_size to track the number of entries
1690 req->prev_gpaddr = 0;
1694 err = blockif_delete(nvstore->ctx, &req->io_req);
1696 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1707 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1709 struct nvme_submission_queue *sq;
1713 /* handle all submissions up to sq->tail index */
1714 sq = &sc->submit_queues[idx];
1716 pthread_mutex_lock(&sq->mtx);
1719 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1720 idx, sqhead, sq->tail, sq->qbase);
1722 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1723 struct nvme_command *cmd;
1724 struct pci_nvme_ioreq *req;
1732 cmd = &sq->qbase[sqhead];
1733 sqhead = (sqhead + 1) % sq->size;
1735 nsid = le32toh(cmd->nsid);
1736 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1737 pci_nvme_status_genc(&status,
1738 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1740 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1744 req = pci_nvme_get_ioreq(sc);
1746 pci_nvme_status_genc(&status,
1747 NVME_SC_INTERNAL_DEVICE_ERROR);
1748 WPRINTF("%s: unable to allocate IO req", __func__);
1753 req->opc = cmd->opc;
1754 req->cid = cmd->cid;
1755 req->nsid = cmd->nsid;
1758 case NVME_OPC_FLUSH:
1759 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1762 case NVME_OPC_WRITE:
1764 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1767 case NVME_OPC_WRITE_ZEROES:
1768 /* TODO: write zeroes
1769 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1770 __func__, lba, cmd->cdw12 & 0xFFFF); */
1771 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1773 case NVME_OPC_DATASET_MANAGEMENT:
1774 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1778 WPRINTF("%s unhandled io command 0x%x",
1779 __func__, cmd->opc);
1780 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1784 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1787 pci_nvme_release_ioreq(sc, req);
1793 pthread_mutex_unlock(&sq->mtx);
1797 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1798 uint64_t idx, int is_sq, uint64_t value)
1800 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
1801 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
1804 if (idx > sc->num_squeues) {
1805 WPRINTF("%s queue index %lu overflow from "
1807 __func__, idx, sc->num_squeues);
1811 atomic_store_short(&sc->submit_queues[idx].tail,
1815 pci_nvme_handle_admin_cmd(sc, value);
1817 /* submission queue; handle new entries in SQ */
1818 if (idx > sc->num_squeues) {
1819 WPRINTF("%s SQ index %lu overflow from "
1821 __func__, idx, sc->num_squeues);
1824 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1827 if (idx > sc->num_cqueues) {
1828 WPRINTF("%s queue index %lu overflow from "
1830 __func__, idx, sc->num_cqueues);
1834 atomic_store_short(&sc->compl_queues[idx].head,
1840 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1842 const char *s = iswrite ? "WRITE" : "READ";
1845 case NVME_CR_CAP_LOW:
1846 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
1848 case NVME_CR_CAP_HI:
1849 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
1852 DPRINTF("%s %s NVME_CR_VS", func, s);
1855 DPRINTF("%s %s NVME_CR_INTMS", func, s);
1858 DPRINTF("%s %s NVME_CR_INTMC", func, s);
1861 DPRINTF("%s %s NVME_CR_CC", func, s);
1864 DPRINTF("%s %s NVME_CR_CSTS", func, s);
1867 DPRINTF("%s %s NVME_CR_NSSR", func, s);
1870 DPRINTF("%s %s NVME_CR_AQA", func, s);
1872 case NVME_CR_ASQ_LOW:
1873 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
1875 case NVME_CR_ASQ_HI:
1876 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
1878 case NVME_CR_ACQ_LOW:
1879 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
1881 case NVME_CR_ACQ_HI:
1882 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
1885 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
1891 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1892 uint64_t offset, int size, uint64_t value)
1896 if (offset >= NVME_DOORBELL_OFFSET) {
1897 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1898 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1899 int is_sq = (belloffset % 8) < 4;
1901 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1902 WPRINTF("guest attempted an overflow write offset "
1903 "0x%lx, val 0x%lx in %s",
1904 offset, value, __func__);
1908 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1912 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
1913 offset, size, value);
1916 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
1917 "val 0x%lx) to bar0 in %s",
1918 size, offset, value, __func__);
1919 /* TODO: shutdown device */
1923 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1925 pthread_mutex_lock(&sc->mtx);
1928 case NVME_CR_CAP_LOW:
1929 case NVME_CR_CAP_HI:
1936 /* MSI-X, so ignore */
1939 /* MSI-X, so ignore */
1942 ccreg = (uint32_t)value;
1944 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1947 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1948 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1949 NVME_CC_GET_IOCQES(ccreg));
1951 if (NVME_CC_GET_SHN(ccreg)) {
1952 /* perform shutdown - flush out data to backend */
1953 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1954 NVME_CSTS_REG_SHST_SHIFT);
1955 sc->regs.csts |= NVME_SHST_COMPLETE <<
1956 NVME_CSTS_REG_SHST_SHIFT;
1958 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1959 if (NVME_CC_GET_EN(ccreg) == 0)
1960 /* transition 1-> causes controller reset */
1961 pci_nvme_reset_locked(sc);
1963 pci_nvme_init_controller(ctx, sc);
1966 /* Insert the iocqes, iosqes and en bits from the write */
1967 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1968 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1969 if (NVME_CC_GET_EN(ccreg) == 0) {
1970 /* Insert the ams, mps and css bit fields */
1971 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1972 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1973 sc->regs.csts &= ~NVME_CSTS_RDY;
1974 } else if (sc->pending_ios == 0) {
1975 sc->regs.csts |= NVME_CSTS_RDY;
1981 /* ignore writes; don't support subsystem reset */
1984 sc->regs.aqa = (uint32_t)value;
1986 case NVME_CR_ASQ_LOW:
1987 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1988 (0xFFFFF000 & value);
1990 case NVME_CR_ASQ_HI:
1991 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1994 case NVME_CR_ACQ_LOW:
1995 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1996 (0xFFFFF000 & value);
1998 case NVME_CR_ACQ_HI:
1999 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2003 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2004 __func__, offset, value, size);
2006 pthread_mutex_unlock(&sc->mtx);
2010 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2011 int baridx, uint64_t offset, int size, uint64_t value)
2013 struct pci_nvme_softc* sc = pi->pi_arg;
2015 if (baridx == pci_msix_table_bar(pi) ||
2016 baridx == pci_msix_pba_bar(pi)) {
2017 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2018 " value 0x%lx", baridx, offset, size, value);
2020 pci_emul_msix_twrite(pi, offset, size, value);
2026 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2030 DPRINTF("%s unknown baridx %d, val 0x%lx",
2031 __func__, baridx, value);
2035 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2036 uint64_t offset, int size)
2040 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2042 if (offset < NVME_DOORBELL_OFFSET) {
2043 void *p = &(sc->regs);
2044 pthread_mutex_lock(&sc->mtx);
2045 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2046 pthread_mutex_unlock(&sc->mtx);
2049 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2060 value &= 0xFFFFFFFF;
2064 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2065 offset, size, (uint32_t)value);
2073 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2074 uint64_t offset, int size)
2076 struct pci_nvme_softc* sc = pi->pi_arg;
2078 if (baridx == pci_msix_table_bar(pi) ||
2079 baridx == pci_msix_pba_bar(pi)) {
2080 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2081 baridx, offset, size);
2083 return pci_emul_msix_tread(pi, offset, size);
2088 return pci_nvme_read_bar_0(sc, offset, size);
2091 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2099 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2101 char bident[sizeof("XX:X:X")];
2102 char *uopt, *xopts, *config;
2106 sc->max_queues = NVME_QUEUES;
2107 sc->max_qentries = NVME_MAX_QENTRIES;
2108 sc->ioslots = NVME_IOSLOTS;
2109 sc->num_squeues = sc->max_queues;
2110 sc->num_cqueues = sc->max_queues;
2111 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2114 uopt = strdup(opts);
2116 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2117 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2118 for (xopts = strtok(uopt, ",");
2120 xopts = strtok(NULL, ",")) {
2122 if ((config = strchr(xopts, '=')) != NULL)
2125 if (!strcmp("maxq", xopts)) {
2126 sc->max_queues = atoi(config);
2127 } else if (!strcmp("qsz", xopts)) {
2128 sc->max_qentries = atoi(config);
2129 } else if (!strcmp("ioslots", xopts)) {
2130 sc->ioslots = atoi(config);
2131 } else if (!strcmp("sectsz", xopts)) {
2132 sectsz = atoi(config);
2133 } else if (!strcmp("ser", xopts)) {
2135 * This field indicates the Product Serial Number in
2136 * 7-bit ASCII, unused bytes should be space characters.
2139 cpywithpad((char *)sc->ctrldata.sn,
2140 sizeof(sc->ctrldata.sn), config, ' ');
2141 } else if (!strcmp("ram", xopts)) {
2142 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2144 sc->nvstore.type = NVME_STOR_RAM;
2145 sc->nvstore.size = sz * 1024 * 1024;
2146 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2147 sc->nvstore.sectsz = 4096;
2148 sc->nvstore.sectsz_bits = 12;
2149 if (sc->nvstore.ctx == NULL) {
2150 perror("Unable to allocate RAM");
2154 } else if (!strcmp("eui64", xopts)) {
2155 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2156 } else if (!strcmp("dsm", xopts)) {
2157 if (!strcmp("auto", config))
2158 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2159 else if (!strcmp("enable", config))
2160 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2161 else if (!strcmp("disable", config))
2162 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2163 } else if (optidx == 0) {
2164 snprintf(bident, sizeof(bident), "%d:%d",
2165 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2166 sc->nvstore.ctx = blockif_open(xopts, bident);
2167 if (sc->nvstore.ctx == NULL) {
2168 perror("Could not open backing file");
2172 sc->nvstore.type = NVME_STOR_BLOCKIF;
2173 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2175 EPRINTLN("Invalid option %s", xopts);
2184 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2185 EPRINTLN("backing store not specified");
2188 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2189 sc->nvstore.sectsz = sectsz;
2190 else if (sc->nvstore.type != NVME_STOR_RAM)
2191 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2192 for (sc->nvstore.sectsz_bits = 9;
2193 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2194 sc->nvstore.sectsz_bits++);
2196 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2197 sc->max_queues = NVME_QUEUES;
2199 if (sc->max_qentries <= 0) {
2200 EPRINTLN("Invalid qsz option");
2203 if (sc->ioslots <= 0) {
2204 EPRINTLN("Invalid ioslots option");
2212 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2214 struct pci_nvme_softc *sc;
2215 uint32_t pci_membar_sz;
2220 sc = calloc(1, sizeof(struct pci_nvme_softc));
2224 error = pci_nvme_parse_opts(sc, opts);
2230 STAILQ_INIT(&sc->ioreqs_free);
2231 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2232 for (int i = 0; i < sc->ioslots; i++) {
2233 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2234 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2235 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2237 sc->intr_coales_aggr_thresh = 1;
2239 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2240 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2241 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2242 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2243 pci_set_cfgdata8(pi, PCIR_PROGIF,
2244 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2247 * Allocate size of NVMe registers + doorbell space for all queues.
2249 * The specification requires a minimum memory I/O window size of 16K.
2250 * The Windows driver will refuse to start a device with a smaller
2253 pci_membar_sz = sizeof(struct nvme_registers) +
2254 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2255 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2257 DPRINTF("nvme membar size: %u", pci_membar_sz);
2259 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2261 WPRINTF("%s pci alloc mem bar failed", __func__);
2265 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2267 WPRINTF("%s pci add msixcap failed", __func__);
2271 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2273 WPRINTF("%s pci add Express capability failed", __func__);
2277 pthread_mutex_init(&sc->mtx, NULL);
2278 sem_init(&sc->iosemlock, 0, sc->ioslots);
2280 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2282 * Controller data depends on Namespace data so initialize Namespace
2285 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2286 pci_nvme_init_ctrldata(sc);
2287 pci_nvme_init_logpages(sc);
2291 pci_lintr_request(pi);
2298 struct pci_devemu pci_de_nvme = {
2300 .pe_init = pci_nvme_init,
2301 .pe_barwrite = pci_nvme_write,
2302 .pe_barread = pci_nvme_read
2304 PCI_EMUL_SET(pci_de_nvme);