2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <semaphore.h>
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
80 #include <dev/nvme/nvme.h>
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR 4
95 #define NVME_IOSLOTS 8
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN (1 << 14)
100 #define NVME_QUEUES 16
101 #define NVME_MAX_QENTRIES 2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define NVME_MPSMIN 0
104 /* MPSMIN converted to bytes */
105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS 0xffff
115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero) ((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one) ((one) - 1)
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
131 enum nvme_controller_register_offsets {
132 NVME_CR_CAP_LOW = 0x00,
133 NVME_CR_CAP_HI = 0x04,
135 NVME_CR_INTMS = 0x0c,
136 NVME_CR_INTMC = 0x10,
141 NVME_CR_ASQ_LOW = 0x28,
142 NVME_CR_ASQ_HI = 0x2c,
143 NVME_CR_ACQ_LOW = 0x30,
144 NVME_CR_ACQ_HI = 0x34,
147 enum nvme_cmd_cdw11 {
148 NVME_CMD_CDW11_PC = 0x0001,
149 NVME_CMD_CDW11_IEN = 0x0002,
150 NVME_CMD_CDW11_IV = 0xFFFF0000,
158 #define NVME_CQ_INTEN 0x01
159 #define NVME_CQ_INTCOAL 0x02
161 struct nvme_completion_queue {
162 struct nvme_completion *qbase;
165 uint16_t tail; /* nvme progress */
166 uint16_t head; /* guest progress */
171 struct nvme_submission_queue {
172 struct nvme_command *qbase;
175 uint16_t head; /* nvme progress */
176 uint16_t tail; /* guest progress */
177 uint16_t cqid; /* completion queue id */
181 enum nvme_storage_type {
182 NVME_STOR_BLOCKIF = 0,
186 struct pci_nvme_blockstore {
187 enum nvme_storage_type type;
191 uint32_t sectsz_bits;
193 uint32_t deallocate:1;
197 * Calculate the number of additional page descriptors for guest IO requests
198 * based on the advertised Max Data Transfer (MDTS) and given the number of
199 * default iovec's in a struct blockif_req.
201 * Note the + 1 allows for the initial descriptor to not be page aligned.
203 #define MDTS_PAD_SIZE \
204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
254 struct pci_nvme_softc {
255 struct pci_devinst *nsc_pi;
259 struct nvme_registers regs;
261 struct nvme_namespace_data nsdata;
262 struct nvme_controller_data ctrldata;
263 struct nvme_error_information_entry err_log;
264 struct nvme_health_information_page health_log;
265 struct nvme_firmware_page fw_log;
267 struct pci_nvme_blockstore nvstore;
269 uint16_t max_qentries; /* max entries per queue */
270 uint32_t max_queues; /* max number of IO SQ's or CQ's */
271 uint32_t num_cqueues;
272 uint32_t num_squeues;
273 bool num_q_is_set; /* Has host set Number of Queues */
275 struct pci_nvme_ioreq *ioreqs;
276 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
277 uint32_t pending_ios;
282 * Memory mapped Submission and Completion queues
283 * Each array includes both Admin and IO queues
285 struct nvme_completion_queue *compl_queues;
286 struct nvme_submission_queue *submit_queues;
288 struct nvme_feature_obj feat[NVME_FID_MAX];
290 enum nvme_dsm_type dataset_management;
292 /* Accounting for SMART data */
293 __uint128_t read_data_units;
294 __uint128_t write_data_units;
295 __uint128_t read_commands;
296 __uint128_t write_commands;
297 uint32_t read_dunits_remainder;
298 uint32_t write_dunits_remainder;
302 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
303 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
304 static void pci_nvme_io_done(struct blockif_req *, int);
306 /* Controller Configuration utils */
307 #define NVME_CC_GET_EN(cc) \
308 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
309 #define NVME_CC_GET_CSS(cc) \
310 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
311 #define NVME_CC_GET_SHN(cc) \
312 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
313 #define NVME_CC_GET_IOSQES(cc) \
314 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
315 #define NVME_CC_GET_IOCQES(cc) \
316 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
318 #define NVME_CC_WRITE_MASK \
319 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
320 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
321 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
323 #define NVME_CC_NEN_WRITE_MASK \
324 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
325 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
326 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
328 /* Controller Status utils */
329 #define NVME_CSTS_GET_RDY(sts) \
330 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
332 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
334 /* Completion Queue status word utils */
335 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
336 #define NVME_STATUS_MASK \
337 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
338 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
340 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
341 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
343 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
344 struct nvme_feature_obj *,
345 struct nvme_command *,
346 struct nvme_completion *);
347 static void nvme_feature_num_queues(struct pci_nvme_softc *,
348 struct nvme_feature_obj *,
349 struct nvme_command *,
350 struct nvme_completion *);
353 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
357 len = strnlen(src, dst_size);
358 memset(dst, pad, dst_size);
359 memcpy(dst, src, len);
363 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
366 *status &= ~NVME_STATUS_MASK;
367 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
368 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
372 pci_nvme_status_genc(uint16_t *status, uint16_t code)
375 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
379 * Initialize the requested number or IO Submission and Completion Queues.
380 * Admin queues are allocated implicitly.
383 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
388 * Allocate and initialize the Submission Queues
390 if (nsq > NVME_QUEUES) {
391 WPRINTF("%s: clamping number of SQ from %u to %u",
392 __func__, nsq, NVME_QUEUES);
396 sc->num_squeues = nsq;
398 sc->submit_queues = calloc(sc->num_squeues + 1,
399 sizeof(struct nvme_submission_queue));
400 if (sc->submit_queues == NULL) {
401 WPRINTF("%s: SQ allocation failed", __func__);
404 struct nvme_submission_queue *sq = sc->submit_queues;
406 for (i = 0; i < sc->num_squeues; i++)
407 pthread_mutex_init(&sq[i].mtx, NULL);
411 * Allocate and initialize the Completion Queues
413 if (ncq > NVME_QUEUES) {
414 WPRINTF("%s: clamping number of CQ from %u to %u",
415 __func__, ncq, NVME_QUEUES);
419 sc->num_cqueues = ncq;
421 sc->compl_queues = calloc(sc->num_cqueues + 1,
422 sizeof(struct nvme_completion_queue));
423 if (sc->compl_queues == NULL) {
424 WPRINTF("%s: CQ allocation failed", __func__);
427 struct nvme_completion_queue *cq = sc->compl_queues;
429 for (i = 0; i < sc->num_cqueues; i++)
430 pthread_mutex_init(&cq[i].mtx, NULL);
435 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
437 struct nvme_controller_data *cd = &sc->ctrldata;
442 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
443 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
445 /* Num of submission commands that we can handle at a time (2^rab) */
455 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
457 cd->ver = 0x00010300;
459 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
463 cd->lpa = 0; /* TODO: support some simple things like SMART */
464 cd->elpe = 0; /* max error log page entries */
465 cd->npss = 1; /* number of power states support */
467 /* Warning Composite Temperature Threshold */
470 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
471 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
472 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
473 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
474 cd->nn = 1; /* number of namespaces */
477 switch (sc->dataset_management) {
478 case NVME_DATASET_MANAGEMENT_AUTO:
479 if (sc->nvstore.deallocate)
480 cd->oncs |= NVME_ONCS_DSM;
482 case NVME_DATASET_MANAGEMENT_ENABLE:
483 cd->oncs |= NVME_ONCS_DSM;
491 cd->power_state[0].mp = 10;
495 * Calculate the CRC-16 of the given buffer
496 * See copyright attribution at top of file
499 crc16(uint16_t crc, const void *buffer, unsigned int len)
501 const unsigned char *cp = buffer;
502 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
503 static uint16_t const crc16_table[256] = {
504 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
505 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
506 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
507 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
508 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
509 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
510 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
511 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
512 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
513 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
514 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
515 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
516 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
517 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
518 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
519 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
520 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
521 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
522 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
523 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
524 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
525 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
526 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
527 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
528 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
529 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
530 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
531 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
532 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
533 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
534 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
535 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
539 crc = (((crc >> 8) & 0xffU) ^
540 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
545 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
546 struct nvme_namespace_data *nd, uint32_t nsid,
547 struct pci_nvme_blockstore *nvstore)
550 /* Get capacity and block size information from backing store */
551 nd->nsze = nvstore->size / nvstore->sectsz;
555 if (nvstore->type == NVME_STOR_BLOCKIF)
556 nvstore->deallocate = blockif_candelete(nvstore->ctx);
558 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
561 /* Create an EUI-64 if user did not provide one */
562 if (nvstore->eui64 == 0) {
564 uint64_t eui64 = nvstore->eui64;
566 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
567 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
570 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
573 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
575 be64enc(nd->eui64, nvstore->eui64);
577 /* LBA data-sz = 2^lbads */
578 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
582 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
585 memset(&sc->err_log, 0, sizeof(sc->err_log));
586 memset(&sc->health_log, 0, sizeof(sc->health_log));
587 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
589 /* Set read/write remainder to round up according to spec */
590 sc->read_dunits_remainder = 999;
591 sc->write_dunits_remainder = 999;
595 pci_nvme_init_features(struct pci_nvme_softc *sc)
598 sc->feat[0].set = nvme_feature_invalid_cb;
599 sc->feat[0].get = nvme_feature_invalid_cb;
601 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
602 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
603 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
607 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
611 DPRINTF("%s", __func__);
613 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
614 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
615 (60 << NVME_CAP_LO_REG_TO_SHIFT);
617 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
619 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
624 assert(sc->submit_queues != NULL);
626 for (i = 0; i < sc->num_squeues + 1; i++) {
627 sc->submit_queues[i].qbase = NULL;
628 sc->submit_queues[i].size = 0;
629 sc->submit_queues[i].cqid = 0;
630 sc->submit_queues[i].tail = 0;
631 sc->submit_queues[i].head = 0;
634 assert(sc->compl_queues != NULL);
636 for (i = 0; i < sc->num_cqueues + 1; i++) {
637 sc->compl_queues[i].qbase = NULL;
638 sc->compl_queues[i].size = 0;
639 sc->compl_queues[i].tail = 0;
640 sc->compl_queues[i].head = 0;
643 sc->num_q_is_set = false;
647 pci_nvme_reset(struct pci_nvme_softc *sc)
649 pthread_mutex_lock(&sc->mtx);
650 pci_nvme_reset_locked(sc);
651 pthread_mutex_unlock(&sc->mtx);
655 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
659 DPRINTF("%s", __func__);
661 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
662 sc->submit_queues[0].size = asqs;
663 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
664 sizeof(struct nvme_command) * asqs);
666 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
667 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
669 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
670 NVME_AQA_REG_ACQS_MASK) + 1;
671 sc->compl_queues[0].size = acqs;
672 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
673 sizeof(struct nvme_completion) * acqs);
674 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
676 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
677 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
681 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
682 size_t len, enum nvme_copy_dir dir)
687 if (len > (8 * 1024)) {
691 /* Copy from the start of prp1 to the end of the physical page */
692 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
693 bytes = MIN(bytes, len);
695 p = vm_map_gpa(ctx, prp1, bytes);
700 if (dir == NVME_COPY_TO_PRP)
712 len = MIN(len, PAGE_SIZE);
714 p = vm_map_gpa(ctx, prp2, len);
719 if (dir == NVME_COPY_TO_PRP)
728 * Write a Completion Queue Entry update
730 * Write the completion and update the doorbell value
733 pci_nvme_cq_update(struct pci_nvme_softc *sc,
734 struct nvme_completion_queue *cq,
740 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
741 struct nvme_completion *cqe;
743 assert(cq->qbase != NULL);
745 pthread_mutex_lock(&cq->mtx);
747 cqe = &cq->qbase[cq->tail];
749 /* Flip the phase bit */
750 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
753 cqe->sqhd = sq->head;
756 cqe->status = status;
759 if (cq->tail >= cq->size) {
763 pthread_mutex_unlock(&cq->mtx);
767 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
768 struct nvme_completion* compl)
770 uint16_t qid = command->cdw10 & 0xffff;
772 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
773 if (qid == 0 || qid > sc->num_squeues ||
774 (sc->submit_queues[qid].qbase == NULL)) {
775 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
776 __func__, qid, sc->num_squeues);
777 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
778 NVME_SC_INVALID_QUEUE_IDENTIFIER);
782 sc->submit_queues[qid].qbase = NULL;
783 sc->submit_queues[qid].cqid = 0;
784 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
789 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
790 struct nvme_completion* compl)
792 if (command->cdw11 & NVME_CMD_CDW11_PC) {
793 uint16_t qid = command->cdw10 & 0xffff;
794 struct nvme_submission_queue *nsq;
796 if ((qid == 0) || (qid > sc->num_squeues) ||
797 (sc->submit_queues[qid].qbase != NULL)) {
798 WPRINTF("%s queue index %u > num_squeues %u",
799 __func__, qid, sc->num_squeues);
800 pci_nvme_status_tc(&compl->status,
801 NVME_SCT_COMMAND_SPECIFIC,
802 NVME_SC_INVALID_QUEUE_IDENTIFIER);
806 nsq = &sc->submit_queues[qid];
807 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
808 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
809 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
811 * Queues must specify at least two entries
812 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
813 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
815 pci_nvme_status_tc(&compl->status,
816 NVME_SCT_COMMAND_SPECIFIC,
817 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
821 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
822 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
823 pci_nvme_status_tc(&compl->status,
824 NVME_SCT_COMMAND_SPECIFIC,
825 NVME_SC_INVALID_QUEUE_IDENTIFIER);
829 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
830 pci_nvme_status_tc(&compl->status,
831 NVME_SCT_COMMAND_SPECIFIC,
832 NVME_SC_COMPLETION_QUEUE_INVALID);
836 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
838 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
839 sizeof(struct nvme_command) * (size_t)nsq->size);
841 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
842 qid, nsq->size, nsq->qbase, nsq->cqid);
844 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
846 DPRINTF("%s completed creating IOSQ qid %u",
850 * Guest sent non-cont submission queue request.
851 * This setting is unsupported by this emulation.
853 WPRINTF("%s unsupported non-contig (list-based) "
854 "create i/o submission queue", __func__);
856 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
862 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
863 struct nvme_completion* compl)
865 uint16_t qid = command->cdw10 & 0xffff;
868 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
869 if (qid == 0 || qid > sc->num_cqueues ||
870 (sc->compl_queues[qid].qbase == NULL)) {
871 WPRINTF("%s queue index %u / num_cqueues %u",
872 __func__, qid, sc->num_cqueues);
873 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
874 NVME_SC_INVALID_QUEUE_IDENTIFIER);
878 /* Deleting an Active CQ is an error */
879 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
880 if (sc->submit_queues[sqid].cqid == qid) {
881 pci_nvme_status_tc(&compl->status,
882 NVME_SCT_COMMAND_SPECIFIC,
883 NVME_SC_INVALID_QUEUE_DELETION);
887 sc->compl_queues[qid].qbase = NULL;
888 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
893 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
894 struct nvme_completion* compl)
896 struct nvme_completion_queue *ncq;
897 uint16_t qid = command->cdw10 & 0xffff;
899 /* Only support Physically Contiguous queues */
900 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
901 WPRINTF("%s unsupported non-contig (list-based) "
902 "create i/o completion queue",
905 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
909 if ((qid == 0) || (qid > sc->num_cqueues) ||
910 (sc->compl_queues[qid].qbase != NULL)) {
911 WPRINTF("%s queue index %u > num_cqueues %u",
912 __func__, qid, sc->num_cqueues);
913 pci_nvme_status_tc(&compl->status,
914 NVME_SCT_COMMAND_SPECIFIC,
915 NVME_SC_INVALID_QUEUE_IDENTIFIER);
919 ncq = &sc->compl_queues[qid];
920 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
921 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
922 if (ncq->intr_vec > (sc->max_queues + 1)) {
923 pci_nvme_status_tc(&compl->status,
924 NVME_SCT_COMMAND_SPECIFIC,
925 NVME_SC_INVALID_INTERRUPT_VECTOR);
929 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
930 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
932 * Queues must specify at least two entries
933 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
934 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
936 pci_nvme_status_tc(&compl->status,
937 NVME_SCT_COMMAND_SPECIFIC,
938 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
941 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
943 sizeof(struct nvme_command) * (size_t)ncq->size);
945 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
952 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
953 struct nvme_completion* compl)
956 uint8_t logpage = command->cdw10 & 0xFF;
958 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
960 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
963 * Command specifies the number of dwords to return in fields NUMDU
964 * and NUMDL. This is a zero-based value.
966 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
967 logsize *= sizeof(uint32_t);
971 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
972 command->prp2, (uint8_t *)&sc->err_log,
973 MIN(logsize, sizeof(sc->err_log)),
976 case NVME_LOG_HEALTH_INFORMATION:
977 pthread_mutex_lock(&sc->mtx);
978 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
979 sizeof(sc->health_log.data_units_read));
980 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
981 sizeof(sc->health_log.data_units_written));
982 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
983 sizeof(sc->health_log.host_read_commands));
984 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
985 sizeof(sc->health_log.host_write_commands));
986 pthread_mutex_unlock(&sc->mtx);
988 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
989 command->prp2, (uint8_t *)&sc->health_log,
990 MIN(logsize, sizeof(sc->health_log)),
993 case NVME_LOG_FIRMWARE_SLOT:
994 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
995 command->prp2, (uint8_t *)&sc->fw_log,
996 MIN(logsize, sizeof(sc->fw_log)),
1000 DPRINTF("%s get log page %x command not supported",
1003 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1004 NVME_SC_INVALID_LOG_PAGE);
1011 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1012 struct nvme_completion* compl)
1017 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1018 command->cdw10 & 0xFF, command->nsid);
1020 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1022 switch (command->cdw10 & 0xFF) {
1023 case 0x00: /* return Identify Namespace data structure */
1024 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1025 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1028 case 0x01: /* return Identify Controller data structure */
1029 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1030 command->prp2, (uint8_t *)&sc->ctrldata,
1031 sizeof(sc->ctrldata),
1034 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1035 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1036 sizeof(uint32_t) * 1024);
1037 ((uint32_t *)dest)[0] = 1;
1038 ((uint32_t *)dest)[1] = 0;
1040 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1041 if (command->nsid != 1) {
1042 pci_nvme_status_genc(&status,
1043 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1046 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1047 sizeof(uint32_t) * 1024);
1048 /* All bytes after the descriptor shall be zero */
1049 bzero(dest, sizeof(uint32_t) * 1024);
1051 /* Return NIDT=1 (i.e. EUI64) descriptor */
1052 ((uint8_t *)dest)[0] = 1;
1053 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1054 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1057 DPRINTF("%s unsupported identify command requested 0x%x",
1058 __func__, command->cdw10 & 0xFF);
1059 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1063 compl->status = status;
1068 nvme_fid_to_name(uint8_t fid)
1073 case NVME_FEAT_ARBITRATION:
1074 name = "Arbitration";
1076 case NVME_FEAT_POWER_MANAGEMENT:
1077 name = "Power Management";
1079 case NVME_FEAT_LBA_RANGE_TYPE:
1080 name = "LBA Range Type";
1082 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1083 name = "Temperature Threshold";
1085 case NVME_FEAT_ERROR_RECOVERY:
1086 name = "Error Recovery";
1088 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1089 name = "Volatile Write Cache";
1091 case NVME_FEAT_NUMBER_OF_QUEUES:
1092 name = "Number of Queues";
1094 case NVME_FEAT_INTERRUPT_COALESCING:
1095 name = "Interrupt Coalescing";
1097 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1098 name = "Interrupt Vector Configuration";
1100 case NVME_FEAT_WRITE_ATOMICITY:
1101 name = "Write Atomicity Normal";
1103 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1104 name = "Asynchronous Event Configuration";
1106 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1107 name = "Autonomous Power State Transition";
1109 case NVME_FEAT_HOST_MEMORY_BUFFER:
1110 name = "Host Memory Buffer";
1112 case NVME_FEAT_TIMESTAMP:
1115 case NVME_FEAT_KEEP_ALIVE_TIMER:
1116 name = "Keep Alive Timer";
1118 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1119 name = "Host Controlled Thermal Management";
1121 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1122 name = "Non-Operation Power State Config";
1124 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1125 name = "Read Recovery Level Config";
1127 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1128 name = "Predictable Latency Mode Config";
1130 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1131 name = "Predictable Latency Mode Window";
1133 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1134 name = "LBA Status Information Report Interval";
1136 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1137 name = "Host Behavior Support";
1139 case NVME_FEAT_SANITIZE_CONFIG:
1140 name = "Sanitize Config";
1142 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1143 name = "Endurance Group Event Configuration";
1145 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1146 name = "Software Progress Marker";
1148 case NVME_FEAT_HOST_IDENTIFIER:
1149 name = "Host Identifier";
1151 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1152 name = "Reservation Notification Mask";
1154 case NVME_FEAT_RESERVATION_PERSISTENCE:
1155 name = "Reservation Persistence";
1157 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1158 name = "Namespace Write Protection Config";
1169 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1170 struct nvme_feature_obj *feat,
1171 struct nvme_command *command,
1172 struct nvme_completion *compl)
1175 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1179 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1180 struct nvme_feature_obj *feat,
1181 struct nvme_command *command,
1182 struct nvme_completion *compl)
1184 uint16_t nqr; /* Number of Queues Requested */
1186 if (sc->num_q_is_set) {
1187 WPRINTF("%s: Number of Queues already set", __func__);
1188 pci_nvme_status_genc(&compl->status,
1189 NVME_SC_COMMAND_SEQUENCE_ERROR);
1193 nqr = command->cdw11 & 0xFFFF;
1194 if (nqr == 0xffff) {
1195 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1196 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1200 sc->num_squeues = ONE_BASED(nqr);
1201 if (sc->num_squeues > sc->max_queues) {
1202 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1204 sc->num_squeues = sc->max_queues;
1207 nqr = (command->cdw11 >> 16) & 0xFFFF;
1208 if (nqr == 0xffff) {
1209 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1210 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1214 sc->num_cqueues = ONE_BASED(nqr);
1215 if (sc->num_cqueues > sc->max_queues) {
1216 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1218 sc->num_cqueues = sc->max_queues;
1221 /* Patch the command value which will be saved on callback's return */
1222 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1223 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1225 sc->num_q_is_set = true;
1229 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1230 struct nvme_completion *compl)
1232 struct nvme_feature_obj *feat;
1233 uint32_t nsid = command->nsid;
1234 uint8_t fid = command->cdw10 & 0xFF;
1236 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1238 if (fid >= NVME_FID_MAX) {
1239 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1240 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1243 feat = &sc->feat[fid];
1245 if (!feat->namespace_specific &&
1246 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1247 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1248 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1253 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1256 feat->set(sc, feat, command, compl);
1258 if (compl->status == NVME_SC_SUCCESS)
1259 feat->cdw11 = command->cdw11;
1265 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1266 struct nvme_completion* compl)
1268 struct nvme_feature_obj *feat;
1269 uint8_t fid = command->cdw10 & 0xFF;
1271 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1273 if (fid >= NVME_FID_MAX) {
1274 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1275 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1280 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1282 feat = &sc->feat[fid];
1284 feat->get(sc, feat, command, compl);
1287 if (compl->status == NVME_SC_SUCCESS) {
1288 compl->cdw0 = feat->cdw11;
1295 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1296 struct nvme_completion* compl)
1298 uint8_t ses, lbaf, pi;
1300 /* Only supports Secure Erase Setting - User Data Erase */
1301 ses = (command->cdw10 >> 9) & 0x7;
1303 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1307 /* Only supports a single LBA Format */
1308 lbaf = command->cdw10 & 0xf;
1310 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1311 NVME_SC_INVALID_FORMAT);
1315 /* Doesn't support Protection Infomation */
1316 pi = (command->cdw10 >> 5) & 0x7;
1318 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1322 if (sc->nvstore.type == NVME_STOR_RAM) {
1323 if (sc->nvstore.ctx)
1324 free(sc->nvstore.ctx);
1325 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1326 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1328 struct pci_nvme_ioreq *req;
1331 req = pci_nvme_get_ioreq(sc);
1333 pci_nvme_status_genc(&compl->status,
1334 NVME_SC_INTERNAL_DEVICE_ERROR);
1335 WPRINTF("%s: unable to allocate IO req", __func__);
1338 req->nvme_sq = &sc->submit_queues[0];
1340 req->opc = command->opc;
1341 req->cid = command->cid;
1342 req->nsid = command->nsid;
1344 req->io_req.br_offset = 0;
1345 req->io_req.br_resid = sc->nvstore.size;
1346 req->io_req.br_callback = pci_nvme_io_done;
1348 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1350 pci_nvme_status_genc(&compl->status,
1351 NVME_SC_INTERNAL_DEVICE_ERROR);
1352 pci_nvme_release_ioreq(sc, req);
1360 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1361 struct nvme_completion* compl)
1363 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1364 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1366 /* TODO: search for the command ID and abort it */
1369 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1374 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1375 struct nvme_command* command, struct nvme_completion* compl)
1377 DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1380 * TODO: raise events when they happen based on the Set Features cmd.
1381 * These events happen async, so only set completion successful if
1382 * there is an event reflective of the request to get event.
1384 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1385 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1390 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1392 struct nvme_completion compl;
1393 struct nvme_command *cmd;
1394 struct nvme_submission_queue *sq;
1395 struct nvme_completion_queue *cq;
1398 DPRINTF("%s index %u", __func__, (uint32_t)value);
1400 sq = &sc->submit_queues[0];
1401 cq = &sc->compl_queues[0];
1403 pthread_mutex_lock(&sq->mtx);
1406 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1408 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1409 cmd = &(sq->qbase)[sqhead];
1414 case NVME_OPC_DELETE_IO_SQ:
1415 DPRINTF("%s command DELETE_IO_SQ", __func__);
1416 nvme_opc_delete_io_sq(sc, cmd, &compl);
1418 case NVME_OPC_CREATE_IO_SQ:
1419 DPRINTF("%s command CREATE_IO_SQ", __func__);
1420 nvme_opc_create_io_sq(sc, cmd, &compl);
1422 case NVME_OPC_DELETE_IO_CQ:
1423 DPRINTF("%s command DELETE_IO_CQ", __func__);
1424 nvme_opc_delete_io_cq(sc, cmd, &compl);
1426 case NVME_OPC_CREATE_IO_CQ:
1427 DPRINTF("%s command CREATE_IO_CQ", __func__);
1428 nvme_opc_create_io_cq(sc, cmd, &compl);
1430 case NVME_OPC_GET_LOG_PAGE:
1431 DPRINTF("%s command GET_LOG_PAGE", __func__);
1432 nvme_opc_get_log_page(sc, cmd, &compl);
1434 case NVME_OPC_IDENTIFY:
1435 DPRINTF("%s command IDENTIFY", __func__);
1436 nvme_opc_identify(sc, cmd, &compl);
1438 case NVME_OPC_ABORT:
1439 DPRINTF("%s command ABORT", __func__);
1440 nvme_opc_abort(sc, cmd, &compl);
1442 case NVME_OPC_SET_FEATURES:
1443 DPRINTF("%s command SET_FEATURES", __func__);
1444 nvme_opc_set_features(sc, cmd, &compl);
1446 case NVME_OPC_GET_FEATURES:
1447 DPRINTF("%s command GET_FEATURES", __func__);
1448 nvme_opc_get_features(sc, cmd, &compl);
1450 case NVME_OPC_ASYNC_EVENT_REQUEST:
1451 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1452 /* XXX dont care, unhandled for now
1453 nvme_opc_async_event_req(sc, cmd, &compl);
1455 compl.status = NVME_NO_STATUS;
1457 case NVME_OPC_FORMAT_NVM:
1458 DPRINTF("%s command FORMAT_NVM", __func__);
1459 if ((sc->ctrldata.oacs &
1460 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1461 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1463 compl.status = NVME_NO_STATUS;
1464 nvme_opc_format_nvm(sc, cmd, &compl);
1467 DPRINTF("0x%x command is not implemented",
1469 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1471 sqhead = (sqhead + 1) % sq->size;
1473 if (NVME_COMPLETION_VALID(compl)) {
1474 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1482 DPRINTF("setting sqhead %u", sqhead);
1485 if (cq->head != cq->tail)
1486 pci_generate_msix(sc->nsc_pi, 0);
1488 pthread_mutex_unlock(&sq->mtx);
1492 * Update the Write and Read statistics reported in SMART data
1494 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1495 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1496 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1499 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1500 size_t bytes, uint16_t status)
1503 pthread_mutex_lock(&sc->mtx);
1505 case NVME_OPC_WRITE:
1506 sc->write_commands++;
1507 if (status != NVME_SC_SUCCESS)
1509 sc->write_dunits_remainder += (bytes / 512);
1510 while (sc->write_dunits_remainder >= 1000) {
1511 sc->write_data_units++;
1512 sc->write_dunits_remainder -= 1000;
1516 sc->read_commands++;
1517 if (status != NVME_SC_SUCCESS)
1519 sc->read_dunits_remainder += (bytes / 512);
1520 while (sc->read_dunits_remainder >= 1000) {
1521 sc->read_data_units++;
1522 sc->read_dunits_remainder -= 1000;
1526 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1529 pthread_mutex_unlock(&sc->mtx);
1533 * Check if the combination of Starting LBA (slba) and Number of Logical
1534 * Blocks (nlb) exceeds the range of the underlying storage.
1536 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1537 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1541 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1544 size_t offset, bytes;
1546 /* Overflow check of multiplying Starting LBA by the sector size */
1547 if (slba >> (64 - nvstore->sectsz_bits))
1550 offset = slba << nvstore->sectsz_bits;
1551 bytes = nlb << nvstore->sectsz_bits;
1553 /* Overflow check of Number of Logical Blocks */
1554 if ((nvstore->size - offset) < bytes)
1561 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1562 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1569 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1573 /* concatenate contig block-iovs to minimize number of iovs */
1574 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1575 iovidx = req->io_req.br_iovcnt - 1;
1577 req->io_req.br_iov[iovidx].iov_base =
1578 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1579 req->prev_gpaddr, size);
1581 req->prev_size += size;
1582 req->io_req.br_resid += size;
1584 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1586 iovidx = req->io_req.br_iovcnt;
1588 req->io_req.br_offset = lba;
1589 req->io_req.br_resid = 0;
1590 req->io_req.br_param = req;
1593 req->io_req.br_iov[iovidx].iov_base =
1594 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1597 req->io_req.br_iov[iovidx].iov_len = size;
1599 req->prev_gpaddr = gpaddr;
1600 req->prev_size = size;
1601 req->io_req.br_resid += size;
1603 req->io_req.br_iovcnt++;
1610 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1611 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1612 uint32_t cdw0, uint16_t status)
1614 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1616 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1617 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1618 NVME_STATUS_GET_SC(status));
1620 pci_nvme_cq_update(sc, cq,
1626 if (cq->head != cq->tail) {
1627 if (cq->intr_en & NVME_CQ_INTEN) {
1628 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1630 DPRINTF("%s: CQ%u interrupt disabled",
1631 __func__, sq->cqid);
1637 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1640 req->nvme_sq = NULL;
1643 pthread_mutex_lock(&sc->mtx);
1645 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1648 /* when no more IO pending, can set to ready if device reset/enabled */
1649 if (sc->pending_ios == 0 &&
1650 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1651 sc->regs.csts |= NVME_CSTS_RDY;
1653 pthread_mutex_unlock(&sc->mtx);
1655 sem_post(&sc->iosemlock);
1658 static struct pci_nvme_ioreq *
1659 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1661 struct pci_nvme_ioreq *req = NULL;;
1663 sem_wait(&sc->iosemlock);
1664 pthread_mutex_lock(&sc->mtx);
1666 req = STAILQ_FIRST(&sc->ioreqs_free);
1667 assert(req != NULL);
1668 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1674 pthread_mutex_unlock(&sc->mtx);
1676 req->io_req.br_iovcnt = 0;
1677 req->io_req.br_offset = 0;
1678 req->io_req.br_resid = 0;
1679 req->io_req.br_param = req;
1680 req->prev_gpaddr = 0;
1687 pci_nvme_io_done(struct blockif_req *br, int err)
1689 struct pci_nvme_ioreq *req = br->br_param;
1690 struct nvme_submission_queue *sq = req->nvme_sq;
1691 uint16_t code, status;
1693 DPRINTF("%s error %d %s", __func__, err, strerror(err));
1695 /* TODO return correct error */
1696 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1697 pci_nvme_status_genc(&status, code);
1699 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1700 pci_nvme_stats_write_read_update(req->sc, req->opc,
1701 req->bytes, status);
1702 pci_nvme_release_ioreq(req->sc, req);
1706 * Implements the Flush command. The specification states:
1707 * If a volatile write cache is not present, Flush commands complete
1708 * successfully and have no effect
1709 * in the description of the Volatile Write Cache (VWC) field of the Identify
1710 * Controller data. Therefore, set status to Success if the command is
1711 * not supported (i.e. RAM or as indicated by the blockif).
1714 nvme_opc_flush(struct pci_nvme_softc *sc,
1715 struct nvme_command *cmd,
1716 struct pci_nvme_blockstore *nvstore,
1717 struct pci_nvme_ioreq *req,
1720 bool pending = false;
1722 if (nvstore->type == NVME_STOR_RAM) {
1723 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1727 req->io_req.br_callback = pci_nvme_io_done;
1729 err = blockif_flush(nvstore->ctx, &req->io_req);
1735 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1738 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1746 nvme_write_read_ram(struct pci_nvme_softc *sc,
1747 struct pci_nvme_blockstore *nvstore,
1748 uint64_t prp1, uint64_t prp2,
1749 size_t offset, uint64_t bytes,
1752 uint8_t *buf = nvstore->ctx;
1753 enum nvme_copy_dir dir;
1757 dir = NVME_COPY_TO_PRP;
1759 dir = NVME_COPY_FROM_PRP;
1761 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1762 buf + offset, bytes, dir))
1763 pci_nvme_status_genc(&status,
1764 NVME_SC_DATA_TRANSFER_ERROR);
1766 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1772 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1773 struct pci_nvme_blockstore *nvstore,
1774 struct pci_nvme_ioreq *req,
1775 uint64_t prp1, uint64_t prp2,
1776 size_t offset, uint64_t bytes,
1781 uint16_t status = NVME_NO_STATUS;
1783 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1784 if (pci_nvme_append_iov_req(sc, req, prp1,
1785 size, is_write, offset)) {
1786 pci_nvme_status_genc(&status,
1787 NVME_SC_DATA_TRANSFER_ERROR);
1796 } else if (bytes <= PAGE_SIZE) {
1798 if (pci_nvme_append_iov_req(sc, req, prp2,
1799 size, is_write, offset)) {
1800 pci_nvme_status_genc(&status,
1801 NVME_SC_DATA_TRANSFER_ERROR);
1805 void *vmctx = sc->nsc_pi->pi_vmctx;
1806 uint64_t *prp_list = &prp2;
1807 uint64_t *last = prp_list;
1809 /* PRP2 is pointer to a physical region page list */
1811 /* Last entry in list points to the next list */
1812 if (prp_list == last) {
1813 uint64_t prp = *prp_list;
1815 prp_list = paddr_guest2host(vmctx, prp,
1816 PAGE_SIZE - (prp % PAGE_SIZE));
1817 last = prp_list + (NVME_PRP2_ITEMS - 1);
1820 size = MIN(bytes, PAGE_SIZE);
1822 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1823 size, is_write, offset)) {
1824 pci_nvme_status_genc(&status,
1825 NVME_SC_DATA_TRANSFER_ERROR);
1835 req->io_req.br_callback = pci_nvme_io_done;
1837 err = blockif_write(nvstore->ctx, &req->io_req);
1839 err = blockif_read(nvstore->ctx, &req->io_req);
1842 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
1848 nvme_opc_write_read(struct pci_nvme_softc *sc,
1849 struct nvme_command *cmd,
1850 struct pci_nvme_blockstore *nvstore,
1851 struct pci_nvme_ioreq *req,
1854 uint64_t lba, nblocks, bytes;
1856 bool is_write = cmd->opc == NVME_OPC_WRITE;
1857 bool pending = false;
1859 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1860 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1861 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
1862 WPRINTF("%s command would exceed LBA range", __func__);
1863 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1867 bytes = nblocks << nvstore->sectsz_bits;
1868 if (bytes > NVME_MAX_DATA_SIZE) {
1869 WPRINTF("%s command would exceed MDTS", __func__);
1870 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
1874 offset = lba << nvstore->sectsz_bits;
1877 req->io_req.br_offset = lba;
1879 /* PRP bits 1:0 must be zero */
1880 cmd->prp1 &= ~0x3UL;
1881 cmd->prp2 &= ~0x3UL;
1883 if (nvstore->type == NVME_STOR_RAM) {
1884 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
1885 cmd->prp2, offset, bytes, is_write);
1887 *status = nvme_write_read_blockif(sc, nvstore, req,
1888 cmd->prp1, cmd->prp2, offset, bytes, is_write);
1890 if (*status == NVME_NO_STATUS)
1895 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
1901 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1903 struct pci_nvme_ioreq *req = br->br_param;
1904 struct pci_nvme_softc *sc = req->sc;
1909 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1910 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1911 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1913 struct iovec *iov = req->io_req.br_iov;
1916 iov += req->prev_gpaddr;
1918 /* The iov_* values already include the sector size */
1919 req->io_req.br_offset = (off_t)iov->iov_base;
1920 req->io_req.br_resid = iov->iov_len;
1921 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1922 pci_nvme_status_genc(&status,
1923 NVME_SC_INTERNAL_DEVICE_ERROR);
1929 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1930 req->cid, 0, status);
1931 pci_nvme_release_ioreq(sc, req);
1936 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1937 struct nvme_command *cmd,
1938 struct pci_nvme_blockstore *nvstore,
1939 struct pci_nvme_ioreq *req,
1943 bool pending = false;
1945 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1946 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1950 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1951 struct nvme_dsm_range *range;
1952 size_t offset, bytes;
1954 int sectsz_bits = sc->nvstore.sectsz_bits;
1957 * DSM calls are advisory only, and compliant controllers
1958 * may choose to take no actions (i.e. return Success).
1960 if (!nvstore->deallocate) {
1961 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1966 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1970 /* copy locally because a range entry could straddle PRPs */
1971 range = calloc(1, NVME_MAX_DSM_TRIM);
1972 if (range == NULL) {
1973 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1976 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1977 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1979 if (pci_nvme_out_of_range(nvstore, range[0].starting_lba,
1981 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1984 offset = range[0].starting_lba << sectsz_bits;
1985 bytes = range[0].length << sectsz_bits;
1988 * If the request is for more than a single range, store
1989 * the ranges in the br_iov. Optimize for the common case
1990 * of a single range.
1992 * Note that NVMe Number of Ranges is a zero based value
1994 nr = cmd->cdw10 & 0xff;
1996 req->io_req.br_iovcnt = 0;
1997 req->io_req.br_offset = offset;
1998 req->io_req.br_resid = bytes;
2001 req->io_req.br_callback = pci_nvme_io_done;
2003 struct iovec *iov = req->io_req.br_iov;
2005 for (r = 0; r <= nr; r++) {
2006 if (pci_nvme_out_of_range(nvstore, range[r].starting_lba,
2008 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2011 offset = range[r].starting_lba << sectsz_bits;
2012 bytes = range[r].length << sectsz_bits;
2013 if ((nvstore->size - offset) < bytes) {
2014 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2017 iov[r].iov_base = (void *)offset;
2018 iov[r].iov_len = bytes;
2020 req->io_req.br_callback = pci_nvme_dealloc_sm;
2023 * Use prev_gpaddr to track the current entry and
2024 * prev_size to track the number of entries
2026 req->prev_gpaddr = 0;
2030 err = blockif_delete(nvstore->ctx, &req->io_req);
2032 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2043 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2045 struct nvme_submission_queue *sq;
2049 /* handle all submissions up to sq->tail index */
2050 sq = &sc->submit_queues[idx];
2052 pthread_mutex_lock(&sq->mtx);
2055 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2056 idx, sqhead, sq->tail, sq->qbase);
2058 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2059 struct nvme_command *cmd;
2060 struct pci_nvme_ioreq *req;
2068 cmd = &sq->qbase[sqhead];
2069 sqhead = (sqhead + 1) % sq->size;
2071 nsid = le32toh(cmd->nsid);
2072 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2073 pci_nvme_status_genc(&status,
2074 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2076 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2080 req = pci_nvme_get_ioreq(sc);
2082 pci_nvme_status_genc(&status,
2083 NVME_SC_INTERNAL_DEVICE_ERROR);
2084 WPRINTF("%s: unable to allocate IO req", __func__);
2089 req->opc = cmd->opc;
2090 req->cid = cmd->cid;
2091 req->nsid = cmd->nsid;
2094 case NVME_OPC_FLUSH:
2095 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2098 case NVME_OPC_WRITE:
2100 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2103 case NVME_OPC_WRITE_ZEROES:
2104 /* TODO: write zeroes
2105 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2106 __func__, lba, cmd->cdw12 & 0xFFFF); */
2107 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2109 case NVME_OPC_DATASET_MANAGEMENT:
2110 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2114 WPRINTF("%s unhandled io command 0x%x",
2115 __func__, cmd->opc);
2116 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2120 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2123 pci_nvme_release_ioreq(sc, req);
2129 pthread_mutex_unlock(&sq->mtx);
2133 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2134 uint64_t idx, int is_sq, uint64_t value)
2136 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2137 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2140 if (idx > sc->num_squeues) {
2141 WPRINTF("%s queue index %lu overflow from "
2143 __func__, idx, sc->num_squeues);
2147 atomic_store_short(&sc->submit_queues[idx].tail,
2151 pci_nvme_handle_admin_cmd(sc, value);
2153 /* submission queue; handle new entries in SQ */
2154 if (idx > sc->num_squeues) {
2155 WPRINTF("%s SQ index %lu overflow from "
2157 __func__, idx, sc->num_squeues);
2160 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2163 if (idx > sc->num_cqueues) {
2164 WPRINTF("%s queue index %lu overflow from "
2166 __func__, idx, sc->num_cqueues);
2170 atomic_store_short(&sc->compl_queues[idx].head,
2176 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2178 const char *s = iswrite ? "WRITE" : "READ";
2181 case NVME_CR_CAP_LOW:
2182 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2184 case NVME_CR_CAP_HI:
2185 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2188 DPRINTF("%s %s NVME_CR_VS", func, s);
2191 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2194 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2197 DPRINTF("%s %s NVME_CR_CC", func, s);
2200 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2203 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2206 DPRINTF("%s %s NVME_CR_AQA", func, s);
2208 case NVME_CR_ASQ_LOW:
2209 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2211 case NVME_CR_ASQ_HI:
2212 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2214 case NVME_CR_ACQ_LOW:
2215 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2217 case NVME_CR_ACQ_HI:
2218 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2221 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2227 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2228 uint64_t offset, int size, uint64_t value)
2232 if (offset >= NVME_DOORBELL_OFFSET) {
2233 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2234 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2235 int is_sq = (belloffset % 8) < 4;
2237 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2238 WPRINTF("guest attempted an overflow write offset "
2239 "0x%lx, val 0x%lx in %s",
2240 offset, value, __func__);
2244 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2248 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2249 offset, size, value);
2252 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2253 "val 0x%lx) to bar0 in %s",
2254 size, offset, value, __func__);
2255 /* TODO: shutdown device */
2259 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2261 pthread_mutex_lock(&sc->mtx);
2264 case NVME_CR_CAP_LOW:
2265 case NVME_CR_CAP_HI:
2272 /* MSI-X, so ignore */
2275 /* MSI-X, so ignore */
2278 ccreg = (uint32_t)value;
2280 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2283 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2284 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2285 NVME_CC_GET_IOCQES(ccreg));
2287 if (NVME_CC_GET_SHN(ccreg)) {
2288 /* perform shutdown - flush out data to backend */
2289 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2290 NVME_CSTS_REG_SHST_SHIFT);
2291 sc->regs.csts |= NVME_SHST_COMPLETE <<
2292 NVME_CSTS_REG_SHST_SHIFT;
2294 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2295 if (NVME_CC_GET_EN(ccreg) == 0)
2296 /* transition 1-> causes controller reset */
2297 pci_nvme_reset_locked(sc);
2299 pci_nvme_init_controller(ctx, sc);
2302 /* Insert the iocqes, iosqes and en bits from the write */
2303 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2304 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2305 if (NVME_CC_GET_EN(ccreg) == 0) {
2306 /* Insert the ams, mps and css bit fields */
2307 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2308 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2309 sc->regs.csts &= ~NVME_CSTS_RDY;
2310 } else if (sc->pending_ios == 0) {
2311 sc->regs.csts |= NVME_CSTS_RDY;
2317 /* ignore writes; don't support subsystem reset */
2320 sc->regs.aqa = (uint32_t)value;
2322 case NVME_CR_ASQ_LOW:
2323 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2324 (0xFFFFF000 & value);
2326 case NVME_CR_ASQ_HI:
2327 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2330 case NVME_CR_ACQ_LOW:
2331 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2332 (0xFFFFF000 & value);
2334 case NVME_CR_ACQ_HI:
2335 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2339 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2340 __func__, offset, value, size);
2342 pthread_mutex_unlock(&sc->mtx);
2346 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2347 int baridx, uint64_t offset, int size, uint64_t value)
2349 struct pci_nvme_softc* sc = pi->pi_arg;
2351 if (baridx == pci_msix_table_bar(pi) ||
2352 baridx == pci_msix_pba_bar(pi)) {
2353 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2354 " value 0x%lx", baridx, offset, size, value);
2356 pci_emul_msix_twrite(pi, offset, size, value);
2362 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2366 DPRINTF("%s unknown baridx %d, val 0x%lx",
2367 __func__, baridx, value);
2371 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2372 uint64_t offset, int size)
2376 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2378 if (offset < NVME_DOORBELL_OFFSET) {
2379 void *p = &(sc->regs);
2380 pthread_mutex_lock(&sc->mtx);
2381 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2382 pthread_mutex_unlock(&sc->mtx);
2385 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2396 value &= 0xFFFFFFFF;
2400 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2401 offset, size, (uint32_t)value);
2409 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2410 uint64_t offset, int size)
2412 struct pci_nvme_softc* sc = pi->pi_arg;
2414 if (baridx == pci_msix_table_bar(pi) ||
2415 baridx == pci_msix_pba_bar(pi)) {
2416 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2417 baridx, offset, size);
2419 return pci_emul_msix_tread(pi, offset, size);
2424 return pci_nvme_read_bar_0(sc, offset, size);
2427 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2435 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2437 char bident[sizeof("XX:X:X")];
2438 char *uopt, *xopts, *config;
2442 sc->max_queues = NVME_QUEUES;
2443 sc->max_qentries = NVME_MAX_QENTRIES;
2444 sc->ioslots = NVME_IOSLOTS;
2445 sc->num_squeues = sc->max_queues;
2446 sc->num_cqueues = sc->max_queues;
2447 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2450 uopt = strdup(opts);
2452 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2453 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2454 for (xopts = strtok(uopt, ",");
2456 xopts = strtok(NULL, ",")) {
2458 if ((config = strchr(xopts, '=')) != NULL)
2461 if (!strcmp("maxq", xopts)) {
2462 sc->max_queues = atoi(config);
2463 } else if (!strcmp("qsz", xopts)) {
2464 sc->max_qentries = atoi(config);
2465 } else if (!strcmp("ioslots", xopts)) {
2466 sc->ioslots = atoi(config);
2467 } else if (!strcmp("sectsz", xopts)) {
2468 sectsz = atoi(config);
2469 } else if (!strcmp("ser", xopts)) {
2471 * This field indicates the Product Serial Number in
2472 * 7-bit ASCII, unused bytes should be space characters.
2475 cpywithpad((char *)sc->ctrldata.sn,
2476 sizeof(sc->ctrldata.sn), config, ' ');
2477 } else if (!strcmp("ram", xopts)) {
2478 uint64_t sz = strtoull(&xopts[4], NULL, 10);
2480 sc->nvstore.type = NVME_STOR_RAM;
2481 sc->nvstore.size = sz * 1024 * 1024;
2482 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2483 sc->nvstore.sectsz = 4096;
2484 sc->nvstore.sectsz_bits = 12;
2485 if (sc->nvstore.ctx == NULL) {
2486 perror("Unable to allocate RAM");
2490 } else if (!strcmp("eui64", xopts)) {
2491 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2492 } else if (!strcmp("dsm", xopts)) {
2493 if (!strcmp("auto", config))
2494 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2495 else if (!strcmp("enable", config))
2496 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2497 else if (!strcmp("disable", config))
2498 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2499 } else if (optidx == 0) {
2500 snprintf(bident, sizeof(bident), "%d:%d",
2501 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2502 sc->nvstore.ctx = blockif_open(xopts, bident);
2503 if (sc->nvstore.ctx == NULL) {
2504 perror("Could not open backing file");
2508 sc->nvstore.type = NVME_STOR_BLOCKIF;
2509 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2511 EPRINTLN("Invalid option %s", xopts);
2520 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2521 EPRINTLN("backing store not specified");
2524 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2525 sc->nvstore.sectsz = sectsz;
2526 else if (sc->nvstore.type != NVME_STOR_RAM)
2527 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2528 for (sc->nvstore.sectsz_bits = 9;
2529 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2530 sc->nvstore.sectsz_bits++);
2532 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2533 sc->max_queues = NVME_QUEUES;
2535 if (sc->max_qentries <= 0) {
2536 EPRINTLN("Invalid qsz option");
2539 if (sc->ioslots <= 0) {
2540 EPRINTLN("Invalid ioslots option");
2548 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2550 struct pci_nvme_softc *sc;
2551 uint32_t pci_membar_sz;
2556 sc = calloc(1, sizeof(struct pci_nvme_softc));
2560 error = pci_nvme_parse_opts(sc, opts);
2566 STAILQ_INIT(&sc->ioreqs_free);
2567 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2568 for (int i = 0; i < sc->ioslots; i++) {
2569 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2572 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2573 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2574 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2575 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2576 pci_set_cfgdata8(pi, PCIR_PROGIF,
2577 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2580 * Allocate size of NVMe registers + doorbell space for all queues.
2582 * The specification requires a minimum memory I/O window size of 16K.
2583 * The Windows driver will refuse to start a device with a smaller
2586 pci_membar_sz = sizeof(struct nvme_registers) +
2587 2 * sizeof(uint32_t) * (sc->max_queues + 1);
2588 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2590 DPRINTF("nvme membar size: %u", pci_membar_sz);
2592 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2594 WPRINTF("%s pci alloc mem bar failed", __func__);
2598 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2600 WPRINTF("%s pci add msixcap failed", __func__);
2604 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2606 WPRINTF("%s pci add Express capability failed", __func__);
2610 pthread_mutex_init(&sc->mtx, NULL);
2611 sem_init(&sc->iosemlock, 0, sc->ioslots);
2613 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2615 * Controller data depends on Namespace data so initialize Namespace
2618 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2619 pci_nvme_init_ctrldata(sc);
2620 pci_nvme_init_logpages(sc);
2621 pci_nvme_init_features(sc);
2625 pci_lintr_request(pi);
2632 struct pci_devemu pci_de_nvme = {
2634 .pe_init = pci_nvme_init,
2635 .pe_barwrite = pci_nvme_write,
2636 .pe_barread = pci_nvme_read
2638 PCI_EMUL_SET(pci_de_nvme);