2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
7 * Function crc16 Copyright (c) 2017, Fedor Uporov
8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * bhyve PCIe-NVMe device emulation.
36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
43 * maxq = max number of queues
44 * qsz = max elements in each queue
45 * ioslots = max number of concurrent io requests
46 * sectsz = sector size (defaults to blockif sector size)
47 * ser = serial number (20-chars max)
48 * eui64 = IEEE Extended Unique Identifier (8 byte value)
53 - create async event for smart and log
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
65 #include <semaphore.h>
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
77 #include <dev/nvme/nvme.h>
85 static int nvme_debug = 0;
86 #define DPRINTF(params) if (nvme_debug) PRINTLN params
87 #define WPRINTF(params) PRINTLN params
89 /* defaults; can be overridden */
90 #define NVME_MSIX_BAR 4
92 #define NVME_IOSLOTS 8
94 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
95 #define NVME_MMIO_SPACE_MIN (1 << 14)
97 #define NVME_QUEUES 16
98 #define NVME_MAX_QENTRIES 2048
100 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
101 #define NVME_MAX_BLOCKIOVS 512
103 /* This is a synthetic status code to indicate there is no status */
104 #define NVME_NO_STATUS 0xffff
105 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
109 /* Convert a zero-based value into a one-based value */
110 #define ONE_BASED(zero) ((zero) + 1)
111 /* Convert a one-based value into a zero-based value */
112 #define ZERO_BASED(one) ((one) - 1)
114 /* Encode number of SQ's and CQ's for Set/Get Features */
115 #define NVME_FEATURE_NUM_QUEUES(sc) \
116 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
117 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
119 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
121 enum nvme_controller_register_offsets {
122 NVME_CR_CAP_LOW = 0x00,
123 NVME_CR_CAP_HI = 0x04,
125 NVME_CR_INTMS = 0x0c,
126 NVME_CR_INTMC = 0x10,
131 NVME_CR_ASQ_LOW = 0x28,
132 NVME_CR_ASQ_HI = 0x2c,
133 NVME_CR_ACQ_LOW = 0x30,
134 NVME_CR_ACQ_HI = 0x34,
137 enum nvme_cmd_cdw11 {
138 NVME_CMD_CDW11_PC = 0x0001,
139 NVME_CMD_CDW11_IEN = 0x0002,
140 NVME_CMD_CDW11_IV = 0xFFFF0000,
143 #define NVME_CQ_INTEN 0x01
144 #define NVME_CQ_INTCOAL 0x02
146 struct nvme_completion_queue {
147 struct nvme_completion *qbase;
149 uint16_t tail; /* nvme progress */
150 uint16_t head; /* guest progress */
156 struct nvme_submission_queue {
157 struct nvme_command *qbase;
159 uint16_t head; /* nvme progress */
160 uint16_t tail; /* guest progress */
161 uint16_t cqid; /* completion queue id */
162 int busy; /* queue is being processed */
166 enum nvme_storage_type {
167 NVME_STOR_BLOCKIF = 0,
171 struct pci_nvme_blockstore {
172 enum nvme_storage_type type;
176 uint32_t sectsz_bits;
180 struct pci_nvme_ioreq {
181 struct pci_nvme_softc *sc;
182 struct pci_nvme_ioreq *next;
183 struct nvme_submission_queue *nvme_sq;
186 /* command information */
191 uint64_t prev_gpaddr;
195 * lock if all iovs consumed (big IO);
196 * complete transaction before continuing
201 struct blockif_req io_req;
203 /* pad to fit up to 512 page descriptors from guest IO request */
204 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
207 struct pci_nvme_softc {
208 struct pci_devinst *nsc_pi;
212 struct nvme_registers regs;
214 struct nvme_namespace_data nsdata;
215 struct nvme_controller_data ctrldata;
216 struct nvme_error_information_entry err_log;
217 struct nvme_health_information_page health_log;
218 struct nvme_firmware_page fw_log;
220 struct pci_nvme_blockstore nvstore;
222 uint16_t max_qentries; /* max entries per queue */
223 uint32_t max_queues; /* max number of IO SQ's or CQ's */
224 uint32_t num_cqueues;
225 uint32_t num_squeues;
227 struct pci_nvme_ioreq *ioreqs;
228 struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
229 uint32_t pending_ios;
234 * Memory mapped Submission and Completion queues
235 * Each array includes both Admin and IO queues
237 struct nvme_completion_queue *compl_queues;
238 struct nvme_submission_queue *submit_queues;
240 /* controller features */
241 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
242 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
243 uint32_t async_ev_config; /* 0x0B: async event config */
247 static void pci_nvme_io_partial(struct blockif_req *br, int err);
249 /* Controller Configuration utils */
250 #define NVME_CC_GET_EN(cc) \
251 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
252 #define NVME_CC_GET_CSS(cc) \
253 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
254 #define NVME_CC_GET_SHN(cc) \
255 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
256 #define NVME_CC_GET_IOSQES(cc) \
257 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
258 #define NVME_CC_GET_IOCQES(cc) \
259 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
261 #define NVME_CC_WRITE_MASK \
262 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
263 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
264 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
266 #define NVME_CC_NEN_WRITE_MASK \
267 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
268 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
269 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
271 /* Controller Status utils */
272 #define NVME_CSTS_GET_RDY(sts) \
273 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
275 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
277 /* Completion Queue status word utils */
278 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
279 #define NVME_STATUS_MASK \
280 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
281 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
284 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
288 len = strnlen(src, dst_size);
289 memset(dst, pad, dst_size);
290 memcpy(dst, src, len);
294 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
297 *status &= ~NVME_STATUS_MASK;
298 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
299 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
303 pci_nvme_status_genc(uint16_t *status, uint16_t code)
306 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
310 pci_nvme_toggle_phase(uint16_t *status, int prev)
314 *status &= ~NVME_STATUS_P;
316 *status |= NVME_STATUS_P;
320 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
322 struct nvme_controller_data *cd = &sc->ctrldata;
327 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
328 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
330 /* Num of submission commands that we can handle at a time (2^rab) */
340 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
342 cd->ver = 0x00010300;
344 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
348 cd->lpa = 0; /* TODO: support some simple things like SMART */
349 cd->elpe = 0; /* max error log page entries */
350 cd->npss = 1; /* number of power states support */
352 /* Warning Composite Temperature Threshold */
355 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
356 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
357 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
358 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
359 cd->nn = 1; /* number of namespaces */
363 cd->power_state[0].mp = 10;
367 * Calculate the CRC-16 of the given buffer
368 * See copyright attribution at top of file
371 crc16(uint16_t crc, const void *buffer, unsigned int len)
373 const unsigned char *cp = buffer;
374 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
375 static uint16_t const crc16_table[256] = {
376 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
377 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
378 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
379 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
380 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
381 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
382 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
383 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
384 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
385 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
386 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
387 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
388 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
389 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
390 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
391 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
392 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
393 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
394 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
395 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
396 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
397 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
398 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
399 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
400 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
401 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
402 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
403 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
404 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
405 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
406 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
407 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
411 crc = (((crc >> 8) & 0xffU) ^
412 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
417 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
418 struct nvme_namespace_data *nd, uint32_t nsid,
422 nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
426 /* Get LBA and backstore information from backing store */
427 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
430 /* Create an EUI-64 if user did not provide one */
434 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
435 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
438 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
441 eui64 = (eui64 << 16) | (nsid & 0xffff);
443 be64enc(nd->eui64, eui64);
445 /* LBA data-sz = 2^lbads */
446 nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
450 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
453 memset(&sc->err_log, 0, sizeof(sc->err_log));
454 memset(&sc->health_log, 0, sizeof(sc->health_log));
455 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
459 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
461 DPRINTF(("%s", __func__));
463 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
464 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
465 (60 << NVME_CAP_LO_REG_TO_SHIFT);
467 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
469 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
474 sc->num_cqueues = sc->num_squeues = sc->max_queues;
475 if (sc->submit_queues != NULL) {
476 for (int i = 0; i < sc->num_squeues + 1; i++) {
478 * The Admin Submission Queue is at index 0.
479 * It must not be changed at reset otherwise the
480 * emulation will be out of sync with the guest.
483 sc->submit_queues[i].qbase = NULL;
484 sc->submit_queues[i].size = 0;
485 sc->submit_queues[i].cqid = 0;
487 sc->submit_queues[i].tail = 0;
488 sc->submit_queues[i].head = 0;
489 sc->submit_queues[i].busy = 0;
492 sc->submit_queues = calloc(sc->num_squeues + 1,
493 sizeof(struct nvme_submission_queue));
495 if (sc->compl_queues != NULL) {
496 for (int i = 0; i < sc->num_cqueues + 1; i++) {
497 /* See Admin Submission Queue note above */
499 sc->compl_queues[i].qbase = NULL;
500 sc->compl_queues[i].size = 0;
503 sc->compl_queues[i].tail = 0;
504 sc->compl_queues[i].head = 0;
507 sc->compl_queues = calloc(sc->num_cqueues + 1,
508 sizeof(struct nvme_completion_queue));
510 for (int i = 0; i < sc->num_cqueues + 1; i++)
511 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
516 pci_nvme_reset(struct pci_nvme_softc *sc)
518 pthread_mutex_lock(&sc->mtx);
519 pci_nvme_reset_locked(sc);
520 pthread_mutex_unlock(&sc->mtx);
524 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
528 DPRINTF(("%s", __func__));
530 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
531 sc->submit_queues[0].size = asqs;
532 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
533 sizeof(struct nvme_command) * asqs);
535 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p",
536 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
538 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
539 NVME_AQA_REG_ACQS_MASK) + 1;
540 sc->compl_queues[0].size = acqs;
541 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
542 sizeof(struct nvme_completion) * acqs);
543 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p",
544 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
548 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
554 if (len > (8 * 1024)) {
558 /* Copy from the start of prp1 to the end of the physical page */
559 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
560 bytes = MIN(bytes, len);
562 dst = vm_map_gpa(ctx, prp1, bytes);
567 memcpy(dst, src, bytes);
576 len = MIN(len, PAGE_SIZE);
578 dst = vm_map_gpa(ctx, prp2, len);
583 memcpy(dst, src, len);
589 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
590 struct nvme_completion* compl)
592 uint16_t qid = command->cdw10 & 0xffff;
594 DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid));
595 if (qid == 0 || qid > sc->num_squeues) {
596 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u",
597 __func__, qid, sc->num_squeues));
598 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
599 NVME_SC_INVALID_QUEUE_IDENTIFIER);
603 sc->submit_queues[qid].qbase = NULL;
604 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
609 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
610 struct nvme_completion* compl)
612 if (command->cdw11 & NVME_CMD_CDW11_PC) {
613 uint16_t qid = command->cdw10 & 0xffff;
614 struct nvme_submission_queue *nsq;
616 if ((qid == 0) || (qid > sc->num_squeues)) {
617 WPRINTF(("%s queue index %u > num_squeues %u",
618 __func__, qid, sc->num_squeues));
619 pci_nvme_status_tc(&compl->status,
620 NVME_SCT_COMMAND_SPECIFIC,
621 NVME_SC_INVALID_QUEUE_IDENTIFIER);
625 nsq = &sc->submit_queues[qid];
626 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
628 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
629 sizeof(struct nvme_command) * (size_t)nsq->size);
630 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
631 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
633 DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__,
634 qid, nsq->size, nsq->qbase, nsq->cqid));
636 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
638 DPRINTF(("%s completed creating IOSQ qid %u",
642 * Guest sent non-cont submission queue request.
643 * This setting is unsupported by this emulation.
645 WPRINTF(("%s unsupported non-contig (list-based) "
646 "create i/o submission queue", __func__));
648 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
654 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
655 struct nvme_completion* compl)
657 uint16_t qid = command->cdw10 & 0xffff;
659 DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid));
660 if (qid == 0 || qid > sc->num_cqueues) {
661 WPRINTF(("%s queue index %u / num_cqueues %u",
662 __func__, qid, sc->num_cqueues));
663 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
664 NVME_SC_INVALID_QUEUE_IDENTIFIER);
668 sc->compl_queues[qid].qbase = NULL;
669 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
674 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
675 struct nvme_completion* compl)
677 if (command->cdw11 & NVME_CMD_CDW11_PC) {
678 uint16_t qid = command->cdw10 & 0xffff;
679 struct nvme_completion_queue *ncq;
681 if ((qid == 0) || (qid > sc->num_cqueues)) {
682 WPRINTF(("%s queue index %u > num_cqueues %u",
683 __func__, qid, sc->num_cqueues));
684 pci_nvme_status_tc(&compl->status,
685 NVME_SCT_COMMAND_SPECIFIC,
686 NVME_SC_INVALID_QUEUE_IDENTIFIER);
690 ncq = &sc->compl_queues[qid];
691 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
692 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
693 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
695 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
697 sizeof(struct nvme_command) * (size_t)ncq->size);
699 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
702 * Non-contig completion queue unsupported.
704 WPRINTF(("%s unsupported non-contig (list-based) "
705 "create i/o completion queue",
708 /* 0x12 = Invalid Use of Controller Memory Buffer */
709 pci_nvme_status_genc(&compl->status, 0x12);
716 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
717 struct nvme_completion* compl)
719 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
720 uint8_t logpage = command->cdw10 & 0xFF;
722 DPRINTF(("%s log page %u len %u", __func__, logpage, logsize));
724 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
728 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
729 command->prp2, (uint8_t *)&sc->err_log, logsize);
731 case NVME_LOG_HEALTH_INFORMATION:
732 /* TODO: present some smart info */
733 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
734 command->prp2, (uint8_t *)&sc->health_log, logsize);
736 case NVME_LOG_FIRMWARE_SLOT:
737 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
738 command->prp2, (uint8_t *)&sc->fw_log, logsize);
741 WPRINTF(("%s get log page %x command not supported",
744 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
745 NVME_SC_INVALID_LOG_PAGE);
752 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
753 struct nvme_completion* compl)
757 DPRINTF(("%s identify 0x%x nsid 0x%x", __func__,
758 command->cdw10 & 0xFF, command->nsid));
760 switch (command->cdw10 & 0xFF) {
761 case 0x00: /* return Identify Namespace data structure */
762 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
763 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
765 case 0x01: /* return Identify Controller data structure */
766 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
767 command->prp2, (uint8_t *)&sc->ctrldata,
768 sizeof(sc->ctrldata));
770 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
771 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
772 sizeof(uint32_t) * 1024);
773 ((uint32_t *)dest)[0] = 1;
774 ((uint32_t *)dest)[1] = 0;
777 pci_nvme_status_genc(&compl->status,
778 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
780 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
787 DPRINTF(("%s unsupported identify command requested 0x%x",
788 __func__, command->cdw10 & 0xFF));
789 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
793 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
798 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
799 struct nvme_completion* compl)
801 uint16_t nqr; /* Number of Queues Requested */
803 nqr = command->cdw11 & 0xFFFF;
805 WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr));
806 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
810 sc->num_squeues = ONE_BASED(nqr);
811 if (sc->num_squeues > sc->max_queues) {
812 DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues,
814 sc->num_squeues = sc->max_queues;
817 nqr = (command->cdw11 >> 16) & 0xFFFF;
819 WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr));
820 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
824 sc->num_cqueues = ONE_BASED(nqr);
825 if (sc->num_cqueues > sc->max_queues) {
826 DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues,
828 sc->num_cqueues = sc->max_queues;
831 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
837 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
838 struct nvme_completion* compl)
840 int feature = command->cdw10 & 0xFF;
843 DPRINTF(("%s feature 0x%x", __func__, feature));
847 case NVME_FEAT_ARBITRATION:
848 DPRINTF((" arbitration 0x%x", command->cdw11));
850 case NVME_FEAT_POWER_MANAGEMENT:
851 DPRINTF((" power management 0x%x", command->cdw11));
853 case NVME_FEAT_LBA_RANGE_TYPE:
854 DPRINTF((" lba range 0x%x", command->cdw11));
856 case NVME_FEAT_TEMPERATURE_THRESHOLD:
857 DPRINTF((" temperature threshold 0x%x", command->cdw11));
859 case NVME_FEAT_ERROR_RECOVERY:
860 DPRINTF((" error recovery 0x%x", command->cdw11));
862 case NVME_FEAT_VOLATILE_WRITE_CACHE:
863 DPRINTF((" volatile write cache 0x%x", command->cdw11));
865 case NVME_FEAT_NUMBER_OF_QUEUES:
866 nvme_set_feature_queues(sc, command, compl);
868 case NVME_FEAT_INTERRUPT_COALESCING:
869 DPRINTF((" interrupt coalescing 0x%x", command->cdw11));
872 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
874 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
876 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
877 iv = command->cdw11 & 0xFFFF;
879 DPRINTF((" interrupt vector configuration 0x%x",
882 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
883 if (sc->compl_queues[i].intr_vec == iv) {
884 if (command->cdw11 & (1 << 16))
885 sc->compl_queues[i].intr_en |=
888 sc->compl_queues[i].intr_en &=
893 case NVME_FEAT_WRITE_ATOMICITY:
894 DPRINTF((" write atomicity 0x%x", command->cdw11));
896 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
897 DPRINTF((" async event configuration 0x%x",
899 sc->async_ev_config = command->cdw11;
901 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
902 DPRINTF((" software progress marker 0x%x",
906 DPRINTF((" autonomous power state transition 0x%x",
910 WPRINTF(("%s invalid feature", __func__));
911 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
915 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
920 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
921 struct nvme_completion* compl)
923 int feature = command->cdw10 & 0xFF;
925 DPRINTF(("%s feature 0x%x", __func__, feature));
930 case NVME_FEAT_ARBITRATION:
931 DPRINTF((" arbitration"));
933 case NVME_FEAT_POWER_MANAGEMENT:
934 DPRINTF((" power management"));
936 case NVME_FEAT_LBA_RANGE_TYPE:
937 DPRINTF((" lba range"));
939 case NVME_FEAT_TEMPERATURE_THRESHOLD:
940 DPRINTF((" temperature threshold"));
941 switch ((command->cdw11 >> 20) & 0x3) {
943 /* Over temp threshold */
944 compl->cdw0 = 0xFFFF;
947 /* Under temp threshold */
951 WPRINTF((" invalid threshold type select"));
952 pci_nvme_status_genc(&compl->status,
953 NVME_SC_INVALID_FIELD);
957 case NVME_FEAT_ERROR_RECOVERY:
958 DPRINTF((" error recovery"));
960 case NVME_FEAT_VOLATILE_WRITE_CACHE:
961 DPRINTF((" volatile write cache"));
963 case NVME_FEAT_NUMBER_OF_QUEUES:
964 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
966 DPRINTF((" number of queues (submit %u, completion %u)",
967 compl->cdw0 & 0xFFFF,
968 (compl->cdw0 >> 16) & 0xFFFF));
971 case NVME_FEAT_INTERRUPT_COALESCING:
972 DPRINTF((" interrupt coalescing"));
974 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
975 DPRINTF((" interrupt vector configuration"));
977 case NVME_FEAT_WRITE_ATOMICITY:
978 DPRINTF((" write atomicity"));
980 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
981 DPRINTF((" async event configuration"));
982 sc->async_ev_config = command->cdw11;
984 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
985 DPRINTF((" software progress marker"));
988 DPRINTF((" autonomous power state transition"));
991 WPRINTF(("%s invalid feature 0x%x", __func__, feature));
992 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
996 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1001 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1002 struct nvme_completion* compl)
1004 DPRINTF(("%s submission queue %u, command ID 0x%x", __func__,
1005 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1007 /* TODO: search for the command ID and abort it */
1010 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1015 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1016 struct nvme_command* command, struct nvme_completion* compl)
1018 DPRINTF(("%s async event request 0x%x", __func__, command->cdw11));
1021 * TODO: raise events when they happen based on the Set Features cmd.
1022 * These events happen async, so only set completion successful if
1023 * there is an event reflective of the request to get event.
1025 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1026 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1031 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1033 struct nvme_completion compl;
1034 struct nvme_command *cmd;
1035 struct nvme_submission_queue *sq;
1036 struct nvme_completion_queue *cq;
1040 DPRINTF(("%s index %u", __func__, (uint32_t)value));
1042 sq = &sc->submit_queues[0];
1044 sqhead = atomic_load_acq_short(&sq->head);
1046 if (atomic_testandset_int(&sq->busy, 1)) {
1047 DPRINTF(("%s SQ busy, head %u, tail %u",
1048 __func__, sqhead, sq->tail));
1052 DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail));
1054 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1055 cmd = &(sq->qbase)[sqhead];
1060 case NVME_OPC_DELETE_IO_SQ:
1061 DPRINTF(("%s command DELETE_IO_SQ", __func__));
1062 do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1064 case NVME_OPC_CREATE_IO_SQ:
1065 DPRINTF(("%s command CREATE_IO_SQ", __func__));
1066 do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1068 case NVME_OPC_DELETE_IO_CQ:
1069 DPRINTF(("%s command DELETE_IO_CQ", __func__));
1070 do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1072 case NVME_OPC_CREATE_IO_CQ:
1073 DPRINTF(("%s command CREATE_IO_CQ", __func__));
1074 do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1076 case NVME_OPC_GET_LOG_PAGE:
1077 DPRINTF(("%s command GET_LOG_PAGE", __func__));
1078 do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1080 case NVME_OPC_IDENTIFY:
1081 DPRINTF(("%s command IDENTIFY", __func__));
1082 do_intr |= nvme_opc_identify(sc, cmd, &compl);
1084 case NVME_OPC_ABORT:
1085 DPRINTF(("%s command ABORT", __func__));
1086 do_intr |= nvme_opc_abort(sc, cmd, &compl);
1088 case NVME_OPC_SET_FEATURES:
1089 DPRINTF(("%s command SET_FEATURES", __func__));
1090 do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1092 case NVME_OPC_GET_FEATURES:
1093 DPRINTF(("%s command GET_FEATURES", __func__));
1094 do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1096 case NVME_OPC_ASYNC_EVENT_REQUEST:
1097 DPRINTF(("%s command ASYNC_EVENT_REQ", __func__));
1098 /* XXX dont care, unhandled for now
1099 do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1101 compl.status = NVME_NO_STATUS;
1104 WPRINTF(("0x%x command is not implemented",
1106 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1110 if (NVME_COMPLETION_VALID(compl)) {
1111 struct nvme_completion *cp;
1114 cq = &sc->compl_queues[0];
1116 cp = &(cq->qbase)[cq->tail];
1117 cp->cdw0 = compl.cdw0;
1122 phase = NVME_STATUS_GET_P(cp->status);
1123 cp->status = compl.status;
1124 pci_nvme_toggle_phase(&cp->status, phase);
1126 cq->tail = (cq->tail + 1) % cq->size;
1128 sqhead = (sqhead + 1) % sq->size;
1131 DPRINTF(("setting sqhead %u", sqhead));
1132 atomic_store_short(&sq->head, sqhead);
1133 atomic_store_int(&sq->busy, 0);
1136 pci_generate_msix(sc->nsc_pi, 0);
1141 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1142 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1147 /* concatenate contig block-iovs to minimize number of iovs */
1148 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1149 iovidx = req->io_req.br_iovcnt - 1;
1151 req->io_req.br_iov[iovidx].iov_base =
1152 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1153 req->prev_gpaddr, size);
1155 req->prev_size += size;
1156 req->io_req.br_resid += size;
1158 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1160 pthread_mutex_lock(&req->mtx);
1162 iovidx = req->io_req.br_iovcnt;
1163 if (iovidx == NVME_MAX_BLOCKIOVS) {
1166 DPRINTF(("large I/O, doing partial req"));
1169 req->io_req.br_iovcnt = 0;
1171 req->io_req.br_callback = pci_nvme_io_partial;
1174 err = blockif_read(sc->nvstore.ctx,
1177 err = blockif_write(sc->nvstore.ctx,
1180 /* wait until req completes before cont */
1182 pthread_cond_wait(&req->cv, &req->mtx);
1185 req->io_req.br_offset = lba;
1186 req->io_req.br_resid = 0;
1187 req->io_req.br_param = req;
1190 req->io_req.br_iov[iovidx].iov_base =
1191 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1194 req->io_req.br_iov[iovidx].iov_len = size;
1196 req->prev_gpaddr = gpaddr;
1197 req->prev_size = size;
1198 req->io_req.br_resid += size;
1200 req->io_req.br_iovcnt++;
1202 pthread_mutex_unlock(&req->mtx);
1205 /* RAM buffer: read/write directly */
1206 void *p = sc->nvstore.ctx;
1209 if ((lba + size) > sc->nvstore.size) {
1210 WPRINTF(("%s write would overflow RAM", __func__));
1214 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1215 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1217 memcpy(p, gptr, size);
1219 memcpy(gptr, p, size);
1225 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1226 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1227 uint32_t cdw0, uint16_t status, int ignore_busy)
1229 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1230 struct nvme_completion *compl;
1234 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1235 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1236 NVME_STATUS_GET_SC(status)));
1238 pthread_mutex_lock(&cq->mtx);
1240 assert(cq->qbase != NULL);
1242 compl = &cq->qbase[cq->tail];
1244 compl->sqhd = atomic_load_acq_short(&sq->head);
1249 phase = NVME_STATUS_GET_P(compl->status);
1250 compl->status = status;
1251 pci_nvme_toggle_phase(&compl->status, phase);
1253 cq->tail = (cq->tail + 1) % cq->size;
1255 if (cq->intr_en & NVME_CQ_INTEN)
1258 pthread_mutex_unlock(&cq->mtx);
1260 if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1262 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1266 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1269 req->nvme_sq = NULL;
1272 pthread_mutex_lock(&sc->mtx);
1274 req->next = sc->ioreqs_free;
1275 sc->ioreqs_free = req;
1278 /* when no more IO pending, can set to ready if device reset/enabled */
1279 if (sc->pending_ios == 0 &&
1280 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1281 sc->regs.csts |= NVME_CSTS_RDY;
1283 pthread_mutex_unlock(&sc->mtx);
1285 sem_post(&sc->iosemlock);
1288 static struct pci_nvme_ioreq *
1289 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1291 struct pci_nvme_ioreq *req = NULL;;
1293 sem_wait(&sc->iosemlock);
1294 pthread_mutex_lock(&sc->mtx);
1296 req = sc->ioreqs_free;
1297 assert(req != NULL);
1299 sc->ioreqs_free = req->next;
1306 pthread_mutex_unlock(&sc->mtx);
1308 req->io_req.br_iovcnt = 0;
1309 req->io_req.br_offset = 0;
1310 req->io_req.br_resid = 0;
1311 req->io_req.br_param = req;
1312 req->prev_gpaddr = 0;
1319 pci_nvme_io_done(struct blockif_req *br, int err)
1321 struct pci_nvme_ioreq *req = br->br_param;
1322 struct nvme_submission_queue *sq = req->nvme_sq;
1323 uint16_t code, status;
1325 DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1327 /* TODO return correct error */
1328 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1329 pci_nvme_status_genc(&status, code);
1331 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1332 pci_nvme_release_ioreq(req->sc, req);
1336 pci_nvme_io_partial(struct blockif_req *br, int err)
1338 struct pci_nvme_ioreq *req = br->br_param;
1340 DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1342 pthread_cond_signal(&req->cv);
1347 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1349 struct nvme_submission_queue *sq;
1354 /* handle all submissions up to sq->tail index */
1355 sq = &sc->submit_queues[idx];
1357 if (atomic_testandset_int(&sq->busy, 1)) {
1358 DPRINTF(("%s sqid %u busy", __func__, idx));
1362 sqhead = atomic_load_acq_short(&sq->head);
1364 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1365 idx, sqhead, sq->tail, sq->qbase));
1367 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1368 struct nvme_command *cmd;
1369 struct pci_nvme_ioreq *req = NULL;
1371 uint64_t nblocks, bytes, size, cpsz;
1373 /* TODO: support scatter gather list handling */
1375 cmd = &sq->qbase[sqhead];
1376 sqhead = (sqhead + 1) % sq->size;
1378 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1380 if (cmd->opc == NVME_OPC_FLUSH) {
1381 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1382 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1386 } else if (cmd->opc == 0x08) {
1387 /* TODO: write zeroes */
1388 WPRINTF(("%s write zeroes lba 0x%lx blocks %u",
1389 __func__, lba, cmd->cdw12 & 0xFFFF));
1390 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1391 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1397 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1399 bytes = nblocks * sc->nvstore.sectsz;
1401 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1402 req = pci_nvme_get_ioreq(sc);
1408 * If data starts mid-page and flows into the next page, then
1409 * increase page count
1412 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1414 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1415 cmd->opc == NVME_OPC_WRITE ?
1417 lba, nblocks, bytes));
1419 cmd->prp1 &= ~(0x03UL);
1420 cmd->prp2 &= ~(0x03UL);
1422 DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2));
1425 lba *= sc->nvstore.sectsz;
1427 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1433 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1435 req->opc = cmd->opc;
1436 req->cid = cmd->cid;
1437 req->nsid = cmd->nsid;
1440 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1441 cmd->opc == NVME_OPC_WRITE, lba);
1448 if (size <= PAGE_SIZE) {
1449 /* prp2 is second (and final) page in transfer */
1451 err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1453 cmd->opc == NVME_OPC_WRITE,
1459 /* prp2 is pointer to a physical region page list */
1460 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1461 cmd->prp2, PAGE_SIZE);
1465 cpsz = MIN(size, PAGE_SIZE);
1468 * Move to linked physical region page list
1471 if (i == (NVME_PRP2_ITEMS-1) &&
1473 assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1474 prp_list = paddr_guest2host(
1475 sc->nsc_pi->pi_vmctx,
1476 prp_list[i], PAGE_SIZE);
1479 if (prp_list[i] == 0) {
1480 WPRINTF(("PRP2[%d] = 0 !!!", i));
1485 err = pci_nvme_append_iov_req(sc, req,
1487 cmd->opc == NVME_OPC_WRITE, lba);
1498 if (sc->nvstore.type == NVME_STOR_RAM) {
1499 uint16_t code, status;
1501 code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1503 pci_nvme_status_genc(&status, code);
1505 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1515 req->io_req.br_callback = pci_nvme_io_done;
1520 err = blockif_read(sc->nvstore.ctx, &req->io_req);
1522 case NVME_OPC_WRITE:
1523 err = blockif_write(sc->nvstore.ctx, &req->io_req);
1526 WPRINTF(("%s unhandled io command 0x%x",
1527 __func__, cmd->opc));
1535 pci_nvme_status_genc(&status,
1536 NVME_SC_DATA_TRANSFER_ERROR);
1538 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1540 pci_nvme_release_ioreq(sc, req);
1544 atomic_store_short(&sq->head, sqhead);
1545 atomic_store_int(&sq->busy, 0);
1549 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1550 uint64_t idx, int is_sq, uint64_t value)
1552 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx",
1553 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1556 atomic_store_short(&sc->submit_queues[idx].tail,
1560 pci_nvme_handle_admin_cmd(sc, value);
1562 /* submission queue; handle new entries in SQ */
1563 if (idx > sc->num_squeues) {
1564 WPRINTF(("%s SQ index %lu overflow from "
1566 __func__, idx, sc->num_squeues));
1569 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1572 if (idx > sc->num_cqueues) {
1573 WPRINTF(("%s queue index %lu overflow from "
1575 __func__, idx, sc->num_cqueues));
1579 sc->compl_queues[idx].head = (uint16_t)value;
1584 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1586 const char *s = iswrite ? "WRITE" : "READ";
1589 case NVME_CR_CAP_LOW:
1590 DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s));
1592 case NVME_CR_CAP_HI:
1593 DPRINTF(("%s %s NVME_CR_CAP_HI", func, s));
1596 DPRINTF(("%s %s NVME_CR_VS", func, s));
1599 DPRINTF(("%s %s NVME_CR_INTMS", func, s));
1602 DPRINTF(("%s %s NVME_CR_INTMC", func, s));
1605 DPRINTF(("%s %s NVME_CR_CC", func, s));
1608 DPRINTF(("%s %s NVME_CR_CSTS", func, s));
1611 DPRINTF(("%s %s NVME_CR_NSSR", func, s));
1614 DPRINTF(("%s %s NVME_CR_AQA", func, s));
1616 case NVME_CR_ASQ_LOW:
1617 DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s));
1619 case NVME_CR_ASQ_HI:
1620 DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s));
1622 case NVME_CR_ACQ_LOW:
1623 DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s));
1625 case NVME_CR_ACQ_HI:
1626 DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s));
1629 DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset));
1635 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1636 uint64_t offset, int size, uint64_t value)
1640 if (offset >= NVME_DOORBELL_OFFSET) {
1641 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1642 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1643 int is_sq = (belloffset % 8) < 4;
1645 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1646 WPRINTF(("guest attempted an overflow write offset "
1647 "0x%lx, val 0x%lx in %s",
1648 offset, value, __func__));
1652 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1656 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx",
1657 offset, size, value));
1660 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1661 "val 0x%lx) to bar0 in %s",
1662 size, offset, value, __func__));
1663 /* TODO: shutdown device */
1667 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1669 pthread_mutex_lock(&sc->mtx);
1672 case NVME_CR_CAP_LOW:
1673 case NVME_CR_CAP_HI:
1680 /* MSI-X, so ignore */
1683 /* MSI-X, so ignore */
1686 ccreg = (uint32_t)value;
1688 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1691 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1692 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1693 NVME_CC_GET_IOCQES(ccreg)));
1695 if (NVME_CC_GET_SHN(ccreg)) {
1696 /* perform shutdown - flush out data to backend */
1697 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1698 NVME_CSTS_REG_SHST_SHIFT);
1699 sc->regs.csts |= NVME_SHST_COMPLETE <<
1700 NVME_CSTS_REG_SHST_SHIFT;
1702 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1703 if (NVME_CC_GET_EN(ccreg) == 0)
1704 /* transition 1-> causes controller reset */
1705 pci_nvme_reset_locked(sc);
1707 pci_nvme_init_controller(ctx, sc);
1710 /* Insert the iocqes, iosqes and en bits from the write */
1711 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1712 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1713 if (NVME_CC_GET_EN(ccreg) == 0) {
1714 /* Insert the ams, mps and css bit fields */
1715 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1716 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1717 sc->regs.csts &= ~NVME_CSTS_RDY;
1718 } else if (sc->pending_ios == 0) {
1719 sc->regs.csts |= NVME_CSTS_RDY;
1725 /* ignore writes; don't support subsystem reset */
1728 sc->regs.aqa = (uint32_t)value;
1730 case NVME_CR_ASQ_LOW:
1731 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1732 (0xFFFFF000 & value);
1734 case NVME_CR_ASQ_HI:
1735 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1738 case NVME_CR_ACQ_LOW:
1739 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1740 (0xFFFFF000 & value);
1742 case NVME_CR_ACQ_HI:
1743 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1747 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d",
1748 __func__, offset, value, size));
1750 pthread_mutex_unlock(&sc->mtx);
1754 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1755 int baridx, uint64_t offset, int size, uint64_t value)
1757 struct pci_nvme_softc* sc = pi->pi_arg;
1759 if (baridx == pci_msix_table_bar(pi) ||
1760 baridx == pci_msix_pba_bar(pi)) {
1761 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1762 " value 0x%lx", baridx, offset, size, value));
1764 pci_emul_msix_twrite(pi, offset, size, value);
1770 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1774 DPRINTF(("%s unknown baridx %d, val 0x%lx",
1775 __func__, baridx, value));
1779 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1780 uint64_t offset, int size)
1784 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1786 if (offset < NVME_DOORBELL_OFFSET) {
1787 void *p = &(sc->regs);
1788 pthread_mutex_lock(&sc->mtx);
1789 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1790 pthread_mutex_unlock(&sc->mtx);
1793 WPRINTF(("pci_nvme: read invalid offset %ld", offset));
1804 value &= 0xFFFFFFFF;
1808 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x",
1809 offset, size, (uint32_t)value));
1817 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1818 uint64_t offset, int size)
1820 struct pci_nvme_softc* sc = pi->pi_arg;
1822 if (baridx == pci_msix_table_bar(pi) ||
1823 baridx == pci_msix_pba_bar(pi)) {
1824 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
1825 baridx, offset, size));
1827 return pci_emul_msix_tread(pi, offset, size);
1832 return pci_nvme_read_bar_0(sc, offset, size);
1835 DPRINTF(("unknown bar %d, 0x%lx", baridx, offset));
1843 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1845 char bident[sizeof("XX:X:X")];
1846 char *uopt, *xopts, *config;
1850 sc->max_queues = NVME_QUEUES;
1851 sc->max_qentries = NVME_MAX_QENTRIES;
1852 sc->ioslots = NVME_IOSLOTS;
1853 sc->num_squeues = sc->max_queues;
1854 sc->num_cqueues = sc->max_queues;
1857 uopt = strdup(opts);
1859 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1860 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1861 for (xopts = strtok(uopt, ",");
1863 xopts = strtok(NULL, ",")) {
1865 if ((config = strchr(xopts, '=')) != NULL)
1868 if (!strcmp("maxq", xopts)) {
1869 sc->max_queues = atoi(config);
1870 } else if (!strcmp("qsz", xopts)) {
1871 sc->max_qentries = atoi(config);
1872 } else if (!strcmp("ioslots", xopts)) {
1873 sc->ioslots = atoi(config);
1874 } else if (!strcmp("sectsz", xopts)) {
1875 sectsz = atoi(config);
1876 } else if (!strcmp("ser", xopts)) {
1878 * This field indicates the Product Serial Number in
1879 * 7-bit ASCII, unused bytes should be space characters.
1882 cpywithpad((char *)sc->ctrldata.sn,
1883 sizeof(sc->ctrldata.sn), config, ' ');
1884 } else if (!strcmp("ram", xopts)) {
1885 uint64_t sz = strtoull(&xopts[4], NULL, 10);
1887 sc->nvstore.type = NVME_STOR_RAM;
1888 sc->nvstore.size = sz * 1024 * 1024;
1889 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1890 sc->nvstore.sectsz = 4096;
1891 sc->nvstore.sectsz_bits = 12;
1892 if (sc->nvstore.ctx == NULL) {
1893 perror("Unable to allocate RAM");
1897 } else if (!strcmp("eui64", xopts)) {
1898 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1899 } else if (optidx == 0) {
1900 snprintf(bident, sizeof(bident), "%d:%d",
1901 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1902 sc->nvstore.ctx = blockif_open(xopts, bident);
1903 if (sc->nvstore.ctx == NULL) {
1904 perror("Could not open backing file");
1908 sc->nvstore.type = NVME_STOR_BLOCKIF;
1909 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1911 EPRINTLN("Invalid option %s", xopts);
1920 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1921 EPRINTLN("backing store not specified");
1924 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1925 sc->nvstore.sectsz = sectsz;
1926 else if (sc->nvstore.type != NVME_STOR_RAM)
1927 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1928 for (sc->nvstore.sectsz_bits = 9;
1929 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1930 sc->nvstore.sectsz_bits++);
1932 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1933 sc->max_queues = NVME_QUEUES;
1935 if (sc->max_qentries <= 0) {
1936 EPRINTLN("Invalid qsz option");
1939 if (sc->ioslots <= 0) {
1940 EPRINTLN("Invalid ioslots option");
1948 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1950 struct pci_nvme_softc *sc;
1951 uint32_t pci_membar_sz;
1956 sc = calloc(1, sizeof(struct pci_nvme_softc));
1960 error = pci_nvme_parse_opts(sc, opts);
1966 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1967 for (int i = 0; i < sc->ioslots; i++) {
1968 if (i < (sc->ioslots-1))
1969 sc->ioreqs[i].next = &sc->ioreqs[i+1];
1970 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1971 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1973 sc->ioreqs_free = sc->ioreqs;
1974 sc->intr_coales_aggr_thresh = 1;
1976 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1977 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1978 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1979 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1980 pci_set_cfgdata8(pi, PCIR_PROGIF,
1981 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1984 * Allocate size of NVMe registers + doorbell space for all queues.
1986 * The specification requires a minimum memory I/O window size of 16K.
1987 * The Windows driver will refuse to start a device with a smaller
1990 pci_membar_sz = sizeof(struct nvme_registers) +
1991 2 * sizeof(uint32_t) * (sc->max_queues + 1);
1992 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1994 DPRINTF(("nvme membar size: %u", pci_membar_sz));
1996 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1998 WPRINTF(("%s pci alloc mem bar failed", __func__));
2002 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2004 WPRINTF(("%s pci add msixcap failed", __func__));
2008 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2010 WPRINTF(("%s pci add Express capability failed", __func__));
2014 pthread_mutex_init(&sc->mtx, NULL);
2015 sem_init(&sc->iosemlock, 0, sc->ioslots);
2018 pci_nvme_init_ctrldata(sc);
2019 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2020 pci_nvme_init_logpages(sc);
2022 pci_lintr_request(pi);
2029 struct pci_devemu pci_de_nvme = {
2031 .pe_init = pci_nvme_init,
2032 .pe_barwrite = pci_nvme_write,
2033 .pe_barread = pci_nvme_read
2035 PCI_EMUL_SET(pci_de_nvme);