2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
7 * Function crc16 Copyright (c) 2017, Fedor Uporov
8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * bhyve PCIe-NVMe device emulation.
36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
43 * maxq = max number of queues
44 * qsz = max elements in each queue
45 * ioslots = max number of concurrent io requests
46 * sectsz = sector size (defaults to blockif sector size)
47 * ser = serial number (20-chars max)
48 * eui64 = IEEE Extended Unique Identifier (8 byte value)
53 - create async event for smart and log
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
65 #include <semaphore.h>
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
77 #include <dev/nvme/nvme.h>
84 static int nvme_debug = 0;
85 #define DPRINTF(params) if (nvme_debug) printf params
86 #define WPRINTF(params) printf params
88 /* defaults; can be overridden */
89 #define NVME_MSIX_BAR 4
91 #define NVME_IOSLOTS 8
93 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
94 #define NVME_MMIO_SPACE_MIN (1 << 14)
96 #define NVME_QUEUES 16
97 #define NVME_MAX_QENTRIES 2048
99 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
100 #define NVME_MAX_BLOCKIOVS 512
102 /* This is a synthetic status code to indicate there is no status */
103 #define NVME_NO_STATUS 0xffff
104 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
108 /* Convert a zero-based value into a one-based value */
109 #define ONE_BASED(zero) ((zero) + 1)
110 /* Convert a one-based value into a zero-based value */
111 #define ZERO_BASED(one) ((one) - 1)
113 /* Encode number of SQ's and CQ's for Set/Get Features */
114 #define NVME_FEATURE_NUM_QUEUES(sc) \
115 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
116 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
118 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
120 enum nvme_controller_register_offsets {
121 NVME_CR_CAP_LOW = 0x00,
122 NVME_CR_CAP_HI = 0x04,
124 NVME_CR_INTMS = 0x0c,
125 NVME_CR_INTMC = 0x10,
130 NVME_CR_ASQ_LOW = 0x28,
131 NVME_CR_ASQ_HI = 0x2c,
132 NVME_CR_ACQ_LOW = 0x30,
133 NVME_CR_ACQ_HI = 0x34,
136 enum nvme_cmd_cdw11 {
137 NVME_CMD_CDW11_PC = 0x0001,
138 NVME_CMD_CDW11_IEN = 0x0002,
139 NVME_CMD_CDW11_IV = 0xFFFF0000,
142 #define NVME_CQ_INTEN 0x01
143 #define NVME_CQ_INTCOAL 0x02
145 struct nvme_completion_queue {
146 struct nvme_completion *qbase;
148 uint16_t tail; /* nvme progress */
149 uint16_t head; /* guest progress */
155 struct nvme_submission_queue {
156 struct nvme_command *qbase;
158 uint16_t head; /* nvme progress */
159 uint16_t tail; /* guest progress */
160 uint16_t cqid; /* completion queue id */
161 int busy; /* queue is being processed */
165 enum nvme_storage_type {
166 NVME_STOR_BLOCKIF = 0,
170 struct pci_nvme_blockstore {
171 enum nvme_storage_type type;
175 uint32_t sectsz_bits;
179 struct pci_nvme_ioreq {
180 struct pci_nvme_softc *sc;
181 struct pci_nvme_ioreq *next;
182 struct nvme_submission_queue *nvme_sq;
185 /* command information */
190 uint64_t prev_gpaddr;
194 * lock if all iovs consumed (big IO);
195 * complete transaction before continuing
200 struct blockif_req io_req;
202 /* pad to fit up to 512 page descriptors from guest IO request */
203 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
206 struct pci_nvme_softc {
207 struct pci_devinst *nsc_pi;
211 struct nvme_registers regs;
213 struct nvme_namespace_data nsdata;
214 struct nvme_controller_data ctrldata;
215 struct nvme_error_information_entry err_log;
216 struct nvme_health_information_page health_log;
217 struct nvme_firmware_page fw_log;
219 struct pci_nvme_blockstore nvstore;
221 uint16_t max_qentries; /* max entries per queue */
222 uint32_t max_queues; /* max number of IO SQ's or CQ's */
223 uint32_t num_cqueues;
224 uint32_t num_squeues;
226 struct pci_nvme_ioreq *ioreqs;
227 struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
228 uint32_t pending_ios;
233 * Memory mapped Submission and Completion queues
234 * Each array includes both Admin and IO queues
236 struct nvme_completion_queue *compl_queues;
237 struct nvme_submission_queue *submit_queues;
239 /* controller features */
240 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
241 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
242 uint32_t async_ev_config; /* 0x0B: async event config */
246 static void pci_nvme_io_partial(struct blockif_req *br, int err);
248 /* Controller Configuration utils */
249 #define NVME_CC_GET_EN(cc) \
250 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
251 #define NVME_CC_GET_CSS(cc) \
252 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
253 #define NVME_CC_GET_SHN(cc) \
254 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
255 #define NVME_CC_GET_IOSQES(cc) \
256 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
257 #define NVME_CC_GET_IOCQES(cc) \
258 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
260 #define NVME_CC_WRITE_MASK \
261 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
262 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
263 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
265 #define NVME_CC_NEN_WRITE_MASK \
266 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
267 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
268 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
270 /* Controller Status utils */
271 #define NVME_CSTS_GET_RDY(sts) \
272 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
274 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
276 /* Completion Queue status word utils */
277 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
278 #define NVME_STATUS_MASK \
279 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
280 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
283 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
287 len = strnlen(src, dst_size);
288 memset(dst, pad, dst_size);
289 memcpy(dst, src, len);
293 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
296 *status &= ~NVME_STATUS_MASK;
297 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
298 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
302 pci_nvme_status_genc(uint16_t *status, uint16_t code)
305 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
309 pci_nvme_toggle_phase(uint16_t *status, int prev)
313 *status &= ~NVME_STATUS_P;
315 *status |= NVME_STATUS_P;
319 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
321 struct nvme_controller_data *cd = &sc->ctrldata;
326 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
327 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
329 /* Num of submission commands that we can handle at a time (2^rab) */
339 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
341 cd->ver = 0x00010300;
343 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
347 cd->lpa = 0; /* TODO: support some simple things like SMART */
348 cd->elpe = 0; /* max error log page entries */
349 cd->npss = 1; /* number of power states support */
351 /* Warning Composite Temperature Threshold */
354 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
355 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
356 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
357 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
358 cd->nn = 1; /* number of namespaces */
362 cd->power_state[0].mp = 10;
366 * Calculate the CRC-16 of the given buffer
367 * See copyright attribution at top of file
370 crc16(uint16_t crc, const void *buffer, unsigned int len)
372 const unsigned char *cp = buffer;
373 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
374 static uint16_t const crc16_table[256] = {
375 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
376 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
377 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
378 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
379 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
380 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
381 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
382 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
383 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
384 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
385 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
386 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
387 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
388 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
389 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
390 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
391 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
392 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
393 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
394 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
395 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
396 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
397 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
398 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
399 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
400 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
401 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
402 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
403 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
404 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
405 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
406 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
410 crc = (((crc >> 8) & 0xffU) ^
411 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
416 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
417 struct nvme_namespace_data *nd, uint32_t nsid,
421 nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
425 /* Get LBA and backstore information from backing store */
426 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
429 /* Create an EUI-64 if user did not provide one */
433 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
434 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
437 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
440 eui64 = (eui64 << 16) | (nsid & 0xffff);
442 be64enc(nd->eui64, eui64);
444 /* LBA data-sz = 2^lbads */
445 nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
449 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
452 memset(&sc->err_log, 0, sizeof(sc->err_log));
453 memset(&sc->health_log, 0, sizeof(sc->health_log));
454 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
458 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
460 DPRINTF(("%s\r\n", __func__));
462 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
463 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
464 (60 << NVME_CAP_LO_REG_TO_SHIFT);
466 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
468 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
473 sc->num_cqueues = sc->num_squeues = sc->max_queues;
474 if (sc->submit_queues != NULL) {
475 for (int i = 0; i < sc->num_squeues + 1; i++) {
477 * The Admin Submission Queue is at index 0.
478 * It must not be changed at reset otherwise the
479 * emulation will be out of sync with the guest.
482 sc->submit_queues[i].qbase = NULL;
483 sc->submit_queues[i].size = 0;
484 sc->submit_queues[i].cqid = 0;
486 sc->submit_queues[i].tail = 0;
487 sc->submit_queues[i].head = 0;
488 sc->submit_queues[i].busy = 0;
491 sc->submit_queues = calloc(sc->num_squeues + 1,
492 sizeof(struct nvme_submission_queue));
494 if (sc->compl_queues != NULL) {
495 for (int i = 0; i < sc->num_cqueues + 1; i++) {
496 /* See Admin Submission Queue note above */
498 sc->compl_queues[i].qbase = NULL;
499 sc->compl_queues[i].size = 0;
502 sc->compl_queues[i].tail = 0;
503 sc->compl_queues[i].head = 0;
506 sc->compl_queues = calloc(sc->num_cqueues + 1,
507 sizeof(struct nvme_completion_queue));
509 for (int i = 0; i < sc->num_cqueues + 1; i++)
510 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
515 pci_nvme_reset(struct pci_nvme_softc *sc)
517 pthread_mutex_lock(&sc->mtx);
518 pci_nvme_reset_locked(sc);
519 pthread_mutex_unlock(&sc->mtx);
523 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
527 DPRINTF(("%s\r\n", __func__));
529 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
530 sc->submit_queues[0].size = asqs;
531 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
532 sizeof(struct nvme_command) * asqs);
534 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
535 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
537 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
538 NVME_AQA_REG_ACQS_MASK) + 1;
539 sc->compl_queues[0].size = acqs;
540 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
541 sizeof(struct nvme_completion) * acqs);
542 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
543 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
547 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
553 if (len > (8 * 1024)) {
557 /* Copy from the start of prp1 to the end of the physical page */
558 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
559 bytes = MIN(bytes, len);
561 dst = vm_map_gpa(ctx, prp1, bytes);
566 memcpy(dst, src, bytes);
575 len = MIN(len, PAGE_SIZE);
577 dst = vm_map_gpa(ctx, prp2, len);
582 memcpy(dst, src, len);
588 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
589 struct nvme_completion* compl)
591 uint16_t qid = command->cdw10 & 0xffff;
593 DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
594 if (qid == 0 || qid > sc->num_squeues) {
595 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
596 __func__, qid, sc->num_squeues));
597 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
598 NVME_SC_INVALID_QUEUE_IDENTIFIER);
602 sc->submit_queues[qid].qbase = NULL;
603 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
608 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
609 struct nvme_completion* compl)
611 if (command->cdw11 & NVME_CMD_CDW11_PC) {
612 uint16_t qid = command->cdw10 & 0xffff;
613 struct nvme_submission_queue *nsq;
615 if ((qid == 0) || (qid > sc->num_squeues)) {
616 WPRINTF(("%s queue index %u > num_squeues %u\r\n",
617 __func__, qid, sc->num_squeues));
618 pci_nvme_status_tc(&compl->status,
619 NVME_SCT_COMMAND_SPECIFIC,
620 NVME_SC_INVALID_QUEUE_IDENTIFIER);
624 nsq = &sc->submit_queues[qid];
625 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
627 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
628 sizeof(struct nvme_command) * (size_t)nsq->size);
629 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
630 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
632 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
633 qid, nsq->size, nsq->qbase, nsq->cqid));
635 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
637 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
641 * Guest sent non-cont submission queue request.
642 * This setting is unsupported by this emulation.
644 WPRINTF(("%s unsupported non-contig (list-based) "
645 "create i/o submission queue\r\n", __func__));
647 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
653 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
654 struct nvme_completion* compl)
656 uint16_t qid = command->cdw10 & 0xffff;
658 DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
659 if (qid == 0 || qid > sc->num_cqueues) {
660 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
661 __func__, qid, sc->num_cqueues));
662 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
663 NVME_SC_INVALID_QUEUE_IDENTIFIER);
667 sc->compl_queues[qid].qbase = NULL;
668 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
673 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
674 struct nvme_completion* compl)
676 if (command->cdw11 & NVME_CMD_CDW11_PC) {
677 uint16_t qid = command->cdw10 & 0xffff;
678 struct nvme_completion_queue *ncq;
680 if ((qid == 0) || (qid > sc->num_cqueues)) {
681 WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
682 __func__, qid, sc->num_cqueues));
683 pci_nvme_status_tc(&compl->status,
684 NVME_SCT_COMMAND_SPECIFIC,
685 NVME_SC_INVALID_QUEUE_IDENTIFIER);
689 ncq = &sc->compl_queues[qid];
690 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
691 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
692 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
694 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
696 sizeof(struct nvme_command) * (size_t)ncq->size);
698 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
701 * Non-contig completion queue unsupported.
703 WPRINTF(("%s unsupported non-contig (list-based) "
704 "create i/o completion queue\r\n",
707 /* 0x12 = Invalid Use of Controller Memory Buffer */
708 pci_nvme_status_genc(&compl->status, 0x12);
715 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
716 struct nvme_completion* compl)
718 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
719 uint8_t logpage = command->cdw10 & 0xFF;
721 DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
723 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
727 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
728 command->prp2, (uint8_t *)&sc->err_log, logsize);
730 case NVME_LOG_HEALTH_INFORMATION:
731 /* TODO: present some smart info */
732 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
733 command->prp2, (uint8_t *)&sc->health_log, logsize);
735 case NVME_LOG_FIRMWARE_SLOT:
736 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
737 command->prp2, (uint8_t *)&sc->fw_log, logsize);
740 WPRINTF(("%s get log page %x command not supported\r\n",
743 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
744 NVME_SC_INVALID_LOG_PAGE);
751 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
752 struct nvme_completion* compl)
756 DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
757 command->cdw10 & 0xFF, command->nsid));
759 switch (command->cdw10 & 0xFF) {
760 case 0x00: /* return Identify Namespace data structure */
761 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
762 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
764 case 0x01: /* return Identify Controller data structure */
765 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
766 command->prp2, (uint8_t *)&sc->ctrldata,
767 sizeof(sc->ctrldata));
769 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
770 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
771 sizeof(uint32_t) * 1024);
772 ((uint32_t *)dest)[0] = 1;
773 ((uint32_t *)dest)[1] = 0;
776 pci_nvme_status_genc(&compl->status,
777 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
779 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
786 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
787 __func__, command->cdw10 & 0xFF));
788 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
792 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
797 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
798 struct nvme_completion* compl)
800 uint16_t nqr; /* Number of Queues Requested */
802 nqr = command->cdw11 & 0xFFFF;
804 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
805 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
809 sc->num_squeues = ONE_BASED(nqr);
810 if (sc->num_squeues > sc->max_queues) {
811 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
813 sc->num_squeues = sc->max_queues;
816 nqr = (command->cdw11 >> 16) & 0xFFFF;
818 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
819 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
823 sc->num_cqueues = ONE_BASED(nqr);
824 if (sc->num_cqueues > sc->max_queues) {
825 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
827 sc->num_cqueues = sc->max_queues;
830 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
836 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
837 struct nvme_completion* compl)
839 int feature = command->cdw10 & 0xFF;
842 DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
846 case NVME_FEAT_ARBITRATION:
847 DPRINTF((" arbitration 0x%x\r\n", command->cdw11));
849 case NVME_FEAT_POWER_MANAGEMENT:
850 DPRINTF((" power management 0x%x\r\n", command->cdw11));
852 case NVME_FEAT_LBA_RANGE_TYPE:
853 DPRINTF((" lba range 0x%x\r\n", command->cdw11));
855 case NVME_FEAT_TEMPERATURE_THRESHOLD:
856 DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11));
858 case NVME_FEAT_ERROR_RECOVERY:
859 DPRINTF((" error recovery 0x%x\r\n", command->cdw11));
861 case NVME_FEAT_VOLATILE_WRITE_CACHE:
862 DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11));
864 case NVME_FEAT_NUMBER_OF_QUEUES:
865 nvme_set_feature_queues(sc, command, compl);
867 case NVME_FEAT_INTERRUPT_COALESCING:
868 DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11));
871 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
873 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
875 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
876 iv = command->cdw11 & 0xFFFF;
878 DPRINTF((" interrupt vector configuration 0x%x\r\n",
881 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
882 if (sc->compl_queues[i].intr_vec == iv) {
883 if (command->cdw11 & (1 << 16))
884 sc->compl_queues[i].intr_en |=
887 sc->compl_queues[i].intr_en &=
892 case NVME_FEAT_WRITE_ATOMICITY:
893 DPRINTF((" write atomicity 0x%x\r\n", command->cdw11));
895 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
896 DPRINTF((" async event configuration 0x%x\r\n",
898 sc->async_ev_config = command->cdw11;
900 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
901 DPRINTF((" software progress marker 0x%x\r\n",
905 DPRINTF((" autonomous power state transition 0x%x\r\n",
909 WPRINTF(("%s invalid feature\r\n", __func__));
910 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
914 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
919 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
920 struct nvme_completion* compl)
922 int feature = command->cdw10 & 0xFF;
924 DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
929 case NVME_FEAT_ARBITRATION:
930 DPRINTF((" arbitration\r\n"));
932 case NVME_FEAT_POWER_MANAGEMENT:
933 DPRINTF((" power management\r\n"));
935 case NVME_FEAT_LBA_RANGE_TYPE:
936 DPRINTF((" lba range\r\n"));
938 case NVME_FEAT_TEMPERATURE_THRESHOLD:
939 DPRINTF((" temperature threshold\r\n"));
940 switch ((command->cdw11 >> 20) & 0x3) {
942 /* Over temp threshold */
943 compl->cdw0 = 0xFFFF;
946 /* Under temp threshold */
950 WPRINTF((" invalid threshold type select\r\n"));
951 pci_nvme_status_genc(&compl->status,
952 NVME_SC_INVALID_FIELD);
956 case NVME_FEAT_ERROR_RECOVERY:
957 DPRINTF((" error recovery\r\n"));
959 case NVME_FEAT_VOLATILE_WRITE_CACHE:
960 DPRINTF((" volatile write cache\r\n"));
962 case NVME_FEAT_NUMBER_OF_QUEUES:
963 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
965 DPRINTF((" number of queues (submit %u, completion %u)\r\n",
966 compl->cdw0 & 0xFFFF,
967 (compl->cdw0 >> 16) & 0xFFFF));
970 case NVME_FEAT_INTERRUPT_COALESCING:
971 DPRINTF((" interrupt coalescing\r\n"));
973 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
974 DPRINTF((" interrupt vector configuration\r\n"));
976 case NVME_FEAT_WRITE_ATOMICITY:
977 DPRINTF((" write atomicity\r\n"));
979 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
980 DPRINTF((" async event configuration\r\n"));
981 sc->async_ev_config = command->cdw11;
983 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
984 DPRINTF((" software progress marker\r\n"));
987 DPRINTF((" autonomous power state transition\r\n"));
990 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
991 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
995 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1000 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1001 struct nvme_completion* compl)
1003 DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
1004 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1006 /* TODO: search for the command ID and abort it */
1009 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1014 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1015 struct nvme_command* command, struct nvme_completion* compl)
1017 DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
1020 * TODO: raise events when they happen based on the Set Features cmd.
1021 * These events happen async, so only set completion successful if
1022 * there is an event reflective of the request to get event.
1024 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1025 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1030 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1032 struct nvme_completion compl;
1033 struct nvme_command *cmd;
1034 struct nvme_submission_queue *sq;
1035 struct nvme_completion_queue *cq;
1039 DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
1041 sq = &sc->submit_queues[0];
1043 sqhead = atomic_load_acq_short(&sq->head);
1045 if (atomic_testandset_int(&sq->busy, 1)) {
1046 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
1047 __func__, sqhead, sq->tail));
1051 DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
1053 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1054 cmd = &(sq->qbase)[sqhead];
1059 case NVME_OPC_DELETE_IO_SQ:
1060 DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
1061 do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1063 case NVME_OPC_CREATE_IO_SQ:
1064 DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
1065 do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1067 case NVME_OPC_DELETE_IO_CQ:
1068 DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
1069 do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1071 case NVME_OPC_CREATE_IO_CQ:
1072 DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
1073 do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1075 case NVME_OPC_GET_LOG_PAGE:
1076 DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1077 do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1079 case NVME_OPC_IDENTIFY:
1080 DPRINTF(("%s command IDENTIFY\r\n", __func__));
1081 do_intr |= nvme_opc_identify(sc, cmd, &compl);
1083 case NVME_OPC_ABORT:
1084 DPRINTF(("%s command ABORT\r\n", __func__));
1085 do_intr |= nvme_opc_abort(sc, cmd, &compl);
1087 case NVME_OPC_SET_FEATURES:
1088 DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1089 do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1091 case NVME_OPC_GET_FEATURES:
1092 DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1093 do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1095 case NVME_OPC_ASYNC_EVENT_REQUEST:
1096 DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1097 /* XXX dont care, unhandled for now
1098 do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1100 compl.status = NVME_NO_STATUS;
1103 WPRINTF(("0x%x command is not implemented\r\n",
1105 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1109 if (NVME_COMPLETION_VALID(compl)) {
1110 struct nvme_completion *cp;
1113 cq = &sc->compl_queues[0];
1115 cp = &(cq->qbase)[cq->tail];
1116 cp->cdw0 = compl.cdw0;
1121 phase = NVME_STATUS_GET_P(cp->status);
1122 cp->status = compl.status;
1123 pci_nvme_toggle_phase(&cp->status, phase);
1125 cq->tail = (cq->tail + 1) % cq->size;
1127 sqhead = (sqhead + 1) % sq->size;
1130 DPRINTF(("setting sqhead %u\r\n", sqhead));
1131 atomic_store_short(&sq->head, sqhead);
1132 atomic_store_int(&sq->busy, 0);
1135 pci_generate_msix(sc->nsc_pi, 0);
1140 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1141 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1146 /* concatenate contig block-iovs to minimize number of iovs */
1147 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1148 iovidx = req->io_req.br_iovcnt - 1;
1150 req->io_req.br_iov[iovidx].iov_base =
1151 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1152 req->prev_gpaddr, size);
1154 req->prev_size += size;
1155 req->io_req.br_resid += size;
1157 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1159 pthread_mutex_lock(&req->mtx);
1161 iovidx = req->io_req.br_iovcnt;
1162 if (iovidx == NVME_MAX_BLOCKIOVS) {
1165 DPRINTF(("large I/O, doing partial req\r\n"));
1168 req->io_req.br_iovcnt = 0;
1170 req->io_req.br_callback = pci_nvme_io_partial;
1173 err = blockif_read(sc->nvstore.ctx,
1176 err = blockif_write(sc->nvstore.ctx,
1179 /* wait until req completes before cont */
1181 pthread_cond_wait(&req->cv, &req->mtx);
1184 req->io_req.br_offset = lba;
1185 req->io_req.br_resid = 0;
1186 req->io_req.br_param = req;
1189 req->io_req.br_iov[iovidx].iov_base =
1190 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1193 req->io_req.br_iov[iovidx].iov_len = size;
1195 req->prev_gpaddr = gpaddr;
1196 req->prev_size = size;
1197 req->io_req.br_resid += size;
1199 req->io_req.br_iovcnt++;
1201 pthread_mutex_unlock(&req->mtx);
1204 /* RAM buffer: read/write directly */
1205 void *p = sc->nvstore.ctx;
1208 if ((lba + size) > sc->nvstore.size) {
1209 WPRINTF(("%s write would overflow RAM\r\n", __func__));
1213 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1214 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1216 memcpy(p, gptr, size);
1218 memcpy(gptr, p, size);
1224 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1225 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1226 uint32_t cdw0, uint16_t status, int ignore_busy)
1228 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1229 struct nvme_completion *compl;
1233 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1234 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1235 NVME_STATUS_GET_SC(status)));
1237 pthread_mutex_lock(&cq->mtx);
1239 assert(cq->qbase != NULL);
1241 compl = &cq->qbase[cq->tail];
1243 compl->sqhd = atomic_load_acq_short(&sq->head);
1248 phase = NVME_STATUS_GET_P(compl->status);
1249 compl->status = status;
1250 pci_nvme_toggle_phase(&compl->status, phase);
1252 cq->tail = (cq->tail + 1) % cq->size;
1254 if (cq->intr_en & NVME_CQ_INTEN)
1257 pthread_mutex_unlock(&cq->mtx);
1259 if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1261 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1265 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1268 req->nvme_sq = NULL;
1271 pthread_mutex_lock(&sc->mtx);
1273 req->next = sc->ioreqs_free;
1274 sc->ioreqs_free = req;
1277 /* when no more IO pending, can set to ready if device reset/enabled */
1278 if (sc->pending_ios == 0 &&
1279 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1280 sc->regs.csts |= NVME_CSTS_RDY;
1282 pthread_mutex_unlock(&sc->mtx);
1284 sem_post(&sc->iosemlock);
1287 static struct pci_nvme_ioreq *
1288 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1290 struct pci_nvme_ioreq *req = NULL;;
1292 sem_wait(&sc->iosemlock);
1293 pthread_mutex_lock(&sc->mtx);
1295 req = sc->ioreqs_free;
1296 assert(req != NULL);
1298 sc->ioreqs_free = req->next;
1305 pthread_mutex_unlock(&sc->mtx);
1307 req->io_req.br_iovcnt = 0;
1308 req->io_req.br_offset = 0;
1309 req->io_req.br_resid = 0;
1310 req->io_req.br_param = req;
1311 req->prev_gpaddr = 0;
1318 pci_nvme_io_done(struct blockif_req *br, int err)
1320 struct pci_nvme_ioreq *req = br->br_param;
1321 struct nvme_submission_queue *sq = req->nvme_sq;
1322 uint16_t code, status;
1324 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1326 /* TODO return correct error */
1327 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1328 pci_nvme_status_genc(&status, code);
1330 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1331 pci_nvme_release_ioreq(req->sc, req);
1335 pci_nvme_io_partial(struct blockif_req *br, int err)
1337 struct pci_nvme_ioreq *req = br->br_param;
1339 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1341 pthread_cond_signal(&req->cv);
1346 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1348 struct nvme_submission_queue *sq;
1353 /* handle all submissions up to sq->tail index */
1354 sq = &sc->submit_queues[idx];
1356 if (atomic_testandset_int(&sq->busy, 1)) {
1357 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1361 sqhead = atomic_load_acq_short(&sq->head);
1363 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1364 idx, sqhead, sq->tail, sq->qbase));
1366 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1367 struct nvme_command *cmd;
1368 struct pci_nvme_ioreq *req = NULL;
1370 uint64_t nblocks, bytes, size, cpsz;
1372 /* TODO: support scatter gather list handling */
1374 cmd = &sq->qbase[sqhead];
1375 sqhead = (sqhead + 1) % sq->size;
1377 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1379 if (cmd->opc == NVME_OPC_FLUSH) {
1380 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1381 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1385 } else if (cmd->opc == 0x08) {
1386 /* TODO: write zeroes */
1387 WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1388 __func__, lba, cmd->cdw12 & 0xFFFF));
1389 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1390 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1396 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1398 bytes = nblocks * sc->nvstore.sectsz;
1400 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1401 req = pci_nvme_get_ioreq(sc);
1407 * If data starts mid-page and flows into the next page, then
1408 * increase page count
1411 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1413 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1414 cmd->opc == NVME_OPC_WRITE ?
1416 lba, nblocks, bytes));
1418 cmd->prp1 &= ~(0x03UL);
1419 cmd->prp2 &= ~(0x03UL);
1421 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1424 lba *= sc->nvstore.sectsz;
1426 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1432 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1434 req->opc = cmd->opc;
1435 req->cid = cmd->cid;
1436 req->nsid = cmd->nsid;
1439 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1440 cmd->opc == NVME_OPC_WRITE, lba);
1447 if (size <= PAGE_SIZE) {
1448 /* prp2 is second (and final) page in transfer */
1450 err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1452 cmd->opc == NVME_OPC_WRITE,
1458 /* prp2 is pointer to a physical region page list */
1459 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1460 cmd->prp2, PAGE_SIZE);
1464 cpsz = MIN(size, PAGE_SIZE);
1467 * Move to linked physical region page list
1470 if (i == (NVME_PRP2_ITEMS-1) &&
1472 assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1473 prp_list = paddr_guest2host(
1474 sc->nsc_pi->pi_vmctx,
1475 prp_list[i], PAGE_SIZE);
1478 if (prp_list[i] == 0) {
1479 WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1484 err = pci_nvme_append_iov_req(sc, req,
1486 cmd->opc == NVME_OPC_WRITE, lba);
1497 if (sc->nvstore.type == NVME_STOR_RAM) {
1498 uint16_t code, status;
1500 code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1502 pci_nvme_status_genc(&status, code);
1504 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1514 req->io_req.br_callback = pci_nvme_io_done;
1519 err = blockif_read(sc->nvstore.ctx, &req->io_req);
1521 case NVME_OPC_WRITE:
1522 err = blockif_write(sc->nvstore.ctx, &req->io_req);
1525 WPRINTF(("%s unhandled io command 0x%x\r\n",
1526 __func__, cmd->opc));
1534 pci_nvme_status_genc(&status,
1535 NVME_SC_DATA_TRANSFER_ERROR);
1537 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1539 pci_nvme_release_ioreq(sc, req);
1543 atomic_store_short(&sq->head, sqhead);
1544 atomic_store_int(&sq->busy, 0);
1548 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1549 uint64_t idx, int is_sq, uint64_t value)
1551 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1552 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1555 atomic_store_short(&sc->submit_queues[idx].tail,
1559 pci_nvme_handle_admin_cmd(sc, value);
1561 /* submission queue; handle new entries in SQ */
1562 if (idx > sc->num_squeues) {
1563 WPRINTF(("%s SQ index %lu overflow from "
1564 "guest (max %u)\r\n",
1565 __func__, idx, sc->num_squeues));
1568 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1571 if (idx > sc->num_cqueues) {
1572 WPRINTF(("%s queue index %lu overflow from "
1573 "guest (max %u)\r\n",
1574 __func__, idx, sc->num_cqueues));
1578 sc->compl_queues[idx].head = (uint16_t)value;
1583 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1585 const char *s = iswrite ? "WRITE" : "READ";
1588 case NVME_CR_CAP_LOW:
1589 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1591 case NVME_CR_CAP_HI:
1592 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1595 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1598 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1601 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1604 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1607 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1610 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1613 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1615 case NVME_CR_ASQ_LOW:
1616 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1618 case NVME_CR_ASQ_HI:
1619 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1621 case NVME_CR_ACQ_LOW:
1622 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1624 case NVME_CR_ACQ_HI:
1625 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1628 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1634 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1635 uint64_t offset, int size, uint64_t value)
1639 if (offset >= NVME_DOORBELL_OFFSET) {
1640 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1641 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1642 int is_sq = (belloffset % 8) < 4;
1644 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1645 WPRINTF(("guest attempted an overflow write offset "
1646 "0x%lx, val 0x%lx in %s",
1647 offset, value, __func__));
1651 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1655 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1656 offset, size, value));
1659 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1660 "val 0x%lx) to bar0 in %s",
1661 size, offset, value, __func__));
1662 /* TODO: shutdown device */
1666 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1668 pthread_mutex_lock(&sc->mtx);
1671 case NVME_CR_CAP_LOW:
1672 case NVME_CR_CAP_HI:
1679 /* MSI-X, so ignore */
1682 /* MSI-X, so ignore */
1685 ccreg = (uint32_t)value;
1687 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1690 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1691 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1692 NVME_CC_GET_IOCQES(ccreg)));
1694 if (NVME_CC_GET_SHN(ccreg)) {
1695 /* perform shutdown - flush out data to backend */
1696 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1697 NVME_CSTS_REG_SHST_SHIFT);
1698 sc->regs.csts |= NVME_SHST_COMPLETE <<
1699 NVME_CSTS_REG_SHST_SHIFT;
1701 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1702 if (NVME_CC_GET_EN(ccreg) == 0)
1703 /* transition 1-> causes controller reset */
1704 pci_nvme_reset_locked(sc);
1706 pci_nvme_init_controller(ctx, sc);
1709 /* Insert the iocqes, iosqes and en bits from the write */
1710 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1711 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1712 if (NVME_CC_GET_EN(ccreg) == 0) {
1713 /* Insert the ams, mps and css bit fields */
1714 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1715 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1716 sc->regs.csts &= ~NVME_CSTS_RDY;
1717 } else if (sc->pending_ios == 0) {
1718 sc->regs.csts |= NVME_CSTS_RDY;
1724 /* ignore writes; don't support subsystem reset */
1727 sc->regs.aqa = (uint32_t)value;
1729 case NVME_CR_ASQ_LOW:
1730 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1731 (0xFFFFF000 & value);
1733 case NVME_CR_ASQ_HI:
1734 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1737 case NVME_CR_ACQ_LOW:
1738 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1739 (0xFFFFF000 & value);
1741 case NVME_CR_ACQ_HI:
1742 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1746 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1747 __func__, offset, value, size));
1749 pthread_mutex_unlock(&sc->mtx);
1753 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1754 int baridx, uint64_t offset, int size, uint64_t value)
1756 struct pci_nvme_softc* sc = pi->pi_arg;
1758 if (baridx == pci_msix_table_bar(pi) ||
1759 baridx == pci_msix_pba_bar(pi)) {
1760 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1761 " value 0x%lx\r\n", baridx, offset, size, value));
1763 pci_emul_msix_twrite(pi, offset, size, value);
1769 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1773 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1774 __func__, baridx, value));
1778 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1779 uint64_t offset, int size)
1783 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1785 if (offset < NVME_DOORBELL_OFFSET) {
1786 void *p = &(sc->regs);
1787 pthread_mutex_lock(&sc->mtx);
1788 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1789 pthread_mutex_unlock(&sc->mtx);
1792 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1803 value &= 0xFFFFFFFF;
1807 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1808 offset, size, (uint32_t)value));
1816 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1817 uint64_t offset, int size)
1819 struct pci_nvme_softc* sc = pi->pi_arg;
1821 if (baridx == pci_msix_table_bar(pi) ||
1822 baridx == pci_msix_pba_bar(pi)) {
1823 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1824 baridx, offset, size));
1826 return pci_emul_msix_tread(pi, offset, size);
1831 return pci_nvme_read_bar_0(sc, offset, size);
1834 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1842 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1844 char bident[sizeof("XX:X:X")];
1845 char *uopt, *xopts, *config;
1849 sc->max_queues = NVME_QUEUES;
1850 sc->max_qentries = NVME_MAX_QENTRIES;
1851 sc->ioslots = NVME_IOSLOTS;
1852 sc->num_squeues = sc->max_queues;
1853 sc->num_cqueues = sc->max_queues;
1856 uopt = strdup(opts);
1858 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1859 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1860 for (xopts = strtok(uopt, ",");
1862 xopts = strtok(NULL, ",")) {
1864 if ((config = strchr(xopts, '=')) != NULL)
1867 if (!strcmp("maxq", xopts)) {
1868 sc->max_queues = atoi(config);
1869 } else if (!strcmp("qsz", xopts)) {
1870 sc->max_qentries = atoi(config);
1871 } else if (!strcmp("ioslots", xopts)) {
1872 sc->ioslots = atoi(config);
1873 } else if (!strcmp("sectsz", xopts)) {
1874 sectsz = atoi(config);
1875 } else if (!strcmp("ser", xopts)) {
1877 * This field indicates the Product Serial Number in
1878 * 7-bit ASCII, unused bytes should be space characters.
1881 cpywithpad((char *)sc->ctrldata.sn,
1882 sizeof(sc->ctrldata.sn), config, ' ');
1883 } else if (!strcmp("ram", xopts)) {
1884 uint64_t sz = strtoull(&xopts[4], NULL, 10);
1886 sc->nvstore.type = NVME_STOR_RAM;
1887 sc->nvstore.size = sz * 1024 * 1024;
1888 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1889 sc->nvstore.sectsz = 4096;
1890 sc->nvstore.sectsz_bits = 12;
1891 if (sc->nvstore.ctx == NULL) {
1892 perror("Unable to allocate RAM");
1896 } else if (!strcmp("eui64", xopts)) {
1897 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1898 } else if (optidx == 0) {
1899 snprintf(bident, sizeof(bident), "%d:%d",
1900 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1901 sc->nvstore.ctx = blockif_open(xopts, bident);
1902 if (sc->nvstore.ctx == NULL) {
1903 perror("Could not open backing file");
1907 sc->nvstore.type = NVME_STOR_BLOCKIF;
1908 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1910 fprintf(stderr, "Invalid option %s\n", xopts);
1919 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1920 fprintf(stderr, "backing store not specified\n");
1923 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1924 sc->nvstore.sectsz = sectsz;
1925 else if (sc->nvstore.type != NVME_STOR_RAM)
1926 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1927 for (sc->nvstore.sectsz_bits = 9;
1928 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1929 sc->nvstore.sectsz_bits++);
1931 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1932 sc->max_queues = NVME_QUEUES;
1934 if (sc->max_qentries <= 0) {
1935 fprintf(stderr, "Invalid qsz option\n");
1938 if (sc->ioslots <= 0) {
1939 fprintf(stderr, "Invalid ioslots option\n");
1947 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1949 struct pci_nvme_softc *sc;
1950 uint32_t pci_membar_sz;
1955 sc = calloc(1, sizeof(struct pci_nvme_softc));
1959 error = pci_nvme_parse_opts(sc, opts);
1965 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1966 for (int i = 0; i < sc->ioslots; i++) {
1967 if (i < (sc->ioslots-1))
1968 sc->ioreqs[i].next = &sc->ioreqs[i+1];
1969 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1970 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1972 sc->ioreqs_free = sc->ioreqs;
1973 sc->intr_coales_aggr_thresh = 1;
1975 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1976 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1977 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1978 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1979 pci_set_cfgdata8(pi, PCIR_PROGIF,
1980 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1983 * Allocate size of NVMe registers + doorbell space for all queues.
1985 * The specification requires a minimum memory I/O window size of 16K.
1986 * The Windows driver will refuse to start a device with a smaller
1989 pci_membar_sz = sizeof(struct nvme_registers) +
1990 2 * sizeof(uint32_t) * (sc->max_queues + 1);
1991 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1993 DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1995 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1997 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
2001 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2003 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
2007 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2009 WPRINTF(("%s pci add Express capability failed\r\n", __func__));
2013 pthread_mutex_init(&sc->mtx, NULL);
2014 sem_init(&sc->iosemlock, 0, sc->ioslots);
2017 pci_nvme_init_ctrldata(sc);
2018 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2019 pci_nvme_init_logpages(sc);
2021 pci_lintr_request(pi);
2028 struct pci_devemu pci_de_nvme = {
2030 .pe_init = pci_nvme_init,
2031 .pe_barwrite = pci_nvme_write,
2032 .pe_barread = pci_nvme_read
2034 PCI_EMUL_SET(pci_de_nvme);