2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
7 * Function crc16 Copyright (c) 2017, Fedor Uporov
8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * bhyve PCIe-NVMe device emulation.
36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
43 * maxq = max number of queues
44 * qsz = max elements in each queue
45 * ioslots = max number of concurrent io requests
46 * sectsz = sector size (defaults to blockif sector size)
47 * ser = serial number (20-chars max)
48 * eui64 = IEEE Extended Unique Identifier (8 byte value)
53 - create async event for smart and log
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
65 #include <semaphore.h>
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
77 #include <dev/nvme/nvme.h>
84 static int nvme_debug = 0;
85 #define DPRINTF(params) if (nvme_debug) printf params
86 #define WPRINTF(params) printf params
88 /* defaults; can be overridden */
89 #define NVME_MSIX_BAR 4
91 #define NVME_IOSLOTS 8
93 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
94 #define NVME_MMIO_SPACE_MIN (1 << 14)
96 #define NVME_QUEUES 16
97 #define NVME_MAX_QENTRIES 2048
99 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
100 #define NVME_MAX_BLOCKIOVS 512
102 /* This is a synthetic status code to indicate there is no status */
103 #define NVME_NO_STATUS 0xffff
104 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
108 /* Convert a zero-based value into a one-based value */
109 #define ONE_BASED(zero) ((zero) + 1)
110 /* Convert a one-based value into a zero-based value */
111 #define ZERO_BASED(one) ((one) - 1)
113 /* Encode number of SQ's and CQ's for Set/Get Features */
114 #define NVME_FEATURE_NUM_QUEUES(sc) \
115 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
116 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
118 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
120 enum nvme_controller_register_offsets {
121 NVME_CR_CAP_LOW = 0x00,
122 NVME_CR_CAP_HI = 0x04,
124 NVME_CR_INTMS = 0x0c,
125 NVME_CR_INTMC = 0x10,
130 NVME_CR_ASQ_LOW = 0x28,
131 NVME_CR_ASQ_HI = 0x2c,
132 NVME_CR_ACQ_LOW = 0x30,
133 NVME_CR_ACQ_HI = 0x34,
136 enum nvme_cmd_cdw11 {
137 NVME_CMD_CDW11_PC = 0x0001,
138 NVME_CMD_CDW11_IEN = 0x0002,
139 NVME_CMD_CDW11_IV = 0xFFFF0000,
142 #define NVME_CQ_INTEN 0x01
143 #define NVME_CQ_INTCOAL 0x02
145 struct nvme_completion_queue {
146 struct nvme_completion *qbase;
148 uint16_t tail; /* nvme progress */
149 uint16_t head; /* guest progress */
155 struct nvme_submission_queue {
156 struct nvme_command *qbase;
158 uint16_t head; /* nvme progress */
159 uint16_t tail; /* guest progress */
160 uint16_t cqid; /* completion queue id */
161 int busy; /* queue is being processed */
165 enum nvme_storage_type {
166 NVME_STOR_BLOCKIF = 0,
170 struct pci_nvme_blockstore {
171 enum nvme_storage_type type;
175 uint32_t sectsz_bits;
179 struct pci_nvme_ioreq {
180 struct pci_nvme_softc *sc;
181 struct pci_nvme_ioreq *next;
182 struct nvme_submission_queue *nvme_sq;
185 /* command information */
190 uint64_t prev_gpaddr;
194 * lock if all iovs consumed (big IO);
195 * complete transaction before continuing
200 struct blockif_req io_req;
202 /* pad to fit up to 512 page descriptors from guest IO request */
203 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
206 struct pci_nvme_softc {
207 struct pci_devinst *nsc_pi;
211 struct nvme_registers regs;
213 struct nvme_namespace_data nsdata;
214 struct nvme_controller_data ctrldata;
215 struct nvme_error_information_entry err_log;
216 struct nvme_health_information_page health_log;
217 struct nvme_firmware_page fw_log;
219 struct pci_nvme_blockstore nvstore;
221 uint16_t max_qentries; /* max entries per queue */
222 uint32_t max_queues; /* max number of IO SQ's or CQ's */
223 uint32_t num_cqueues;
224 uint32_t num_squeues;
226 struct pci_nvme_ioreq *ioreqs;
227 struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
228 uint32_t pending_ios;
233 * Memory mapped Submission and Completion queues
234 * Each array includes both Admin and IO queues
236 struct nvme_completion_queue *compl_queues;
237 struct nvme_submission_queue *submit_queues;
239 /* controller features */
240 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */
241 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
242 uint32_t async_ev_config; /* 0x0B: async event config */
246 static void pci_nvme_io_partial(struct blockif_req *br, int err);
248 /* Controller Configuration utils */
249 #define NVME_CC_GET_EN(cc) \
250 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
251 #define NVME_CC_GET_CSS(cc) \
252 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
253 #define NVME_CC_GET_SHN(cc) \
254 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
255 #define NVME_CC_GET_IOSQES(cc) \
256 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
257 #define NVME_CC_GET_IOCQES(cc) \
258 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
260 #define NVME_CC_WRITE_MASK \
261 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
262 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
263 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
265 #define NVME_CC_NEN_WRITE_MASK \
266 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
267 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
268 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
270 /* Controller Status utils */
271 #define NVME_CSTS_GET_RDY(sts) \
272 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
274 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
276 /* Completion Queue status word utils */
277 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
278 #define NVME_STATUS_MASK \
279 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
280 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
283 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
287 len = strnlen(src, dst_size);
288 memset(dst, pad, dst_size);
289 memcpy(dst, src, len);
293 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
296 *status &= ~NVME_STATUS_MASK;
297 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
298 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
302 pci_nvme_status_genc(uint16_t *status, uint16_t code)
305 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
309 pci_nvme_toggle_phase(uint16_t *status, int prev)
313 *status &= ~NVME_STATUS_P;
315 *status |= NVME_STATUS_P;
319 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
321 struct nvme_controller_data *cd = &sc->ctrldata;
326 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
327 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
329 /* Num of submission commands that we can handle at a time (2^rab) */
339 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */
341 cd->ver = 0x00010300;
343 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
347 cd->lpa = 0; /* TODO: support some simple things like SMART */
348 cd->elpe = 0; /* max error log page entries */
349 cd->npss = 1; /* number of power states support */
351 /* Warning Composite Temperature Threshold */
354 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
355 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
356 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
357 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
358 cd->nn = 1; /* number of namespaces */
362 cd->power_state[0].mp = 10;
366 * Calculate the CRC-16 of the given buffer
367 * See copyright attribution at top of file
370 crc16(uint16_t crc, const void *buffer, unsigned int len)
372 const unsigned char *cp = buffer;
373 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
374 static uint16_t const crc16_table[256] = {
375 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
376 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
377 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
378 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
379 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
380 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
381 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
382 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
383 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
384 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
385 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
386 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
387 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
388 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
389 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
390 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
391 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
392 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
393 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
394 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
395 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
396 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
397 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
398 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
399 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
400 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
401 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
402 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
403 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
404 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
405 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
406 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
410 crc = (((crc >> 8) & 0xffU) ^
411 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
416 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
417 struct nvme_namespace_data *nd, uint32_t nsid,
421 nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
425 /* Get LBA and backstore information from backing store */
426 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
429 /* Create an EUI-64 if user did not provide one */
433 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
434 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
437 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
440 eui64 = (eui64 << 16) | (nsid & 0xffff);
442 be64enc(nd->eui64, eui64);
444 /* LBA data-sz = 2^lbads */
445 nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
449 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
452 memset(&sc->err_log, 0, sizeof(sc->err_log));
453 memset(&sc->health_log, 0, sizeof(sc->health_log));
454 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
458 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
460 DPRINTF(("%s\r\n", __func__));
462 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
463 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
464 (60 << NVME_CAP_LO_REG_TO_SHIFT);
466 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
468 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
473 sc->num_cqueues = sc->num_squeues = sc->max_queues;
474 if (sc->submit_queues != NULL) {
475 for (int i = 0; i < sc->num_squeues + 1; i++) {
477 * The Admin Submission Queue is at index 0.
478 * It must not be changed at reset otherwise the
479 * emulation will be out of sync with the guest.
482 sc->submit_queues[i].qbase = NULL;
483 sc->submit_queues[i].size = 0;
484 sc->submit_queues[i].cqid = 0;
486 sc->submit_queues[i].tail = 0;
487 sc->submit_queues[i].head = 0;
488 sc->submit_queues[i].busy = 0;
491 sc->submit_queues = calloc(sc->num_squeues + 1,
492 sizeof(struct nvme_submission_queue));
494 if (sc->compl_queues != NULL) {
495 for (int i = 0; i < sc->num_cqueues + 1; i++) {
496 /* See Admin Submission Queue note above */
498 sc->compl_queues[i].qbase = NULL;
499 sc->compl_queues[i].size = 0;
502 sc->compl_queues[i].tail = 0;
503 sc->compl_queues[i].head = 0;
506 sc->compl_queues = calloc(sc->num_cqueues + 1,
507 sizeof(struct nvme_completion_queue));
509 for (int i = 0; i < sc->num_cqueues + 1; i++)
510 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
515 pci_nvme_reset(struct pci_nvme_softc *sc)
517 pthread_mutex_lock(&sc->mtx);
518 pci_nvme_reset_locked(sc);
519 pthread_mutex_unlock(&sc->mtx);
523 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
527 DPRINTF(("%s\r\n", __func__));
529 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
530 sc->submit_queues[0].size = asqs;
531 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
532 sizeof(struct nvme_command) * asqs);
534 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
535 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
537 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
538 NVME_AQA_REG_ACQS_MASK) + 1;
539 sc->compl_queues[0].size = acqs;
540 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
541 sizeof(struct nvme_completion) * acqs);
542 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
543 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
547 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
553 if (len > (8 * 1024)) {
557 /* Copy from the start of prp1 to the end of the physical page */
558 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
559 bytes = MIN(bytes, len);
561 dst = vm_map_gpa(ctx, prp1, bytes);
566 memcpy(dst, src, bytes);
575 len = MIN(len, PAGE_SIZE);
577 dst = vm_map_gpa(ctx, prp2, len);
582 memcpy(dst, src, len);
588 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
589 struct nvme_completion* compl)
591 uint16_t qid = command->cdw10 & 0xffff;
593 DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
594 if (qid == 0 || qid > sc->num_squeues) {
595 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
596 __func__, qid, sc->num_squeues));
597 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
598 NVME_SC_INVALID_QUEUE_IDENTIFIER);
602 sc->submit_queues[qid].qbase = NULL;
603 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
608 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
609 struct nvme_completion* compl)
611 if (command->cdw11 & NVME_CMD_CDW11_PC) {
612 uint16_t qid = command->cdw10 & 0xffff;
613 struct nvme_submission_queue *nsq;
615 if ((qid == 0) || (qid > sc->num_squeues)) {
616 WPRINTF(("%s queue index %u > num_squeues %u\r\n",
617 __func__, qid, sc->num_squeues));
618 pci_nvme_status_tc(&compl->status,
619 NVME_SCT_COMMAND_SPECIFIC,
620 NVME_SC_INVALID_QUEUE_IDENTIFIER);
624 nsq = &sc->submit_queues[qid];
625 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
627 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
628 sizeof(struct nvme_command) * (size_t)nsq->size);
629 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
630 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
632 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
633 qid, nsq->size, nsq->qbase, nsq->cqid));
635 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
637 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
641 * Guest sent non-cont submission queue request.
642 * This setting is unsupported by this emulation.
644 WPRINTF(("%s unsupported non-contig (list-based) "
645 "create i/o submission queue\r\n", __func__));
647 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
653 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
654 struct nvme_completion* compl)
656 uint16_t qid = command->cdw10 & 0xffff;
658 DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
659 if (qid == 0 || qid > sc->num_cqueues) {
660 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
661 __func__, qid, sc->num_cqueues));
662 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
663 NVME_SC_INVALID_QUEUE_IDENTIFIER);
667 sc->compl_queues[qid].qbase = NULL;
668 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
673 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
674 struct nvme_completion* compl)
676 if (command->cdw11 & NVME_CMD_CDW11_PC) {
677 uint16_t qid = command->cdw10 & 0xffff;
678 struct nvme_completion_queue *ncq;
680 if ((qid == 0) || (qid > sc->num_cqueues)) {
681 WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
682 __func__, qid, sc->num_cqueues));
683 pci_nvme_status_tc(&compl->status,
684 NVME_SCT_COMMAND_SPECIFIC,
685 NVME_SC_INVALID_QUEUE_IDENTIFIER);
689 ncq = &sc->compl_queues[qid];
690 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
691 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
692 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
694 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
696 sizeof(struct nvme_command) * (size_t)ncq->size);
698 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
701 * Non-contig completion queue unsupported.
703 WPRINTF(("%s unsupported non-contig (list-based) "
704 "create i/o completion queue\r\n",
707 /* 0x12 = Invalid Use of Controller Memory Buffer */
708 pci_nvme_status_genc(&compl->status, 0x12);
715 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
716 struct nvme_completion* compl)
718 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
719 uint8_t logpage = command->cdw10 & 0xFF;
721 DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
723 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
727 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
728 command->prp2, (uint8_t *)&sc->err_log, logsize);
730 case NVME_LOG_HEALTH_INFORMATION:
731 /* TODO: present some smart info */
732 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
733 command->prp2, (uint8_t *)&sc->health_log, logsize);
735 case NVME_LOG_FIRMWARE_SLOT:
736 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
737 command->prp2, (uint8_t *)&sc->fw_log, logsize);
740 WPRINTF(("%s get log page %x command not supported\r\n",
743 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
744 NVME_SC_INVALID_LOG_PAGE);
751 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
752 struct nvme_completion* compl)
756 DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
757 command->cdw10 & 0xFF, command->nsid));
759 switch (command->cdw10 & 0xFF) {
760 case 0x00: /* return Identify Namespace data structure */
761 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
762 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
764 case 0x01: /* return Identify Controller data structure */
765 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
766 command->prp2, (uint8_t *)&sc->ctrldata,
767 sizeof(sc->ctrldata));
769 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
770 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
771 sizeof(uint32_t) * 1024);
772 ((uint32_t *)dest)[0] = 1;
773 ((uint32_t *)dest)[1] = 0;
776 pci_nvme_status_genc(&compl->status,
777 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
779 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
786 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
787 __func__, command->cdw10 & 0xFF));
788 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
792 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
797 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
798 struct nvme_completion* compl)
800 uint16_t nqr; /* Number of Queues Requested */
802 nqr = command->cdw11 & 0xFFFF;
804 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
805 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
809 sc->num_squeues = ONE_BASED(nqr);
810 if (sc->num_squeues > sc->max_queues) {
811 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
813 sc->num_squeues = sc->max_queues;
816 nqr = (command->cdw11 >> 16) & 0xFFFF;
818 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
819 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
823 sc->num_cqueues = ONE_BASED(nqr);
824 if (sc->num_cqueues > sc->max_queues) {
825 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
827 sc->num_cqueues = sc->max_queues;
830 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
836 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
837 struct nvme_completion* compl)
839 int feature = command->cdw10 & 0xFF;
842 DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
846 case NVME_FEAT_ARBITRATION:
847 DPRINTF((" arbitration 0x%x\r\n", command->cdw11));
849 case NVME_FEAT_POWER_MANAGEMENT:
850 DPRINTF((" power management 0x%x\r\n", command->cdw11));
852 case NVME_FEAT_LBA_RANGE_TYPE:
853 DPRINTF((" lba range 0x%x\r\n", command->cdw11));
855 case NVME_FEAT_TEMPERATURE_THRESHOLD:
856 DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11));
858 case NVME_FEAT_ERROR_RECOVERY:
859 DPRINTF((" error recovery 0x%x\r\n", command->cdw11));
861 case NVME_FEAT_VOLATILE_WRITE_CACHE:
862 DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11));
864 case NVME_FEAT_NUMBER_OF_QUEUES:
865 nvme_set_feature_queues(sc, command, compl);
867 case NVME_FEAT_INTERRUPT_COALESCING:
868 DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11));
871 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
873 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
875 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
876 iv = command->cdw11 & 0xFFFF;
878 DPRINTF((" interrupt vector configuration 0x%x\r\n",
881 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
882 if (sc->compl_queues[i].intr_vec == iv) {
883 if (command->cdw11 & (1 << 16))
884 sc->compl_queues[i].intr_en |=
887 sc->compl_queues[i].intr_en &=
892 case NVME_FEAT_WRITE_ATOMICITY:
893 DPRINTF((" write atomicity 0x%x\r\n", command->cdw11));
895 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
896 DPRINTF((" async event configuration 0x%x\r\n",
898 sc->async_ev_config = command->cdw11;
900 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
901 DPRINTF((" software progress marker 0x%x\r\n",
905 DPRINTF((" autonomous power state transition 0x%x\r\n",
909 WPRINTF(("%s invalid feature\r\n", __func__));
910 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
914 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
919 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
920 struct nvme_completion* compl)
922 int feature = command->cdw10 & 0xFF;
924 DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
929 case NVME_FEAT_ARBITRATION:
930 DPRINTF((" arbitration\r\n"));
932 case NVME_FEAT_POWER_MANAGEMENT:
933 DPRINTF((" power management\r\n"));
935 case NVME_FEAT_LBA_RANGE_TYPE:
936 DPRINTF((" lba range\r\n"));
938 case NVME_FEAT_TEMPERATURE_THRESHOLD:
939 DPRINTF((" temperature threshold\r\n"));
940 switch ((command->cdw11 >> 20) & 0x3) {
942 /* Over temp threshold */
943 compl->cdw0 = 0xFFFF;
946 /* Under temp threshold */
950 WPRINTF((" invalid threshold type select\r\n"));
951 pci_nvme_status_genc(&compl->status,
952 NVME_SC_INVALID_FIELD);
956 case NVME_FEAT_ERROR_RECOVERY:
957 DPRINTF((" error recovery\r\n"));
959 case NVME_FEAT_VOLATILE_WRITE_CACHE:
960 DPRINTF((" volatile write cache\r\n"));
962 case NVME_FEAT_NUMBER_OF_QUEUES:
963 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
965 DPRINTF((" number of queues (submit %u, completion %u)\r\n",
966 compl->cdw0 & 0xFFFF,
967 (compl->cdw0 >> 16) & 0xFFFF));
970 case NVME_FEAT_INTERRUPT_COALESCING:
971 DPRINTF((" interrupt coalescing\r\n"));
973 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
974 DPRINTF((" interrupt vector configuration\r\n"));
976 case NVME_FEAT_WRITE_ATOMICITY:
977 DPRINTF((" write atomicity\r\n"));
979 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
980 DPRINTF((" async event configuration\r\n"));
981 sc->async_ev_config = command->cdw11;
983 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
984 DPRINTF((" software progress marker\r\n"));
987 DPRINTF((" autonomous power state transition\r\n"));
990 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
991 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
995 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1000 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1001 struct nvme_completion* compl)
1003 DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
1004 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1006 /* TODO: search for the command ID and abort it */
1009 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1014 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1015 struct nvme_command* command, struct nvme_completion* compl)
1017 DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
1020 * TODO: raise events when they happen based on the Set Features cmd.
1021 * These events happen async, so only set completion successful if
1022 * there is an event reflective of the request to get event.
1024 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1025 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1030 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1032 struct nvme_completion compl;
1033 struct nvme_command *cmd;
1034 struct nvme_submission_queue *sq;
1035 struct nvme_completion_queue *cq;
1039 DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
1041 sq = &sc->submit_queues[0];
1043 sqhead = atomic_load_acq_short(&sq->head);
1045 if (atomic_testandset_int(&sq->busy, 1)) {
1046 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
1047 __func__, sqhead, sq->tail));
1051 DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
1053 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1054 cmd = &(sq->qbase)[sqhead];
1058 case NVME_OPC_DELETE_IO_SQ:
1059 DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
1060 do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1062 case NVME_OPC_CREATE_IO_SQ:
1063 DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
1064 do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1066 case NVME_OPC_DELETE_IO_CQ:
1067 DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
1068 do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1070 case NVME_OPC_CREATE_IO_CQ:
1071 DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
1072 do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1074 case NVME_OPC_GET_LOG_PAGE:
1075 DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1076 do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1078 case NVME_OPC_IDENTIFY:
1079 DPRINTF(("%s command IDENTIFY\r\n", __func__));
1080 do_intr |= nvme_opc_identify(sc, cmd, &compl);
1082 case NVME_OPC_ABORT:
1083 DPRINTF(("%s command ABORT\r\n", __func__));
1084 do_intr |= nvme_opc_abort(sc, cmd, &compl);
1086 case NVME_OPC_SET_FEATURES:
1087 DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1088 do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1090 case NVME_OPC_GET_FEATURES:
1091 DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1092 do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1094 case NVME_OPC_ASYNC_EVENT_REQUEST:
1095 DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1096 /* XXX dont care, unhandled for now
1097 do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1099 compl.status = NVME_NO_STATUS;
1102 WPRINTF(("0x%x command is not implemented\r\n",
1104 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1108 if (NVME_COMPLETION_VALID(compl)) {
1109 struct nvme_completion *cp;
1112 cq = &sc->compl_queues[0];
1114 cp = &(cq->qbase)[cq->tail];
1115 cp->cdw0 = compl.cdw0;
1120 phase = NVME_STATUS_GET_P(cp->status);
1121 cp->status = compl.status;
1122 pci_nvme_toggle_phase(&cp->status, phase);
1124 cq->tail = (cq->tail + 1) % cq->size;
1126 sqhead = (sqhead + 1) % sq->size;
1129 DPRINTF(("setting sqhead %u\r\n", sqhead));
1130 atomic_store_short(&sq->head, sqhead);
1131 atomic_store_int(&sq->busy, 0);
1134 pci_generate_msix(sc->nsc_pi, 0);
1139 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1140 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1145 /* concatenate contig block-iovs to minimize number of iovs */
1146 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1147 iovidx = req->io_req.br_iovcnt - 1;
1149 req->io_req.br_iov[iovidx].iov_base =
1150 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1151 req->prev_gpaddr, size);
1153 req->prev_size += size;
1154 req->io_req.br_resid += size;
1156 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1158 pthread_mutex_lock(&req->mtx);
1160 iovidx = req->io_req.br_iovcnt;
1161 if (iovidx == NVME_MAX_BLOCKIOVS) {
1164 DPRINTF(("large I/O, doing partial req\r\n"));
1167 req->io_req.br_iovcnt = 0;
1169 req->io_req.br_callback = pci_nvme_io_partial;
1172 err = blockif_read(sc->nvstore.ctx,
1175 err = blockif_write(sc->nvstore.ctx,
1178 /* wait until req completes before cont */
1180 pthread_cond_wait(&req->cv, &req->mtx);
1183 req->io_req.br_offset = lba;
1184 req->io_req.br_resid = 0;
1185 req->io_req.br_param = req;
1188 req->io_req.br_iov[iovidx].iov_base =
1189 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1192 req->io_req.br_iov[iovidx].iov_len = size;
1194 req->prev_gpaddr = gpaddr;
1195 req->prev_size = size;
1196 req->io_req.br_resid += size;
1198 req->io_req.br_iovcnt++;
1200 pthread_mutex_unlock(&req->mtx);
1203 /* RAM buffer: read/write directly */
1204 void *p = sc->nvstore.ctx;
1207 if ((lba + size) > sc->nvstore.size) {
1208 WPRINTF(("%s write would overflow RAM\r\n", __func__));
1212 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1213 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1215 memcpy(p, gptr, size);
1217 memcpy(gptr, p, size);
1223 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1224 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1225 uint32_t cdw0, uint16_t status, int ignore_busy)
1227 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1228 struct nvme_completion *compl;
1232 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1233 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1234 NVME_STATUS_GET_SC(status)));
1236 pthread_mutex_lock(&cq->mtx);
1238 assert(cq->qbase != NULL);
1240 compl = &cq->qbase[cq->tail];
1242 compl->sqhd = atomic_load_acq_short(&sq->head);
1247 phase = NVME_STATUS_GET_P(compl->status);
1248 compl->status = status;
1249 pci_nvme_toggle_phase(&compl->status, phase);
1251 cq->tail = (cq->tail + 1) % cq->size;
1253 if (cq->intr_en & NVME_CQ_INTEN)
1256 pthread_mutex_unlock(&cq->mtx);
1258 if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1260 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1264 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1267 req->nvme_sq = NULL;
1270 pthread_mutex_lock(&sc->mtx);
1272 req->next = sc->ioreqs_free;
1273 sc->ioreqs_free = req;
1276 /* when no more IO pending, can set to ready if device reset/enabled */
1277 if (sc->pending_ios == 0 &&
1278 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1279 sc->regs.csts |= NVME_CSTS_RDY;
1281 pthread_mutex_unlock(&sc->mtx);
1283 sem_post(&sc->iosemlock);
1286 static struct pci_nvme_ioreq *
1287 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1289 struct pci_nvme_ioreq *req = NULL;;
1291 sem_wait(&sc->iosemlock);
1292 pthread_mutex_lock(&sc->mtx);
1294 req = sc->ioreqs_free;
1295 assert(req != NULL);
1297 sc->ioreqs_free = req->next;
1304 pthread_mutex_unlock(&sc->mtx);
1306 req->io_req.br_iovcnt = 0;
1307 req->io_req.br_offset = 0;
1308 req->io_req.br_resid = 0;
1309 req->io_req.br_param = req;
1310 req->prev_gpaddr = 0;
1317 pci_nvme_io_done(struct blockif_req *br, int err)
1319 struct pci_nvme_ioreq *req = br->br_param;
1320 struct nvme_submission_queue *sq = req->nvme_sq;
1321 uint16_t code, status;
1323 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1325 /* TODO return correct error */
1326 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1327 pci_nvme_status_genc(&status, code);
1329 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1330 pci_nvme_release_ioreq(req->sc, req);
1334 pci_nvme_io_partial(struct blockif_req *br, int err)
1336 struct pci_nvme_ioreq *req = br->br_param;
1338 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1340 pthread_cond_signal(&req->cv);
1345 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1347 struct nvme_submission_queue *sq;
1352 /* handle all submissions up to sq->tail index */
1353 sq = &sc->submit_queues[idx];
1355 if (atomic_testandset_int(&sq->busy, 1)) {
1356 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1360 sqhead = atomic_load_acq_short(&sq->head);
1362 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1363 idx, sqhead, sq->tail, sq->qbase));
1365 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1366 struct nvme_command *cmd;
1367 struct pci_nvme_ioreq *req = NULL;
1369 uint64_t nblocks, bytes, size, cpsz;
1371 /* TODO: support scatter gather list handling */
1373 cmd = &sq->qbase[sqhead];
1374 sqhead = (sqhead + 1) % sq->size;
1376 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1378 if (cmd->opc == NVME_OPC_FLUSH) {
1379 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1380 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1384 } else if (cmd->opc == 0x08) {
1385 /* TODO: write zeroes */
1386 WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1387 __func__, lba, cmd->cdw12 & 0xFFFF));
1388 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1389 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1395 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1397 bytes = nblocks * sc->nvstore.sectsz;
1399 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1400 req = pci_nvme_get_ioreq(sc);
1406 * If data starts mid-page and flows into the next page, then
1407 * increase page count
1410 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1412 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1413 cmd->opc == NVME_OPC_WRITE ?
1415 lba, nblocks, bytes));
1417 cmd->prp1 &= ~(0x03UL);
1418 cmd->prp2 &= ~(0x03UL);
1420 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1423 lba *= sc->nvstore.sectsz;
1425 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1431 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1433 req->opc = cmd->opc;
1434 req->cid = cmd->cid;
1435 req->nsid = cmd->nsid;
1438 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1439 cmd->opc == NVME_OPC_WRITE, lba);
1446 if (size <= PAGE_SIZE) {
1447 /* prp2 is second (and final) page in transfer */
1449 err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1451 cmd->opc == NVME_OPC_WRITE,
1457 /* prp2 is pointer to a physical region page list */
1458 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1459 cmd->prp2, PAGE_SIZE);
1463 cpsz = MIN(size, PAGE_SIZE);
1466 * Move to linked physical region page list
1469 if (i == (NVME_PRP2_ITEMS-1) &&
1471 assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1472 prp_list = paddr_guest2host(
1473 sc->nsc_pi->pi_vmctx,
1474 prp_list[i], PAGE_SIZE);
1477 if (prp_list[i] == 0) {
1478 WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1483 err = pci_nvme_append_iov_req(sc, req,
1485 cmd->opc == NVME_OPC_WRITE, lba);
1496 if (sc->nvstore.type == NVME_STOR_RAM) {
1497 uint16_t code, status;
1499 code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1501 pci_nvme_status_genc(&status, code);
1503 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1513 req->io_req.br_callback = pci_nvme_io_done;
1518 err = blockif_read(sc->nvstore.ctx, &req->io_req);
1520 case NVME_OPC_WRITE:
1521 err = blockif_write(sc->nvstore.ctx, &req->io_req);
1524 WPRINTF(("%s unhandled io command 0x%x\r\n",
1525 __func__, cmd->opc));
1533 pci_nvme_status_genc(&status,
1534 NVME_SC_DATA_TRANSFER_ERROR);
1536 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1538 pci_nvme_release_ioreq(sc, req);
1542 atomic_store_short(&sq->head, sqhead);
1543 atomic_store_int(&sq->busy, 0);
1547 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1548 uint64_t idx, int is_sq, uint64_t value)
1550 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1551 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1554 atomic_store_short(&sc->submit_queues[idx].tail,
1558 pci_nvme_handle_admin_cmd(sc, value);
1560 /* submission queue; handle new entries in SQ */
1561 if (idx > sc->num_squeues) {
1562 WPRINTF(("%s SQ index %lu overflow from "
1563 "guest (max %u)\r\n",
1564 __func__, idx, sc->num_squeues));
1567 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1570 if (idx > sc->num_cqueues) {
1571 WPRINTF(("%s queue index %lu overflow from "
1572 "guest (max %u)\r\n",
1573 __func__, idx, sc->num_cqueues));
1577 sc->compl_queues[idx].head = (uint16_t)value;
1582 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1584 const char *s = iswrite ? "WRITE" : "READ";
1587 case NVME_CR_CAP_LOW:
1588 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1590 case NVME_CR_CAP_HI:
1591 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1594 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1597 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1600 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1603 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1606 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1609 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1612 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1614 case NVME_CR_ASQ_LOW:
1615 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1617 case NVME_CR_ASQ_HI:
1618 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1620 case NVME_CR_ACQ_LOW:
1621 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1623 case NVME_CR_ACQ_HI:
1624 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1627 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1633 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1634 uint64_t offset, int size, uint64_t value)
1638 if (offset >= NVME_DOORBELL_OFFSET) {
1639 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1640 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1641 int is_sq = (belloffset % 8) < 4;
1643 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1644 WPRINTF(("guest attempted an overflow write offset "
1645 "0x%lx, val 0x%lx in %s",
1646 offset, value, __func__));
1650 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1654 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1655 offset, size, value));
1658 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1659 "val 0x%lx) to bar0 in %s",
1660 size, offset, value, __func__));
1661 /* TODO: shutdown device */
1665 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1667 pthread_mutex_lock(&sc->mtx);
1670 case NVME_CR_CAP_LOW:
1671 case NVME_CR_CAP_HI:
1678 /* MSI-X, so ignore */
1681 /* MSI-X, so ignore */
1684 ccreg = (uint32_t)value;
1686 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1689 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1690 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1691 NVME_CC_GET_IOCQES(ccreg)));
1693 if (NVME_CC_GET_SHN(ccreg)) {
1694 /* perform shutdown - flush out data to backend */
1695 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1696 NVME_CSTS_REG_SHST_SHIFT);
1697 sc->regs.csts |= NVME_SHST_COMPLETE <<
1698 NVME_CSTS_REG_SHST_SHIFT;
1700 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1701 if (NVME_CC_GET_EN(ccreg) == 0)
1702 /* transition 1-> causes controller reset */
1703 pci_nvme_reset_locked(sc);
1705 pci_nvme_init_controller(ctx, sc);
1708 /* Insert the iocqes, iosqes and en bits from the write */
1709 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1710 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1711 if (NVME_CC_GET_EN(ccreg) == 0) {
1712 /* Insert the ams, mps and css bit fields */
1713 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1714 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1715 sc->regs.csts &= ~NVME_CSTS_RDY;
1716 } else if (sc->pending_ios == 0) {
1717 sc->regs.csts |= NVME_CSTS_RDY;
1723 /* ignore writes; don't support subsystem reset */
1726 sc->regs.aqa = (uint32_t)value;
1728 case NVME_CR_ASQ_LOW:
1729 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1730 (0xFFFFF000 & value);
1732 case NVME_CR_ASQ_HI:
1733 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1736 case NVME_CR_ACQ_LOW:
1737 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1738 (0xFFFFF000 & value);
1740 case NVME_CR_ACQ_HI:
1741 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1745 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1746 __func__, offset, value, size));
1748 pthread_mutex_unlock(&sc->mtx);
1752 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1753 int baridx, uint64_t offset, int size, uint64_t value)
1755 struct pci_nvme_softc* sc = pi->pi_arg;
1757 if (baridx == pci_msix_table_bar(pi) ||
1758 baridx == pci_msix_pba_bar(pi)) {
1759 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1760 " value 0x%lx\r\n", baridx, offset, size, value));
1762 pci_emul_msix_twrite(pi, offset, size, value);
1768 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1772 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1773 __func__, baridx, value));
1777 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1778 uint64_t offset, int size)
1782 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1784 if (offset < NVME_DOORBELL_OFFSET) {
1785 void *p = &(sc->regs);
1786 pthread_mutex_lock(&sc->mtx);
1787 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1788 pthread_mutex_unlock(&sc->mtx);
1791 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1802 value &= 0xFFFFFFFF;
1806 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1807 offset, size, (uint32_t)value));
1815 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1816 uint64_t offset, int size)
1818 struct pci_nvme_softc* sc = pi->pi_arg;
1820 if (baridx == pci_msix_table_bar(pi) ||
1821 baridx == pci_msix_pba_bar(pi)) {
1822 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1823 baridx, offset, size));
1825 return pci_emul_msix_tread(pi, offset, size);
1830 return pci_nvme_read_bar_0(sc, offset, size);
1833 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1841 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1843 char bident[sizeof("XX:X:X")];
1844 char *uopt, *xopts, *config;
1848 sc->max_queues = NVME_QUEUES;
1849 sc->max_qentries = NVME_MAX_QENTRIES;
1850 sc->ioslots = NVME_IOSLOTS;
1851 sc->num_squeues = sc->max_queues;
1852 sc->num_cqueues = sc->max_queues;
1855 uopt = strdup(opts);
1857 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1858 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1859 for (xopts = strtok(uopt, ",");
1861 xopts = strtok(NULL, ",")) {
1863 if ((config = strchr(xopts, '=')) != NULL)
1866 if (!strcmp("maxq", xopts)) {
1867 sc->max_queues = atoi(config);
1868 } else if (!strcmp("qsz", xopts)) {
1869 sc->max_qentries = atoi(config);
1870 } else if (!strcmp("ioslots", xopts)) {
1871 sc->ioslots = atoi(config);
1872 } else if (!strcmp("sectsz", xopts)) {
1873 sectsz = atoi(config);
1874 } else if (!strcmp("ser", xopts)) {
1876 * This field indicates the Product Serial Number in
1877 * 7-bit ASCII, unused bytes should be space characters.
1880 cpywithpad((char *)sc->ctrldata.sn,
1881 sizeof(sc->ctrldata.sn), config, ' ');
1882 } else if (!strcmp("ram", xopts)) {
1883 uint64_t sz = strtoull(&xopts[4], NULL, 10);
1885 sc->nvstore.type = NVME_STOR_RAM;
1886 sc->nvstore.size = sz * 1024 * 1024;
1887 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1888 sc->nvstore.sectsz = 4096;
1889 sc->nvstore.sectsz_bits = 12;
1890 if (sc->nvstore.ctx == NULL) {
1891 perror("Unable to allocate RAM");
1895 } else if (!strcmp("eui64", xopts)) {
1896 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1897 } else if (optidx == 0) {
1898 snprintf(bident, sizeof(bident), "%d:%d",
1899 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1900 sc->nvstore.ctx = blockif_open(xopts, bident);
1901 if (sc->nvstore.ctx == NULL) {
1902 perror("Could not open backing file");
1906 sc->nvstore.type = NVME_STOR_BLOCKIF;
1907 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1909 fprintf(stderr, "Invalid option %s\n", xopts);
1918 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1919 fprintf(stderr, "backing store not specified\n");
1922 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1923 sc->nvstore.sectsz = sectsz;
1924 else if (sc->nvstore.type != NVME_STOR_RAM)
1925 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1926 for (sc->nvstore.sectsz_bits = 9;
1927 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1928 sc->nvstore.sectsz_bits++);
1930 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1931 sc->max_queues = NVME_QUEUES;
1933 if (sc->max_qentries <= 0) {
1934 fprintf(stderr, "Invalid qsz option\n");
1937 if (sc->ioslots <= 0) {
1938 fprintf(stderr, "Invalid ioslots option\n");
1946 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1948 struct pci_nvme_softc *sc;
1949 uint32_t pci_membar_sz;
1954 sc = calloc(1, sizeof(struct pci_nvme_softc));
1958 error = pci_nvme_parse_opts(sc, opts);
1964 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1965 for (int i = 0; i < sc->ioslots; i++) {
1966 if (i < (sc->ioslots-1))
1967 sc->ioreqs[i].next = &sc->ioreqs[i+1];
1968 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1969 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1971 sc->ioreqs_free = sc->ioreqs;
1972 sc->intr_coales_aggr_thresh = 1;
1974 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1975 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1976 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1977 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1978 pci_set_cfgdata8(pi, PCIR_PROGIF,
1979 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1982 * Allocate size of NVMe registers + doorbell space for all queues.
1984 * The specification requires a minimum memory I/O window size of 16K.
1985 * The Windows driver will refuse to start a device with a smaller
1988 pci_membar_sz = sizeof(struct nvme_registers) +
1989 2 * sizeof(uint32_t) * (sc->max_queues + 1);
1990 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1992 DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1994 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1996 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
2000 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2002 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
2006 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2008 WPRINTF(("%s pci add Express capability failed\r\n", __func__));
2012 pthread_mutex_init(&sc->mtx, NULL);
2013 sem_init(&sc->iosemlock, 0, sc->ioslots);
2016 pci_nvme_init_ctrldata(sc);
2017 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2018 pci_nvme_init_logpages(sc);
2020 pci_lintr_request(pi);
2027 struct pci_devemu pci_de_nvme = {
2029 .pe_init = pci_nvme_init,
2030 .pe_barwrite = pci_nvme_write,
2031 .pe_barread = pci_nvme_read
2033 PCI_EMUL_SET(pci_de_nvme);