2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
81 #include <dev/nvme/nvme.h>
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR 4
97 #define NVME_IOSLOTS 8
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN (1 << 14)
102 #define NVME_QUEUES 16
103 #define NVME_MAX_QENTRIES 2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN 0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS 0xffff
117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
121 /* Convert a zero-based value into a one-based value */
122 #define ONE_BASED(zero) ((zero) + 1)
123 /* Convert a one-based value into a zero-based value */
124 #define ZERO_BASED(one) ((one) - 1)
126 /* Encode number of SQ's and CQ's for Set/Get Features */
127 #define NVME_FEATURE_NUM_QUEUES(sc) \
128 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
129 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
131 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
133 enum nvme_controller_register_offsets {
134 NVME_CR_CAP_LOW = 0x00,
135 NVME_CR_CAP_HI = 0x04,
137 NVME_CR_INTMS = 0x0c,
138 NVME_CR_INTMC = 0x10,
143 NVME_CR_ASQ_LOW = 0x28,
144 NVME_CR_ASQ_HI = 0x2c,
145 NVME_CR_ACQ_LOW = 0x30,
146 NVME_CR_ACQ_HI = 0x34,
149 enum nvme_cmd_cdw11 {
150 NVME_CMD_CDW11_PC = 0x0001,
151 NVME_CMD_CDW11_IEN = 0x0002,
152 NVME_CMD_CDW11_IV = 0xFFFF0000,
160 #define NVME_CQ_INTEN 0x01
161 #define NVME_CQ_INTCOAL 0x02
163 struct nvme_completion_queue {
164 struct nvme_completion *qbase;
167 uint16_t tail; /* nvme progress */
168 uint16_t head; /* guest progress */
173 struct nvme_submission_queue {
174 struct nvme_command *qbase;
177 uint16_t head; /* nvme progress */
178 uint16_t tail; /* guest progress */
179 uint16_t cqid; /* completion queue id */
183 enum nvme_storage_type {
184 NVME_STOR_BLOCKIF = 0,
188 struct pci_nvme_blockstore {
189 enum nvme_storage_type type;
193 uint32_t sectsz_bits;
195 uint32_t deallocate:1;
199 * Calculate the number of additional page descriptors for guest IO requests
200 * based on the advertised Max Data Transfer (MDTS) and given the number of
201 * default iovec's in a struct blockif_req.
203 #define MDTS_PAD_SIZE \
204 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
208 struct pci_nvme_ioreq {
209 struct pci_nvme_softc *sc;
210 STAILQ_ENTRY(pci_nvme_ioreq) link;
211 struct nvme_submission_queue *nvme_sq;
214 /* command information */
219 uint64_t prev_gpaddr;
223 struct blockif_req io_req;
225 struct iovec iovpadding[MDTS_PAD_SIZE];
229 /* Dataset Management bit in ONCS reflects backing storage capability */
230 NVME_DATASET_MANAGEMENT_AUTO,
231 /* Unconditionally set Dataset Management bit in ONCS */
232 NVME_DATASET_MANAGEMENT_ENABLE,
233 /* Unconditionally clear Dataset Management bit in ONCS */
234 NVME_DATASET_MANAGEMENT_DISABLE,
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241 struct nvme_feature_obj *,
242 struct nvme_command *,
243 struct nvme_completion *);
245 struct nvme_feature_obj {
249 bool namespace_specific;
252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
255 PCI_NVME_AE_TYPE_ERROR = 0,
256 PCI_NVME_AE_TYPE_SMART,
257 PCI_NVME_AE_TYPE_NOTICE,
258 PCI_NVME_AE_TYPE_IO_CMD = 6,
259 PCI_NVME_AE_TYPE_VENDOR = 7,
260 PCI_NVME_AE_TYPE_MAX /* Must be last */
261 } pci_nvme_async_type;
263 /* Asynchronous Event Requests */
264 struct pci_nvme_aer {
265 STAILQ_ENTRY(pci_nvme_aer) link;
266 uint16_t cid; /* Command ID of the submitted AER */
270 PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
271 PCI_NVME_AE_INFO_FW_ACTIVATION,
272 PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
273 PCI_NVME_AE_INFO_ANA_CHANGE,
274 PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
275 PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
276 PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
277 PCI_NVME_AE_INFO_MAX,
278 } pci_nvme_async_info;
280 /* Asynchronous Event Notifications */
281 struct pci_nvme_aen {
282 pci_nvme_async_type atype;
287 struct pci_nvme_softc {
288 struct pci_devinst *nsc_pi;
292 struct nvme_registers regs;
294 struct nvme_namespace_data nsdata;
295 struct nvme_controller_data ctrldata;
296 struct nvme_error_information_entry err_log;
297 struct nvme_health_information_page health_log;
298 struct nvme_firmware_page fw_log;
300 struct pci_nvme_blockstore nvstore;
302 uint16_t max_qentries; /* max entries per queue */
303 uint32_t max_queues; /* max number of IO SQ's or CQ's */
304 uint32_t num_cqueues;
305 uint32_t num_squeues;
306 bool num_q_is_set; /* Has host set Number of Queues */
308 struct pci_nvme_ioreq *ioreqs;
309 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
310 uint32_t pending_ios;
315 * Memory mapped Submission and Completion queues
316 * Each array includes both Admin and IO queues
318 struct nvme_completion_queue *compl_queues;
319 struct nvme_submission_queue *submit_queues;
321 struct nvme_feature_obj feat[NVME_FID_MAX];
323 enum nvme_dsm_type dataset_management;
325 /* Accounting for SMART data */
326 __uint128_t read_data_units;
327 __uint128_t write_data_units;
328 __uint128_t read_commands;
329 __uint128_t write_commands;
330 uint32_t read_dunits_remainder;
331 uint32_t write_dunits_remainder;
333 STAILQ_HEAD(, pci_nvme_aer) aer_list;
334 pthread_mutex_t aer_mtx;
336 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
338 pthread_mutex_t aen_mtx;
339 pthread_cond_t aen_cond;
343 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
344 struct nvme_completion_queue *cq,
349 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
350 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
351 static void pci_nvme_io_done(struct blockif_req *, int);
353 /* Controller Configuration utils */
354 #define NVME_CC_GET_EN(cc) \
355 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
356 #define NVME_CC_GET_CSS(cc) \
357 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
358 #define NVME_CC_GET_SHN(cc) \
359 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
360 #define NVME_CC_GET_IOSQES(cc) \
361 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
362 #define NVME_CC_GET_IOCQES(cc) \
363 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
365 #define NVME_CC_WRITE_MASK \
366 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
367 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
368 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
370 #define NVME_CC_NEN_WRITE_MASK \
371 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
372 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
373 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
375 /* Controller Status utils */
376 #define NVME_CSTS_GET_RDY(sts) \
377 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
379 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
381 /* Completion Queue status word utils */
382 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
383 #define NVME_STATUS_MASK \
384 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
385 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
387 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
388 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
390 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
391 struct nvme_feature_obj *,
392 struct nvme_command *,
393 struct nvme_completion *);
394 static void nvme_feature_num_queues(struct pci_nvme_softc *,
395 struct nvme_feature_obj *,
396 struct nvme_command *,
397 struct nvme_completion *);
398 static void nvme_feature_iv_config(struct pci_nvme_softc *,
399 struct nvme_feature_obj *,
400 struct nvme_command *,
401 struct nvme_completion *);
403 static void *aen_thr(void *arg);
406 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
410 len = strnlen(src, dst_size);
411 memset(dst, pad, dst_size);
412 memcpy(dst, src, len);
416 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
419 *status &= ~NVME_STATUS_MASK;
420 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
421 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
425 pci_nvme_status_genc(uint16_t *status, uint16_t code)
428 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
432 * Initialize the requested number or IO Submission and Completion Queues.
433 * Admin queues are allocated implicitly.
436 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
441 * Allocate and initialize the Submission Queues
443 if (nsq > NVME_QUEUES) {
444 WPRINTF("%s: clamping number of SQ from %u to %u",
445 __func__, nsq, NVME_QUEUES);
449 sc->num_squeues = nsq;
451 sc->submit_queues = calloc(sc->num_squeues + 1,
452 sizeof(struct nvme_submission_queue));
453 if (sc->submit_queues == NULL) {
454 WPRINTF("%s: SQ allocation failed", __func__);
457 struct nvme_submission_queue *sq = sc->submit_queues;
459 for (i = 0; i < sc->num_squeues; i++)
460 pthread_mutex_init(&sq[i].mtx, NULL);
464 * Allocate and initialize the Completion Queues
466 if (ncq > NVME_QUEUES) {
467 WPRINTF("%s: clamping number of CQ from %u to %u",
468 __func__, ncq, NVME_QUEUES);
472 sc->num_cqueues = ncq;
474 sc->compl_queues = calloc(sc->num_cqueues + 1,
475 sizeof(struct nvme_completion_queue));
476 if (sc->compl_queues == NULL) {
477 WPRINTF("%s: CQ allocation failed", __func__);
480 struct nvme_completion_queue *cq = sc->compl_queues;
482 for (i = 0; i < sc->num_cqueues; i++)
483 pthread_mutex_init(&cq[i].mtx, NULL);
488 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
490 struct nvme_controller_data *cd = &sc->ctrldata;
495 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
496 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
498 /* Num of submission commands that we can handle at a time (2^rab) */
508 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
510 cd->ver = 0x00010300;
512 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
516 /* Advertise 1, Read-only firmware slot */
517 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
518 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
519 cd->lpa = 0; /* TODO: support some simple things like SMART */
520 cd->elpe = 0; /* max error log page entries */
521 cd->npss = 1; /* number of power states support */
523 /* Warning Composite Temperature Threshold */
526 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
527 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
528 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
529 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
530 cd->nn = 1; /* number of namespaces */
533 switch (sc->dataset_management) {
534 case NVME_DATASET_MANAGEMENT_AUTO:
535 if (sc->nvstore.deallocate)
536 cd->oncs |= NVME_ONCS_DSM;
538 case NVME_DATASET_MANAGEMENT_ENABLE:
539 cd->oncs |= NVME_ONCS_DSM;
547 cd->power_state[0].mp = 10;
551 * Calculate the CRC-16 of the given buffer
552 * See copyright attribution at top of file
555 crc16(uint16_t crc, const void *buffer, unsigned int len)
557 const unsigned char *cp = buffer;
558 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
559 static uint16_t const crc16_table[256] = {
560 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
561 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
562 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
563 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
564 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
565 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
566 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
567 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
568 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
569 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
570 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
571 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
572 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
573 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
574 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
575 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
576 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
577 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
578 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
579 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
580 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
581 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
582 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
583 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
584 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
585 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
586 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
587 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
588 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
589 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
590 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
591 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
595 crc = (((crc >> 8) & 0xffU) ^
596 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
601 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
602 struct nvme_namespace_data *nd, uint32_t nsid,
603 struct pci_nvme_blockstore *nvstore)
606 /* Get capacity and block size information from backing store */
607 nd->nsze = nvstore->size / nvstore->sectsz;
611 if (nvstore->type == NVME_STOR_BLOCKIF)
612 nvstore->deallocate = blockif_candelete(nvstore->ctx);
614 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
617 /* Create an EUI-64 if user did not provide one */
618 if (nvstore->eui64 == 0) {
620 uint64_t eui64 = nvstore->eui64;
622 asprintf(&data, "%s%u%u%u", get_config_value("name"),
623 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
624 sc->nsc_pi->pi_func);
627 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
630 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
632 be64enc(nd->eui64, nvstore->eui64);
634 /* LBA data-sz = 2^lbads */
635 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
639 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
642 memset(&sc->err_log, 0, sizeof(sc->err_log));
643 memset(&sc->health_log, 0, sizeof(sc->health_log));
644 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
646 /* Set read/write remainder to round up according to spec */
647 sc->read_dunits_remainder = 999;
648 sc->write_dunits_remainder = 999;
650 /* Set nominal Health values checked by implementations */
651 sc->health_log.temperature = 310;
652 sc->health_log.available_spare = 100;
653 sc->health_log.available_spare_threshold = 10;
657 pci_nvme_init_features(struct pci_nvme_softc *sc)
660 sc->feat[0].set = nvme_feature_invalid_cb;
661 sc->feat[0].get = nvme_feature_invalid_cb;
663 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
664 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
665 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
666 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
667 nvme_feature_iv_config;
668 /* Enable all AENs by default */
669 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f;
670 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
671 nvme_feature_invalid_cb;
672 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
673 nvme_feature_invalid_cb;
677 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
680 STAILQ_INIT(&sc->aer_list);
685 pci_nvme_aer_init(struct pci_nvme_softc *sc)
688 pthread_mutex_init(&sc->aer_mtx, NULL);
689 pci_nvme_aer_reset(sc);
693 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
695 struct pci_nvme_aer *aer = NULL;
697 pthread_mutex_lock(&sc->aer_mtx);
698 while (!STAILQ_EMPTY(&sc->aer_list)) {
699 aer = STAILQ_FIRST(&sc->aer_list);
700 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
703 pthread_mutex_unlock(&sc->aer_mtx);
705 pci_nvme_aer_reset(sc);
709 pci_nvme_aer_available(struct pci_nvme_softc *sc)
712 return (sc->aer_count != 0);
716 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
718 struct nvme_controller_data *cd = &sc->ctrldata;
720 /* AERL is a zero based value while aer_count is one's based */
721 return (sc->aer_count == (cd->aerl + 1));
725 * Add an Async Event Request
727 * Stores an AER to be returned later if the Controller needs to notify the
729 * Note that while the NVMe spec doesn't require Controllers to return AER's
730 * in order, this implementation does preserve the order.
733 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
735 struct pci_nvme_aer *aer = NULL;
737 if (pci_nvme_aer_limit_reached(sc))
740 aer = calloc(1, sizeof(struct pci_nvme_aer));
744 /* Save the Command ID for use in the completion message */
747 pthread_mutex_lock(&sc->aer_mtx);
749 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
750 pthread_mutex_unlock(&sc->aer_mtx);
756 * Get an Async Event Request structure
758 * Returns a pointer to an AER previously submitted by the host or NULL if
759 * no AER's exist. Caller is responsible for freeing the returned struct.
761 static struct pci_nvme_aer *
762 pci_nvme_aer_get(struct pci_nvme_softc *sc)
764 struct pci_nvme_aer *aer = NULL;
766 pthread_mutex_lock(&sc->aer_mtx);
767 aer = STAILQ_FIRST(&sc->aer_list);
769 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
772 pthread_mutex_unlock(&sc->aer_mtx);
778 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
782 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
784 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
785 sc->aen[atype].atype = atype;
790 pci_nvme_aen_init(struct pci_nvme_softc *sc)
794 pci_nvme_aen_reset(sc);
796 pthread_mutex_init(&sc->aen_mtx, NULL);
797 pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
798 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
799 sc->nsc_pi->pi_func);
800 pthread_set_name_np(sc->aen_tid, nstr);
804 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
807 pci_nvme_aen_reset(sc);
810 /* Notify the AEN thread of pending work */
812 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
815 pthread_cond_signal(&sc->aen_cond);
819 * Post an Asynchronous Event Notification
822 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
825 struct pci_nvme_aen *aen;
827 if (atype >= PCI_NVME_AE_TYPE_MAX) {
831 pthread_mutex_lock(&sc->aen_mtx);
832 aen = &sc->aen[atype];
834 /* Has the controller already posted an event of this type? */
836 pthread_mutex_unlock(&sc->aen_mtx);
840 aen->event_data = event_data;
842 pthread_mutex_unlock(&sc->aen_mtx);
844 pci_nvme_aen_notify(sc);
850 pci_nvme_aen_process(struct pci_nvme_softc *sc)
852 struct pci_nvme_aer *aer;
853 struct pci_nvme_aen *aen;
854 pci_nvme_async_type atype;
859 assert(pthread_mutex_isowned_np(&sc->aen_mtx));
860 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
861 aen = &sc->aen[atype];
862 /* Previous iterations may have depleted the available AER's */
863 if (!pci_nvme_aer_available(sc)) {
864 DPRINTF("%s: no AER", __func__);
869 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
873 status = NVME_SC_SUCCESS;
875 /* Is the event masked? */
877 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
879 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
881 case PCI_NVME_AE_TYPE_ERROR:
882 lid = NVME_LOG_ERROR;
884 case PCI_NVME_AE_TYPE_SMART:
886 if ((mask & aen->event_data) == 0)
888 lid = NVME_LOG_HEALTH_INFORMATION;
890 case PCI_NVME_AE_TYPE_NOTICE:
891 if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
892 EPRINTLN("%s unknown AEN notice type %u",
893 __func__, aen->event_data);
894 status = NVME_SC_INTERNAL_DEVICE_ERROR;
898 if (((1 << aen->event_data) & mask) == 0)
900 switch (aen->event_data) {
901 case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
902 lid = NVME_LOG_CHANGED_NAMESPACE;
904 case PCI_NVME_AE_INFO_FW_ACTIVATION:
905 lid = NVME_LOG_FIRMWARE_SLOT;
907 case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
908 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
910 case PCI_NVME_AE_INFO_ANA_CHANGE:
911 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
913 case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
914 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
916 case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
917 lid = NVME_LOG_LBA_STATUS_INFORMATION;
919 case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
920 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
928 EPRINTLN("%s unknown AEN type %u", __func__, atype);
929 status = NVME_SC_INTERNAL_DEVICE_ERROR;
933 aer = pci_nvme_aer_get(sc);
936 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
937 pci_nvme_cq_update(sc, &sc->compl_queues[0],
938 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
946 pci_generate_msix(sc->nsc_pi, 0);
953 struct pci_nvme_softc *sc;
957 pthread_mutex_lock(&sc->aen_mtx);
959 pci_nvme_aen_process(sc);
960 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
962 pthread_mutex_unlock(&sc->aen_mtx);
969 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
973 DPRINTF("%s", __func__);
975 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
976 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
977 (60 << NVME_CAP_LO_REG_TO_SHIFT);
979 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
981 sc->regs.vs = 0x00010300; /* NVMe v1.3 */
986 assert(sc->submit_queues != NULL);
988 for (i = 0; i < sc->num_squeues + 1; i++) {
989 sc->submit_queues[i].qbase = NULL;
990 sc->submit_queues[i].size = 0;
991 sc->submit_queues[i].cqid = 0;
992 sc->submit_queues[i].tail = 0;
993 sc->submit_queues[i].head = 0;
996 assert(sc->compl_queues != NULL);
998 for (i = 0; i < sc->num_cqueues + 1; i++) {
999 sc->compl_queues[i].qbase = NULL;
1000 sc->compl_queues[i].size = 0;
1001 sc->compl_queues[i].tail = 0;
1002 sc->compl_queues[i].head = 0;
1005 sc->num_q_is_set = false;
1007 pci_nvme_aer_destroy(sc);
1008 pci_nvme_aen_destroy(sc);
1012 pci_nvme_reset(struct pci_nvme_softc *sc)
1014 pthread_mutex_lock(&sc->mtx);
1015 pci_nvme_reset_locked(sc);
1016 pthread_mutex_unlock(&sc->mtx);
1020 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1022 uint16_t acqs, asqs;
1024 DPRINTF("%s", __func__);
1026 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1027 sc->submit_queues[0].size = asqs;
1028 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1029 sizeof(struct nvme_command) * asqs);
1031 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1032 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1034 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1035 NVME_AQA_REG_ACQS_MASK) + 1;
1036 sc->compl_queues[0].size = acqs;
1037 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1038 sizeof(struct nvme_completion) * acqs);
1039 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1041 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1042 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1046 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1047 size_t len, enum nvme_copy_dir dir)
1052 if (len > (8 * 1024)) {
1056 /* Copy from the start of prp1 to the end of the physical page */
1057 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1058 bytes = MIN(bytes, len);
1060 p = vm_map_gpa(ctx, prp1, bytes);
1065 if (dir == NVME_COPY_TO_PRP)
1066 memcpy(p, b, bytes);
1068 memcpy(b, p, bytes);
1077 len = MIN(len, PAGE_SIZE);
1079 p = vm_map_gpa(ctx, prp2, len);
1084 if (dir == NVME_COPY_TO_PRP)
1093 * Write a Completion Queue Entry update
1095 * Write the completion and update the doorbell value
1098 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1099 struct nvme_completion_queue *cq,
1105 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1106 struct nvme_completion *cqe;
1108 assert(cq->qbase != NULL);
1110 pthread_mutex_lock(&cq->mtx);
1112 cqe = &cq->qbase[cq->tail];
1114 /* Flip the phase bit */
1115 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1118 cqe->sqhd = sq->head;
1121 cqe->status = status;
1124 if (cq->tail >= cq->size) {
1128 pthread_mutex_unlock(&cq->mtx);
1132 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1133 struct nvme_completion* compl)
1135 uint16_t qid = command->cdw10 & 0xffff;
1137 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1138 if (qid == 0 || qid > sc->num_squeues ||
1139 (sc->submit_queues[qid].qbase == NULL)) {
1140 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1141 __func__, qid, sc->num_squeues);
1142 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1143 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1147 sc->submit_queues[qid].qbase = NULL;
1148 sc->submit_queues[qid].cqid = 0;
1149 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1154 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1155 struct nvme_completion* compl)
1157 if (command->cdw11 & NVME_CMD_CDW11_PC) {
1158 uint16_t qid = command->cdw10 & 0xffff;
1159 struct nvme_submission_queue *nsq;
1161 if ((qid == 0) || (qid > sc->num_squeues) ||
1162 (sc->submit_queues[qid].qbase != NULL)) {
1163 WPRINTF("%s queue index %u > num_squeues %u",
1164 __func__, qid, sc->num_squeues);
1165 pci_nvme_status_tc(&compl->status,
1166 NVME_SCT_COMMAND_SPECIFIC,
1167 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1171 nsq = &sc->submit_queues[qid];
1172 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1173 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1174 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1176 * Queues must specify at least two entries
1177 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1178 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1180 pci_nvme_status_tc(&compl->status,
1181 NVME_SCT_COMMAND_SPECIFIC,
1182 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1185 nsq->head = nsq->tail = 0;
1187 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1188 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1189 pci_nvme_status_tc(&compl->status,
1190 NVME_SCT_COMMAND_SPECIFIC,
1191 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1195 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1196 pci_nvme_status_tc(&compl->status,
1197 NVME_SCT_COMMAND_SPECIFIC,
1198 NVME_SC_COMPLETION_QUEUE_INVALID);
1202 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1204 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1205 sizeof(struct nvme_command) * (size_t)nsq->size);
1207 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1208 qid, nsq->size, nsq->qbase, nsq->cqid);
1210 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1212 DPRINTF("%s completed creating IOSQ qid %u",
1216 * Guest sent non-cont submission queue request.
1217 * This setting is unsupported by this emulation.
1219 WPRINTF("%s unsupported non-contig (list-based) "
1220 "create i/o submission queue", __func__);
1222 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1228 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1229 struct nvme_completion* compl)
1231 uint16_t qid = command->cdw10 & 0xffff;
1234 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1235 if (qid == 0 || qid > sc->num_cqueues ||
1236 (sc->compl_queues[qid].qbase == NULL)) {
1237 WPRINTF("%s queue index %u / num_cqueues %u",
1238 __func__, qid, sc->num_cqueues);
1239 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1240 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1244 /* Deleting an Active CQ is an error */
1245 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1246 if (sc->submit_queues[sqid].cqid == qid) {
1247 pci_nvme_status_tc(&compl->status,
1248 NVME_SCT_COMMAND_SPECIFIC,
1249 NVME_SC_INVALID_QUEUE_DELETION);
1253 sc->compl_queues[qid].qbase = NULL;
1254 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1259 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1260 struct nvme_completion* compl)
1262 struct nvme_completion_queue *ncq;
1263 uint16_t qid = command->cdw10 & 0xffff;
1265 /* Only support Physically Contiguous queues */
1266 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1267 WPRINTF("%s unsupported non-contig (list-based) "
1268 "create i/o completion queue",
1271 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1275 if ((qid == 0) || (qid > sc->num_cqueues) ||
1276 (sc->compl_queues[qid].qbase != NULL)) {
1277 WPRINTF("%s queue index %u > num_cqueues %u",
1278 __func__, qid, sc->num_cqueues);
1279 pci_nvme_status_tc(&compl->status,
1280 NVME_SCT_COMMAND_SPECIFIC,
1281 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1285 ncq = &sc->compl_queues[qid];
1286 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1287 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1288 if (ncq->intr_vec > (sc->max_queues + 1)) {
1289 pci_nvme_status_tc(&compl->status,
1290 NVME_SCT_COMMAND_SPECIFIC,
1291 NVME_SC_INVALID_INTERRUPT_VECTOR);
1295 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1296 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1298 * Queues must specify at least two entries
1299 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1300 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1302 pci_nvme_status_tc(&compl->status,
1303 NVME_SCT_COMMAND_SPECIFIC,
1304 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1307 ncq->head = ncq->tail = 0;
1308 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1310 sizeof(struct nvme_command) * (size_t)ncq->size);
1312 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1319 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1320 struct nvme_completion* compl)
1323 uint8_t logpage = command->cdw10 & 0xFF;
1325 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1327 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1330 * Command specifies the number of dwords to return in fields NUMDU
1331 * and NUMDL. This is a zero-based value.
1333 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1334 logsize *= sizeof(uint32_t);
1337 case NVME_LOG_ERROR:
1338 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1339 command->prp2, (uint8_t *)&sc->err_log,
1340 MIN(logsize, sizeof(sc->err_log)),
1343 case NVME_LOG_HEALTH_INFORMATION:
1344 pthread_mutex_lock(&sc->mtx);
1345 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1346 sizeof(sc->health_log.data_units_read));
1347 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1348 sizeof(sc->health_log.data_units_written));
1349 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1350 sizeof(sc->health_log.host_read_commands));
1351 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1352 sizeof(sc->health_log.host_write_commands));
1353 pthread_mutex_unlock(&sc->mtx);
1355 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1356 command->prp2, (uint8_t *)&sc->health_log,
1357 MIN(logsize, sizeof(sc->health_log)),
1360 case NVME_LOG_FIRMWARE_SLOT:
1361 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1362 command->prp2, (uint8_t *)&sc->fw_log,
1363 MIN(logsize, sizeof(sc->fw_log)),
1367 DPRINTF("%s get log page %x command not supported",
1370 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1371 NVME_SC_INVALID_LOG_PAGE);
1378 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1379 struct nvme_completion* compl)
1384 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1385 command->cdw10 & 0xFF, command->nsid);
1387 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1389 switch (command->cdw10 & 0xFF) {
1390 case 0x00: /* return Identify Namespace data structure */
1391 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1392 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1395 case 0x01: /* return Identify Controller data structure */
1396 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1397 command->prp2, (uint8_t *)&sc->ctrldata,
1398 sizeof(sc->ctrldata),
1401 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1402 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1403 sizeof(uint32_t) * 1024);
1404 /* All unused entries shall be zero */
1405 bzero(dest, sizeof(uint32_t) * 1024);
1406 ((uint32_t *)dest)[0] = 1;
1408 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1409 if (command->nsid != 1) {
1410 pci_nvme_status_genc(&status,
1411 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1414 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1415 sizeof(uint32_t) * 1024);
1416 /* All bytes after the descriptor shall be zero */
1417 bzero(dest, sizeof(uint32_t) * 1024);
1419 /* Return NIDT=1 (i.e. EUI64) descriptor */
1420 ((uint8_t *)dest)[0] = 1;
1421 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1422 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1425 DPRINTF("%s unsupported identify command requested 0x%x",
1426 __func__, command->cdw10 & 0xFF);
1427 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1431 compl->status = status;
1436 nvme_fid_to_name(uint8_t fid)
1441 case NVME_FEAT_ARBITRATION:
1442 name = "Arbitration";
1444 case NVME_FEAT_POWER_MANAGEMENT:
1445 name = "Power Management";
1447 case NVME_FEAT_LBA_RANGE_TYPE:
1448 name = "LBA Range Type";
1450 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1451 name = "Temperature Threshold";
1453 case NVME_FEAT_ERROR_RECOVERY:
1454 name = "Error Recovery";
1456 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1457 name = "Volatile Write Cache";
1459 case NVME_FEAT_NUMBER_OF_QUEUES:
1460 name = "Number of Queues";
1462 case NVME_FEAT_INTERRUPT_COALESCING:
1463 name = "Interrupt Coalescing";
1465 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1466 name = "Interrupt Vector Configuration";
1468 case NVME_FEAT_WRITE_ATOMICITY:
1469 name = "Write Atomicity Normal";
1471 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1472 name = "Asynchronous Event Configuration";
1474 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1475 name = "Autonomous Power State Transition";
1477 case NVME_FEAT_HOST_MEMORY_BUFFER:
1478 name = "Host Memory Buffer";
1480 case NVME_FEAT_TIMESTAMP:
1483 case NVME_FEAT_KEEP_ALIVE_TIMER:
1484 name = "Keep Alive Timer";
1486 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1487 name = "Host Controlled Thermal Management";
1489 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1490 name = "Non-Operation Power State Config";
1492 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1493 name = "Read Recovery Level Config";
1495 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1496 name = "Predictable Latency Mode Config";
1498 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1499 name = "Predictable Latency Mode Window";
1501 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1502 name = "LBA Status Information Report Interval";
1504 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1505 name = "Host Behavior Support";
1507 case NVME_FEAT_SANITIZE_CONFIG:
1508 name = "Sanitize Config";
1510 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1511 name = "Endurance Group Event Configuration";
1513 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1514 name = "Software Progress Marker";
1516 case NVME_FEAT_HOST_IDENTIFIER:
1517 name = "Host Identifier";
1519 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1520 name = "Reservation Notification Mask";
1522 case NVME_FEAT_RESERVATION_PERSISTENCE:
1523 name = "Reservation Persistence";
1525 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1526 name = "Namespace Write Protection Config";
1537 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1538 struct nvme_feature_obj *feat,
1539 struct nvme_command *command,
1540 struct nvme_completion *compl)
1543 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1547 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1548 struct nvme_feature_obj *feat,
1549 struct nvme_command *command,
1550 struct nvme_completion *compl)
1553 uint32_t cdw11 = command->cdw11;
1557 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1559 iv = cdw11 & 0xffff;
1560 cd = cdw11 & (1 << 16);
1562 if (iv > (sc->max_queues + 1)) {
1566 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1567 if ((iv == 0) && !cd)
1570 /* Requested Interrupt Vector must be used by a CQ */
1571 for (i = 0; i < sc->num_cqueues + 1; i++) {
1572 if (sc->compl_queues[i].intr_vec == iv) {
1573 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1580 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1581 struct nvme_feature_obj *feat,
1582 struct nvme_command *command,
1583 struct nvme_completion *compl)
1585 uint16_t nqr; /* Number of Queues Requested */
1587 if (sc->num_q_is_set) {
1588 WPRINTF("%s: Number of Queues already set", __func__);
1589 pci_nvme_status_genc(&compl->status,
1590 NVME_SC_COMMAND_SEQUENCE_ERROR);
1594 nqr = command->cdw11 & 0xFFFF;
1595 if (nqr == 0xffff) {
1596 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1597 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1601 sc->num_squeues = ONE_BASED(nqr);
1602 if (sc->num_squeues > sc->max_queues) {
1603 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1605 sc->num_squeues = sc->max_queues;
1608 nqr = (command->cdw11 >> 16) & 0xFFFF;
1609 if (nqr == 0xffff) {
1610 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1611 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1615 sc->num_cqueues = ONE_BASED(nqr);
1616 if (sc->num_cqueues > sc->max_queues) {
1617 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1619 sc->num_cqueues = sc->max_queues;
1622 /* Patch the command value which will be saved on callback's return */
1623 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1624 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1626 sc->num_q_is_set = true;
1630 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1631 struct nvme_completion *compl)
1633 struct nvme_feature_obj *feat;
1634 uint32_t nsid = command->nsid;
1635 uint8_t fid = command->cdw10 & 0xFF;
1637 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1639 if (fid >= NVME_FID_MAX) {
1640 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1641 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1644 feat = &sc->feat[fid];
1646 if (!feat->namespace_specific &&
1647 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1648 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1649 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1654 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1657 feat->set(sc, feat, command, compl);
1659 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1660 if (compl->status == NVME_SC_SUCCESS) {
1661 feat->cdw11 = command->cdw11;
1662 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1663 (command->cdw11 != 0))
1664 pci_nvme_aen_notify(sc);
1671 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1672 struct nvme_completion* compl)
1674 struct nvme_feature_obj *feat;
1675 uint8_t fid = command->cdw10 & 0xFF;
1677 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1679 if (fid >= NVME_FID_MAX) {
1680 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1681 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1686 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1688 feat = &sc->feat[fid];
1690 feat->get(sc, feat, command, compl);
1693 if (compl->status == NVME_SC_SUCCESS) {
1694 compl->cdw0 = feat->cdw11;
1701 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1702 struct nvme_completion* compl)
1704 uint8_t ses, lbaf, pi;
1706 /* Only supports Secure Erase Setting - User Data Erase */
1707 ses = (command->cdw10 >> 9) & 0x7;
1709 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1713 /* Only supports a single LBA Format */
1714 lbaf = command->cdw10 & 0xf;
1716 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1717 NVME_SC_INVALID_FORMAT);
1721 /* Doesn't support Protection Infomation */
1722 pi = (command->cdw10 >> 5) & 0x7;
1724 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1728 if (sc->nvstore.type == NVME_STOR_RAM) {
1729 if (sc->nvstore.ctx)
1730 free(sc->nvstore.ctx);
1731 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1732 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1734 struct pci_nvme_ioreq *req;
1737 req = pci_nvme_get_ioreq(sc);
1739 pci_nvme_status_genc(&compl->status,
1740 NVME_SC_INTERNAL_DEVICE_ERROR);
1741 WPRINTF("%s: unable to allocate IO req", __func__);
1744 req->nvme_sq = &sc->submit_queues[0];
1746 req->opc = command->opc;
1747 req->cid = command->cid;
1748 req->nsid = command->nsid;
1750 req->io_req.br_offset = 0;
1751 req->io_req.br_resid = sc->nvstore.size;
1752 req->io_req.br_callback = pci_nvme_io_done;
1754 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1756 pci_nvme_status_genc(&compl->status,
1757 NVME_SC_INTERNAL_DEVICE_ERROR);
1758 pci_nvme_release_ioreq(sc, req);
1766 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1767 struct nvme_completion* compl)
1769 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1770 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1772 /* TODO: search for the command ID and abort it */
1775 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1780 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1781 struct nvme_command* command, struct nvme_completion* compl)
1783 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1784 sc->aer_count, sc->ctrldata.aerl, command->cid);
1786 /* Don't exceed the Async Event Request Limit (AERL). */
1787 if (pci_nvme_aer_limit_reached(sc)) {
1788 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1789 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1793 if (pci_nvme_aer_add(sc, command->cid)) {
1794 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1795 NVME_SC_INTERNAL_DEVICE_ERROR);
1800 * Raise events when they happen based on the Set Features cmd.
1801 * These events happen async, so only set completion successful if
1802 * there is an event reflective of the request to get event.
1804 compl->status = NVME_NO_STATUS;
1805 pci_nvme_aen_notify(sc);
1811 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1813 struct nvme_completion compl;
1814 struct nvme_command *cmd;
1815 struct nvme_submission_queue *sq;
1816 struct nvme_completion_queue *cq;
1819 DPRINTF("%s index %u", __func__, (uint32_t)value);
1821 sq = &sc->submit_queues[0];
1822 cq = &sc->compl_queues[0];
1824 pthread_mutex_lock(&sq->mtx);
1827 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1829 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1830 cmd = &(sq->qbase)[sqhead];
1835 case NVME_OPC_DELETE_IO_SQ:
1836 DPRINTF("%s command DELETE_IO_SQ", __func__);
1837 nvme_opc_delete_io_sq(sc, cmd, &compl);
1839 case NVME_OPC_CREATE_IO_SQ:
1840 DPRINTF("%s command CREATE_IO_SQ", __func__);
1841 nvme_opc_create_io_sq(sc, cmd, &compl);
1843 case NVME_OPC_DELETE_IO_CQ:
1844 DPRINTF("%s command DELETE_IO_CQ", __func__);
1845 nvme_opc_delete_io_cq(sc, cmd, &compl);
1847 case NVME_OPC_CREATE_IO_CQ:
1848 DPRINTF("%s command CREATE_IO_CQ", __func__);
1849 nvme_opc_create_io_cq(sc, cmd, &compl);
1851 case NVME_OPC_GET_LOG_PAGE:
1852 DPRINTF("%s command GET_LOG_PAGE", __func__);
1853 nvme_opc_get_log_page(sc, cmd, &compl);
1855 case NVME_OPC_IDENTIFY:
1856 DPRINTF("%s command IDENTIFY", __func__);
1857 nvme_opc_identify(sc, cmd, &compl);
1859 case NVME_OPC_ABORT:
1860 DPRINTF("%s command ABORT", __func__);
1861 nvme_opc_abort(sc, cmd, &compl);
1863 case NVME_OPC_SET_FEATURES:
1864 DPRINTF("%s command SET_FEATURES", __func__);
1865 nvme_opc_set_features(sc, cmd, &compl);
1867 case NVME_OPC_GET_FEATURES:
1868 DPRINTF("%s command GET_FEATURES", __func__);
1869 nvme_opc_get_features(sc, cmd, &compl);
1871 case NVME_OPC_FIRMWARE_ACTIVATE:
1872 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1873 pci_nvme_status_tc(&compl.status,
1874 NVME_SCT_COMMAND_SPECIFIC,
1875 NVME_SC_INVALID_FIRMWARE_SLOT);
1877 case NVME_OPC_ASYNC_EVENT_REQUEST:
1878 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1879 nvme_opc_async_event_req(sc, cmd, &compl);
1881 case NVME_OPC_FORMAT_NVM:
1882 DPRINTF("%s command FORMAT_NVM", __func__);
1883 if ((sc->ctrldata.oacs &
1884 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1885 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1887 compl.status = NVME_NO_STATUS;
1888 nvme_opc_format_nvm(sc, cmd, &compl);
1891 DPRINTF("0x%x command is not implemented",
1893 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1895 sqhead = (sqhead + 1) % sq->size;
1897 if (NVME_COMPLETION_VALID(compl)) {
1898 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1906 DPRINTF("setting sqhead %u", sqhead);
1909 if (cq->head != cq->tail)
1910 pci_generate_msix(sc->nsc_pi, 0);
1912 pthread_mutex_unlock(&sq->mtx);
1916 * Update the Write and Read statistics reported in SMART data
1918 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1919 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1920 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1923 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1924 size_t bytes, uint16_t status)
1927 pthread_mutex_lock(&sc->mtx);
1929 case NVME_OPC_WRITE:
1930 sc->write_commands++;
1931 if (status != NVME_SC_SUCCESS)
1933 sc->write_dunits_remainder += (bytes / 512);
1934 while (sc->write_dunits_remainder >= 1000) {
1935 sc->write_data_units++;
1936 sc->write_dunits_remainder -= 1000;
1940 sc->read_commands++;
1941 if (status != NVME_SC_SUCCESS)
1943 sc->read_dunits_remainder += (bytes / 512);
1944 while (sc->read_dunits_remainder >= 1000) {
1945 sc->read_data_units++;
1946 sc->read_dunits_remainder -= 1000;
1950 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1953 pthread_mutex_unlock(&sc->mtx);
1957 * Check if the combination of Starting LBA (slba) and Number of Logical
1958 * Blocks (nlb) exceeds the range of the underlying storage.
1960 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1961 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1965 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1968 size_t offset, bytes;
1970 /* Overflow check of multiplying Starting LBA by the sector size */
1971 if (slba >> (64 - nvstore->sectsz_bits))
1974 offset = slba << nvstore->sectsz_bits;
1975 bytes = nlb << nvstore->sectsz_bits;
1977 /* Overflow check of Number of Logical Blocks */
1978 if ((nvstore->size - offset) < bytes)
1985 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1986 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1993 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1997 /* concatenate contig block-iovs to minimize number of iovs */
1998 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1999 iovidx = req->io_req.br_iovcnt - 1;
2001 req->io_req.br_iov[iovidx].iov_base =
2002 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2003 req->prev_gpaddr, size);
2005 req->prev_size += size;
2006 req->io_req.br_resid += size;
2008 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2010 iovidx = req->io_req.br_iovcnt;
2012 req->io_req.br_offset = lba;
2013 req->io_req.br_resid = 0;
2014 req->io_req.br_param = req;
2017 req->io_req.br_iov[iovidx].iov_base =
2018 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2021 req->io_req.br_iov[iovidx].iov_len = size;
2023 req->prev_gpaddr = gpaddr;
2024 req->prev_size = size;
2025 req->io_req.br_resid += size;
2027 req->io_req.br_iovcnt++;
2034 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2035 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2036 uint32_t cdw0, uint16_t status)
2038 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2040 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2041 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2042 NVME_STATUS_GET_SC(status));
2044 pci_nvme_cq_update(sc, cq,
2050 if (cq->head != cq->tail) {
2051 if (cq->intr_en & NVME_CQ_INTEN) {
2052 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2054 DPRINTF("%s: CQ%u interrupt disabled",
2055 __func__, sq->cqid);
2061 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2064 req->nvme_sq = NULL;
2067 pthread_mutex_lock(&sc->mtx);
2069 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2072 /* when no more IO pending, can set to ready if device reset/enabled */
2073 if (sc->pending_ios == 0 &&
2074 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2075 sc->regs.csts |= NVME_CSTS_RDY;
2077 pthread_mutex_unlock(&sc->mtx);
2079 sem_post(&sc->iosemlock);
2082 static struct pci_nvme_ioreq *
2083 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2085 struct pci_nvme_ioreq *req = NULL;
2087 sem_wait(&sc->iosemlock);
2088 pthread_mutex_lock(&sc->mtx);
2090 req = STAILQ_FIRST(&sc->ioreqs_free);
2091 assert(req != NULL);
2092 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2098 pthread_mutex_unlock(&sc->mtx);
2100 req->io_req.br_iovcnt = 0;
2101 req->io_req.br_offset = 0;
2102 req->io_req.br_resid = 0;
2103 req->io_req.br_param = req;
2104 req->prev_gpaddr = 0;
2111 pci_nvme_io_done(struct blockif_req *br, int err)
2113 struct pci_nvme_ioreq *req = br->br_param;
2114 struct nvme_submission_queue *sq = req->nvme_sq;
2115 uint16_t code, status;
2117 DPRINTF("%s error %d %s", __func__, err, strerror(err));
2119 /* TODO return correct error */
2120 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2121 pci_nvme_status_genc(&status, code);
2123 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2124 pci_nvme_stats_write_read_update(req->sc, req->opc,
2125 req->bytes, status);
2126 pci_nvme_release_ioreq(req->sc, req);
2130 * Implements the Flush command. The specification states:
2131 * If a volatile write cache is not present, Flush commands complete
2132 * successfully and have no effect
2133 * in the description of the Volatile Write Cache (VWC) field of the Identify
2134 * Controller data. Therefore, set status to Success if the command is
2135 * not supported (i.e. RAM or as indicated by the blockif).
2138 nvme_opc_flush(struct pci_nvme_softc *sc,
2139 struct nvme_command *cmd,
2140 struct pci_nvme_blockstore *nvstore,
2141 struct pci_nvme_ioreq *req,
2144 bool pending = false;
2146 if (nvstore->type == NVME_STOR_RAM) {
2147 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2151 req->io_req.br_callback = pci_nvme_io_done;
2153 err = blockif_flush(nvstore->ctx, &req->io_req);
2159 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2162 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2170 nvme_write_read_ram(struct pci_nvme_softc *sc,
2171 struct pci_nvme_blockstore *nvstore,
2172 uint64_t prp1, uint64_t prp2,
2173 size_t offset, uint64_t bytes,
2176 uint8_t *buf = nvstore->ctx;
2177 enum nvme_copy_dir dir;
2181 dir = NVME_COPY_TO_PRP;
2183 dir = NVME_COPY_FROM_PRP;
2185 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2186 buf + offset, bytes, dir))
2187 pci_nvme_status_genc(&status,
2188 NVME_SC_DATA_TRANSFER_ERROR);
2190 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2196 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2197 struct pci_nvme_blockstore *nvstore,
2198 struct pci_nvme_ioreq *req,
2199 uint64_t prp1, uint64_t prp2,
2200 size_t offset, uint64_t bytes,
2205 uint16_t status = NVME_NO_STATUS;
2207 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2208 if (pci_nvme_append_iov_req(sc, req, prp1,
2209 size, is_write, offset)) {
2210 pci_nvme_status_genc(&status,
2211 NVME_SC_DATA_TRANSFER_ERROR);
2220 } else if (bytes <= PAGE_SIZE) {
2222 if (pci_nvme_append_iov_req(sc, req, prp2,
2223 size, is_write, offset)) {
2224 pci_nvme_status_genc(&status,
2225 NVME_SC_DATA_TRANSFER_ERROR);
2229 void *vmctx = sc->nsc_pi->pi_vmctx;
2230 uint64_t *prp_list = &prp2;
2231 uint64_t *last = prp_list;
2233 /* PRP2 is pointer to a physical region page list */
2235 /* Last entry in list points to the next list */
2236 if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2237 uint64_t prp = *prp_list;
2239 prp_list = paddr_guest2host(vmctx, prp,
2240 PAGE_SIZE - (prp % PAGE_SIZE));
2241 last = prp_list + (NVME_PRP2_ITEMS - 1);
2244 size = MIN(bytes, PAGE_SIZE);
2246 if (pci_nvme_append_iov_req(sc, req, *prp_list,
2247 size, is_write, offset)) {
2248 pci_nvme_status_genc(&status,
2249 NVME_SC_DATA_TRANSFER_ERROR);
2259 req->io_req.br_callback = pci_nvme_io_done;
2261 err = blockif_write(nvstore->ctx, &req->io_req);
2263 err = blockif_read(nvstore->ctx, &req->io_req);
2266 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2272 nvme_opc_write_read(struct pci_nvme_softc *sc,
2273 struct nvme_command *cmd,
2274 struct pci_nvme_blockstore *nvstore,
2275 struct pci_nvme_ioreq *req,
2278 uint64_t lba, nblocks, bytes;
2280 bool is_write = cmd->opc == NVME_OPC_WRITE;
2281 bool pending = false;
2283 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2284 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2286 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2287 WPRINTF("%s command would exceed LBA range", __func__);
2288 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2292 bytes = nblocks << nvstore->sectsz_bits;
2293 if (bytes > NVME_MAX_DATA_SIZE) {
2294 WPRINTF("%s command would exceed MDTS", __func__);
2295 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2299 offset = lba << nvstore->sectsz_bits;
2302 req->io_req.br_offset = lba;
2304 /* PRP bits 1:0 must be zero */
2305 cmd->prp1 &= ~0x3UL;
2306 cmd->prp2 &= ~0x3UL;
2308 if (nvstore->type == NVME_STOR_RAM) {
2309 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2310 cmd->prp2, offset, bytes, is_write);
2312 *status = nvme_write_read_blockif(sc, nvstore, req,
2313 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2315 if (*status == NVME_NO_STATUS)
2320 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2326 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2328 struct pci_nvme_ioreq *req = br->br_param;
2329 struct pci_nvme_softc *sc = req->sc;
2334 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2335 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2336 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2338 struct iovec *iov = req->io_req.br_iov;
2341 iov += req->prev_gpaddr;
2343 /* The iov_* values already include the sector size */
2344 req->io_req.br_offset = (off_t)iov->iov_base;
2345 req->io_req.br_resid = iov->iov_len;
2346 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2347 pci_nvme_status_genc(&status,
2348 NVME_SC_INTERNAL_DEVICE_ERROR);
2354 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2355 req->cid, 0, status);
2356 pci_nvme_release_ioreq(sc, req);
2361 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2362 struct nvme_command *cmd,
2363 struct pci_nvme_blockstore *nvstore,
2364 struct pci_nvme_ioreq *req,
2367 struct nvme_dsm_range *range;
2368 uint32_t nr, r, non_zero, dr;
2370 bool pending = false;
2372 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2373 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2377 nr = cmd->cdw10 & 0xff;
2379 /* copy locally because a range entry could straddle PRPs */
2380 range = calloc(1, NVME_MAX_DSM_TRIM);
2381 if (range == NULL) {
2382 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2385 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2386 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2388 /* Check for invalid ranges and the number of non-zero lengths */
2390 for (r = 0; r <= nr; r++) {
2391 if (pci_nvme_out_of_range(nvstore,
2392 range[r].starting_lba, range[r].length)) {
2393 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2396 if (range[r].length != 0)
2400 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2401 size_t offset, bytes;
2402 int sectsz_bits = sc->nvstore.sectsz_bits;
2405 * DSM calls are advisory only, and compliant controllers
2406 * may choose to take no actions (i.e. return Success).
2408 if (!nvstore->deallocate) {
2409 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2413 /* If all ranges have a zero length, return Success */
2414 if (non_zero == 0) {
2415 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2420 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2424 offset = range[0].starting_lba << sectsz_bits;
2425 bytes = range[0].length << sectsz_bits;
2428 * If the request is for more than a single range, store
2429 * the ranges in the br_iov. Optimize for the common case
2430 * of a single range.
2432 * Note that NVMe Number of Ranges is a zero based value
2434 req->io_req.br_iovcnt = 0;
2435 req->io_req.br_offset = offset;
2436 req->io_req.br_resid = bytes;
2439 req->io_req.br_callback = pci_nvme_io_done;
2441 struct iovec *iov = req->io_req.br_iov;
2443 for (r = 0, dr = 0; r <= nr; r++) {
2444 offset = range[r].starting_lba << sectsz_bits;
2445 bytes = range[r].length << sectsz_bits;
2449 if ((nvstore->size - offset) < bytes) {
2450 pci_nvme_status_genc(status,
2451 NVME_SC_LBA_OUT_OF_RANGE);
2454 iov[dr].iov_base = (void *)offset;
2455 iov[dr].iov_len = bytes;
2458 req->io_req.br_callback = pci_nvme_dealloc_sm;
2461 * Use prev_gpaddr to track the current entry and
2462 * prev_size to track the number of entries
2464 req->prev_gpaddr = 0;
2465 req->prev_size = dr;
2468 err = blockif_delete(nvstore->ctx, &req->io_req);
2470 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2480 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2482 struct nvme_submission_queue *sq;
2486 /* handle all submissions up to sq->tail index */
2487 sq = &sc->submit_queues[idx];
2489 pthread_mutex_lock(&sq->mtx);
2492 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2493 idx, sqhead, sq->tail, sq->qbase);
2495 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2496 struct nvme_command *cmd;
2497 struct pci_nvme_ioreq *req;
2505 cmd = &sq->qbase[sqhead];
2506 sqhead = (sqhead + 1) % sq->size;
2508 nsid = le32toh(cmd->nsid);
2509 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2510 pci_nvme_status_genc(&status,
2511 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2513 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2517 req = pci_nvme_get_ioreq(sc);
2519 pci_nvme_status_genc(&status,
2520 NVME_SC_INTERNAL_DEVICE_ERROR);
2521 WPRINTF("%s: unable to allocate IO req", __func__);
2526 req->opc = cmd->opc;
2527 req->cid = cmd->cid;
2528 req->nsid = cmd->nsid;
2531 case NVME_OPC_FLUSH:
2532 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2535 case NVME_OPC_WRITE:
2537 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2540 case NVME_OPC_WRITE_ZEROES:
2541 /* TODO: write zeroes
2542 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2543 __func__, lba, cmd->cdw12 & 0xFFFF); */
2544 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2546 case NVME_OPC_DATASET_MANAGEMENT:
2547 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2551 WPRINTF("%s unhandled io command 0x%x",
2552 __func__, cmd->opc);
2553 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2557 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2560 pci_nvme_release_ioreq(sc, req);
2566 pthread_mutex_unlock(&sq->mtx);
2570 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2571 uint64_t idx, int is_sq, uint64_t value)
2573 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2574 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2577 if (idx > sc->num_squeues) {
2578 WPRINTF("%s queue index %lu overflow from "
2580 __func__, idx, sc->num_squeues);
2584 atomic_store_short(&sc->submit_queues[idx].tail,
2588 pci_nvme_handle_admin_cmd(sc, value);
2590 /* submission queue; handle new entries in SQ */
2591 if (idx > sc->num_squeues) {
2592 WPRINTF("%s SQ index %lu overflow from "
2594 __func__, idx, sc->num_squeues);
2597 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2600 if (idx > sc->num_cqueues) {
2601 WPRINTF("%s queue index %lu overflow from "
2603 __func__, idx, sc->num_cqueues);
2607 atomic_store_short(&sc->compl_queues[idx].head,
2613 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2615 const char *s = iswrite ? "WRITE" : "READ";
2618 case NVME_CR_CAP_LOW:
2619 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2621 case NVME_CR_CAP_HI:
2622 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2625 DPRINTF("%s %s NVME_CR_VS", func, s);
2628 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2631 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2634 DPRINTF("%s %s NVME_CR_CC", func, s);
2637 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2640 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2643 DPRINTF("%s %s NVME_CR_AQA", func, s);
2645 case NVME_CR_ASQ_LOW:
2646 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2648 case NVME_CR_ASQ_HI:
2649 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2651 case NVME_CR_ACQ_LOW:
2652 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2654 case NVME_CR_ACQ_HI:
2655 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2658 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2664 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2665 uint64_t offset, int size, uint64_t value)
2669 if (offset >= NVME_DOORBELL_OFFSET) {
2670 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2671 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2672 int is_sq = (belloffset % 8) < 4;
2674 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2675 WPRINTF("guest attempted an overflow write offset "
2676 "0x%lx, val 0x%lx in %s",
2677 offset, value, __func__);
2681 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2685 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2686 offset, size, value);
2689 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2690 "val 0x%lx) to bar0 in %s",
2691 size, offset, value, __func__);
2692 /* TODO: shutdown device */
2696 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2698 pthread_mutex_lock(&sc->mtx);
2701 case NVME_CR_CAP_LOW:
2702 case NVME_CR_CAP_HI:
2709 /* MSI-X, so ignore */
2712 /* MSI-X, so ignore */
2715 ccreg = (uint32_t)value;
2717 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2720 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2721 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2722 NVME_CC_GET_IOCQES(ccreg));
2724 if (NVME_CC_GET_SHN(ccreg)) {
2725 /* perform shutdown - flush out data to backend */
2726 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2727 NVME_CSTS_REG_SHST_SHIFT);
2728 sc->regs.csts |= NVME_SHST_COMPLETE <<
2729 NVME_CSTS_REG_SHST_SHIFT;
2731 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2732 if (NVME_CC_GET_EN(ccreg) == 0)
2733 /* transition 1-> causes controller reset */
2734 pci_nvme_reset_locked(sc);
2736 pci_nvme_init_controller(ctx, sc);
2739 /* Insert the iocqes, iosqes and en bits from the write */
2740 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2741 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2742 if (NVME_CC_GET_EN(ccreg) == 0) {
2743 /* Insert the ams, mps and css bit fields */
2744 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2745 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2746 sc->regs.csts &= ~NVME_CSTS_RDY;
2747 } else if (sc->pending_ios == 0) {
2748 sc->regs.csts |= NVME_CSTS_RDY;
2754 /* ignore writes; don't support subsystem reset */
2757 sc->regs.aqa = (uint32_t)value;
2759 case NVME_CR_ASQ_LOW:
2760 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2761 (0xFFFFF000 & value);
2763 case NVME_CR_ASQ_HI:
2764 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2767 case NVME_CR_ACQ_LOW:
2768 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2769 (0xFFFFF000 & value);
2771 case NVME_CR_ACQ_HI:
2772 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2776 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2777 __func__, offset, value, size);
2779 pthread_mutex_unlock(&sc->mtx);
2783 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2784 int baridx, uint64_t offset, int size, uint64_t value)
2786 struct pci_nvme_softc* sc = pi->pi_arg;
2788 if (baridx == pci_msix_table_bar(pi) ||
2789 baridx == pci_msix_pba_bar(pi)) {
2790 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2791 " value 0x%lx", baridx, offset, size, value);
2793 pci_emul_msix_twrite(pi, offset, size, value);
2799 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2803 DPRINTF("%s unknown baridx %d, val 0x%lx",
2804 __func__, baridx, value);
2808 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2809 uint64_t offset, int size)
2813 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2815 if (offset < NVME_DOORBELL_OFFSET) {
2816 void *p = &(sc->regs);
2817 pthread_mutex_lock(&sc->mtx);
2818 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2819 pthread_mutex_unlock(&sc->mtx);
2822 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2833 value &= 0xFFFFFFFF;
2837 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2838 offset, size, (uint32_t)value);
2846 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2847 uint64_t offset, int size)
2849 struct pci_nvme_softc* sc = pi->pi_arg;
2851 if (baridx == pci_msix_table_bar(pi) ||
2852 baridx == pci_msix_pba_bar(pi)) {
2853 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2854 baridx, offset, size);
2856 return pci_emul_msix_tread(pi, offset, size);
2861 return pci_nvme_read_bar_0(sc, offset, size);
2864 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2871 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2873 char bident[sizeof("XX:X:X")];
2877 sc->max_queues = NVME_QUEUES;
2878 sc->max_qentries = NVME_MAX_QENTRIES;
2879 sc->ioslots = NVME_IOSLOTS;
2880 sc->num_squeues = sc->max_queues;
2881 sc->num_cqueues = sc->max_queues;
2882 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2884 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2885 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2887 value = get_config_value_node(nvl, "maxq");
2889 sc->max_queues = atoi(value);
2890 value = get_config_value_node(nvl, "qsz");
2891 if (value != NULL) {
2892 sc->max_qentries = atoi(value);
2893 if (sc->max_qentries <= 0) {
2894 EPRINTLN("nvme: Invalid qsz option %d",
2899 value = get_config_value_node(nvl, "ioslots");
2900 if (value != NULL) {
2901 sc->ioslots = atoi(value);
2902 if (sc->ioslots <= 0) {
2903 EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2907 value = get_config_value_node(nvl, "sectsz");
2909 sectsz = atoi(value);
2910 value = get_config_value_node(nvl, "ser");
2911 if (value != NULL) {
2913 * This field indicates the Product Serial Number in
2914 * 7-bit ASCII, unused bytes should be space characters.
2917 cpywithpad((char *)sc->ctrldata.sn,
2918 sizeof(sc->ctrldata.sn), value, ' ');
2920 value = get_config_value_node(nvl, "eui64");
2922 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2923 value = get_config_value_node(nvl, "dsm");
2924 if (value != NULL) {
2925 if (strcmp(value, "auto") == 0)
2926 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2927 else if (strcmp(value, "enable") == 0)
2928 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2929 else if (strcmp(value, "disable") == 0)
2930 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2933 value = get_config_value_node(nvl, "ram");
2934 if (value != NULL) {
2935 uint64_t sz = strtoull(value, NULL, 10);
2937 sc->nvstore.type = NVME_STOR_RAM;
2938 sc->nvstore.size = sz * 1024 * 1024;
2939 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2940 sc->nvstore.sectsz = 4096;
2941 sc->nvstore.sectsz_bits = 12;
2942 if (sc->nvstore.ctx == NULL) {
2943 EPRINTLN("nvme: Unable to allocate RAM");
2947 snprintf(bident, sizeof(bident), "%d:%d",
2948 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2949 sc->nvstore.ctx = blockif_open(nvl, bident);
2950 if (sc->nvstore.ctx == NULL) {
2951 EPRINTLN("nvme: Could not open backing file: %s",
2955 sc->nvstore.type = NVME_STOR_BLOCKIF;
2956 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2959 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2960 sc->nvstore.sectsz = sectsz;
2961 else if (sc->nvstore.type != NVME_STOR_RAM)
2962 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2963 for (sc->nvstore.sectsz_bits = 9;
2964 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2965 sc->nvstore.sectsz_bits++);
2967 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2968 sc->max_queues = NVME_QUEUES;
2974 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
2976 struct pci_nvme_softc *sc;
2977 uint32_t pci_membar_sz;
2982 sc = calloc(1, sizeof(struct pci_nvme_softc));
2986 error = pci_nvme_parse_config(sc, nvl);
2992 STAILQ_INIT(&sc->ioreqs_free);
2993 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2994 for (int i = 0; i < sc->ioslots; i++) {
2995 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2998 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2999 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3000 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3001 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3002 pci_set_cfgdata8(pi, PCIR_PROGIF,
3003 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3006 * Allocate size of NVMe registers + doorbell space for all queues.
3008 * The specification requires a minimum memory I/O window size of 16K.
3009 * The Windows driver will refuse to start a device with a smaller
3012 pci_membar_sz = sizeof(struct nvme_registers) +
3013 2 * sizeof(uint32_t) * (sc->max_queues + 1);
3014 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3016 DPRINTF("nvme membar size: %u", pci_membar_sz);
3018 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3020 WPRINTF("%s pci alloc mem bar failed", __func__);
3024 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3026 WPRINTF("%s pci add msixcap failed", __func__);
3030 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3032 WPRINTF("%s pci add Express capability failed", __func__);
3036 pthread_mutex_init(&sc->mtx, NULL);
3037 sem_init(&sc->iosemlock, 0, sc->ioslots);
3039 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3041 * Controller data depends on Namespace data so initialize Namespace
3044 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3045 pci_nvme_init_ctrldata(sc);
3046 pci_nvme_init_logpages(sc);
3047 pci_nvme_init_features(sc);
3049 pci_nvme_aer_init(sc);
3050 pci_nvme_aen_init(sc);
3054 pci_lintr_request(pi);
3061 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3068 if (strncmp(opts, "ram=", 4) == 0) {
3069 cp = strchr(opts, ',');
3071 set_config_value_node(nvl, "ram", opts + 4);
3074 ram = strndup(opts + 4, cp - opts - 4);
3075 set_config_value_node(nvl, "ram", ram);
3077 return (pci_parse_legacy_config(nvl, cp + 1));
3079 return (blockif_legacy_config(nvl, opts));
3082 struct pci_devemu pci_de_nvme = {
3084 .pe_init = pci_nvme_init,
3085 .pe_legacy_config = pci_nvme_legacy_config,
3086 .pe_barwrite = pci_nvme_write,
3087 .pe_barread = pci_nvme_read
3089 PCI_EMUL_SET(pci_de_nvme);