2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
81 #include <dev/nvme/nvme.h>
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR 4
97 #define NVME_IOSLOTS 8
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN (1 << 14)
102 #define NVME_QUEUES 16
103 #define NVME_MAX_QENTRIES 2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN 0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS 0xffff
117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero) ((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one) ((one) - 1)
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
136 enum nvme_controller_register_offsets {
137 NVME_CR_CAP_LOW = 0x00,
138 NVME_CR_CAP_HI = 0x04,
140 NVME_CR_INTMS = 0x0c,
141 NVME_CR_INTMC = 0x10,
146 NVME_CR_ASQ_LOW = 0x28,
147 NVME_CR_ASQ_HI = 0x2c,
148 NVME_CR_ACQ_LOW = 0x30,
149 NVME_CR_ACQ_HI = 0x34,
152 enum nvme_cmd_cdw11 {
153 NVME_CMD_CDW11_PC = 0x0001,
154 NVME_CMD_CDW11_IEN = 0x0002,
155 NVME_CMD_CDW11_IV = 0xFFFF0000,
163 #define NVME_CQ_INTEN 0x01
164 #define NVME_CQ_INTCOAL 0x02
166 struct nvme_completion_queue {
167 struct nvme_completion *qbase;
170 uint16_t tail; /* nvme progress */
171 uint16_t head; /* guest progress */
176 struct nvme_submission_queue {
177 struct nvme_command *qbase;
180 uint16_t head; /* nvme progress */
181 uint16_t tail; /* guest progress */
182 uint16_t cqid; /* completion queue id */
186 enum nvme_storage_type {
187 NVME_STOR_BLOCKIF = 0,
191 struct pci_nvme_blockstore {
192 enum nvme_storage_type type;
196 uint32_t sectsz_bits;
198 uint32_t deallocate:1;
202 * Calculate the number of additional page descriptors for guest IO requests
203 * based on the advertised Max Data Transfer (MDTS) and given the number of
204 * default iovec's in a struct blockif_req.
206 #define MDTS_PAD_SIZE \
207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
211 struct pci_nvme_ioreq {
212 struct pci_nvme_softc *sc;
213 STAILQ_ENTRY(pci_nvme_ioreq) link;
214 struct nvme_submission_queue *nvme_sq;
217 /* command information */
222 uint64_t prev_gpaddr;
226 struct blockif_req io_req;
228 struct iovec iovpadding[MDTS_PAD_SIZE];
232 /* Dataset Management bit in ONCS reflects backing storage capability */
233 NVME_DATASET_MANAGEMENT_AUTO,
234 /* Unconditionally set Dataset Management bit in ONCS */
235 NVME_DATASET_MANAGEMENT_ENABLE,
236 /* Unconditionally clear Dataset Management bit in ONCS */
237 NVME_DATASET_MANAGEMENT_DISABLE,
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244 struct nvme_feature_obj *,
245 struct nvme_command *,
246 struct nvme_completion *);
248 struct nvme_feature_obj {
252 bool namespace_specific;
255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
258 PCI_NVME_AE_TYPE_ERROR = 0,
259 PCI_NVME_AE_TYPE_SMART,
260 PCI_NVME_AE_TYPE_NOTICE,
261 PCI_NVME_AE_TYPE_IO_CMD = 6,
262 PCI_NVME_AE_TYPE_VENDOR = 7,
263 PCI_NVME_AE_TYPE_MAX /* Must be last */
264 } pci_nvme_async_type;
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 STAILQ_ENTRY(pci_nvme_aer) link;
269 uint16_t cid; /* Command ID of the submitted AER */
272 /** Asynchronous Event Information - Notice */
274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277 PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281 PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
284 #define PCI_NVME_AEI_NOTICE_SHIFT 8
285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289 pci_nvme_async_type atype;
295 * By default, enable all Asynchrnous Event Notifications:
296 * SMART / Health Critical Warnings
297 * Namespace Attribute Notices
299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f
302 NVME_CNTRLTYPE_IO = 1,
303 NVME_CNTRLTYPE_DISCOVERY = 2,
304 NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
307 struct pci_nvme_softc {
308 struct pci_devinst *nsc_pi;
312 struct nvme_registers regs;
314 struct nvme_namespace_data nsdata;
315 struct nvme_controller_data ctrldata;
316 struct nvme_error_information_entry err_log;
317 struct nvme_health_information_page health_log;
318 struct nvme_firmware_page fw_log;
319 struct nvme_ns_list ns_log;
321 struct pci_nvme_blockstore nvstore;
323 uint16_t max_qentries; /* max entries per queue */
324 uint32_t max_queues; /* max number of IO SQ's or CQ's */
325 uint32_t num_cqueues;
326 uint32_t num_squeues;
327 bool num_q_is_set; /* Has host set Number of Queues */
329 struct pci_nvme_ioreq *ioreqs;
330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331 uint32_t pending_ios;
336 * Memory mapped Submission and Completion queues
337 * Each array includes both Admin and IO queues
339 struct nvme_completion_queue *compl_queues;
340 struct nvme_submission_queue *submit_queues;
342 struct nvme_feature_obj feat[NVME_FID_MAX];
344 enum nvme_dsm_type dataset_management;
346 /* Accounting for SMART data */
347 __uint128_t read_data_units;
348 __uint128_t write_data_units;
349 __uint128_t read_commands;
350 __uint128_t write_commands;
351 uint32_t read_dunits_remainder;
352 uint32_t write_dunits_remainder;
354 STAILQ_HEAD(, pci_nvme_aer) aer_list;
355 pthread_mutex_t aer_mtx;
357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
359 pthread_mutex_t aen_mtx;
360 pthread_cond_t aen_cond;
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365 struct nvme_completion_queue *cq,
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
374 /* Controller Configuration utils */
375 #define NVME_CC_GET_EN(cc) \
376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define NVME_CC_GET_CSS(cc) \
378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define NVME_CC_GET_SHN(cc) \
380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define NVME_CC_GET_IOSQES(cc) \
382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define NVME_CC_GET_IOCQES(cc) \
384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
386 #define NVME_CC_WRITE_MASK \
387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
391 #define NVME_CC_NEN_WRITE_MASK \
392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
396 /* Controller Status utils */
397 #define NVME_CSTS_GET_RDY(sts) \
398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
402 /* Completion Queue status word utils */
403 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
404 #define NVME_STATUS_MASK \
405 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
406 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
408 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
409 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
411 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
412 struct nvme_feature_obj *,
413 struct nvme_command *,
414 struct nvme_completion *);
415 static void nvme_feature_temperature(struct pci_nvme_softc *,
416 struct nvme_feature_obj *,
417 struct nvme_command *,
418 struct nvme_completion *);
419 static void nvme_feature_num_queues(struct pci_nvme_softc *,
420 struct nvme_feature_obj *,
421 struct nvme_command *,
422 struct nvme_completion *);
423 static void nvme_feature_iv_config(struct pci_nvme_softc *,
424 struct nvme_feature_obj *,
425 struct nvme_command *,
426 struct nvme_completion *);
427 static void nvme_feature_async_event(struct pci_nvme_softc *,
428 struct nvme_feature_obj *,
429 struct nvme_command *,
430 struct nvme_completion *);
432 static void *aen_thr(void *arg);
435 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
439 len = strnlen(src, dst_size);
440 memset(dst, pad, dst_size);
441 memcpy(dst, src, len);
445 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
448 *status &= ~NVME_STATUS_MASK;
449 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
450 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
454 pci_nvme_status_genc(uint16_t *status, uint16_t code)
457 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
461 * Initialize the requested number or IO Submission and Completion Queues.
462 * Admin queues are allocated implicitly.
465 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
470 * Allocate and initialize the Submission Queues
472 if (nsq > NVME_QUEUES) {
473 WPRINTF("%s: clamping number of SQ from %u to %u",
474 __func__, nsq, NVME_QUEUES);
478 sc->num_squeues = nsq;
480 sc->submit_queues = calloc(sc->num_squeues + 1,
481 sizeof(struct nvme_submission_queue));
482 if (sc->submit_queues == NULL) {
483 WPRINTF("%s: SQ allocation failed", __func__);
486 struct nvme_submission_queue *sq = sc->submit_queues;
488 for (i = 0; i < sc->num_squeues; i++)
489 pthread_mutex_init(&sq[i].mtx, NULL);
493 * Allocate and initialize the Completion Queues
495 if (ncq > NVME_QUEUES) {
496 WPRINTF("%s: clamping number of CQ from %u to %u",
497 __func__, ncq, NVME_QUEUES);
501 sc->num_cqueues = ncq;
503 sc->compl_queues = calloc(sc->num_cqueues + 1,
504 sizeof(struct nvme_completion_queue));
505 if (sc->compl_queues == NULL) {
506 WPRINTF("%s: CQ allocation failed", __func__);
509 struct nvme_completion_queue *cq = sc->compl_queues;
511 for (i = 0; i < sc->num_cqueues; i++)
512 pthread_mutex_init(&cq[i].mtx, NULL);
517 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
519 struct nvme_controller_data *cd = &sc->ctrldata;
524 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
525 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
527 /* Num of submission commands that we can handle at a time (2^rab) */
537 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
539 cd->ver = NVME_REV(1,4);
541 cd->cntrltype = NVME_CNTRLTYPE_IO;
542 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
543 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
547 /* Advertise 1, Read-only firmware slot */
548 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
549 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
550 cd->lpa = 0; /* TODO: support some simple things like SMART */
551 cd->elpe = 0; /* max error log page entries */
552 cd->npss = 1; /* number of power states support */
554 /* Warning Composite Temperature Threshold */
558 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
559 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
560 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
561 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
562 cd->nn = 1; /* number of namespaces */
565 switch (sc->dataset_management) {
566 case NVME_DATASET_MANAGEMENT_AUTO:
567 if (sc->nvstore.deallocate)
568 cd->oncs |= NVME_ONCS_DSM;
570 case NVME_DATASET_MANAGEMENT_ENABLE:
571 cd->oncs |= NVME_ONCS_DSM;
577 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
578 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
580 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
582 cd->power_state[0].mp = 10;
586 * Calculate the CRC-16 of the given buffer
587 * See copyright attribution at top of file
590 crc16(uint16_t crc, const void *buffer, unsigned int len)
592 const unsigned char *cp = buffer;
593 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
594 static uint16_t const crc16_table[256] = {
595 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
596 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
597 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
598 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
599 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
600 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
601 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
602 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
603 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
604 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
605 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
606 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
607 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
608 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
609 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
610 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
611 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
612 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
613 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
614 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
615 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
616 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
617 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
618 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
619 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
620 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
621 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
622 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
623 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
624 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
625 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
626 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
630 crc = (((crc >> 8) & 0xffU) ^
631 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
636 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
637 struct nvme_namespace_data *nd)
640 /* Get capacity and block size information from backing store */
641 nd->nsze = nvstore->size / nvstore->sectsz;
647 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
648 struct nvme_namespace_data *nd, uint32_t nsid,
649 struct pci_nvme_blockstore *nvstore)
652 pci_nvme_init_nsdata_size(nvstore, nd);
654 if (nvstore->type == NVME_STOR_BLOCKIF)
655 nvstore->deallocate = blockif_candelete(nvstore->ctx);
657 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
660 /* Create an EUI-64 if user did not provide one */
661 if (nvstore->eui64 == 0) {
663 uint64_t eui64 = nvstore->eui64;
665 asprintf(&data, "%s%u%u%u", get_config_value("name"),
666 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
667 sc->nsc_pi->pi_func);
670 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
673 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
675 be64enc(nd->eui64, nvstore->eui64);
677 /* LBA data-sz = 2^lbads */
678 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
682 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
685 memset(&sc->err_log, 0, sizeof(sc->err_log));
686 memset(&sc->health_log, 0, sizeof(sc->health_log));
687 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
688 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
690 /* Set read/write remainder to round up according to spec */
691 sc->read_dunits_remainder = 999;
692 sc->write_dunits_remainder = 999;
694 /* Set nominal Health values checked by implementations */
695 sc->health_log.temperature = NVME_TEMPERATURE;
696 sc->health_log.available_spare = 100;
697 sc->health_log.available_spare_threshold = 10;
701 pci_nvme_init_features(struct pci_nvme_softc *sc)
703 enum nvme_feature fid;
705 for (fid = 0; fid < NVME_FID_MAX; fid++) {
707 case NVME_FEAT_ARBITRATION:
708 case NVME_FEAT_POWER_MANAGEMENT:
709 case NVME_FEAT_INTERRUPT_COALESCING: //XXX
710 case NVME_FEAT_WRITE_ATOMICITY:
711 /* Mandatory but no special handling required */
712 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
713 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
714 // this returns a data buffer
716 case NVME_FEAT_TEMPERATURE_THRESHOLD:
717 sc->feat[fid].set = nvme_feature_temperature;
719 case NVME_FEAT_ERROR_RECOVERY:
720 sc->feat[fid].namespace_specific = true;
722 case NVME_FEAT_NUMBER_OF_QUEUES:
723 sc->feat[fid].set = nvme_feature_num_queues;
725 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
726 sc->feat[fid].set = nvme_feature_iv_config;
728 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
729 sc->feat[fid].set = nvme_feature_async_event;
730 /* Enable all AENs by default */
731 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
734 sc->feat[fid].set = nvme_feature_invalid_cb;
735 sc->feat[fid].get = nvme_feature_invalid_cb;
741 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
744 STAILQ_INIT(&sc->aer_list);
749 pci_nvme_aer_init(struct pci_nvme_softc *sc)
752 pthread_mutex_init(&sc->aer_mtx, NULL);
753 pci_nvme_aer_reset(sc);
757 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
759 struct pci_nvme_aer *aer = NULL;
761 pthread_mutex_lock(&sc->aer_mtx);
762 while (!STAILQ_EMPTY(&sc->aer_list)) {
763 aer = STAILQ_FIRST(&sc->aer_list);
764 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
767 pthread_mutex_unlock(&sc->aer_mtx);
769 pci_nvme_aer_reset(sc);
773 pci_nvme_aer_available(struct pci_nvme_softc *sc)
776 return (sc->aer_count != 0);
780 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
782 struct nvme_controller_data *cd = &sc->ctrldata;
784 /* AERL is a zero based value while aer_count is one's based */
785 return (sc->aer_count == (cd->aerl + 1));
789 * Add an Async Event Request
791 * Stores an AER to be returned later if the Controller needs to notify the
793 * Note that while the NVMe spec doesn't require Controllers to return AER's
794 * in order, this implementation does preserve the order.
797 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
799 struct pci_nvme_aer *aer = NULL;
801 aer = calloc(1, sizeof(struct pci_nvme_aer));
805 /* Save the Command ID for use in the completion message */
808 pthread_mutex_lock(&sc->aer_mtx);
810 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
811 pthread_mutex_unlock(&sc->aer_mtx);
817 * Get an Async Event Request structure
819 * Returns a pointer to an AER previously submitted by the host or NULL if
820 * no AER's exist. Caller is responsible for freeing the returned struct.
822 static struct pci_nvme_aer *
823 pci_nvme_aer_get(struct pci_nvme_softc *sc)
825 struct pci_nvme_aer *aer = NULL;
827 pthread_mutex_lock(&sc->aer_mtx);
828 aer = STAILQ_FIRST(&sc->aer_list);
830 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
833 pthread_mutex_unlock(&sc->aer_mtx);
839 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
843 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
845 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
846 sc->aen[atype].atype = atype;
851 pci_nvme_aen_init(struct pci_nvme_softc *sc)
855 pci_nvme_aen_reset(sc);
857 pthread_mutex_init(&sc->aen_mtx, NULL);
858 pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
859 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
860 sc->nsc_pi->pi_func);
861 pthread_set_name_np(sc->aen_tid, nstr);
865 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
868 pci_nvme_aen_reset(sc);
871 /* Notify the AEN thread of pending work */
873 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
876 pthread_cond_signal(&sc->aen_cond);
880 * Post an Asynchronous Event Notification
883 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
886 struct pci_nvme_aen *aen;
888 if (atype >= PCI_NVME_AE_TYPE_MAX) {
892 pthread_mutex_lock(&sc->aen_mtx);
893 aen = &sc->aen[atype];
895 /* Has the controller already posted an event of this type? */
897 pthread_mutex_unlock(&sc->aen_mtx);
901 aen->event_data = event_data;
903 pthread_mutex_unlock(&sc->aen_mtx);
905 pci_nvme_aen_notify(sc);
911 pci_nvme_aen_process(struct pci_nvme_softc *sc)
913 struct pci_nvme_aer *aer;
914 struct pci_nvme_aen *aen;
915 pci_nvme_async_type atype;
920 assert(pthread_mutex_isowned_np(&sc->aen_mtx));
921 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
922 aen = &sc->aen[atype];
923 /* Previous iterations may have depleted the available AER's */
924 if (!pci_nvme_aer_available(sc)) {
925 DPRINTF("%s: no AER", __func__);
930 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
934 status = NVME_SC_SUCCESS;
936 /* Is the event masked? */
938 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
940 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
942 case PCI_NVME_AE_TYPE_ERROR:
943 lid = NVME_LOG_ERROR;
945 case PCI_NVME_AE_TYPE_SMART:
947 if ((mask & aen->event_data) == 0)
949 lid = NVME_LOG_HEALTH_INFORMATION;
951 case PCI_NVME_AE_TYPE_NOTICE:
952 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
953 EPRINTLN("%s unknown AEN notice type %u",
954 __func__, aen->event_data);
955 status = NVME_SC_INTERNAL_DEVICE_ERROR;
958 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
960 switch (aen->event_data) {
961 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
962 lid = NVME_LOG_CHANGED_NAMESPACE;
964 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
965 lid = NVME_LOG_FIRMWARE_SLOT;
967 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
968 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
970 case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
971 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
973 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
974 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
976 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
977 lid = NVME_LOG_LBA_STATUS_INFORMATION;
979 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
980 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
988 EPRINTLN("%s unknown AEN type %u", __func__, atype);
989 status = NVME_SC_INTERNAL_DEVICE_ERROR;
993 aer = pci_nvme_aer_get(sc);
996 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
997 pci_nvme_cq_update(sc, &sc->compl_queues[0],
998 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1003 aen->event_data = 0;
1004 aen->posted = false;
1006 pci_generate_msix(sc->nsc_pi, 0);
1013 struct pci_nvme_softc *sc;
1017 pthread_mutex_lock(&sc->aen_mtx);
1019 pci_nvme_aen_process(sc);
1020 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1022 pthread_mutex_unlock(&sc->aen_mtx);
1029 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1033 DPRINTF("%s", __func__);
1035 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1036 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1037 (60 << NVME_CAP_LO_REG_TO_SHIFT);
1039 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1041 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */
1045 assert(sc->submit_queues != NULL);
1047 for (i = 0; i < sc->num_squeues + 1; i++) {
1048 sc->submit_queues[i].qbase = NULL;
1049 sc->submit_queues[i].size = 0;
1050 sc->submit_queues[i].cqid = 0;
1051 sc->submit_queues[i].tail = 0;
1052 sc->submit_queues[i].head = 0;
1055 assert(sc->compl_queues != NULL);
1057 for (i = 0; i < sc->num_cqueues + 1; i++) {
1058 sc->compl_queues[i].qbase = NULL;
1059 sc->compl_queues[i].size = 0;
1060 sc->compl_queues[i].tail = 0;
1061 sc->compl_queues[i].head = 0;
1064 sc->num_q_is_set = false;
1066 pci_nvme_aer_destroy(sc);
1067 pci_nvme_aen_destroy(sc);
1070 * Clear CSTS.RDY last to prevent the host from enabling Controller
1071 * before cleanup completes
1077 pci_nvme_reset(struct pci_nvme_softc *sc)
1079 pthread_mutex_lock(&sc->mtx);
1080 pci_nvme_reset_locked(sc);
1081 pthread_mutex_unlock(&sc->mtx);
1085 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1087 uint16_t acqs, asqs;
1089 DPRINTF("%s", __func__);
1091 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1092 sc->submit_queues[0].size = asqs;
1093 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1094 sizeof(struct nvme_command) * asqs);
1096 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1097 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1099 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1100 NVME_AQA_REG_ACQS_MASK) + 1;
1101 sc->compl_queues[0].size = acqs;
1102 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1103 sizeof(struct nvme_completion) * acqs);
1104 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1106 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1107 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1111 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1112 size_t len, enum nvme_copy_dir dir)
1117 if (len > (8 * 1024)) {
1121 /* Copy from the start of prp1 to the end of the physical page */
1122 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1123 bytes = MIN(bytes, len);
1125 p = vm_map_gpa(ctx, prp1, bytes);
1130 if (dir == NVME_COPY_TO_PRP)
1131 memcpy(p, b, bytes);
1133 memcpy(b, p, bytes);
1142 len = MIN(len, PAGE_SIZE);
1144 p = vm_map_gpa(ctx, prp2, len);
1149 if (dir == NVME_COPY_TO_PRP)
1158 * Write a Completion Queue Entry update
1160 * Write the completion and update the doorbell value
1163 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1164 struct nvme_completion_queue *cq,
1170 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1171 struct nvme_completion *cqe;
1173 assert(cq->qbase != NULL);
1175 pthread_mutex_lock(&cq->mtx);
1177 cqe = &cq->qbase[cq->tail];
1179 /* Flip the phase bit */
1180 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1183 cqe->sqhd = sq->head;
1186 cqe->status = status;
1189 if (cq->tail >= cq->size) {
1193 pthread_mutex_unlock(&cq->mtx);
1197 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1198 struct nvme_completion* compl)
1200 uint16_t qid = command->cdw10 & 0xffff;
1202 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1203 if (qid == 0 || qid > sc->num_squeues ||
1204 (sc->submit_queues[qid].qbase == NULL)) {
1205 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1206 __func__, qid, sc->num_squeues);
1207 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1208 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1212 sc->submit_queues[qid].qbase = NULL;
1213 sc->submit_queues[qid].cqid = 0;
1214 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1219 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1220 struct nvme_completion* compl)
1222 if (command->cdw11 & NVME_CMD_CDW11_PC) {
1223 uint16_t qid = command->cdw10 & 0xffff;
1224 struct nvme_submission_queue *nsq;
1226 if ((qid == 0) || (qid > sc->num_squeues) ||
1227 (sc->submit_queues[qid].qbase != NULL)) {
1228 WPRINTF("%s queue index %u > num_squeues %u",
1229 __func__, qid, sc->num_squeues);
1230 pci_nvme_status_tc(&compl->status,
1231 NVME_SCT_COMMAND_SPECIFIC,
1232 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1236 nsq = &sc->submit_queues[qid];
1237 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1238 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1239 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1241 * Queues must specify at least two entries
1242 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1243 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1245 pci_nvme_status_tc(&compl->status,
1246 NVME_SCT_COMMAND_SPECIFIC,
1247 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1250 nsq->head = nsq->tail = 0;
1252 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1253 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1254 pci_nvme_status_tc(&compl->status,
1255 NVME_SCT_COMMAND_SPECIFIC,
1256 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1260 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1261 pci_nvme_status_tc(&compl->status,
1262 NVME_SCT_COMMAND_SPECIFIC,
1263 NVME_SC_COMPLETION_QUEUE_INVALID);
1267 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1269 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1270 sizeof(struct nvme_command) * (size_t)nsq->size);
1272 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1273 qid, nsq->size, nsq->qbase, nsq->cqid);
1275 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1277 DPRINTF("%s completed creating IOSQ qid %u",
1281 * Guest sent non-cont submission queue request.
1282 * This setting is unsupported by this emulation.
1284 WPRINTF("%s unsupported non-contig (list-based) "
1285 "create i/o submission queue", __func__);
1287 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1293 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1294 struct nvme_completion* compl)
1296 uint16_t qid = command->cdw10 & 0xffff;
1299 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1300 if (qid == 0 || qid > sc->num_cqueues ||
1301 (sc->compl_queues[qid].qbase == NULL)) {
1302 WPRINTF("%s queue index %u / num_cqueues %u",
1303 __func__, qid, sc->num_cqueues);
1304 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1305 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1309 /* Deleting an Active CQ is an error */
1310 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1311 if (sc->submit_queues[sqid].cqid == qid) {
1312 pci_nvme_status_tc(&compl->status,
1313 NVME_SCT_COMMAND_SPECIFIC,
1314 NVME_SC_INVALID_QUEUE_DELETION);
1318 sc->compl_queues[qid].qbase = NULL;
1319 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1324 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1325 struct nvme_completion* compl)
1327 struct nvme_completion_queue *ncq;
1328 uint16_t qid = command->cdw10 & 0xffff;
1330 /* Only support Physically Contiguous queues */
1331 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1332 WPRINTF("%s unsupported non-contig (list-based) "
1333 "create i/o completion queue",
1336 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1340 if ((qid == 0) || (qid > sc->num_cqueues) ||
1341 (sc->compl_queues[qid].qbase != NULL)) {
1342 WPRINTF("%s queue index %u > num_cqueues %u",
1343 __func__, qid, sc->num_cqueues);
1344 pci_nvme_status_tc(&compl->status,
1345 NVME_SCT_COMMAND_SPECIFIC,
1346 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1350 ncq = &sc->compl_queues[qid];
1351 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1352 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1353 if (ncq->intr_vec > (sc->max_queues + 1)) {
1354 pci_nvme_status_tc(&compl->status,
1355 NVME_SCT_COMMAND_SPECIFIC,
1356 NVME_SC_INVALID_INTERRUPT_VECTOR);
1360 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1361 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1363 * Queues must specify at least two entries
1364 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1365 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1367 pci_nvme_status_tc(&compl->status,
1368 NVME_SCT_COMMAND_SPECIFIC,
1369 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1372 ncq->head = ncq->tail = 0;
1373 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1375 sizeof(struct nvme_command) * (size_t)ncq->size);
1377 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1384 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1385 struct nvme_completion* compl)
1389 uint8_t logpage = command->cdw10 & 0xFF;
1391 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1393 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1396 * Command specifies the number of dwords to return in fields NUMDU
1397 * and NUMDL. This is a zero-based value.
1399 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1400 logsize *= sizeof(uint32_t);
1401 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1404 case NVME_LOG_ERROR:
1405 if (logoff >= sizeof(sc->err_log)) {
1406 pci_nvme_status_genc(&compl->status,
1407 NVME_SC_INVALID_FIELD);
1411 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1412 command->prp2, (uint8_t *)&sc->err_log + logoff,
1413 MIN(logsize - logoff, sizeof(sc->err_log)),
1416 case NVME_LOG_HEALTH_INFORMATION:
1417 if (logoff >= sizeof(sc->health_log)) {
1418 pci_nvme_status_genc(&compl->status,
1419 NVME_SC_INVALID_FIELD);
1423 pthread_mutex_lock(&sc->mtx);
1424 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1425 sizeof(sc->health_log.data_units_read));
1426 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1427 sizeof(sc->health_log.data_units_written));
1428 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1429 sizeof(sc->health_log.host_read_commands));
1430 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1431 sizeof(sc->health_log.host_write_commands));
1432 pthread_mutex_unlock(&sc->mtx);
1434 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1435 command->prp2, (uint8_t *)&sc->health_log + logoff,
1436 MIN(logsize - logoff, sizeof(sc->health_log)),
1439 case NVME_LOG_FIRMWARE_SLOT:
1440 if (logoff >= sizeof(sc->fw_log)) {
1441 pci_nvme_status_genc(&compl->status,
1442 NVME_SC_INVALID_FIELD);
1446 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1447 command->prp2, (uint8_t *)&sc->fw_log + logoff,
1448 MIN(logsize - logoff, sizeof(sc->fw_log)),
1451 case NVME_LOG_CHANGED_NAMESPACE:
1452 if (logoff >= sizeof(sc->ns_log)) {
1453 pci_nvme_status_genc(&compl->status,
1454 NVME_SC_INVALID_FIELD);
1458 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1459 command->prp2, (uint8_t *)&sc->ns_log + logoff,
1460 MIN(logsize - logoff, sizeof(sc->ns_log)),
1462 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1465 DPRINTF("%s get log page %x command not supported",
1468 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1469 NVME_SC_INVALID_LOG_PAGE);
1476 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1477 struct nvme_completion* compl)
1482 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1483 command->cdw10 & 0xFF, command->nsid);
1485 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1487 switch (command->cdw10 & 0xFF) {
1488 case 0x00: /* return Identify Namespace data structure */
1489 /* Global NS only valid with NS Management */
1490 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1491 pci_nvme_status_genc(&status,
1492 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1495 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1496 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1499 case 0x01: /* return Identify Controller data structure */
1500 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1501 command->prp2, (uint8_t *)&sc->ctrldata,
1502 sizeof(sc->ctrldata),
1505 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1506 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1507 sizeof(uint32_t) * 1024);
1508 /* All unused entries shall be zero */
1509 bzero(dest, sizeof(uint32_t) * 1024);
1510 ((uint32_t *)dest)[0] = 1;
1512 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1513 if (command->nsid != 1) {
1514 pci_nvme_status_genc(&status,
1515 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1518 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1519 sizeof(uint32_t) * 1024);
1520 /* All bytes after the descriptor shall be zero */
1521 bzero(dest, sizeof(uint32_t) * 1024);
1523 /* Return NIDT=1 (i.e. EUI64) descriptor */
1524 ((uint8_t *)dest)[0] = 1;
1525 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1526 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1529 DPRINTF("%s unsupported identify command requested 0x%x",
1530 __func__, command->cdw10 & 0xFF);
1531 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1535 compl->status = status;
1540 nvme_fid_to_name(uint8_t fid)
1545 case NVME_FEAT_ARBITRATION:
1546 name = "Arbitration";
1548 case NVME_FEAT_POWER_MANAGEMENT:
1549 name = "Power Management";
1551 case NVME_FEAT_LBA_RANGE_TYPE:
1552 name = "LBA Range Type";
1554 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1555 name = "Temperature Threshold";
1557 case NVME_FEAT_ERROR_RECOVERY:
1558 name = "Error Recovery";
1560 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1561 name = "Volatile Write Cache";
1563 case NVME_FEAT_NUMBER_OF_QUEUES:
1564 name = "Number of Queues";
1566 case NVME_FEAT_INTERRUPT_COALESCING:
1567 name = "Interrupt Coalescing";
1569 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1570 name = "Interrupt Vector Configuration";
1572 case NVME_FEAT_WRITE_ATOMICITY:
1573 name = "Write Atomicity Normal";
1575 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1576 name = "Asynchronous Event Configuration";
1578 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1579 name = "Autonomous Power State Transition";
1581 case NVME_FEAT_HOST_MEMORY_BUFFER:
1582 name = "Host Memory Buffer";
1584 case NVME_FEAT_TIMESTAMP:
1587 case NVME_FEAT_KEEP_ALIVE_TIMER:
1588 name = "Keep Alive Timer";
1590 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1591 name = "Host Controlled Thermal Management";
1593 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1594 name = "Non-Operation Power State Config";
1596 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1597 name = "Read Recovery Level Config";
1599 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1600 name = "Predictable Latency Mode Config";
1602 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1603 name = "Predictable Latency Mode Window";
1605 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1606 name = "LBA Status Information Report Interval";
1608 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1609 name = "Host Behavior Support";
1611 case NVME_FEAT_SANITIZE_CONFIG:
1612 name = "Sanitize Config";
1614 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1615 name = "Endurance Group Event Configuration";
1617 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1618 name = "Software Progress Marker";
1620 case NVME_FEAT_HOST_IDENTIFIER:
1621 name = "Host Identifier";
1623 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1624 name = "Reservation Notification Mask";
1626 case NVME_FEAT_RESERVATION_PERSISTENCE:
1627 name = "Reservation Persistence";
1629 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1630 name = "Namespace Write Protection Config";
1641 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1642 struct nvme_feature_obj *feat,
1643 struct nvme_command *command,
1644 struct nvme_completion *compl)
1647 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1651 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1652 struct nvme_feature_obj *feat,
1653 struct nvme_command *command,
1654 struct nvme_completion *compl)
1657 uint32_t cdw11 = command->cdw11;
1661 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1663 iv = cdw11 & 0xffff;
1664 cd = cdw11 & (1 << 16);
1666 if (iv > (sc->max_queues + 1)) {
1670 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1671 if ((iv == 0) && !cd)
1674 /* Requested Interrupt Vector must be used by a CQ */
1675 for (i = 0; i < sc->num_cqueues + 1; i++) {
1676 if (sc->compl_queues[i].intr_vec == iv) {
1677 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1682 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000)
1684 nvme_feature_async_event(struct pci_nvme_softc *sc,
1685 struct nvme_feature_obj *feat,
1686 struct nvme_command *command,
1687 struct nvme_completion *compl)
1690 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1691 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1694 #define NVME_TEMP_THRESH_OVER 0
1695 #define NVME_TEMP_THRESH_UNDER 1
1697 nvme_feature_temperature(struct pci_nvme_softc *sc,
1698 struct nvme_feature_obj *feat,
1699 struct nvme_command *command,
1700 struct nvme_completion *compl)
1702 uint16_t tmpth; /* Temperature Threshold */
1703 uint8_t tmpsel; /* Threshold Temperature Select */
1704 uint8_t thsel; /* Threshold Type Select */
1705 bool set_crit = false;
1707 tmpth = command->cdw11 & 0xffff;
1708 tmpsel = (command->cdw11 >> 16) & 0xf;
1709 thsel = (command->cdw11 >> 20) & 0x3;
1711 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1713 /* Check for unsupported values */
1714 if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1715 (thsel > NVME_TEMP_THRESH_UNDER)) {
1716 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1720 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) ||
1721 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1724 pthread_mutex_lock(&sc->mtx);
1726 sc->health_log.critical_warning |=
1727 NVME_CRIT_WARN_ST_TEMPERATURE;
1729 sc->health_log.critical_warning &=
1730 ~NVME_CRIT_WARN_ST_TEMPERATURE;
1731 pthread_mutex_unlock(&sc->mtx);
1734 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1735 sc->health_log.critical_warning);
1738 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1742 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1743 struct nvme_feature_obj *feat,
1744 struct nvme_command *command,
1745 struct nvme_completion *compl)
1747 uint16_t nqr; /* Number of Queues Requested */
1749 if (sc->num_q_is_set) {
1750 WPRINTF("%s: Number of Queues already set", __func__);
1751 pci_nvme_status_genc(&compl->status,
1752 NVME_SC_COMMAND_SEQUENCE_ERROR);
1756 nqr = command->cdw11 & 0xFFFF;
1757 if (nqr == 0xffff) {
1758 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1759 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1763 sc->num_squeues = ONE_BASED(nqr);
1764 if (sc->num_squeues > sc->max_queues) {
1765 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1767 sc->num_squeues = sc->max_queues;
1770 nqr = (command->cdw11 >> 16) & 0xFFFF;
1771 if (nqr == 0xffff) {
1772 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1773 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1777 sc->num_cqueues = ONE_BASED(nqr);
1778 if (sc->num_cqueues > sc->max_queues) {
1779 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1781 sc->num_cqueues = sc->max_queues;
1784 /* Patch the command value which will be saved on callback's return */
1785 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1786 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1788 sc->num_q_is_set = true;
1792 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1793 struct nvme_completion *compl)
1795 struct nvme_feature_obj *feat;
1796 uint32_t nsid = command->nsid;
1797 uint8_t fid = command->cdw10 & 0xFF;
1799 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1801 if (fid >= NVME_FID_MAX) {
1802 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1803 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1806 feat = &sc->feat[fid];
1808 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1809 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1813 if (!feat->namespace_specific &&
1814 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1815 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1816 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1821 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1824 feat->set(sc, feat, command, compl);
1826 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1827 if (compl->status == NVME_SC_SUCCESS) {
1828 feat->cdw11 = command->cdw11;
1829 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1830 (command->cdw11 != 0))
1831 pci_nvme_aen_notify(sc);
1837 #define NVME_FEATURES_SEL_SUPPORTED 0x3
1838 #define NVME_FEATURES_NS_SPECIFIC (1 << 1)
1841 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1842 struct nvme_completion* compl)
1844 struct nvme_feature_obj *feat;
1845 uint8_t fid = command->cdw10 & 0xFF;
1846 uint8_t sel = (command->cdw10 >> 8) & 0x7;
1848 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1850 if (fid >= NVME_FID_MAX) {
1851 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1852 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1857 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1859 feat = &sc->feat[fid];
1861 feat->get(sc, feat, command, compl);
1864 if (compl->status == NVME_SC_SUCCESS) {
1865 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1866 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1868 compl->cdw0 = feat->cdw11;
1875 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1876 struct nvme_completion* compl)
1878 uint8_t ses, lbaf, pi;
1880 /* Only supports Secure Erase Setting - User Data Erase */
1881 ses = (command->cdw10 >> 9) & 0x7;
1883 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1887 /* Only supports a single LBA Format */
1888 lbaf = command->cdw10 & 0xf;
1890 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1891 NVME_SC_INVALID_FORMAT);
1895 /* Doesn't support Protection Infomation */
1896 pi = (command->cdw10 >> 5) & 0x7;
1898 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1902 if (sc->nvstore.type == NVME_STOR_RAM) {
1903 if (sc->nvstore.ctx)
1904 free(sc->nvstore.ctx);
1905 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1906 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1908 struct pci_nvme_ioreq *req;
1911 req = pci_nvme_get_ioreq(sc);
1913 pci_nvme_status_genc(&compl->status,
1914 NVME_SC_INTERNAL_DEVICE_ERROR);
1915 WPRINTF("%s: unable to allocate IO req", __func__);
1918 req->nvme_sq = &sc->submit_queues[0];
1920 req->opc = command->opc;
1921 req->cid = command->cid;
1922 req->nsid = command->nsid;
1924 req->io_req.br_offset = 0;
1925 req->io_req.br_resid = sc->nvstore.size;
1926 req->io_req.br_callback = pci_nvme_io_done;
1928 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1930 pci_nvme_status_genc(&compl->status,
1931 NVME_SC_INTERNAL_DEVICE_ERROR);
1932 pci_nvme_release_ioreq(sc, req);
1934 compl->status = NVME_NO_STATUS;
1941 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1942 struct nvme_completion* compl)
1944 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1945 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1947 /* TODO: search for the command ID and abort it */
1950 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1955 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1956 struct nvme_command* command, struct nvme_completion* compl)
1958 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1959 sc->aer_count, sc->ctrldata.aerl, command->cid);
1961 /* Don't exceed the Async Event Request Limit (AERL). */
1962 if (pci_nvme_aer_limit_reached(sc)) {
1963 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1964 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1968 if (pci_nvme_aer_add(sc, command->cid)) {
1969 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1970 NVME_SC_INTERNAL_DEVICE_ERROR);
1975 * Raise events when they happen based on the Set Features cmd.
1976 * These events happen async, so only set completion successful if
1977 * there is an event reflective of the request to get event.
1979 compl->status = NVME_NO_STATUS;
1980 pci_nvme_aen_notify(sc);
1986 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1988 struct nvme_completion compl;
1989 struct nvme_command *cmd;
1990 struct nvme_submission_queue *sq;
1991 struct nvme_completion_queue *cq;
1994 DPRINTF("%s index %u", __func__, (uint32_t)value);
1996 sq = &sc->submit_queues[0];
1997 cq = &sc->compl_queues[0];
1999 pthread_mutex_lock(&sq->mtx);
2002 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2004 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2005 cmd = &(sq->qbase)[sqhead];
2010 case NVME_OPC_DELETE_IO_SQ:
2011 DPRINTF("%s command DELETE_IO_SQ", __func__);
2012 nvme_opc_delete_io_sq(sc, cmd, &compl);
2014 case NVME_OPC_CREATE_IO_SQ:
2015 DPRINTF("%s command CREATE_IO_SQ", __func__);
2016 nvme_opc_create_io_sq(sc, cmd, &compl);
2018 case NVME_OPC_DELETE_IO_CQ:
2019 DPRINTF("%s command DELETE_IO_CQ", __func__);
2020 nvme_opc_delete_io_cq(sc, cmd, &compl);
2022 case NVME_OPC_CREATE_IO_CQ:
2023 DPRINTF("%s command CREATE_IO_CQ", __func__);
2024 nvme_opc_create_io_cq(sc, cmd, &compl);
2026 case NVME_OPC_GET_LOG_PAGE:
2027 DPRINTF("%s command GET_LOG_PAGE", __func__);
2028 nvme_opc_get_log_page(sc, cmd, &compl);
2030 case NVME_OPC_IDENTIFY:
2031 DPRINTF("%s command IDENTIFY", __func__);
2032 nvme_opc_identify(sc, cmd, &compl);
2034 case NVME_OPC_ABORT:
2035 DPRINTF("%s command ABORT", __func__);
2036 nvme_opc_abort(sc, cmd, &compl);
2038 case NVME_OPC_SET_FEATURES:
2039 DPRINTF("%s command SET_FEATURES", __func__);
2040 nvme_opc_set_features(sc, cmd, &compl);
2042 case NVME_OPC_GET_FEATURES:
2043 DPRINTF("%s command GET_FEATURES", __func__);
2044 nvme_opc_get_features(sc, cmd, &compl);
2046 case NVME_OPC_FIRMWARE_ACTIVATE:
2047 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2048 pci_nvme_status_tc(&compl.status,
2049 NVME_SCT_COMMAND_SPECIFIC,
2050 NVME_SC_INVALID_FIRMWARE_SLOT);
2052 case NVME_OPC_ASYNC_EVENT_REQUEST:
2053 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2054 nvme_opc_async_event_req(sc, cmd, &compl);
2056 case NVME_OPC_FORMAT_NVM:
2057 DPRINTF("%s command FORMAT_NVM", __func__);
2058 if ((sc->ctrldata.oacs &
2059 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2060 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2063 nvme_opc_format_nvm(sc, cmd, &compl);
2065 case NVME_OPC_SECURITY_SEND:
2066 case NVME_OPC_SECURITY_RECEIVE:
2067 case NVME_OPC_SANITIZE:
2068 case NVME_OPC_GET_LBA_STATUS:
2069 DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2071 /* Valid but unsupported opcodes */
2072 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2075 DPRINTF("%s command OPC=%#X (not implemented)",
2078 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2080 sqhead = (sqhead + 1) % sq->size;
2082 if (NVME_COMPLETION_VALID(compl)) {
2083 pci_nvme_cq_update(sc, &sc->compl_queues[0],
2091 DPRINTF("setting sqhead %u", sqhead);
2094 if (cq->head != cq->tail)
2095 pci_generate_msix(sc->nsc_pi, 0);
2097 pthread_mutex_unlock(&sq->mtx);
2101 * Update the Write and Read statistics reported in SMART data
2103 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2104 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2105 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2108 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2109 size_t bytes, uint16_t status)
2112 pthread_mutex_lock(&sc->mtx);
2114 case NVME_OPC_WRITE:
2115 sc->write_commands++;
2116 if (status != NVME_SC_SUCCESS)
2118 sc->write_dunits_remainder += (bytes / 512);
2119 while (sc->write_dunits_remainder >= 1000) {
2120 sc->write_data_units++;
2121 sc->write_dunits_remainder -= 1000;
2125 sc->read_commands++;
2126 if (status != NVME_SC_SUCCESS)
2128 sc->read_dunits_remainder += (bytes / 512);
2129 while (sc->read_dunits_remainder >= 1000) {
2130 sc->read_data_units++;
2131 sc->read_dunits_remainder -= 1000;
2135 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2138 pthread_mutex_unlock(&sc->mtx);
2142 * Check if the combination of Starting LBA (slba) and number of blocks
2143 * exceeds the range of the underlying storage.
2145 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2146 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2150 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2153 size_t offset, bytes;
2155 /* Overflow check of multiplying Starting LBA by the sector size */
2156 if (slba >> (64 - nvstore->sectsz_bits))
2159 offset = slba << nvstore->sectsz_bits;
2160 bytes = nblocks << nvstore->sectsz_bits;
2162 /* Overflow check of Number of Logical Blocks */
2163 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2170 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2171 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2178 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2182 /* concatenate contig block-iovs to minimize number of iovs */
2183 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2184 iovidx = req->io_req.br_iovcnt - 1;
2186 req->io_req.br_iov[iovidx].iov_base =
2187 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2188 req->prev_gpaddr, size);
2190 req->prev_size += size;
2191 req->io_req.br_resid += size;
2193 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2195 iovidx = req->io_req.br_iovcnt;
2197 req->io_req.br_offset = lba;
2198 req->io_req.br_resid = 0;
2199 req->io_req.br_param = req;
2202 req->io_req.br_iov[iovidx].iov_base =
2203 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2206 req->io_req.br_iov[iovidx].iov_len = size;
2208 req->prev_gpaddr = gpaddr;
2209 req->prev_size = size;
2210 req->io_req.br_resid += size;
2212 req->io_req.br_iovcnt++;
2219 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2220 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2221 uint32_t cdw0, uint16_t status)
2223 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2225 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2226 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2227 NVME_STATUS_GET_SC(status));
2229 pci_nvme_cq_update(sc, cq,
2235 if (cq->head != cq->tail) {
2236 if (cq->intr_en & NVME_CQ_INTEN) {
2237 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2239 DPRINTF("%s: CQ%u interrupt disabled",
2240 __func__, sq->cqid);
2246 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2249 req->nvme_sq = NULL;
2252 pthread_mutex_lock(&sc->mtx);
2254 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2257 /* when no more IO pending, can set to ready if device reset/enabled */
2258 if (sc->pending_ios == 0 &&
2259 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2260 sc->regs.csts |= NVME_CSTS_RDY;
2262 pthread_mutex_unlock(&sc->mtx);
2264 sem_post(&sc->iosemlock);
2267 static struct pci_nvme_ioreq *
2268 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2270 struct pci_nvme_ioreq *req = NULL;
2272 sem_wait(&sc->iosemlock);
2273 pthread_mutex_lock(&sc->mtx);
2275 req = STAILQ_FIRST(&sc->ioreqs_free);
2276 assert(req != NULL);
2277 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2283 pthread_mutex_unlock(&sc->mtx);
2285 req->io_req.br_iovcnt = 0;
2286 req->io_req.br_offset = 0;
2287 req->io_req.br_resid = 0;
2288 req->io_req.br_param = req;
2289 req->prev_gpaddr = 0;
2296 pci_nvme_io_done(struct blockif_req *br, int err)
2298 struct pci_nvme_ioreq *req = br->br_param;
2299 struct nvme_submission_queue *sq = req->nvme_sq;
2300 uint16_t code, status;
2302 DPRINTF("%s error %d %s", __func__, err, strerror(err));
2304 /* TODO return correct error */
2305 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2306 pci_nvme_status_genc(&status, code);
2308 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2309 pci_nvme_stats_write_read_update(req->sc, req->opc,
2310 req->bytes, status);
2311 pci_nvme_release_ioreq(req->sc, req);
2315 * Implements the Flush command. The specification states:
2316 * If a volatile write cache is not present, Flush commands complete
2317 * successfully and have no effect
2318 * in the description of the Volatile Write Cache (VWC) field of the Identify
2319 * Controller data. Therefore, set status to Success if the command is
2320 * not supported (i.e. RAM or as indicated by the blockif).
2323 nvme_opc_flush(struct pci_nvme_softc *sc,
2324 struct nvme_command *cmd,
2325 struct pci_nvme_blockstore *nvstore,
2326 struct pci_nvme_ioreq *req,
2329 bool pending = false;
2331 if (nvstore->type == NVME_STOR_RAM) {
2332 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2336 req->io_req.br_callback = pci_nvme_io_done;
2338 err = blockif_flush(nvstore->ctx, &req->io_req);
2344 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2347 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2355 nvme_write_read_ram(struct pci_nvme_softc *sc,
2356 struct pci_nvme_blockstore *nvstore,
2357 uint64_t prp1, uint64_t prp2,
2358 size_t offset, uint64_t bytes,
2361 uint8_t *buf = nvstore->ctx;
2362 enum nvme_copy_dir dir;
2366 dir = NVME_COPY_TO_PRP;
2368 dir = NVME_COPY_FROM_PRP;
2370 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2371 buf + offset, bytes, dir))
2372 pci_nvme_status_genc(&status,
2373 NVME_SC_DATA_TRANSFER_ERROR);
2375 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2381 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2382 struct pci_nvme_blockstore *nvstore,
2383 struct pci_nvme_ioreq *req,
2384 uint64_t prp1, uint64_t prp2,
2385 size_t offset, uint64_t bytes,
2390 uint16_t status = NVME_NO_STATUS;
2392 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2393 if (pci_nvme_append_iov_req(sc, req, prp1,
2394 size, is_write, offset)) {
2395 pci_nvme_status_genc(&status,
2396 NVME_SC_DATA_TRANSFER_ERROR);
2405 } else if (bytes <= PAGE_SIZE) {
2407 if (pci_nvme_append_iov_req(sc, req, prp2,
2408 size, is_write, offset)) {
2409 pci_nvme_status_genc(&status,
2410 NVME_SC_DATA_TRANSFER_ERROR);
2414 void *vmctx = sc->nsc_pi->pi_vmctx;
2415 uint64_t *prp_list = &prp2;
2416 uint64_t *last = prp_list;
2418 /* PRP2 is pointer to a physical region page list */
2420 /* Last entry in list points to the next list */
2421 if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2422 uint64_t prp = *prp_list;
2424 prp_list = paddr_guest2host(vmctx, prp,
2425 PAGE_SIZE - (prp % PAGE_SIZE));
2426 last = prp_list + (NVME_PRP2_ITEMS - 1);
2429 size = MIN(bytes, PAGE_SIZE);
2431 if (pci_nvme_append_iov_req(sc, req, *prp_list,
2432 size, is_write, offset)) {
2433 pci_nvme_status_genc(&status,
2434 NVME_SC_DATA_TRANSFER_ERROR);
2444 req->io_req.br_callback = pci_nvme_io_done;
2446 err = blockif_write(nvstore->ctx, &req->io_req);
2448 err = blockif_read(nvstore->ctx, &req->io_req);
2451 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2457 nvme_opc_write_read(struct pci_nvme_softc *sc,
2458 struct nvme_command *cmd,
2459 struct pci_nvme_blockstore *nvstore,
2460 struct pci_nvme_ioreq *req,
2463 uint64_t lba, nblocks, bytes;
2465 bool is_write = cmd->opc == NVME_OPC_WRITE;
2466 bool pending = false;
2468 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2469 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2471 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2472 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2473 __func__, lba, nblocks);
2474 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2478 bytes = nblocks << nvstore->sectsz_bits;
2479 if (bytes > NVME_MAX_DATA_SIZE) {
2480 WPRINTF("%s command would exceed MDTS", __func__);
2481 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2485 offset = lba << nvstore->sectsz_bits;
2488 req->io_req.br_offset = lba;
2490 /* PRP bits 1:0 must be zero */
2491 cmd->prp1 &= ~0x3UL;
2492 cmd->prp2 &= ~0x3UL;
2494 if (nvstore->type == NVME_STOR_RAM) {
2495 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2496 cmd->prp2, offset, bytes, is_write);
2498 *status = nvme_write_read_blockif(sc, nvstore, req,
2499 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2501 if (*status == NVME_NO_STATUS)
2506 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2512 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2514 struct pci_nvme_ioreq *req = br->br_param;
2515 struct pci_nvme_softc *sc = req->sc;
2520 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2521 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2522 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2524 struct iovec *iov = req->io_req.br_iov;
2527 iov += req->prev_gpaddr;
2529 /* The iov_* values already include the sector size */
2530 req->io_req.br_offset = (off_t)iov->iov_base;
2531 req->io_req.br_resid = iov->iov_len;
2532 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2533 pci_nvme_status_genc(&status,
2534 NVME_SC_INTERNAL_DEVICE_ERROR);
2540 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2541 req->cid, 0, status);
2542 pci_nvme_release_ioreq(sc, req);
2547 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2548 struct nvme_command *cmd,
2549 struct pci_nvme_blockstore *nvstore,
2550 struct pci_nvme_ioreq *req,
2553 struct nvme_dsm_range *range;
2554 uint32_t nr, r, non_zero, dr;
2556 bool pending = false;
2558 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2559 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2563 nr = cmd->cdw10 & 0xff;
2565 /* copy locally because a range entry could straddle PRPs */
2566 range = calloc(1, NVME_MAX_DSM_TRIM);
2567 if (range == NULL) {
2568 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2571 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2572 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2574 /* Check for invalid ranges and the number of non-zero lengths */
2576 for (r = 0; r <= nr; r++) {
2577 if (pci_nvme_out_of_range(nvstore,
2578 range[r].starting_lba, range[r].length)) {
2579 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2582 if (range[r].length != 0)
2586 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2587 size_t offset, bytes;
2588 int sectsz_bits = sc->nvstore.sectsz_bits;
2591 * DSM calls are advisory only, and compliant controllers
2592 * may choose to take no actions (i.e. return Success).
2594 if (!nvstore->deallocate) {
2595 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2599 /* If all ranges have a zero length, return Success */
2600 if (non_zero == 0) {
2601 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2606 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2610 offset = range[0].starting_lba << sectsz_bits;
2611 bytes = range[0].length << sectsz_bits;
2614 * If the request is for more than a single range, store
2615 * the ranges in the br_iov. Optimize for the common case
2616 * of a single range.
2618 * Note that NVMe Number of Ranges is a zero based value
2620 req->io_req.br_iovcnt = 0;
2621 req->io_req.br_offset = offset;
2622 req->io_req.br_resid = bytes;
2625 req->io_req.br_callback = pci_nvme_io_done;
2627 struct iovec *iov = req->io_req.br_iov;
2629 for (r = 0, dr = 0; r <= nr; r++) {
2630 offset = range[r].starting_lba << sectsz_bits;
2631 bytes = range[r].length << sectsz_bits;
2635 if ((nvstore->size - offset) < bytes) {
2636 pci_nvme_status_genc(status,
2637 NVME_SC_LBA_OUT_OF_RANGE);
2640 iov[dr].iov_base = (void *)offset;
2641 iov[dr].iov_len = bytes;
2644 req->io_req.br_callback = pci_nvme_dealloc_sm;
2647 * Use prev_gpaddr to track the current entry and
2648 * prev_size to track the number of entries
2650 req->prev_gpaddr = 0;
2651 req->prev_size = dr;
2654 err = blockif_delete(nvstore->ctx, &req->io_req);
2656 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2666 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2668 struct nvme_submission_queue *sq;
2672 /* handle all submissions up to sq->tail index */
2673 sq = &sc->submit_queues[idx];
2675 pthread_mutex_lock(&sq->mtx);
2678 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2679 idx, sqhead, sq->tail, sq->qbase);
2681 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2682 struct nvme_command *cmd;
2683 struct pci_nvme_ioreq *req;
2691 cmd = &sq->qbase[sqhead];
2692 sqhead = (sqhead + 1) % sq->size;
2694 nsid = le32toh(cmd->nsid);
2695 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2696 pci_nvme_status_genc(&status,
2697 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2699 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2703 req = pci_nvme_get_ioreq(sc);
2705 pci_nvme_status_genc(&status,
2706 NVME_SC_INTERNAL_DEVICE_ERROR);
2707 WPRINTF("%s: unable to allocate IO req", __func__);
2712 req->opc = cmd->opc;
2713 req->cid = cmd->cid;
2714 req->nsid = cmd->nsid;
2717 case NVME_OPC_FLUSH:
2718 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2721 case NVME_OPC_WRITE:
2723 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2726 case NVME_OPC_WRITE_ZEROES:
2727 /* TODO: write zeroes
2728 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2729 __func__, lba, cmd->cdw12 & 0xFFFF); */
2730 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2732 case NVME_OPC_DATASET_MANAGEMENT:
2733 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2737 WPRINTF("%s unhandled io command 0x%x",
2738 __func__, cmd->opc);
2739 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2743 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2746 pci_nvme_release_ioreq(sc, req);
2752 pthread_mutex_unlock(&sq->mtx);
2756 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2757 uint64_t idx, int is_sq, uint64_t value)
2759 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2760 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2763 if (idx > sc->num_squeues) {
2764 WPRINTF("%s queue index %lu overflow from "
2766 __func__, idx, sc->num_squeues);
2770 atomic_store_short(&sc->submit_queues[idx].tail,
2774 pci_nvme_handle_admin_cmd(sc, value);
2776 /* submission queue; handle new entries in SQ */
2777 if (idx > sc->num_squeues) {
2778 WPRINTF("%s SQ index %lu overflow from "
2780 __func__, idx, sc->num_squeues);
2783 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2786 if (idx > sc->num_cqueues) {
2787 WPRINTF("%s queue index %lu overflow from "
2789 __func__, idx, sc->num_cqueues);
2793 atomic_store_short(&sc->compl_queues[idx].head,
2799 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2801 const char *s = iswrite ? "WRITE" : "READ";
2804 case NVME_CR_CAP_LOW:
2805 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2807 case NVME_CR_CAP_HI:
2808 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2811 DPRINTF("%s %s NVME_CR_VS", func, s);
2814 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2817 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2820 DPRINTF("%s %s NVME_CR_CC", func, s);
2823 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2826 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2829 DPRINTF("%s %s NVME_CR_AQA", func, s);
2831 case NVME_CR_ASQ_LOW:
2832 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2834 case NVME_CR_ASQ_HI:
2835 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2837 case NVME_CR_ACQ_LOW:
2838 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2840 case NVME_CR_ACQ_HI:
2841 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2844 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2850 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2851 uint64_t offset, int size, uint64_t value)
2855 if (offset >= NVME_DOORBELL_OFFSET) {
2856 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2857 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2858 int is_sq = (belloffset % 8) < 4;
2860 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2861 WPRINTF("guest attempted an overflow write offset "
2862 "0x%lx, val 0x%lx in %s",
2863 offset, value, __func__);
2867 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2871 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2872 offset, size, value);
2875 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2876 "val 0x%lx) to bar0 in %s",
2877 size, offset, value, __func__);
2878 /* TODO: shutdown device */
2882 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2884 pthread_mutex_lock(&sc->mtx);
2887 case NVME_CR_CAP_LOW:
2888 case NVME_CR_CAP_HI:
2895 /* MSI-X, so ignore */
2898 /* MSI-X, so ignore */
2901 ccreg = (uint32_t)value;
2903 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2906 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2907 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2908 NVME_CC_GET_IOCQES(ccreg));
2910 if (NVME_CC_GET_SHN(ccreg)) {
2911 /* perform shutdown - flush out data to backend */
2912 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2913 NVME_CSTS_REG_SHST_SHIFT);
2914 sc->regs.csts |= NVME_SHST_COMPLETE <<
2915 NVME_CSTS_REG_SHST_SHIFT;
2917 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2918 if (NVME_CC_GET_EN(ccreg) == 0)
2919 /* transition 1-> causes controller reset */
2920 pci_nvme_reset_locked(sc);
2922 pci_nvme_init_controller(ctx, sc);
2925 /* Insert the iocqes, iosqes and en bits from the write */
2926 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2927 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2928 if (NVME_CC_GET_EN(ccreg) == 0) {
2929 /* Insert the ams, mps and css bit fields */
2930 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2931 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2932 sc->regs.csts &= ~NVME_CSTS_RDY;
2933 } else if (sc->pending_ios == 0) {
2934 sc->regs.csts |= NVME_CSTS_RDY;
2940 /* ignore writes; don't support subsystem reset */
2943 sc->regs.aqa = (uint32_t)value;
2945 case NVME_CR_ASQ_LOW:
2946 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2947 (0xFFFFF000 & value);
2949 case NVME_CR_ASQ_HI:
2950 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2953 case NVME_CR_ACQ_LOW:
2954 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2955 (0xFFFFF000 & value);
2957 case NVME_CR_ACQ_HI:
2958 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2962 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2963 __func__, offset, value, size);
2965 pthread_mutex_unlock(&sc->mtx);
2969 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2970 int baridx, uint64_t offset, int size, uint64_t value)
2972 struct pci_nvme_softc* sc = pi->pi_arg;
2974 if (baridx == pci_msix_table_bar(pi) ||
2975 baridx == pci_msix_pba_bar(pi)) {
2976 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2977 " value 0x%lx", baridx, offset, size, value);
2979 pci_emul_msix_twrite(pi, offset, size, value);
2985 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2989 DPRINTF("%s unknown baridx %d, val 0x%lx",
2990 __func__, baridx, value);
2994 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2995 uint64_t offset, int size)
2999 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3001 if (offset < NVME_DOORBELL_OFFSET) {
3002 void *p = &(sc->regs);
3003 pthread_mutex_lock(&sc->mtx);
3004 memcpy(&value, (void *)((uintptr_t)p + offset), size);
3005 pthread_mutex_unlock(&sc->mtx);
3008 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3019 value &= 0xFFFFFFFF;
3023 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
3024 offset, size, (uint32_t)value);
3032 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
3033 uint64_t offset, int size)
3035 struct pci_nvme_softc* sc = pi->pi_arg;
3037 if (baridx == pci_msix_table_bar(pi) ||
3038 baridx == pci_msix_pba_bar(pi)) {
3039 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3040 baridx, offset, size);
3042 return pci_emul_msix_tread(pi, offset, size);
3047 return pci_nvme_read_bar_0(sc, offset, size);
3050 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3057 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3059 char bident[sizeof("XX:X:X")];
3063 sc->max_queues = NVME_QUEUES;
3064 sc->max_qentries = NVME_MAX_QENTRIES;
3065 sc->ioslots = NVME_IOSLOTS;
3066 sc->num_squeues = sc->max_queues;
3067 sc->num_cqueues = sc->max_queues;
3068 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3070 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3071 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3073 value = get_config_value_node(nvl, "maxq");
3075 sc->max_queues = atoi(value);
3076 value = get_config_value_node(nvl, "qsz");
3077 if (value != NULL) {
3078 sc->max_qentries = atoi(value);
3079 if (sc->max_qentries <= 0) {
3080 EPRINTLN("nvme: Invalid qsz option %d",
3085 value = get_config_value_node(nvl, "ioslots");
3086 if (value != NULL) {
3087 sc->ioslots = atoi(value);
3088 if (sc->ioslots <= 0) {
3089 EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3093 value = get_config_value_node(nvl, "sectsz");
3095 sectsz = atoi(value);
3096 value = get_config_value_node(nvl, "ser");
3097 if (value != NULL) {
3099 * This field indicates the Product Serial Number in
3100 * 7-bit ASCII, unused bytes should be space characters.
3103 cpywithpad((char *)sc->ctrldata.sn,
3104 sizeof(sc->ctrldata.sn), value, ' ');
3106 value = get_config_value_node(nvl, "eui64");
3108 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3109 value = get_config_value_node(nvl, "dsm");
3110 if (value != NULL) {
3111 if (strcmp(value, "auto") == 0)
3112 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3113 else if (strcmp(value, "enable") == 0)
3114 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3115 else if (strcmp(value, "disable") == 0)
3116 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3119 value = get_config_value_node(nvl, "ram");
3120 if (value != NULL) {
3121 uint64_t sz = strtoull(value, NULL, 10);
3123 sc->nvstore.type = NVME_STOR_RAM;
3124 sc->nvstore.size = sz * 1024 * 1024;
3125 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3126 sc->nvstore.sectsz = 4096;
3127 sc->nvstore.sectsz_bits = 12;
3128 if (sc->nvstore.ctx == NULL) {
3129 EPRINTLN("nvme: Unable to allocate RAM");
3133 snprintf(bident, sizeof(bident), "%d:%d",
3134 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3135 sc->nvstore.ctx = blockif_open(nvl, bident);
3136 if (sc->nvstore.ctx == NULL) {
3137 EPRINTLN("nvme: Could not open backing file: %s",
3141 sc->nvstore.type = NVME_STOR_BLOCKIF;
3142 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3145 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3146 sc->nvstore.sectsz = sectsz;
3147 else if (sc->nvstore.type != NVME_STOR_RAM)
3148 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3149 for (sc->nvstore.sectsz_bits = 9;
3150 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3151 sc->nvstore.sectsz_bits++);
3153 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3154 sc->max_queues = NVME_QUEUES;
3160 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3162 struct pci_nvme_softc *sc;
3163 struct pci_nvme_blockstore *nvstore;
3164 struct nvme_namespace_data *nd;
3167 nvstore = &sc->nvstore;
3170 nvstore->size = new_size;
3171 pci_nvme_init_nsdata_size(nvstore, nd);
3173 /* Add changed NSID to list */
3174 sc->ns_log.ns[0] = 1;
3175 sc->ns_log.ns[1] = 0;
3177 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3178 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3182 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3184 struct pci_nvme_softc *sc;
3185 uint32_t pci_membar_sz;
3190 sc = calloc(1, sizeof(struct pci_nvme_softc));
3194 error = pci_nvme_parse_config(sc, nvl);
3200 STAILQ_INIT(&sc->ioreqs_free);
3201 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3202 for (int i = 0; i < sc->ioslots; i++) {
3203 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3206 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3207 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3208 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3209 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3210 pci_set_cfgdata8(pi, PCIR_PROGIF,
3211 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3214 * Allocate size of NVMe registers + doorbell space for all queues.
3216 * The specification requires a minimum memory I/O window size of 16K.
3217 * The Windows driver will refuse to start a device with a smaller
3220 pci_membar_sz = sizeof(struct nvme_registers) +
3221 2 * sizeof(uint32_t) * (sc->max_queues + 1);
3222 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3224 DPRINTF("nvme membar size: %u", pci_membar_sz);
3226 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3228 WPRINTF("%s pci alloc mem bar failed", __func__);
3232 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3234 WPRINTF("%s pci add msixcap failed", __func__);
3238 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3240 WPRINTF("%s pci add Express capability failed", __func__);
3244 pthread_mutex_init(&sc->mtx, NULL);
3245 sem_init(&sc->iosemlock, 0, sc->ioslots);
3246 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3248 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3250 * Controller data depends on Namespace data so initialize Namespace
3253 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3254 pci_nvme_init_ctrldata(sc);
3255 pci_nvme_init_logpages(sc);
3256 pci_nvme_init_features(sc);
3258 pci_nvme_aer_init(sc);
3259 pci_nvme_aen_init(sc);
3263 pci_lintr_request(pi);
3270 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3277 if (strncmp(opts, "ram=", 4) == 0) {
3278 cp = strchr(opts, ',');
3280 set_config_value_node(nvl, "ram", opts + 4);
3283 ram = strndup(opts + 4, cp - opts - 4);
3284 set_config_value_node(nvl, "ram", ram);
3286 return (pci_parse_legacy_config(nvl, cp + 1));
3288 return (blockif_legacy_config(nvl, opts));
3291 static const struct pci_devemu pci_de_nvme = {
3293 .pe_init = pci_nvme_init,
3294 .pe_legacy_config = pci_nvme_legacy_config,
3295 .pe_barwrite = pci_nvme_write,
3296 .pe_barread = pci_nvme_read
3298 PCI_EMUL_SET(pci_de_nvme);