2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
81 #include <dev/nvme/nvme.h>
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR 4
97 #define NVME_IOSLOTS 8
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN (1 << 14)
102 #define NVME_QUEUES 16
103 #define NVME_MAX_QENTRIES 2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN 0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS 0xffff
117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero) ((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one) ((one) - 1)
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
136 enum nvme_controller_register_offsets {
137 NVME_CR_CAP_LOW = 0x00,
138 NVME_CR_CAP_HI = 0x04,
140 NVME_CR_INTMS = 0x0c,
141 NVME_CR_INTMC = 0x10,
146 NVME_CR_ASQ_LOW = 0x28,
147 NVME_CR_ASQ_HI = 0x2c,
148 NVME_CR_ACQ_LOW = 0x30,
149 NVME_CR_ACQ_HI = 0x34,
152 enum nvme_cmd_cdw11 {
153 NVME_CMD_CDW11_PC = 0x0001,
154 NVME_CMD_CDW11_IEN = 0x0002,
155 NVME_CMD_CDW11_IV = 0xFFFF0000,
163 #define NVME_CQ_INTEN 0x01
164 #define NVME_CQ_INTCOAL 0x02
166 struct nvme_completion_queue {
167 struct nvme_completion *qbase;
170 uint16_t tail; /* nvme progress */
171 uint16_t head; /* guest progress */
176 struct nvme_submission_queue {
177 struct nvme_command *qbase;
180 uint16_t head; /* nvme progress */
181 uint16_t tail; /* guest progress */
182 uint16_t cqid; /* completion queue id */
186 enum nvme_storage_type {
187 NVME_STOR_BLOCKIF = 0,
191 struct pci_nvme_blockstore {
192 enum nvme_storage_type type;
196 uint32_t sectsz_bits;
198 uint32_t deallocate:1;
202 * Calculate the number of additional page descriptors for guest IO requests
203 * based on the advertised Max Data Transfer (MDTS) and given the number of
204 * default iovec's in a struct blockif_req.
206 #define MDTS_PAD_SIZE \
207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
211 struct pci_nvme_ioreq {
212 struct pci_nvme_softc *sc;
213 STAILQ_ENTRY(pci_nvme_ioreq) link;
214 struct nvme_submission_queue *nvme_sq;
217 /* command information */
222 uint64_t prev_gpaddr;
226 struct blockif_req io_req;
228 struct iovec iovpadding[MDTS_PAD_SIZE];
232 /* Dataset Management bit in ONCS reflects backing storage capability */
233 NVME_DATASET_MANAGEMENT_AUTO,
234 /* Unconditionally set Dataset Management bit in ONCS */
235 NVME_DATASET_MANAGEMENT_ENABLE,
236 /* Unconditionally clear Dataset Management bit in ONCS */
237 NVME_DATASET_MANAGEMENT_DISABLE,
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244 struct nvme_feature_obj *,
245 struct nvme_command *,
246 struct nvme_completion *);
248 struct nvme_feature_obj {
252 bool namespace_specific;
255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
258 PCI_NVME_AE_TYPE_ERROR = 0,
259 PCI_NVME_AE_TYPE_SMART,
260 PCI_NVME_AE_TYPE_NOTICE,
261 PCI_NVME_AE_TYPE_IO_CMD = 6,
262 PCI_NVME_AE_TYPE_VENDOR = 7,
263 PCI_NVME_AE_TYPE_MAX /* Must be last */
264 } pci_nvme_async_type;
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 STAILQ_ENTRY(pci_nvme_aer) link;
269 uint16_t cid; /* Command ID of the submitted AER */
272 /** Asynchronous Event Information - Notice */
274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277 PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281 PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
284 #define PCI_NVME_AEI_NOTICE_SHIFT 8
285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289 pci_nvme_async_type atype;
295 * By default, enable all Asynchrnous Event Notifications:
296 * SMART / Health Critical Warnings
297 * Namespace Attribute Notices
299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f
302 NVME_CNTRLTYPE_IO = 1,
303 NVME_CNTRLTYPE_DISCOVERY = 2,
304 NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
307 struct pci_nvme_softc {
308 struct pci_devinst *nsc_pi;
312 struct nvme_registers regs;
314 struct nvme_namespace_data nsdata;
315 struct nvme_controller_data ctrldata;
316 struct nvme_error_information_entry err_log;
317 struct nvme_health_information_page health_log;
318 struct nvme_firmware_page fw_log;
319 struct nvme_ns_list ns_log;
321 struct pci_nvme_blockstore nvstore;
323 uint16_t max_qentries; /* max entries per queue */
324 uint32_t max_queues; /* max number of IO SQ's or CQ's */
325 uint32_t num_cqueues;
326 uint32_t num_squeues;
327 bool num_q_is_set; /* Has host set Number of Queues */
329 struct pci_nvme_ioreq *ioreqs;
330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331 uint32_t pending_ios;
336 * Memory mapped Submission and Completion queues
337 * Each array includes both Admin and IO queues
339 struct nvme_completion_queue *compl_queues;
340 struct nvme_submission_queue *submit_queues;
342 struct nvme_feature_obj feat[NVME_FID_MAX];
344 enum nvme_dsm_type dataset_management;
346 /* Accounting for SMART data */
347 __uint128_t read_data_units;
348 __uint128_t write_data_units;
349 __uint128_t read_commands;
350 __uint128_t write_commands;
351 uint32_t read_dunits_remainder;
352 uint32_t write_dunits_remainder;
354 STAILQ_HEAD(, pci_nvme_aer) aer_list;
355 pthread_mutex_t aer_mtx;
357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
359 pthread_mutex_t aen_mtx;
360 pthread_cond_t aen_cond;
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365 struct nvme_completion_queue *cq,
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
374 /* Controller Configuration utils */
375 #define NVME_CC_GET_EN(cc) \
376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define NVME_CC_GET_CSS(cc) \
378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define NVME_CC_GET_SHN(cc) \
380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define NVME_CC_GET_IOSQES(cc) \
382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define NVME_CC_GET_IOCQES(cc) \
384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
386 #define NVME_CC_WRITE_MASK \
387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
391 #define NVME_CC_NEN_WRITE_MASK \
392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
396 /* Controller Status utils */
397 #define NVME_CSTS_GET_RDY(sts) \
398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
401 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT)
403 /* Completion Queue status word utils */
404 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
405 #define NVME_STATUS_MASK \
406 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
407 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
409 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
410 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
413 struct nvme_feature_obj *,
414 struct nvme_command *,
415 struct nvme_completion *);
416 static void nvme_feature_temperature(struct pci_nvme_softc *,
417 struct nvme_feature_obj *,
418 struct nvme_command *,
419 struct nvme_completion *);
420 static void nvme_feature_num_queues(struct pci_nvme_softc *,
421 struct nvme_feature_obj *,
422 struct nvme_command *,
423 struct nvme_completion *);
424 static void nvme_feature_iv_config(struct pci_nvme_softc *,
425 struct nvme_feature_obj *,
426 struct nvme_command *,
427 struct nvme_completion *);
428 static void nvme_feature_async_event(struct pci_nvme_softc *,
429 struct nvme_feature_obj *,
430 struct nvme_command *,
431 struct nvme_completion *);
433 static void *aen_thr(void *arg);
436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
440 len = strnlen(src, dst_size);
441 memset(dst, pad, dst_size);
442 memcpy(dst, src, len);
446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
449 *status &= ~NVME_STATUS_MASK;
450 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
451 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
455 pci_nvme_status_genc(uint16_t *status, uint16_t code)
458 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
462 * Initialize the requested number or IO Submission and Completion Queues.
463 * Admin queues are allocated implicitly.
466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
471 * Allocate and initialize the Submission Queues
473 if (nsq > NVME_QUEUES) {
474 WPRINTF("%s: clamping number of SQ from %u to %u",
475 __func__, nsq, NVME_QUEUES);
479 sc->num_squeues = nsq;
481 sc->submit_queues = calloc(sc->num_squeues + 1,
482 sizeof(struct nvme_submission_queue));
483 if (sc->submit_queues == NULL) {
484 WPRINTF("%s: SQ allocation failed", __func__);
487 struct nvme_submission_queue *sq = sc->submit_queues;
489 for (i = 0; i < sc->num_squeues + 1; i++)
490 pthread_mutex_init(&sq[i].mtx, NULL);
494 * Allocate and initialize the Completion Queues
496 if (ncq > NVME_QUEUES) {
497 WPRINTF("%s: clamping number of CQ from %u to %u",
498 __func__, ncq, NVME_QUEUES);
502 sc->num_cqueues = ncq;
504 sc->compl_queues = calloc(sc->num_cqueues + 1,
505 sizeof(struct nvme_completion_queue));
506 if (sc->compl_queues == NULL) {
507 WPRINTF("%s: CQ allocation failed", __func__);
510 struct nvme_completion_queue *cq = sc->compl_queues;
512 for (i = 0; i < sc->num_cqueues + 1; i++)
513 pthread_mutex_init(&cq[i].mtx, NULL);
518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
520 struct nvme_controller_data *cd = &sc->ctrldata;
525 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
526 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
528 /* Num of submission commands that we can handle at a time (2^rab) */
538 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
540 cd->ver = NVME_REV(1,4);
542 cd->cntrltype = NVME_CNTRLTYPE_IO;
543 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
544 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
548 /* Advertise 1, Read-only firmware slot */
549 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
550 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
551 cd->lpa = 0; /* TODO: support some simple things like SMART */
552 cd->elpe = 0; /* max error log page entries */
554 * Report a single power state (zero-based value)
555 * power_state[] values are left as zero to indicate "Not reported"
559 /* Warning Composite Temperature Threshold */
563 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
564 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
565 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
567 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
568 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
569 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
570 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
571 cd->nn = 1; /* number of namespaces */
574 switch (sc->dataset_management) {
575 case NVME_DATASET_MANAGEMENT_AUTO:
576 if (sc->nvstore.deallocate)
577 cd->oncs |= NVME_ONCS_DSM;
579 case NVME_DATASET_MANAGEMENT_ENABLE:
580 cd->oncs |= NVME_ONCS_DSM;
586 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
587 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
589 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
593 * Calculate the CRC-16 of the given buffer
594 * See copyright attribution at top of file
597 crc16(uint16_t crc, const void *buffer, unsigned int len)
599 const unsigned char *cp = buffer;
600 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
601 static uint16_t const crc16_table[256] = {
602 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
603 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
604 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
605 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
606 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
607 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
608 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
609 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
610 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
611 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
612 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
613 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
614 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
615 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
616 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
617 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
618 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
619 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
620 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
621 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
622 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
623 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
624 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
625 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
626 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
627 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
628 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
629 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
630 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
631 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
632 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
633 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
637 crc = (((crc >> 8) & 0xffU) ^
638 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
644 struct nvme_namespace_data *nd)
647 /* Get capacity and block size information from backing store */
648 nd->nsze = nvstore->size / nvstore->sectsz;
654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
655 struct nvme_namespace_data *nd, uint32_t nsid,
656 struct pci_nvme_blockstore *nvstore)
659 pci_nvme_init_nsdata_size(nvstore, nd);
661 if (nvstore->type == NVME_STOR_BLOCKIF)
662 nvstore->deallocate = blockif_candelete(nvstore->ctx);
664 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
667 /* Create an EUI-64 if user did not provide one */
668 if (nvstore->eui64 == 0) {
670 uint64_t eui64 = nvstore->eui64;
672 asprintf(&data, "%s%u%u%u", get_config_value("name"),
673 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
674 sc->nsc_pi->pi_func);
677 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
680 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
682 be64enc(nd->eui64, nvstore->eui64);
684 /* LBA data-sz = 2^lbads */
685 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
689 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
691 __uint128_t power_cycles = 1;
693 memset(&sc->err_log, 0, sizeof(sc->err_log));
694 memset(&sc->health_log, 0, sizeof(sc->health_log));
695 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
696 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
698 /* Set read/write remainder to round up according to spec */
699 sc->read_dunits_remainder = 999;
700 sc->write_dunits_remainder = 999;
702 /* Set nominal Health values checked by implementations */
703 sc->health_log.temperature = NVME_TEMPERATURE;
704 sc->health_log.available_spare = 100;
705 sc->health_log.available_spare_threshold = 10;
707 /* Set Active Firmware Info to slot 1 */
708 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
709 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
710 sizeof(sc->fw_log.revision[0]));
712 memcpy(&sc->health_log.power_cycles, &power_cycles,
713 sizeof(sc->health_log.power_cycles));
717 pci_nvme_init_features(struct pci_nvme_softc *sc)
719 enum nvme_feature fid;
721 for (fid = 0; fid < NVME_FID_MAX; fid++) {
723 case NVME_FEAT_ARBITRATION:
724 case NVME_FEAT_POWER_MANAGEMENT:
725 case NVME_FEAT_INTERRUPT_COALESCING: //XXX
726 case NVME_FEAT_WRITE_ATOMICITY:
727 /* Mandatory but no special handling required */
728 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
729 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
730 // this returns a data buffer
732 case NVME_FEAT_TEMPERATURE_THRESHOLD:
733 sc->feat[fid].set = nvme_feature_temperature;
735 case NVME_FEAT_ERROR_RECOVERY:
736 sc->feat[fid].namespace_specific = true;
738 case NVME_FEAT_NUMBER_OF_QUEUES:
739 sc->feat[fid].set = nvme_feature_num_queues;
741 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
742 sc->feat[fid].set = nvme_feature_iv_config;
744 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
745 sc->feat[fid].set = nvme_feature_async_event;
746 /* Enable all AENs by default */
747 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
750 sc->feat[fid].set = nvme_feature_invalid_cb;
751 sc->feat[fid].get = nvme_feature_invalid_cb;
757 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
760 STAILQ_INIT(&sc->aer_list);
765 pci_nvme_aer_init(struct pci_nvme_softc *sc)
768 pthread_mutex_init(&sc->aer_mtx, NULL);
769 pci_nvme_aer_reset(sc);
773 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
775 struct pci_nvme_aer *aer = NULL;
777 pthread_mutex_lock(&sc->aer_mtx);
778 while (!STAILQ_EMPTY(&sc->aer_list)) {
779 aer = STAILQ_FIRST(&sc->aer_list);
780 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
783 pthread_mutex_unlock(&sc->aer_mtx);
785 pci_nvme_aer_reset(sc);
789 pci_nvme_aer_available(struct pci_nvme_softc *sc)
792 return (sc->aer_count != 0);
796 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
798 struct nvme_controller_data *cd = &sc->ctrldata;
800 /* AERL is a zero based value while aer_count is one's based */
801 return (sc->aer_count == (cd->aerl + 1U));
805 * Add an Async Event Request
807 * Stores an AER to be returned later if the Controller needs to notify the
809 * Note that while the NVMe spec doesn't require Controllers to return AER's
810 * in order, this implementation does preserve the order.
813 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
815 struct pci_nvme_aer *aer = NULL;
817 aer = calloc(1, sizeof(struct pci_nvme_aer));
821 /* Save the Command ID for use in the completion message */
824 pthread_mutex_lock(&sc->aer_mtx);
826 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
827 pthread_mutex_unlock(&sc->aer_mtx);
833 * Get an Async Event Request structure
835 * Returns a pointer to an AER previously submitted by the host or NULL if
836 * no AER's exist. Caller is responsible for freeing the returned struct.
838 static struct pci_nvme_aer *
839 pci_nvme_aer_get(struct pci_nvme_softc *sc)
841 struct pci_nvme_aer *aer = NULL;
843 pthread_mutex_lock(&sc->aer_mtx);
844 aer = STAILQ_FIRST(&sc->aer_list);
846 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
849 pthread_mutex_unlock(&sc->aer_mtx);
855 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
859 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
861 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
862 sc->aen[atype].atype = atype;
867 pci_nvme_aen_init(struct pci_nvme_softc *sc)
871 pci_nvme_aen_reset(sc);
873 pthread_mutex_init(&sc->aen_mtx, NULL);
874 pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
875 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
876 sc->nsc_pi->pi_func);
877 pthread_set_name_np(sc->aen_tid, nstr);
881 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
884 pci_nvme_aen_reset(sc);
887 /* Notify the AEN thread of pending work */
889 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
892 pthread_cond_signal(&sc->aen_cond);
896 * Post an Asynchronous Event Notification
899 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
902 struct pci_nvme_aen *aen;
904 if (atype >= PCI_NVME_AE_TYPE_MAX) {
908 pthread_mutex_lock(&sc->aen_mtx);
909 aen = &sc->aen[atype];
911 /* Has the controller already posted an event of this type? */
913 pthread_mutex_unlock(&sc->aen_mtx);
917 aen->event_data = event_data;
919 pthread_mutex_unlock(&sc->aen_mtx);
921 pci_nvme_aen_notify(sc);
927 pci_nvme_aen_process(struct pci_nvme_softc *sc)
929 struct pci_nvme_aer *aer;
930 struct pci_nvme_aen *aen;
931 pci_nvme_async_type atype;
936 assert(pthread_mutex_isowned_np(&sc->aen_mtx));
937 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
938 aen = &sc->aen[atype];
939 /* Previous iterations may have depleted the available AER's */
940 if (!pci_nvme_aer_available(sc)) {
941 DPRINTF("%s: no AER", __func__);
946 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
950 status = NVME_SC_SUCCESS;
952 /* Is the event masked? */
954 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
956 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
958 case PCI_NVME_AE_TYPE_ERROR:
959 lid = NVME_LOG_ERROR;
961 case PCI_NVME_AE_TYPE_SMART:
963 if ((mask & aen->event_data) == 0)
965 lid = NVME_LOG_HEALTH_INFORMATION;
967 case PCI_NVME_AE_TYPE_NOTICE:
968 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
969 EPRINTLN("%s unknown AEN notice type %u",
970 __func__, aen->event_data);
971 status = NVME_SC_INTERNAL_DEVICE_ERROR;
975 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
977 switch (aen->event_data) {
978 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
979 lid = NVME_LOG_CHANGED_NAMESPACE;
981 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
982 lid = NVME_LOG_FIRMWARE_SLOT;
984 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
985 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
987 case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
988 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
990 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
991 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
993 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
994 lid = NVME_LOG_LBA_STATUS_INFORMATION;
996 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
997 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
1005 EPRINTLN("%s unknown AEN type %u", __func__, atype);
1006 status = NVME_SC_INTERNAL_DEVICE_ERROR;
1011 aer = pci_nvme_aer_get(sc);
1012 assert(aer != NULL);
1014 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1015 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1016 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1021 aen->event_data = 0;
1022 aen->posted = false;
1024 pci_generate_msix(sc->nsc_pi, 0);
1031 struct pci_nvme_softc *sc;
1035 pthread_mutex_lock(&sc->aen_mtx);
1037 pci_nvme_aen_process(sc);
1038 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1040 pthread_mutex_unlock(&sc->aen_mtx);
1047 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1051 DPRINTF("%s", __func__);
1053 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1054 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1055 (60 << NVME_CAP_LO_REG_TO_SHIFT);
1057 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1059 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */
1063 assert(sc->submit_queues != NULL);
1065 for (i = 0; i < sc->num_squeues + 1; i++) {
1066 sc->submit_queues[i].qbase = NULL;
1067 sc->submit_queues[i].size = 0;
1068 sc->submit_queues[i].cqid = 0;
1069 sc->submit_queues[i].tail = 0;
1070 sc->submit_queues[i].head = 0;
1073 assert(sc->compl_queues != NULL);
1075 for (i = 0; i < sc->num_cqueues + 1; i++) {
1076 sc->compl_queues[i].qbase = NULL;
1077 sc->compl_queues[i].size = 0;
1078 sc->compl_queues[i].tail = 0;
1079 sc->compl_queues[i].head = 0;
1082 sc->num_q_is_set = false;
1084 pci_nvme_aer_destroy(sc);
1085 pci_nvme_aen_destroy(sc);
1088 * Clear CSTS.RDY last to prevent the host from enabling Controller
1089 * before cleanup completes
1095 pci_nvme_reset(struct pci_nvme_softc *sc)
1097 pthread_mutex_lock(&sc->mtx);
1098 pci_nvme_reset_locked(sc);
1099 pthread_mutex_unlock(&sc->mtx);
1103 pci_nvme_init_controller(struct pci_nvme_softc *sc)
1105 uint16_t acqs, asqs;
1107 DPRINTF("%s", __func__);
1110 * NVMe 2.0 states that "enabling a controller while this field is
1111 * cleared to 0h produces undefined results" for both ACQS and
1112 * ASQS. If zero, set CFS and do not become ready.
1114 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1116 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1117 asqs - 1, sc->regs.aqa);
1118 sc->regs.csts |= NVME_CSTS_CFS;
1121 sc->submit_queues[0].size = asqs;
1122 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1123 sc->regs.asq, sizeof(struct nvme_command) * asqs);
1124 if (sc->submit_queues[0].qbase == NULL) {
1125 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1127 sc->regs.csts |= NVME_CSTS_CFS;
1131 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1132 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1134 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1135 NVME_AQA_REG_ACQS_MASK);
1137 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1138 acqs - 1, sc->regs.aqa);
1139 sc->regs.csts |= NVME_CSTS_CFS;
1142 sc->compl_queues[0].size = acqs;
1143 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1144 sc->regs.acq, sizeof(struct nvme_completion) * acqs);
1145 if (sc->compl_queues[0].qbase == NULL) {
1146 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1148 sc->regs.csts |= NVME_CSTS_CFS;
1151 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1153 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1154 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1160 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1161 size_t len, enum nvme_copy_dir dir)
1166 if (len > (8 * 1024)) {
1170 /* Copy from the start of prp1 to the end of the physical page */
1171 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1172 bytes = MIN(bytes, len);
1174 p = vm_map_gpa(ctx, prp1, bytes);
1179 if (dir == NVME_COPY_TO_PRP)
1180 memcpy(p, b, bytes);
1182 memcpy(b, p, bytes);
1191 len = MIN(len, PAGE_SIZE);
1193 p = vm_map_gpa(ctx, prp2, len);
1198 if (dir == NVME_COPY_TO_PRP)
1207 * Write a Completion Queue Entry update
1209 * Write the completion and update the doorbell value
1212 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1213 struct nvme_completion_queue *cq,
1219 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1220 struct nvme_completion *cqe;
1222 assert(cq->qbase != NULL);
1224 pthread_mutex_lock(&cq->mtx);
1226 cqe = &cq->qbase[cq->tail];
1228 /* Flip the phase bit */
1229 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1232 cqe->sqhd = sq->head;
1235 cqe->status = status;
1238 if (cq->tail >= cq->size) {
1242 pthread_mutex_unlock(&cq->mtx);
1246 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1247 struct nvme_completion* compl)
1249 uint16_t qid = command->cdw10 & 0xffff;
1251 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1252 if (qid == 0 || qid > sc->num_squeues ||
1253 (sc->submit_queues[qid].qbase == NULL)) {
1254 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1255 __func__, qid, sc->num_squeues);
1256 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1257 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1261 sc->submit_queues[qid].qbase = NULL;
1262 sc->submit_queues[qid].cqid = 0;
1263 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1268 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1269 struct nvme_completion* compl)
1271 if (command->cdw11 & NVME_CMD_CDW11_PC) {
1272 uint16_t qid = command->cdw10 & 0xffff;
1273 struct nvme_submission_queue *nsq;
1275 if ((qid == 0) || (qid > sc->num_squeues) ||
1276 (sc->submit_queues[qid].qbase != NULL)) {
1277 WPRINTF("%s queue index %u > num_squeues %u",
1278 __func__, qid, sc->num_squeues);
1279 pci_nvme_status_tc(&compl->status,
1280 NVME_SCT_COMMAND_SPECIFIC,
1281 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1285 nsq = &sc->submit_queues[qid];
1286 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1287 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1288 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1290 * Queues must specify at least two entries
1291 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1292 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1294 pci_nvme_status_tc(&compl->status,
1295 NVME_SCT_COMMAND_SPECIFIC,
1296 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1299 nsq->head = nsq->tail = 0;
1301 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1302 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1303 pci_nvme_status_tc(&compl->status,
1304 NVME_SCT_COMMAND_SPECIFIC,
1305 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1309 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1310 pci_nvme_status_tc(&compl->status,
1311 NVME_SCT_COMMAND_SPECIFIC,
1312 NVME_SC_COMPLETION_QUEUE_INVALID);
1316 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1318 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1319 sizeof(struct nvme_command) * (size_t)nsq->size);
1321 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1322 qid, nsq->size, nsq->qbase, nsq->cqid);
1324 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1326 DPRINTF("%s completed creating IOSQ qid %u",
1330 * Guest sent non-cont submission queue request.
1331 * This setting is unsupported by this emulation.
1333 WPRINTF("%s unsupported non-contig (list-based) "
1334 "create i/o submission queue", __func__);
1336 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1342 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1343 struct nvme_completion* compl)
1345 uint16_t qid = command->cdw10 & 0xffff;
1348 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1349 if (qid == 0 || qid > sc->num_cqueues ||
1350 (sc->compl_queues[qid].qbase == NULL)) {
1351 WPRINTF("%s queue index %u / num_cqueues %u",
1352 __func__, qid, sc->num_cqueues);
1353 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1354 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1358 /* Deleting an Active CQ is an error */
1359 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1360 if (sc->submit_queues[sqid].cqid == qid) {
1361 pci_nvme_status_tc(&compl->status,
1362 NVME_SCT_COMMAND_SPECIFIC,
1363 NVME_SC_INVALID_QUEUE_DELETION);
1367 sc->compl_queues[qid].qbase = NULL;
1368 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1373 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1374 struct nvme_completion* compl)
1376 struct nvme_completion_queue *ncq;
1377 uint16_t qid = command->cdw10 & 0xffff;
1379 /* Only support Physically Contiguous queues */
1380 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1381 WPRINTF("%s unsupported non-contig (list-based) "
1382 "create i/o completion queue",
1385 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1389 if ((qid == 0) || (qid > sc->num_cqueues) ||
1390 (sc->compl_queues[qid].qbase != NULL)) {
1391 WPRINTF("%s queue index %u > num_cqueues %u",
1392 __func__, qid, sc->num_cqueues);
1393 pci_nvme_status_tc(&compl->status,
1394 NVME_SCT_COMMAND_SPECIFIC,
1395 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1399 ncq = &sc->compl_queues[qid];
1400 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1401 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1402 if (ncq->intr_vec > (sc->max_queues + 1)) {
1403 pci_nvme_status_tc(&compl->status,
1404 NVME_SCT_COMMAND_SPECIFIC,
1405 NVME_SC_INVALID_INTERRUPT_VECTOR);
1409 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1410 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1412 * Queues must specify at least two entries
1413 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1414 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1416 pci_nvme_status_tc(&compl->status,
1417 NVME_SCT_COMMAND_SPECIFIC,
1418 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1421 ncq->head = ncq->tail = 0;
1422 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1424 sizeof(struct nvme_command) * (size_t)ncq->size);
1426 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1433 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1434 struct nvme_completion* compl)
1440 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1443 * Command specifies the number of dwords to return in fields NUMDU
1444 * and NUMDL. This is a zero-based value.
1446 logpage = command->cdw10 & 0xFF;
1447 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1448 logsize *= sizeof(uint32_t);
1449 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1451 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1454 case NVME_LOG_ERROR:
1455 if (logoff >= sizeof(sc->err_log)) {
1456 pci_nvme_status_genc(&compl->status,
1457 NVME_SC_INVALID_FIELD);
1461 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1462 command->prp2, (uint8_t *)&sc->err_log + logoff,
1463 MIN(logsize - logoff, sizeof(sc->err_log)),
1466 case NVME_LOG_HEALTH_INFORMATION:
1467 if (logoff >= sizeof(sc->health_log)) {
1468 pci_nvme_status_genc(&compl->status,
1469 NVME_SC_INVALID_FIELD);
1473 pthread_mutex_lock(&sc->mtx);
1474 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1475 sizeof(sc->health_log.data_units_read));
1476 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1477 sizeof(sc->health_log.data_units_written));
1478 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1479 sizeof(sc->health_log.host_read_commands));
1480 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1481 sizeof(sc->health_log.host_write_commands));
1482 pthread_mutex_unlock(&sc->mtx);
1484 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1485 command->prp2, (uint8_t *)&sc->health_log + logoff,
1486 MIN(logsize - logoff, sizeof(sc->health_log)),
1489 case NVME_LOG_FIRMWARE_SLOT:
1490 if (logoff >= sizeof(sc->fw_log)) {
1491 pci_nvme_status_genc(&compl->status,
1492 NVME_SC_INVALID_FIELD);
1496 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1497 command->prp2, (uint8_t *)&sc->fw_log + logoff,
1498 MIN(logsize - logoff, sizeof(sc->fw_log)),
1501 case NVME_LOG_CHANGED_NAMESPACE:
1502 if (logoff >= sizeof(sc->ns_log)) {
1503 pci_nvme_status_genc(&compl->status,
1504 NVME_SC_INVALID_FIELD);
1508 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1509 command->prp2, (uint8_t *)&sc->ns_log + logoff,
1510 MIN(logsize - logoff, sizeof(sc->ns_log)),
1512 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1515 DPRINTF("%s get log page %x command not supported",
1518 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1519 NVME_SC_INVALID_LOG_PAGE);
1526 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1527 struct nvme_completion* compl)
1532 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1533 command->cdw10 & 0xFF, command->nsid);
1536 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1538 switch (command->cdw10 & 0xFF) {
1539 case 0x00: /* return Identify Namespace data structure */
1540 /* Global NS only valid with NS Management */
1541 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1542 pci_nvme_status_genc(&status,
1543 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1546 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1547 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1550 case 0x01: /* return Identify Controller data structure */
1551 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1552 command->prp2, (uint8_t *)&sc->ctrldata,
1553 sizeof(sc->ctrldata),
1556 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1557 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1558 sizeof(uint32_t) * 1024);
1559 /* All unused entries shall be zero */
1560 memset(dest, 0, sizeof(uint32_t) * 1024);
1561 ((uint32_t *)dest)[0] = 1;
1563 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1564 if (command->nsid != 1) {
1565 pci_nvme_status_genc(&status,
1566 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1569 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1570 sizeof(uint32_t) * 1024);
1571 /* All bytes after the descriptor shall be zero */
1572 memset(dest, 0, sizeof(uint32_t) * 1024);
1574 /* Return NIDT=1 (i.e. EUI64) descriptor */
1575 ((uint8_t *)dest)[0] = 1;
1576 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1577 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
1581 * Controller list is optional but used by UNH tests. Return
1582 * a valid but empty list.
1584 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1585 sizeof(uint16_t) * 2048);
1586 memset(dest, 0, sizeof(uint16_t) * 2048);
1589 DPRINTF("%s unsupported identify command requested 0x%x",
1590 __func__, command->cdw10 & 0xFF);
1591 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1595 compl->status = status;
1600 nvme_fid_to_name(uint8_t fid)
1605 case NVME_FEAT_ARBITRATION:
1606 name = "Arbitration";
1608 case NVME_FEAT_POWER_MANAGEMENT:
1609 name = "Power Management";
1611 case NVME_FEAT_LBA_RANGE_TYPE:
1612 name = "LBA Range Type";
1614 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1615 name = "Temperature Threshold";
1617 case NVME_FEAT_ERROR_RECOVERY:
1618 name = "Error Recovery";
1620 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1621 name = "Volatile Write Cache";
1623 case NVME_FEAT_NUMBER_OF_QUEUES:
1624 name = "Number of Queues";
1626 case NVME_FEAT_INTERRUPT_COALESCING:
1627 name = "Interrupt Coalescing";
1629 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1630 name = "Interrupt Vector Configuration";
1632 case NVME_FEAT_WRITE_ATOMICITY:
1633 name = "Write Atomicity Normal";
1635 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1636 name = "Asynchronous Event Configuration";
1638 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1639 name = "Autonomous Power State Transition";
1641 case NVME_FEAT_HOST_MEMORY_BUFFER:
1642 name = "Host Memory Buffer";
1644 case NVME_FEAT_TIMESTAMP:
1647 case NVME_FEAT_KEEP_ALIVE_TIMER:
1648 name = "Keep Alive Timer";
1650 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1651 name = "Host Controlled Thermal Management";
1653 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1654 name = "Non-Operation Power State Config";
1656 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1657 name = "Read Recovery Level Config";
1659 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1660 name = "Predictable Latency Mode Config";
1662 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1663 name = "Predictable Latency Mode Window";
1665 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1666 name = "LBA Status Information Report Interval";
1668 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1669 name = "Host Behavior Support";
1671 case NVME_FEAT_SANITIZE_CONFIG:
1672 name = "Sanitize Config";
1674 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1675 name = "Endurance Group Event Configuration";
1677 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1678 name = "Software Progress Marker";
1680 case NVME_FEAT_HOST_IDENTIFIER:
1681 name = "Host Identifier";
1683 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1684 name = "Reservation Notification Mask";
1686 case NVME_FEAT_RESERVATION_PERSISTENCE:
1687 name = "Reservation Persistence";
1689 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1690 name = "Namespace Write Protection Config";
1701 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1702 struct nvme_feature_obj *feat __unused,
1703 struct nvme_command *command __unused,
1704 struct nvme_completion *compl)
1706 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1710 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1711 struct nvme_feature_obj *feat __unused,
1712 struct nvme_command *command,
1713 struct nvme_completion *compl)
1716 uint32_t cdw11 = command->cdw11;
1720 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1722 iv = cdw11 & 0xffff;
1723 cd = cdw11 & (1 << 16);
1725 if (iv > (sc->max_queues + 1)) {
1729 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1730 if ((iv == 0) && !cd)
1733 /* Requested Interrupt Vector must be used by a CQ */
1734 for (i = 0; i < sc->num_cqueues + 1; i++) {
1735 if (sc->compl_queues[i].intr_vec == iv) {
1736 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1741 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000)
1743 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1744 struct nvme_feature_obj *feat __unused,
1745 struct nvme_command *command,
1746 struct nvme_completion *compl)
1748 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1749 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1752 #define NVME_TEMP_THRESH_OVER 0
1753 #define NVME_TEMP_THRESH_UNDER 1
1755 nvme_feature_temperature(struct pci_nvme_softc *sc,
1756 struct nvme_feature_obj *feat __unused,
1757 struct nvme_command *command,
1758 struct nvme_completion *compl)
1760 uint16_t tmpth; /* Temperature Threshold */
1761 uint8_t tmpsel; /* Threshold Temperature Select */
1762 uint8_t thsel; /* Threshold Type Select */
1763 bool set_crit = false;
1766 tmpth = command->cdw11 & 0xffff;
1767 tmpsel = (command->cdw11 >> 16) & 0xf;
1768 thsel = (command->cdw11 >> 20) & 0x3;
1770 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1772 /* Check for unsupported values */
1773 if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1774 (thsel > NVME_TEMP_THRESH_UNDER)) {
1775 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1779 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) ||
1780 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1783 pthread_mutex_lock(&sc->mtx);
1785 sc->health_log.critical_warning |=
1786 NVME_CRIT_WARN_ST_TEMPERATURE;
1788 sc->health_log.critical_warning &=
1789 ~NVME_CRIT_WARN_ST_TEMPERATURE;
1790 pthread_mutex_unlock(&sc->mtx);
1792 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 &
1793 NVME_CRIT_WARN_ST_TEMPERATURE;
1795 if (set_crit && report_crit)
1796 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1797 sc->health_log.critical_warning);
1799 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1803 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1804 struct nvme_feature_obj *feat __unused,
1805 struct nvme_command *command,
1806 struct nvme_completion *compl)
1808 uint16_t nqr; /* Number of Queues Requested */
1810 if (sc->num_q_is_set) {
1811 WPRINTF("%s: Number of Queues already set", __func__);
1812 pci_nvme_status_genc(&compl->status,
1813 NVME_SC_COMMAND_SEQUENCE_ERROR);
1817 nqr = command->cdw11 & 0xFFFF;
1818 if (nqr == 0xffff) {
1819 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1820 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1824 sc->num_squeues = ONE_BASED(nqr);
1825 if (sc->num_squeues > sc->max_queues) {
1826 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1828 sc->num_squeues = sc->max_queues;
1831 nqr = (command->cdw11 >> 16) & 0xFFFF;
1832 if (nqr == 0xffff) {
1833 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1834 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1838 sc->num_cqueues = ONE_BASED(nqr);
1839 if (sc->num_cqueues > sc->max_queues) {
1840 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1842 sc->num_cqueues = sc->max_queues;
1845 /* Patch the command value which will be saved on callback's return */
1846 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1847 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1849 sc->num_q_is_set = true;
1853 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1854 struct nvme_completion *compl)
1856 struct nvme_feature_obj *feat;
1857 uint32_t nsid = command->nsid;
1858 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10);
1859 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10);
1861 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1863 if (fid >= NVME_FID_MAX) {
1864 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1865 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1870 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1871 NVME_SC_FEATURE_NOT_SAVEABLE);
1875 feat = &sc->feat[fid];
1877 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1878 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1882 if (!feat->namespace_specific &&
1883 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1884 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1885 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1890 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1893 feat->set(sc, feat, command, compl);
1895 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1896 NVME_SC_FEATURE_NOT_CHANGEABLE);
1900 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1901 if (compl->status == NVME_SC_SUCCESS) {
1902 feat->cdw11 = command->cdw11;
1903 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1904 (command->cdw11 != 0))
1905 pci_nvme_aen_notify(sc);
1911 #define NVME_FEATURES_SEL_SUPPORTED 0x3
1912 #define NVME_FEATURES_NS_SPECIFIC (1 << 1)
1915 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1916 struct nvme_completion* compl)
1918 struct nvme_feature_obj *feat;
1919 uint8_t fid = command->cdw10 & 0xFF;
1920 uint8_t sel = (command->cdw10 >> 8) & 0x7;
1922 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1924 if (fid >= NVME_FID_MAX) {
1925 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1926 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1931 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1933 feat = &sc->feat[fid];
1935 feat->get(sc, feat, command, compl);
1938 if (compl->status == NVME_SC_SUCCESS) {
1939 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1940 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1942 compl->cdw0 = feat->cdw11;
1949 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1950 struct nvme_completion* compl)
1952 uint8_t ses, lbaf, pi;
1954 /* Only supports Secure Erase Setting - User Data Erase */
1955 ses = (command->cdw10 >> 9) & 0x7;
1957 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1961 /* Only supports a single LBA Format */
1962 lbaf = command->cdw10 & 0xf;
1964 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1965 NVME_SC_INVALID_FORMAT);
1969 /* Doesn't support Protection Infomation */
1970 pi = (command->cdw10 >> 5) & 0x7;
1972 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1976 if (sc->nvstore.type == NVME_STOR_RAM) {
1977 if (sc->nvstore.ctx)
1978 free(sc->nvstore.ctx);
1979 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1980 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1982 struct pci_nvme_ioreq *req;
1985 req = pci_nvme_get_ioreq(sc);
1987 pci_nvme_status_genc(&compl->status,
1988 NVME_SC_INTERNAL_DEVICE_ERROR);
1989 WPRINTF("%s: unable to allocate IO req", __func__);
1992 req->nvme_sq = &sc->submit_queues[0];
1994 req->opc = command->opc;
1995 req->cid = command->cid;
1996 req->nsid = command->nsid;
1998 req->io_req.br_offset = 0;
1999 req->io_req.br_resid = sc->nvstore.size;
2000 req->io_req.br_callback = pci_nvme_io_done;
2002 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
2004 pci_nvme_status_genc(&compl->status,
2005 NVME_SC_INTERNAL_DEVICE_ERROR);
2006 pci_nvme_release_ioreq(sc, req);
2008 compl->status = NVME_NO_STATUS;
2015 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
2016 struct nvme_completion *compl)
2018 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
2019 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
2021 /* TODO: search for the command ID and abort it */
2024 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2029 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2030 struct nvme_command* command, struct nvme_completion* compl)
2032 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2033 sc->aer_count, sc->ctrldata.aerl, command->cid);
2035 /* Don't exceed the Async Event Request Limit (AERL). */
2036 if (pci_nvme_aer_limit_reached(sc)) {
2037 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2038 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2042 if (pci_nvme_aer_add(sc, command->cid)) {
2043 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2044 NVME_SC_INTERNAL_DEVICE_ERROR);
2049 * Raise events when they happen based on the Set Features cmd.
2050 * These events happen async, so only set completion successful if
2051 * there is an event reflective of the request to get event.
2053 compl->status = NVME_NO_STATUS;
2054 pci_nvme_aen_notify(sc);
2060 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2062 struct nvme_completion compl;
2063 struct nvme_command *cmd;
2064 struct nvme_submission_queue *sq;
2065 struct nvme_completion_queue *cq;
2068 DPRINTF("%s index %u", __func__, (uint32_t)value);
2070 sq = &sc->submit_queues[0];
2071 cq = &sc->compl_queues[0];
2073 pthread_mutex_lock(&sq->mtx);
2076 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2078 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2079 cmd = &(sq->qbase)[sqhead];
2084 case NVME_OPC_DELETE_IO_SQ:
2085 DPRINTF("%s command DELETE_IO_SQ", __func__);
2086 nvme_opc_delete_io_sq(sc, cmd, &compl);
2088 case NVME_OPC_CREATE_IO_SQ:
2089 DPRINTF("%s command CREATE_IO_SQ", __func__);
2090 nvme_opc_create_io_sq(sc, cmd, &compl);
2092 case NVME_OPC_DELETE_IO_CQ:
2093 DPRINTF("%s command DELETE_IO_CQ", __func__);
2094 nvme_opc_delete_io_cq(sc, cmd, &compl);
2096 case NVME_OPC_CREATE_IO_CQ:
2097 DPRINTF("%s command CREATE_IO_CQ", __func__);
2098 nvme_opc_create_io_cq(sc, cmd, &compl);
2100 case NVME_OPC_GET_LOG_PAGE:
2101 DPRINTF("%s command GET_LOG_PAGE", __func__);
2102 nvme_opc_get_log_page(sc, cmd, &compl);
2104 case NVME_OPC_IDENTIFY:
2105 DPRINTF("%s command IDENTIFY", __func__);
2106 nvme_opc_identify(sc, cmd, &compl);
2108 case NVME_OPC_ABORT:
2109 DPRINTF("%s command ABORT", __func__);
2110 nvme_opc_abort(sc, cmd, &compl);
2112 case NVME_OPC_SET_FEATURES:
2113 DPRINTF("%s command SET_FEATURES", __func__);
2114 nvme_opc_set_features(sc, cmd, &compl);
2116 case NVME_OPC_GET_FEATURES:
2117 DPRINTF("%s command GET_FEATURES", __func__);
2118 nvme_opc_get_features(sc, cmd, &compl);
2120 case NVME_OPC_FIRMWARE_ACTIVATE:
2121 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2122 pci_nvme_status_tc(&compl.status,
2123 NVME_SCT_COMMAND_SPECIFIC,
2124 NVME_SC_INVALID_FIRMWARE_SLOT);
2126 case NVME_OPC_ASYNC_EVENT_REQUEST:
2127 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2128 nvme_opc_async_event_req(sc, cmd, &compl);
2130 case NVME_OPC_FORMAT_NVM:
2131 DPRINTF("%s command FORMAT_NVM", __func__);
2132 if ((sc->ctrldata.oacs &
2133 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2134 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2137 nvme_opc_format_nvm(sc, cmd, &compl);
2139 case NVME_OPC_SECURITY_SEND:
2140 case NVME_OPC_SECURITY_RECEIVE:
2141 case NVME_OPC_SANITIZE:
2142 case NVME_OPC_GET_LBA_STATUS:
2143 DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2145 /* Valid but unsupported opcodes */
2146 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2149 DPRINTF("%s command OPC=%#X (not implemented)",
2152 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2154 sqhead = (sqhead + 1) % sq->size;
2156 if (NVME_COMPLETION_VALID(compl)) {
2157 pci_nvme_cq_update(sc, &sc->compl_queues[0],
2165 DPRINTF("setting sqhead %u", sqhead);
2168 if (cq->head != cq->tail)
2169 pci_generate_msix(sc->nsc_pi, 0);
2171 pthread_mutex_unlock(&sq->mtx);
2175 * Update the Write and Read statistics reported in SMART data
2177 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2178 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2179 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2182 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2183 size_t bytes, uint16_t status)
2186 pthread_mutex_lock(&sc->mtx);
2188 case NVME_OPC_WRITE:
2189 sc->write_commands++;
2190 if (status != NVME_SC_SUCCESS)
2192 sc->write_dunits_remainder += (bytes / 512);
2193 while (sc->write_dunits_remainder >= 1000) {
2194 sc->write_data_units++;
2195 sc->write_dunits_remainder -= 1000;
2199 sc->read_commands++;
2200 if (status != NVME_SC_SUCCESS)
2202 sc->read_dunits_remainder += (bytes / 512);
2203 while (sc->read_dunits_remainder >= 1000) {
2204 sc->read_data_units++;
2205 sc->read_dunits_remainder -= 1000;
2209 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2212 pthread_mutex_unlock(&sc->mtx);
2216 * Check if the combination of Starting LBA (slba) and number of blocks
2217 * exceeds the range of the underlying storage.
2219 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2220 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2224 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2227 size_t offset, bytes;
2229 /* Overflow check of multiplying Starting LBA by the sector size */
2230 if (slba >> (64 - nvstore->sectsz_bits))
2233 offset = slba << nvstore->sectsz_bits;
2234 bytes = nblocks << nvstore->sectsz_bits;
2236 /* Overflow check of Number of Logical Blocks */
2237 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2244 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused,
2245 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset)
2248 bool range_is_contiguous;
2253 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2258 * Minimize the number of IOVs by concatenating contiguous address
2259 * ranges. If the IOV count is zero, there is no previous range to
2262 if (req->io_req.br_iovcnt == 0)
2263 range_is_contiguous = false;
2265 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2267 if (range_is_contiguous) {
2268 iovidx = req->io_req.br_iovcnt - 1;
2270 req->io_req.br_iov[iovidx].iov_base =
2271 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2272 req->prev_gpaddr, size);
2273 if (req->io_req.br_iov[iovidx].iov_base == NULL)
2276 req->prev_size += size;
2277 req->io_req.br_resid += size;
2279 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2281 iovidx = req->io_req.br_iovcnt;
2283 req->io_req.br_offset = offset;
2284 req->io_req.br_resid = 0;
2285 req->io_req.br_param = req;
2288 req->io_req.br_iov[iovidx].iov_base =
2289 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2291 if (req->io_req.br_iov[iovidx].iov_base == NULL)
2294 req->io_req.br_iov[iovidx].iov_len = size;
2296 req->prev_gpaddr = gpaddr;
2297 req->prev_size = size;
2298 req->io_req.br_resid += size;
2300 req->io_req.br_iovcnt++;
2307 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2308 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2310 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2312 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2313 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2314 NVME_STATUS_GET_SC(status));
2316 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2318 if (cq->head != cq->tail) {
2319 if (cq->intr_en & NVME_CQ_INTEN) {
2320 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2322 DPRINTF("%s: CQ%u interrupt disabled",
2323 __func__, sq->cqid);
2329 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2332 req->nvme_sq = NULL;
2335 pthread_mutex_lock(&sc->mtx);
2337 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2340 /* when no more IO pending, can set to ready if device reset/enabled */
2341 if (sc->pending_ios == 0 &&
2342 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2343 sc->regs.csts |= NVME_CSTS_RDY;
2345 pthread_mutex_unlock(&sc->mtx);
2347 sem_post(&sc->iosemlock);
2350 static struct pci_nvme_ioreq *
2351 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2353 struct pci_nvme_ioreq *req = NULL;
2355 sem_wait(&sc->iosemlock);
2356 pthread_mutex_lock(&sc->mtx);
2358 req = STAILQ_FIRST(&sc->ioreqs_free);
2359 assert(req != NULL);
2360 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2366 pthread_mutex_unlock(&sc->mtx);
2368 req->io_req.br_iovcnt = 0;
2369 req->io_req.br_offset = 0;
2370 req->io_req.br_resid = 0;
2371 req->io_req.br_param = req;
2372 req->prev_gpaddr = 0;
2379 pci_nvme_io_done(struct blockif_req *br, int err)
2381 struct pci_nvme_ioreq *req = br->br_param;
2382 struct nvme_submission_queue *sq = req->nvme_sq;
2383 uint16_t code, status;
2385 DPRINTF("%s error %d %s", __func__, err, strerror(err));
2387 /* TODO return correct error */
2388 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2390 pci_nvme_status_genc(&status, code);
2392 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2393 pci_nvme_stats_write_read_update(req->sc, req->opc,
2394 req->bytes, status);
2395 pci_nvme_release_ioreq(req->sc, req);
2399 * Implements the Flush command. The specification states:
2400 * If a volatile write cache is not present, Flush commands complete
2401 * successfully and have no effect
2402 * in the description of the Volatile Write Cache (VWC) field of the Identify
2403 * Controller data. Therefore, set status to Success if the command is
2404 * not supported (i.e. RAM or as indicated by the blockif).
2407 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2408 struct nvme_command *cmd __unused,
2409 struct pci_nvme_blockstore *nvstore,
2410 struct pci_nvme_ioreq *req,
2413 bool pending = false;
2415 if (nvstore->type == NVME_STOR_RAM) {
2416 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2420 req->io_req.br_callback = pci_nvme_io_done;
2422 err = blockif_flush(nvstore->ctx, &req->io_req);
2428 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2431 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2439 nvme_write_read_ram(struct pci_nvme_softc *sc,
2440 struct pci_nvme_blockstore *nvstore,
2441 uint64_t prp1, uint64_t prp2,
2442 size_t offset, uint64_t bytes,
2445 uint8_t *buf = nvstore->ctx;
2446 enum nvme_copy_dir dir;
2450 dir = NVME_COPY_TO_PRP;
2452 dir = NVME_COPY_FROM_PRP;
2455 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2456 buf + offset, bytes, dir))
2457 pci_nvme_status_genc(&status,
2458 NVME_SC_DATA_TRANSFER_ERROR);
2460 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2466 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2467 struct pci_nvme_blockstore *nvstore,
2468 struct pci_nvme_ioreq *req,
2469 uint64_t prp1, uint64_t prp2,
2470 size_t offset, uint64_t bytes,
2475 uint16_t status = NVME_NO_STATUS;
2477 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2478 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) {
2488 } else if (bytes <= PAGE_SIZE) {
2490 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) {
2495 void *vmctx = sc->nsc_pi->pi_vmctx;
2496 uint64_t *prp_list = &prp2;
2497 uint64_t *last = prp_list;
2499 /* PRP2 is pointer to a physical region page list */
2501 /* Last entry in list points to the next list */
2502 if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2503 uint64_t prp = *prp_list;
2505 prp_list = paddr_guest2host(vmctx, prp,
2506 PAGE_SIZE - (prp % PAGE_SIZE));
2507 if (prp_list == NULL) {
2511 last = prp_list + (NVME_PRP2_ITEMS - 1);
2514 size = MIN(bytes, PAGE_SIZE);
2516 if (pci_nvme_append_iov_req(sc, req, *prp_list, size,
2528 req->io_req.br_callback = pci_nvme_io_done;
2530 err = blockif_write(nvstore->ctx, &req->io_req);
2532 err = blockif_read(nvstore->ctx, &req->io_req);
2535 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2541 nvme_opc_write_read(struct pci_nvme_softc *sc,
2542 struct nvme_command *cmd,
2543 struct pci_nvme_blockstore *nvstore,
2544 struct pci_nvme_ioreq *req,
2547 uint64_t lba, nblocks, bytes;
2549 bool is_write = cmd->opc == NVME_OPC_WRITE;
2550 bool pending = false;
2552 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2553 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2554 bytes = nblocks << nvstore->sectsz_bits;
2555 if (bytes > NVME_MAX_DATA_SIZE) {
2556 WPRINTF("%s command would exceed MDTS", __func__);
2557 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2561 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2562 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2563 __func__, lba, nblocks);
2564 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2568 offset = lba << nvstore->sectsz_bits;
2571 req->io_req.br_offset = lba;
2573 /* PRP bits 1:0 must be zero */
2574 cmd->prp1 &= ~0x3UL;
2575 cmd->prp2 &= ~0x3UL;
2577 if (nvstore->type == NVME_STOR_RAM) {
2578 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2579 cmd->prp2, offset, bytes, is_write);
2581 *status = nvme_write_read_blockif(sc, nvstore, req,
2582 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2584 if (*status == NVME_NO_STATUS)
2589 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2595 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2597 struct pci_nvme_ioreq *req = br->br_param;
2598 struct pci_nvme_softc *sc = req->sc;
2604 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2605 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2606 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2608 struct iovec *iov = req->io_req.br_iov;
2611 iov += req->prev_gpaddr;
2613 /* The iov_* values already include the sector size */
2614 req->io_req.br_offset = (off_t)iov->iov_base;
2615 req->io_req.br_resid = iov->iov_len;
2616 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2617 pci_nvme_status_genc(&status,
2618 NVME_SC_INTERNAL_DEVICE_ERROR);
2624 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2626 pci_nvme_release_ioreq(sc, req);
2631 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2632 struct nvme_command *cmd,
2633 struct pci_nvme_blockstore *nvstore,
2634 struct pci_nvme_ioreq *req,
2637 struct nvme_dsm_range *range = NULL;
2638 uint32_t nr, r, non_zero, dr;
2640 bool pending = false;
2642 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2643 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2647 nr = cmd->cdw10 & 0xff;
2649 /* copy locally because a range entry could straddle PRPs */
2650 range = calloc(1, NVME_MAX_DSM_TRIM);
2651 if (range == NULL) {
2652 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2655 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2656 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2658 /* Check for invalid ranges and the number of non-zero lengths */
2660 for (r = 0; r <= nr; r++) {
2661 if (pci_nvme_out_of_range(nvstore,
2662 range[r].starting_lba, range[r].length)) {
2663 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2666 if (range[r].length != 0)
2670 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2671 size_t offset, bytes;
2672 int sectsz_bits = sc->nvstore.sectsz_bits;
2675 * DSM calls are advisory only, and compliant controllers
2676 * may choose to take no actions (i.e. return Success).
2678 if (!nvstore->deallocate) {
2679 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2683 /* If all ranges have a zero length, return Success */
2684 if (non_zero == 0) {
2685 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2690 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2694 offset = range[0].starting_lba << sectsz_bits;
2695 bytes = range[0].length << sectsz_bits;
2698 * If the request is for more than a single range, store
2699 * the ranges in the br_iov. Optimize for the common case
2700 * of a single range.
2702 * Note that NVMe Number of Ranges is a zero based value
2704 req->io_req.br_iovcnt = 0;
2705 req->io_req.br_offset = offset;
2706 req->io_req.br_resid = bytes;
2709 req->io_req.br_callback = pci_nvme_io_done;
2711 struct iovec *iov = req->io_req.br_iov;
2713 for (r = 0, dr = 0; r <= nr; r++) {
2714 offset = range[r].starting_lba << sectsz_bits;
2715 bytes = range[r].length << sectsz_bits;
2719 if ((nvstore->size - offset) < bytes) {
2720 pci_nvme_status_genc(status,
2721 NVME_SC_LBA_OUT_OF_RANGE);
2724 iov[dr].iov_base = (void *)offset;
2725 iov[dr].iov_len = bytes;
2728 req->io_req.br_callback = pci_nvme_dealloc_sm;
2731 * Use prev_gpaddr to track the current entry and
2732 * prev_size to track the number of entries
2734 req->prev_gpaddr = 0;
2735 req->prev_size = dr;
2738 err = blockif_delete(nvstore->ctx, &req->io_req);
2740 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2750 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2752 struct nvme_submission_queue *sq;
2756 /* handle all submissions up to sq->tail index */
2757 sq = &sc->submit_queues[idx];
2759 pthread_mutex_lock(&sq->mtx);
2762 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2763 idx, sqhead, sq->tail, sq->qbase);
2765 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2766 struct nvme_command *cmd;
2767 struct pci_nvme_ioreq *req;
2775 cmd = &sq->qbase[sqhead];
2776 sqhead = (sqhead + 1) % sq->size;
2778 nsid = le32toh(cmd->nsid);
2779 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2780 pci_nvme_status_genc(&status,
2781 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2783 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2787 req = pci_nvme_get_ioreq(sc);
2789 pci_nvme_status_genc(&status,
2790 NVME_SC_INTERNAL_DEVICE_ERROR);
2791 WPRINTF("%s: unable to allocate IO req", __func__);
2796 req->opc = cmd->opc;
2797 req->cid = cmd->cid;
2798 req->nsid = cmd->nsid;
2801 case NVME_OPC_FLUSH:
2802 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2805 case NVME_OPC_WRITE:
2807 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2810 case NVME_OPC_WRITE_ZEROES:
2811 /* TODO: write zeroes
2812 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2813 __func__, lba, cmd->cdw12 & 0xFFFF); */
2814 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2816 case NVME_OPC_DATASET_MANAGEMENT:
2817 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2821 WPRINTF("%s unhandled io command 0x%x",
2822 __func__, cmd->opc);
2823 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2827 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2829 pci_nvme_release_ioreq(sc, req);
2835 pthread_mutex_unlock(&sq->mtx);
2839 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc,
2840 uint64_t idx, int is_sq, uint64_t value)
2842 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2843 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2846 if (idx > sc->num_squeues) {
2847 WPRINTF("%s queue index %lu overflow from "
2849 __func__, idx, sc->num_squeues);
2853 atomic_store_short(&sc->submit_queues[idx].tail,
2857 pci_nvme_handle_admin_cmd(sc, value);
2859 /* submission queue; handle new entries in SQ */
2860 if (idx > sc->num_squeues) {
2861 WPRINTF("%s SQ index %lu overflow from "
2863 __func__, idx, sc->num_squeues);
2866 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2869 if (idx > sc->num_cqueues) {
2870 WPRINTF("%s queue index %lu overflow from "
2872 __func__, idx, sc->num_cqueues);
2876 atomic_store_short(&sc->compl_queues[idx].head,
2882 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2884 const char *s = iswrite ? "WRITE" : "READ";
2887 case NVME_CR_CAP_LOW:
2888 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2890 case NVME_CR_CAP_HI:
2891 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2894 DPRINTF("%s %s NVME_CR_VS", func, s);
2897 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2900 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2903 DPRINTF("%s %s NVME_CR_CC", func, s);
2906 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2909 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2912 DPRINTF("%s %s NVME_CR_AQA", func, s);
2914 case NVME_CR_ASQ_LOW:
2915 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2917 case NVME_CR_ASQ_HI:
2918 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2920 case NVME_CR_ACQ_LOW:
2921 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2923 case NVME_CR_ACQ_HI:
2924 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2927 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2933 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size,
2938 if (offset >= NVME_DOORBELL_OFFSET) {
2939 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2940 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2941 int is_sq = (belloffset % 8) < 4;
2943 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2944 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2949 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2950 WPRINTF("guest attempted an overflow write offset "
2951 "0x%lx, val 0x%lx in %s",
2952 offset, value, __func__);
2957 if (sc->submit_queues[idx].qbase == NULL)
2959 } else if (sc->compl_queues[idx].qbase == NULL)
2962 pci_nvme_handle_doorbell(sc, idx, is_sq, value);
2966 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2967 offset, size, value);
2970 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2971 "val 0x%lx) to bar0 in %s",
2972 size, offset, value, __func__);
2973 /* TODO: shutdown device */
2977 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2979 pthread_mutex_lock(&sc->mtx);
2982 case NVME_CR_CAP_LOW:
2983 case NVME_CR_CAP_HI:
2990 /* MSI-X, so ignore */
2993 /* MSI-X, so ignore */
2996 ccreg = (uint32_t)value;
2998 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
3001 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
3002 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
3003 NVME_CC_GET_IOCQES(ccreg));
3005 if (NVME_CC_GET_SHN(ccreg)) {
3006 /* perform shutdown - flush out data to backend */
3007 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
3008 NVME_CSTS_REG_SHST_SHIFT);
3009 sc->regs.csts |= NVME_SHST_COMPLETE <<
3010 NVME_CSTS_REG_SHST_SHIFT;
3012 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
3013 if (NVME_CC_GET_EN(ccreg) == 0)
3014 /* transition 1-> causes controller reset */
3015 pci_nvme_reset_locked(sc);
3017 pci_nvme_init_controller(sc);
3020 /* Insert the iocqes, iosqes and en bits from the write */
3021 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
3022 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
3023 if (NVME_CC_GET_EN(ccreg) == 0) {
3024 /* Insert the ams, mps and css bit fields */
3025 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3026 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3027 sc->regs.csts &= ~NVME_CSTS_RDY;
3028 } else if ((sc->pending_ios == 0) &&
3029 !(sc->regs.csts & NVME_CSTS_CFS)) {
3030 sc->regs.csts |= NVME_CSTS_RDY;
3036 /* ignore writes; don't support subsystem reset */
3039 sc->regs.aqa = (uint32_t)value;
3041 case NVME_CR_ASQ_LOW:
3042 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3043 (0xFFFFF000 & value);
3045 case NVME_CR_ASQ_HI:
3046 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3049 case NVME_CR_ACQ_LOW:
3050 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3051 (0xFFFFF000 & value);
3053 case NVME_CR_ACQ_HI:
3054 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3058 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3059 __func__, offset, value, size);
3061 pthread_mutex_unlock(&sc->mtx);
3065 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
3068 struct pci_nvme_softc* sc = pi->pi_arg;
3070 if (baridx == pci_msix_table_bar(pi) ||
3071 baridx == pci_msix_pba_bar(pi)) {
3072 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3073 " value 0x%lx", baridx, offset, size, value);
3075 pci_emul_msix_twrite(pi, offset, size, value);
3081 pci_nvme_write_bar_0(sc, offset, size, value);
3085 DPRINTF("%s unknown baridx %d, val 0x%lx",
3086 __func__, baridx, value);
3090 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3091 uint64_t offset, int size)
3095 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3097 if (offset < NVME_DOORBELL_OFFSET) {
3098 void *p = &(sc->regs);
3099 pthread_mutex_lock(&sc->mtx);
3100 memcpy(&value, (void *)((uintptr_t)p + offset), size);
3101 pthread_mutex_unlock(&sc->mtx);
3104 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3115 value &= 0xFFFFFFFF;
3119 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
3120 offset, size, (uint32_t)value);
3128 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3130 struct pci_nvme_softc* sc = pi->pi_arg;
3132 if (baridx == pci_msix_table_bar(pi) ||
3133 baridx == pci_msix_pba_bar(pi)) {
3134 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3135 baridx, offset, size);
3137 return pci_emul_msix_tread(pi, offset, size);
3142 return pci_nvme_read_bar_0(sc, offset, size);
3145 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3152 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3154 char bident[sizeof("XXX:XXX")];
3158 sc->max_queues = NVME_QUEUES;
3159 sc->max_qentries = NVME_MAX_QENTRIES;
3160 sc->ioslots = NVME_IOSLOTS;
3161 sc->num_squeues = sc->max_queues;
3162 sc->num_cqueues = sc->max_queues;
3163 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3165 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3166 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3168 value = get_config_value_node(nvl, "maxq");
3170 sc->max_queues = atoi(value);
3171 value = get_config_value_node(nvl, "qsz");
3172 if (value != NULL) {
3173 sc->max_qentries = atoi(value);
3174 if (sc->max_qentries <= 0) {
3175 EPRINTLN("nvme: Invalid qsz option %d",
3180 value = get_config_value_node(nvl, "ioslots");
3181 if (value != NULL) {
3182 sc->ioslots = atoi(value);
3183 if (sc->ioslots <= 0) {
3184 EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3188 value = get_config_value_node(nvl, "sectsz");
3190 sectsz = atoi(value);
3191 value = get_config_value_node(nvl, "ser");
3192 if (value != NULL) {
3194 * This field indicates the Product Serial Number in
3195 * 7-bit ASCII, unused bytes should be space characters.
3198 cpywithpad((char *)sc->ctrldata.sn,
3199 sizeof(sc->ctrldata.sn), value, ' ');
3201 value = get_config_value_node(nvl, "eui64");
3203 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3204 value = get_config_value_node(nvl, "dsm");
3205 if (value != NULL) {
3206 if (strcmp(value, "auto") == 0)
3207 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3208 else if (strcmp(value, "enable") == 0)
3209 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3210 else if (strcmp(value, "disable") == 0)
3211 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3214 value = get_config_value_node(nvl, "ram");
3215 if (value != NULL) {
3216 uint64_t sz = strtoull(value, NULL, 10);
3218 sc->nvstore.type = NVME_STOR_RAM;
3219 sc->nvstore.size = sz * 1024 * 1024;
3220 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3221 sc->nvstore.sectsz = 4096;
3222 sc->nvstore.sectsz_bits = 12;
3223 if (sc->nvstore.ctx == NULL) {
3224 EPRINTLN("nvme: Unable to allocate RAM");
3228 snprintf(bident, sizeof(bident), "%u:%u",
3229 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3230 sc->nvstore.ctx = blockif_open(nvl, bident);
3231 if (sc->nvstore.ctx == NULL) {
3232 EPRINTLN("nvme: Could not open backing file: %s",
3236 sc->nvstore.type = NVME_STOR_BLOCKIF;
3237 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3240 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3241 sc->nvstore.sectsz = sectsz;
3242 else if (sc->nvstore.type != NVME_STOR_RAM)
3243 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3244 for (sc->nvstore.sectsz_bits = 9;
3245 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3246 sc->nvstore.sectsz_bits++);
3248 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3249 sc->max_queues = NVME_QUEUES;
3255 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3258 struct pci_nvme_softc *sc;
3259 struct pci_nvme_blockstore *nvstore;
3260 struct nvme_namespace_data *nd;
3263 nvstore = &sc->nvstore;
3266 nvstore->size = new_size;
3267 pci_nvme_init_nsdata_size(nvstore, nd);
3269 /* Add changed NSID to list */
3270 sc->ns_log.ns[0] = 1;
3271 sc->ns_log.ns[1] = 0;
3273 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3278 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl)
3280 struct pci_nvme_softc *sc;
3281 uint32_t pci_membar_sz;
3286 sc = calloc(1, sizeof(struct pci_nvme_softc));
3290 error = pci_nvme_parse_config(sc, nvl);
3296 STAILQ_INIT(&sc->ioreqs_free);
3297 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3298 for (uint32_t i = 0; i < sc->ioslots; i++) {
3299 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3302 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3303 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3304 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3305 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3306 pci_set_cfgdata8(pi, PCIR_PROGIF,
3307 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3310 * Allocate size of NVMe registers + doorbell space for all queues.
3312 * The specification requires a minimum memory I/O window size of 16K.
3313 * The Windows driver will refuse to start a device with a smaller
3316 pci_membar_sz = sizeof(struct nvme_registers) +
3317 2 * sizeof(uint32_t) * (sc->max_queues + 1);
3318 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3320 DPRINTF("nvme membar size: %u", pci_membar_sz);
3322 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3324 WPRINTF("%s pci alloc mem bar failed", __func__);
3328 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3330 WPRINTF("%s pci add msixcap failed", __func__);
3334 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3336 WPRINTF("%s pci add Express capability failed", __func__);
3340 pthread_mutex_init(&sc->mtx, NULL);
3341 sem_init(&sc->iosemlock, 0, sc->ioslots);
3342 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3344 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3346 * Controller data depends on Namespace data so initialize Namespace
3349 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3350 pci_nvme_init_ctrldata(sc);
3351 pci_nvme_init_logpages(sc);
3352 pci_nvme_init_features(sc);
3354 pci_nvme_aer_init(sc);
3355 pci_nvme_aen_init(sc);
3359 pci_lintr_request(pi);
3366 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3373 if (strncmp(opts, "ram=", 4) == 0) {
3374 cp = strchr(opts, ',');
3376 set_config_value_node(nvl, "ram", opts + 4);
3379 ram = strndup(opts + 4, cp - opts - 4);
3380 set_config_value_node(nvl, "ram", ram);
3382 return (pci_parse_legacy_config(nvl, cp + 1));
3384 return (blockif_legacy_config(nvl, opts));
3387 static const struct pci_devemu pci_de_nvme = {
3389 .pe_init = pci_nvme_init,
3390 .pe_legacy_config = pci_nvme_legacy_config,
3391 .pe_barwrite = pci_nvme_write,
3392 .pe_barread = pci_nvme_read
3394 PCI_EMUL_SET(pci_de_nvme);