]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
contrib/bc: merge from vendor release 6.2.2
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80
81 #include <dev/nvme/nvme.h>
82
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88
89
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR           4
96
97 #define NVME_IOSLOTS            8
98
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN     (1 << 14)
101
102 #define NVME_QUEUES             16
103 #define NVME_MAX_QENTRIES       2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN             0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
108
109 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
110 #define NVME_MDTS               9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS          0xffff
117 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
118
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121
122 /* helpers */
123
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)         ((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)         ((one)  - 1)
128
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133
134 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
135
136 enum nvme_controller_register_offsets {
137         NVME_CR_CAP_LOW = 0x00,
138         NVME_CR_CAP_HI  = 0x04,
139         NVME_CR_VS      = 0x08,
140         NVME_CR_INTMS   = 0x0c,
141         NVME_CR_INTMC   = 0x10,
142         NVME_CR_CC      = 0x14,
143         NVME_CR_CSTS    = 0x1c,
144         NVME_CR_NSSR    = 0x20,
145         NVME_CR_AQA     = 0x24,
146         NVME_CR_ASQ_LOW = 0x28,
147         NVME_CR_ASQ_HI  = 0x2c,
148         NVME_CR_ACQ_LOW = 0x30,
149         NVME_CR_ACQ_HI  = 0x34,
150 };
151
152 enum nvme_cmd_cdw11 {
153         NVME_CMD_CDW11_PC  = 0x0001,
154         NVME_CMD_CDW11_IEN = 0x0002,
155         NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157
158 enum nvme_copy_dir {
159         NVME_COPY_TO_PRP,
160         NVME_COPY_FROM_PRP,
161 };
162
163 #define NVME_CQ_INTEN   0x01
164 #define NVME_CQ_INTCOAL 0x02
165
166 struct nvme_completion_queue {
167         struct nvme_completion *qbase;
168         pthread_mutex_t mtx;
169         uint32_t        size;
170         uint16_t        tail; /* nvme progress */
171         uint16_t        head; /* guest progress */
172         uint16_t        intr_vec;
173         uint32_t        intr_en;
174 };
175
176 struct nvme_submission_queue {
177         struct nvme_command *qbase;
178         pthread_mutex_t mtx;
179         uint32_t        size;
180         uint16_t        head; /* nvme progress */
181         uint16_t        tail; /* guest progress */
182         uint16_t        cqid; /* completion queue id */
183         int             qpriority;
184 };
185
186 enum nvme_storage_type {
187         NVME_STOR_BLOCKIF = 0,
188         NVME_STOR_RAM = 1,
189 };
190
191 struct pci_nvme_blockstore {
192         enum nvme_storage_type type;
193         void            *ctx;
194         uint64_t        size;
195         uint32_t        sectsz;
196         uint32_t        sectsz_bits;
197         uint64_t        eui64;
198         uint32_t        deallocate:1;
199 };
200
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207         ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208           NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209           0 )
210
211 struct pci_nvme_ioreq {
212         struct pci_nvme_softc *sc;
213         STAILQ_ENTRY(pci_nvme_ioreq) link;
214         struct nvme_submission_queue *nvme_sq;
215         uint16_t        sqid;
216
217         /* command information */
218         uint16_t        opc;
219         uint16_t        cid;
220         uint32_t        nsid;
221
222         uint64_t        prev_gpaddr;
223         size_t          prev_size;
224         size_t          bytes;
225
226         struct blockif_req io_req;
227
228         struct iovec    iovpadding[MDTS_PAD_SIZE];
229 };
230
231 enum nvme_dsm_type {
232         /* Dataset Management bit in ONCS reflects backing storage capability */
233         NVME_DATASET_MANAGEMENT_AUTO,
234         /* Unconditionally set Dataset Management bit in ONCS */
235         NVME_DATASET_MANAGEMENT_ENABLE,
236         /* Unconditionally clear Dataset Management bit in ONCS */
237         NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247
248 struct nvme_feature_obj {
249         uint32_t        cdw11;
250         nvme_feature_cb set;
251         nvme_feature_cb get;
252         bool namespace_specific;
253 };
254
255 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256
257 typedef enum {
258         PCI_NVME_AE_TYPE_ERROR = 0,
259         PCI_NVME_AE_TYPE_SMART,
260         PCI_NVME_AE_TYPE_NOTICE,
261         PCI_NVME_AE_TYPE_IO_CMD = 6,
262         PCI_NVME_AE_TYPE_VENDOR = 7,
263         PCI_NVME_AE_TYPE_MAX            /* Must be last */
264 } pci_nvme_async_type;
265
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268         STAILQ_ENTRY(pci_nvme_aer) link;
269         uint16_t        cid;    /* Command ID of the submitted AER */
270 };
271
272 /** Asynchronous Event Information - Notice */
273 typedef enum {
274         PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275         PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276         PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277         PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278         PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279         PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280         PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281         PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
283
284 #define PCI_NVME_AEI_NOTICE_SHIFT               8
285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
286
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289         pci_nvme_async_type atype;
290         uint32_t        event_data;
291         bool            posted;
292 };
293
294 /*
295  * By default, enable all Asynchrnous Event Notifications:
296  *     SMART / Health Critical Warnings
297  *     Namespace Attribute Notices
298  */
299 #define PCI_NVME_AEN_DEFAULT_MASK       0x11f
300
301 typedef enum {
302         NVME_CNTRLTYPE_IO = 1,
303         NVME_CNTRLTYPE_DISCOVERY = 2,
304         NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
306
307 struct pci_nvme_softc {
308         struct pci_devinst *nsc_pi;
309
310         pthread_mutex_t mtx;
311
312         struct nvme_registers regs;
313
314         struct nvme_namespace_data  nsdata;
315         struct nvme_controller_data ctrldata;
316         struct nvme_error_information_entry err_log;
317         struct nvme_health_information_page health_log;
318         struct nvme_firmware_page fw_log;
319         struct nvme_ns_list ns_log;
320
321         struct pci_nvme_blockstore nvstore;
322
323         uint16_t        max_qentries;   /* max entries per queue */
324         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
325         uint32_t        num_cqueues;
326         uint32_t        num_squeues;
327         bool            num_q_is_set; /* Has host set Number of Queues */
328
329         struct pci_nvme_ioreq *ioreqs;
330         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331         uint32_t        pending_ios;
332         uint32_t        ioslots;
333         sem_t           iosemlock;
334
335         /*
336          * Memory mapped Submission and Completion queues
337          * Each array includes both Admin and IO queues
338          */
339         struct nvme_completion_queue *compl_queues;
340         struct nvme_submission_queue *submit_queues;
341
342         struct nvme_feature_obj feat[NVME_FID_MAX];
343
344         enum nvme_dsm_type dataset_management;
345
346         /* Accounting for SMART data */
347         __uint128_t     read_data_units;
348         __uint128_t     write_data_units;
349         __uint128_t     read_commands;
350         __uint128_t     write_commands;
351         uint32_t        read_dunits_remainder;
352         uint32_t        write_dunits_remainder;
353
354         STAILQ_HEAD(, pci_nvme_aer) aer_list;
355         pthread_mutex_t aer_mtx;
356         uint32_t        aer_count;
357         struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
358         pthread_t       aen_tid;
359         pthread_mutex_t aen_mtx;
360         pthread_cond_t  aen_cond;
361 };
362
363
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365     struct nvme_completion_queue *cq,
366     uint32_t cdw0,
367     uint16_t cid,
368     uint16_t sqid,
369     uint16_t status);
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
373
374 /* Controller Configuration utils */
375 #define NVME_CC_GET_EN(cc) \
376         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define NVME_CC_GET_CSS(cc) \
378         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define NVME_CC_GET_SHN(cc) \
380         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define NVME_CC_GET_IOSQES(cc) \
382         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define NVME_CC_GET_IOCQES(cc) \
384         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
385
386 #define NVME_CC_WRITE_MASK \
387         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
390
391 #define NVME_CC_NEN_WRITE_MASK \
392         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
395
396 /* Controller Status utils */
397 #define NVME_CSTS_GET_RDY(sts) \
398         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
399
400 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
401 #define NVME_CSTS_CFS   (1 << NVME_CSTS_REG_CFS_SHIFT)
402
403 /* Completion Queue status word utils */
404 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
405 #define NVME_STATUS_MASK \
406         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
407          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
408
409 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
410         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
411
412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
413     struct nvme_feature_obj *,
414     struct nvme_command *,
415     struct nvme_completion *);
416 static void nvme_feature_temperature(struct pci_nvme_softc *,
417     struct nvme_feature_obj *,
418     struct nvme_command *,
419     struct nvme_completion *);
420 static void nvme_feature_num_queues(struct pci_nvme_softc *,
421     struct nvme_feature_obj *,
422     struct nvme_command *,
423     struct nvme_completion *);
424 static void nvme_feature_iv_config(struct pci_nvme_softc *,
425     struct nvme_feature_obj *,
426     struct nvme_command *,
427     struct nvme_completion *);
428 static void nvme_feature_async_event(struct pci_nvme_softc *,
429     struct nvme_feature_obj *,
430     struct nvme_command *,
431     struct nvme_completion *);
432
433 static void *aen_thr(void *arg);
434
435 static __inline void
436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
437 {
438         size_t len;
439
440         len = strnlen(src, dst_size);
441         memset(dst, pad, dst_size);
442         memcpy(dst, src, len);
443 }
444
445 static __inline void
446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
447 {
448
449         *status &= ~NVME_STATUS_MASK;
450         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
451                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
452 }
453
454 static __inline void
455 pci_nvme_status_genc(uint16_t *status, uint16_t code)
456 {
457
458         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
459 }
460
461 /*
462  * Initialize the requested number or IO Submission and Completion Queues.
463  * Admin queues are allocated implicitly.
464  */
465 static void
466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
467 {
468         uint32_t i;
469
470         /*
471          * Allocate and initialize the Submission Queues
472          */
473         if (nsq > NVME_QUEUES) {
474                 WPRINTF("%s: clamping number of SQ from %u to %u",
475                                         __func__, nsq, NVME_QUEUES);
476                 nsq = NVME_QUEUES;
477         }
478
479         sc->num_squeues = nsq;
480
481         sc->submit_queues = calloc(sc->num_squeues + 1,
482                                 sizeof(struct nvme_submission_queue));
483         if (sc->submit_queues == NULL) {
484                 WPRINTF("%s: SQ allocation failed", __func__);
485                 sc->num_squeues = 0;
486         } else {
487                 struct nvme_submission_queue *sq = sc->submit_queues;
488
489                 for (i = 0; i < sc->num_squeues + 1; i++)
490                         pthread_mutex_init(&sq[i].mtx, NULL);
491         }
492
493         /*
494          * Allocate and initialize the Completion Queues
495          */
496         if (ncq > NVME_QUEUES) {
497                 WPRINTF("%s: clamping number of CQ from %u to %u",
498                                         __func__, ncq, NVME_QUEUES);
499                 ncq = NVME_QUEUES;
500         }
501
502         sc->num_cqueues = ncq;
503
504         sc->compl_queues = calloc(sc->num_cqueues + 1,
505                                 sizeof(struct nvme_completion_queue));
506         if (sc->compl_queues == NULL) {
507                 WPRINTF("%s: CQ allocation failed", __func__);
508                 sc->num_cqueues = 0;
509         } else {
510                 struct nvme_completion_queue *cq = sc->compl_queues;
511
512                 for (i = 0; i < sc->num_cqueues + 1; i++)
513                         pthread_mutex_init(&cq[i].mtx, NULL);
514         }
515 }
516
517 static void
518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
519 {
520         struct nvme_controller_data *cd = &sc->ctrldata;
521
522         cd->vid = 0xFB5D;
523         cd->ssvid = 0x0000;
524
525         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
526         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
527
528         /* Num of submission commands that we can handle at a time (2^rab) */
529         cd->rab   = 4;
530
531         /* FreeBSD OUI */
532         cd->ieee[0] = 0x58;
533         cd->ieee[1] = 0x9c;
534         cd->ieee[2] = 0xfc;
535
536         cd->mic = 0;
537
538         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
539
540         cd->ver = NVME_REV(1,4);
541
542         cd->cntrltype = NVME_CNTRLTYPE_IO;
543         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
544         cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
545         cd->acl = 2;
546         cd->aerl = 4;
547
548         /* Advertise 1, Read-only firmware slot */
549         cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
550             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
551         cd->lpa = 0;    /* TODO: support some simple things like SMART */
552         cd->elpe = 0;   /* max error log page entries */
553         /*
554          * Report a single power state (zero-based value)
555          * power_state[] values are left as zero to indicate "Not reported"
556          */
557         cd->npss = 0;
558
559         /* Warning Composite Temperature Threshold */
560         cd->wctemp = 0x0157;
561         cd->cctemp = 0x0157;
562
563         /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
564         cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
565                         NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
566
567         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
568             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
569         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
570             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
571         cd->nn = 1;     /* number of namespaces */
572
573         cd->oncs = 0;
574         switch (sc->dataset_management) {
575         case NVME_DATASET_MANAGEMENT_AUTO:
576                 if (sc->nvstore.deallocate)
577                         cd->oncs |= NVME_ONCS_DSM;
578                 break;
579         case NVME_DATASET_MANAGEMENT_ENABLE:
580                 cd->oncs |= NVME_ONCS_DSM;
581                 break;
582         default:
583                 break;
584         }
585
586         cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
587             NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
588
589         cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
590 }
591
592 /*
593  * Calculate the CRC-16 of the given buffer
594  * See copyright attribution at top of file
595  */
596 static uint16_t
597 crc16(uint16_t crc, const void *buffer, unsigned int len)
598 {
599         const unsigned char *cp = buffer;
600         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
601         static uint16_t const crc16_table[256] = {
602                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
603                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
604                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
605                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
606                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
607                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
608                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
609                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
610                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
611                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
612                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
613                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
614                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
615                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
616                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
617                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
618                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
619                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
620                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
621                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
622                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
623                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
624                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
625                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
626                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
627                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
628                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
629                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
630                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
631                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
632                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
633                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
634         };
635
636         while (len--)
637                 crc = (((crc >> 8) & 0xffU) ^
638                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
639         return crc;
640 }
641
642 static void
643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
644     struct nvme_namespace_data *nd)
645 {
646
647         /* Get capacity and block size information from backing store */
648         nd->nsze = nvstore->size / nvstore->sectsz;
649         nd->ncap = nd->nsze;
650         nd->nuse = nd->nsze;
651 }
652
653 static void
654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
655     struct nvme_namespace_data *nd, uint32_t nsid,
656     struct pci_nvme_blockstore *nvstore)
657 {
658
659         pci_nvme_init_nsdata_size(nvstore, nd);
660
661         if (nvstore->type == NVME_STOR_BLOCKIF)
662                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
663
664         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
665         nd->flbas = 0;
666
667         /* Create an EUI-64 if user did not provide one */
668         if (nvstore->eui64 == 0) {
669                 char *data = NULL;
670                 uint64_t eui64 = nvstore->eui64;
671
672                 asprintf(&data, "%s%u%u%u", get_config_value("name"),
673                     sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
674                     sc->nsc_pi->pi_func);
675
676                 if (data != NULL) {
677                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
678                         free(data);
679                 }
680                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
681         }
682         be64enc(nd->eui64, nvstore->eui64);
683
684         /* LBA data-sz = 2^lbads */
685         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
686 }
687
688 static void
689 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
690 {
691         __uint128_t power_cycles = 1;
692
693         memset(&sc->err_log, 0, sizeof(sc->err_log));
694         memset(&sc->health_log, 0, sizeof(sc->health_log));
695         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
696         memset(&sc->ns_log, 0, sizeof(sc->ns_log));
697
698         /* Set read/write remainder to round up according to spec */
699         sc->read_dunits_remainder = 999;
700         sc->write_dunits_remainder = 999;
701
702         /* Set nominal Health values checked by implementations */
703         sc->health_log.temperature = NVME_TEMPERATURE;
704         sc->health_log.available_spare = 100;
705         sc->health_log.available_spare_threshold = 10;
706
707         /* Set Active Firmware Info to slot 1 */
708         sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
709         memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
710             sizeof(sc->fw_log.revision[0]));
711
712         memcpy(&sc->health_log.power_cycles, &power_cycles,
713             sizeof(sc->health_log.power_cycles));
714 }
715
716 static void
717 pci_nvme_init_features(struct pci_nvme_softc *sc)
718 {
719         enum nvme_feature       fid;
720
721         for (fid = 0; fid < NVME_FID_MAX; fid++) {
722                 switch (fid) {
723                 case NVME_FEAT_ARBITRATION:
724                 case NVME_FEAT_POWER_MANAGEMENT:
725                 case NVME_FEAT_INTERRUPT_COALESCING: //XXX
726                 case NVME_FEAT_WRITE_ATOMICITY:
727                         /* Mandatory but no special handling required */
728                 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
729                 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
730                 //                this returns a data buffer
731                         break;
732                 case NVME_FEAT_TEMPERATURE_THRESHOLD:
733                         sc->feat[fid].set = nvme_feature_temperature;
734                         break;
735                 case NVME_FEAT_ERROR_RECOVERY:
736                         sc->feat[fid].namespace_specific = true;
737                         break;
738                 case NVME_FEAT_NUMBER_OF_QUEUES:
739                         sc->feat[fid].set = nvme_feature_num_queues;
740                         break;
741                 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
742                         sc->feat[fid].set = nvme_feature_iv_config;
743                         break;
744                 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
745                         sc->feat[fid].set = nvme_feature_async_event;
746                         /* Enable all AENs by default */
747                         sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
748                         break;
749                 default:
750                         sc->feat[fid].set = nvme_feature_invalid_cb;
751                         sc->feat[fid].get = nvme_feature_invalid_cb;
752                 }
753         }
754 }
755
756 static void
757 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
758 {
759
760         STAILQ_INIT(&sc->aer_list);
761         sc->aer_count = 0;
762 }
763
764 static void
765 pci_nvme_aer_init(struct pci_nvme_softc *sc)
766 {
767
768         pthread_mutex_init(&sc->aer_mtx, NULL);
769         pci_nvme_aer_reset(sc);
770 }
771
772 static void
773 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
774 {
775         struct pci_nvme_aer *aer = NULL;
776
777         pthread_mutex_lock(&sc->aer_mtx);
778         while (!STAILQ_EMPTY(&sc->aer_list)) {
779                 aer = STAILQ_FIRST(&sc->aer_list);
780                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
781                 free(aer);
782         }
783         pthread_mutex_unlock(&sc->aer_mtx);
784
785         pci_nvme_aer_reset(sc);
786 }
787
788 static bool
789 pci_nvme_aer_available(struct pci_nvme_softc *sc)
790 {
791
792         return (sc->aer_count != 0);
793 }
794
795 static bool
796 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
797 {
798         struct nvme_controller_data *cd = &sc->ctrldata;
799
800         /* AERL is a zero based value while aer_count is one's based */
801         return (sc->aer_count == (cd->aerl + 1U));
802 }
803
804 /*
805  * Add an Async Event Request
806  *
807  * Stores an AER to be returned later if the Controller needs to notify the
808  * host of an event.
809  * Note that while the NVMe spec doesn't require Controllers to return AER's
810  * in order, this implementation does preserve the order.
811  */
812 static int
813 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
814 {
815         struct pci_nvme_aer *aer = NULL;
816
817         aer = calloc(1, sizeof(struct pci_nvme_aer));
818         if (aer == NULL)
819                 return (-1);
820
821         /* Save the Command ID for use in the completion message */
822         aer->cid = cid;
823
824         pthread_mutex_lock(&sc->aer_mtx);
825         sc->aer_count++;
826         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
827         pthread_mutex_unlock(&sc->aer_mtx);
828
829         return (0);
830 }
831
832 /*
833  * Get an Async Event Request structure
834  *
835  * Returns a pointer to an AER previously submitted by the host or NULL if
836  * no AER's exist. Caller is responsible for freeing the returned struct.
837  */
838 static struct pci_nvme_aer *
839 pci_nvme_aer_get(struct pci_nvme_softc *sc)
840 {
841         struct pci_nvme_aer *aer = NULL;
842
843         pthread_mutex_lock(&sc->aer_mtx);
844         aer = STAILQ_FIRST(&sc->aer_list);
845         if (aer != NULL) {
846                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
847                 sc->aer_count--;
848         }
849         pthread_mutex_unlock(&sc->aer_mtx);
850
851         return (aer);
852 }
853
854 static void
855 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
856 {
857         uint32_t        atype;
858
859         memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
860
861         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
862                 sc->aen[atype].atype = atype;
863         }
864 }
865
866 static void
867 pci_nvme_aen_init(struct pci_nvme_softc *sc)
868 {
869         char nstr[80];
870
871         pci_nvme_aen_reset(sc);
872
873         pthread_mutex_init(&sc->aen_mtx, NULL);
874         pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
875         snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
876             sc->nsc_pi->pi_func);
877         pthread_set_name_np(sc->aen_tid, nstr);
878 }
879
880 static void
881 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
882 {
883
884         pci_nvme_aen_reset(sc);
885 }
886
887 /* Notify the AEN thread of pending work */
888 static void
889 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
890 {
891
892         pthread_cond_signal(&sc->aen_cond);
893 }
894
895 /*
896  * Post an Asynchronous Event Notification
897  */
898 static int32_t
899 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
900                 uint32_t event_data)
901 {
902         struct pci_nvme_aen *aen;
903
904         if (atype >= PCI_NVME_AE_TYPE_MAX) {
905                 return(EINVAL);
906         }
907
908         pthread_mutex_lock(&sc->aen_mtx);
909         aen = &sc->aen[atype];
910
911         /* Has the controller already posted an event of this type? */
912         if (aen->posted) {
913                 pthread_mutex_unlock(&sc->aen_mtx);
914                 return(EALREADY);
915         }
916
917         aen->event_data = event_data;
918         aen->posted = true;
919         pthread_mutex_unlock(&sc->aen_mtx);
920
921         pci_nvme_aen_notify(sc);
922
923         return(0);
924 }
925
926 static void
927 pci_nvme_aen_process(struct pci_nvme_softc *sc)
928 {
929         struct pci_nvme_aer *aer;
930         struct pci_nvme_aen *aen;
931         pci_nvme_async_type atype;
932         uint32_t mask;
933         uint16_t status;
934         uint8_t lid;
935
936         assert(pthread_mutex_isowned_np(&sc->aen_mtx));
937         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
938                 aen = &sc->aen[atype];
939                 /* Previous iterations may have depleted the available AER's */
940                 if (!pci_nvme_aer_available(sc)) {
941                         DPRINTF("%s: no AER", __func__);
942                         break;
943                 }
944
945                 if (!aen->posted) {
946                         DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
947                         continue;
948                 }
949
950                 status = NVME_SC_SUCCESS;
951
952                 /* Is the event masked? */
953                 mask =
954                     sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
955
956                 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
957                 switch (atype) {
958                 case PCI_NVME_AE_TYPE_ERROR:
959                         lid = NVME_LOG_ERROR;
960                         break;
961                 case PCI_NVME_AE_TYPE_SMART:
962                         mask &= 0xff;
963                         if ((mask & aen->event_data) == 0)
964                                 continue;
965                         lid = NVME_LOG_HEALTH_INFORMATION;
966                         break;
967                 case PCI_NVME_AE_TYPE_NOTICE:
968                         if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
969                                 EPRINTLN("%s unknown AEN notice type %u",
970                                     __func__, aen->event_data);
971                                 status = NVME_SC_INTERNAL_DEVICE_ERROR;
972                                 lid = 0;
973                                 break;
974                         }
975                         if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
976                                 continue;
977                         switch (aen->event_data) {
978                         case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
979                                 lid = NVME_LOG_CHANGED_NAMESPACE;
980                                 break;
981                         case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
982                                 lid = NVME_LOG_FIRMWARE_SLOT;
983                                 break;
984                         case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
985                                 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
986                                 break;
987                         case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
988                                 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
989                                 break;
990                         case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
991                                 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
992                                 break;
993                         case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
994                                 lid = NVME_LOG_LBA_STATUS_INFORMATION;
995                                 break;
996                         case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
997                                 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
998                                 break;
999                         default:
1000                                 lid = 0;
1001                         }
1002                         break;
1003                 default:
1004                         /* bad type?!? */
1005                         EPRINTLN("%s unknown AEN type %u", __func__, atype);
1006                         status = NVME_SC_INTERNAL_DEVICE_ERROR;
1007                         lid = 0;
1008                         break;
1009                 }
1010
1011                 aer = pci_nvme_aer_get(sc);
1012                 assert(aer != NULL);
1013
1014                 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1015                 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1016                     (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1017                     aer->cid,
1018                     0,          /* SQID */
1019                     status);
1020
1021                 aen->event_data = 0;
1022                 aen->posted = false;
1023
1024                 pci_generate_msix(sc->nsc_pi, 0);
1025         }
1026 }
1027
1028 static void *
1029 aen_thr(void *arg)
1030 {
1031         struct pci_nvme_softc *sc;
1032
1033         sc = arg;
1034
1035         pthread_mutex_lock(&sc->aen_mtx);
1036         for (;;) {
1037                 pci_nvme_aen_process(sc);
1038                 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1039         }
1040         pthread_mutex_unlock(&sc->aen_mtx);
1041
1042         pthread_exit(NULL);
1043         return (NULL);
1044 }
1045
1046 static void
1047 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1048 {
1049         uint32_t i;
1050
1051         DPRINTF("%s", __func__);
1052
1053         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1054             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1055             (60 << NVME_CAP_LO_REG_TO_SHIFT);
1056
1057         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1058
1059         sc->regs.vs = NVME_REV(1,4);    /* NVMe v1.4 */
1060
1061         sc->regs.cc = 0;
1062
1063         assert(sc->submit_queues != NULL);
1064
1065         for (i = 0; i < sc->num_squeues + 1; i++) {
1066                 sc->submit_queues[i].qbase = NULL;
1067                 sc->submit_queues[i].size = 0;
1068                 sc->submit_queues[i].cqid = 0;
1069                 sc->submit_queues[i].tail = 0;
1070                 sc->submit_queues[i].head = 0;
1071         }
1072
1073         assert(sc->compl_queues != NULL);
1074
1075         for (i = 0; i < sc->num_cqueues + 1; i++) {
1076                 sc->compl_queues[i].qbase = NULL;
1077                 sc->compl_queues[i].size = 0;
1078                 sc->compl_queues[i].tail = 0;
1079                 sc->compl_queues[i].head = 0;
1080         }
1081
1082         sc->num_q_is_set = false;
1083
1084         pci_nvme_aer_destroy(sc);
1085         pci_nvme_aen_destroy(sc);
1086
1087         /*
1088          * Clear CSTS.RDY last to prevent the host from enabling Controller
1089          * before cleanup completes
1090          */
1091         sc->regs.csts = 0;
1092 }
1093
1094 static void
1095 pci_nvme_reset(struct pci_nvme_softc *sc)
1096 {
1097         pthread_mutex_lock(&sc->mtx);
1098         pci_nvme_reset_locked(sc);
1099         pthread_mutex_unlock(&sc->mtx);
1100 }
1101
1102 static int
1103 pci_nvme_init_controller(struct pci_nvme_softc *sc)
1104 {
1105         uint16_t acqs, asqs;
1106
1107         DPRINTF("%s", __func__);
1108
1109         /*
1110          * NVMe 2.0 states that "enabling a controller while this field is
1111          * cleared to 0h produces undefined results" for both ACQS and
1112          * ASQS. If zero, set CFS and do not become ready.
1113          */
1114         asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1115         if (asqs < 2) {
1116                 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1117                     asqs - 1, sc->regs.aqa);
1118                 sc->regs.csts |= NVME_CSTS_CFS;
1119                 return (-1);
1120         }
1121         sc->submit_queues[0].size = asqs;
1122         sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1123             sc->regs.asq, sizeof(struct nvme_command) * asqs);
1124         if (sc->submit_queues[0].qbase == NULL) {
1125                 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1126                     sc->regs.asq);
1127                 sc->regs.csts |= NVME_CSTS_CFS;
1128                 return (-1);
1129         }
1130
1131         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1132                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1133
1134         acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1135             NVME_AQA_REG_ACQS_MASK);
1136         if (acqs < 2) {
1137                 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1138                     acqs - 1, sc->regs.aqa);
1139                 sc->regs.csts |= NVME_CSTS_CFS;
1140                 return (-1);
1141         }
1142         sc->compl_queues[0].size = acqs;
1143         sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1144             sc->regs.acq, sizeof(struct nvme_completion) * acqs);
1145         if (sc->compl_queues[0].qbase == NULL) {
1146                 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1147                     sc->regs.acq);
1148                 sc->regs.csts |= NVME_CSTS_CFS;
1149                 return (-1);
1150         }
1151         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1152
1153         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1154                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1155
1156         return (0);
1157 }
1158
1159 static int
1160 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1161         size_t len, enum nvme_copy_dir dir)
1162 {
1163         uint8_t *p;
1164         size_t bytes;
1165
1166         if (len > (8 * 1024)) {
1167                 return (-1);
1168         }
1169
1170         /* Copy from the start of prp1 to the end of the physical page */
1171         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1172         bytes = MIN(bytes, len);
1173
1174         p = vm_map_gpa(ctx, prp1, bytes);
1175         if (p == NULL) {
1176                 return (-1);
1177         }
1178
1179         if (dir == NVME_COPY_TO_PRP)
1180                 memcpy(p, b, bytes);
1181         else
1182                 memcpy(b, p, bytes);
1183
1184         b += bytes;
1185
1186         len -= bytes;
1187         if (len == 0) {
1188                 return (0);
1189         }
1190
1191         len = MIN(len, PAGE_SIZE);
1192
1193         p = vm_map_gpa(ctx, prp2, len);
1194         if (p == NULL) {
1195                 return (-1);
1196         }
1197
1198         if (dir == NVME_COPY_TO_PRP)
1199                 memcpy(p, b, len);
1200         else
1201                 memcpy(b, p, len);
1202
1203         return (0);
1204 }
1205
1206 /*
1207  * Write a Completion Queue Entry update
1208  *
1209  * Write the completion and update the doorbell value
1210  */
1211 static void
1212 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1213                 struct nvme_completion_queue *cq,
1214                 uint32_t cdw0,
1215                 uint16_t cid,
1216                 uint16_t sqid,
1217                 uint16_t status)
1218 {
1219         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1220         struct nvme_completion *cqe;
1221
1222         assert(cq->qbase != NULL);
1223
1224         pthread_mutex_lock(&cq->mtx);
1225
1226         cqe = &cq->qbase[cq->tail];
1227
1228         /* Flip the phase bit */
1229         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1230
1231         cqe->cdw0 = cdw0;
1232         cqe->sqhd = sq->head;
1233         cqe->sqid = sqid;
1234         cqe->cid = cid;
1235         cqe->status = status;
1236
1237         cq->tail++;
1238         if (cq->tail >= cq->size) {
1239                 cq->tail = 0;
1240         }
1241
1242         pthread_mutex_unlock(&cq->mtx);
1243 }
1244
1245 static int
1246 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1247         struct nvme_completion* compl)
1248 {
1249         uint16_t qid = command->cdw10 & 0xffff;
1250
1251         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1252         if (qid == 0 || qid > sc->num_squeues ||
1253             (sc->submit_queues[qid].qbase == NULL)) {
1254                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1255                         __func__, qid, sc->num_squeues);
1256                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1257                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1258                 return (1);
1259         }
1260
1261         sc->submit_queues[qid].qbase = NULL;
1262         sc->submit_queues[qid].cqid = 0;
1263         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1264         return (1);
1265 }
1266
1267 static int
1268 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1269         struct nvme_completion* compl)
1270 {
1271         if (command->cdw11 & NVME_CMD_CDW11_PC) {
1272                 uint16_t qid = command->cdw10 & 0xffff;
1273                 struct nvme_submission_queue *nsq;
1274
1275                 if ((qid == 0) || (qid > sc->num_squeues) ||
1276                     (sc->submit_queues[qid].qbase != NULL)) {
1277                         WPRINTF("%s queue index %u > num_squeues %u",
1278                                 __func__, qid, sc->num_squeues);
1279                         pci_nvme_status_tc(&compl->status,
1280                             NVME_SCT_COMMAND_SPECIFIC,
1281                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1282                         return (1);
1283                 }
1284
1285                 nsq = &sc->submit_queues[qid];
1286                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1287                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1288                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1289                         /*
1290                          * Queues must specify at least two entries
1291                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1292                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1293                          */
1294                         pci_nvme_status_tc(&compl->status,
1295                             NVME_SCT_COMMAND_SPECIFIC,
1296                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1297                         return (1);
1298                 }
1299                 nsq->head = nsq->tail = 0;
1300
1301                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1302                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1303                         pci_nvme_status_tc(&compl->status,
1304                             NVME_SCT_COMMAND_SPECIFIC,
1305                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1306                         return (1);
1307                 }
1308
1309                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1310                         pci_nvme_status_tc(&compl->status,
1311                             NVME_SCT_COMMAND_SPECIFIC,
1312                             NVME_SC_COMPLETION_QUEUE_INVALID);
1313                         return (1);
1314                 }
1315
1316                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1317
1318                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1319                               sizeof(struct nvme_command) * (size_t)nsq->size);
1320
1321                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1322                         qid, nsq->size, nsq->qbase, nsq->cqid);
1323
1324                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1325
1326                 DPRINTF("%s completed creating IOSQ qid %u",
1327                          __func__, qid);
1328         } else {
1329                 /*
1330                  * Guest sent non-cont submission queue request.
1331                  * This setting is unsupported by this emulation.
1332                  */
1333                 WPRINTF("%s unsupported non-contig (list-based) "
1334                          "create i/o submission queue", __func__);
1335
1336                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1337         }
1338         return (1);
1339 }
1340
1341 static int
1342 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1343         struct nvme_completion* compl)
1344 {
1345         uint16_t qid = command->cdw10 & 0xffff;
1346         uint16_t sqid;
1347
1348         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1349         if (qid == 0 || qid > sc->num_cqueues ||
1350             (sc->compl_queues[qid].qbase == NULL)) {
1351                 WPRINTF("%s queue index %u / num_cqueues %u",
1352                         __func__, qid, sc->num_cqueues);
1353                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1354                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1355                 return (1);
1356         }
1357
1358         /* Deleting an Active CQ is an error */
1359         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1360                 if (sc->submit_queues[sqid].cqid == qid) {
1361                         pci_nvme_status_tc(&compl->status,
1362                             NVME_SCT_COMMAND_SPECIFIC,
1363                             NVME_SC_INVALID_QUEUE_DELETION);
1364                         return (1);
1365                 }
1366
1367         sc->compl_queues[qid].qbase = NULL;
1368         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1369         return (1);
1370 }
1371
1372 static int
1373 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1374         struct nvme_completion* compl)
1375 {
1376         struct nvme_completion_queue *ncq;
1377         uint16_t qid = command->cdw10 & 0xffff;
1378
1379         /* Only support Physically Contiguous queues */
1380         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1381                 WPRINTF("%s unsupported non-contig (list-based) "
1382                          "create i/o completion queue",
1383                          __func__);
1384
1385                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1386                 return (1);
1387         }
1388
1389         if ((qid == 0) || (qid > sc->num_cqueues) ||
1390             (sc->compl_queues[qid].qbase != NULL)) {
1391                 WPRINTF("%s queue index %u > num_cqueues %u",
1392                         __func__, qid, sc->num_cqueues);
1393                 pci_nvme_status_tc(&compl->status,
1394                     NVME_SCT_COMMAND_SPECIFIC,
1395                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1396                 return (1);
1397         }
1398
1399         ncq = &sc->compl_queues[qid];
1400         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1401         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1402         if (ncq->intr_vec > (sc->max_queues + 1)) {
1403                 pci_nvme_status_tc(&compl->status,
1404                     NVME_SCT_COMMAND_SPECIFIC,
1405                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1406                 return (1);
1407         }
1408
1409         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1410         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1411                 /*
1412                  * Queues must specify at least two entries
1413                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1414                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1415                  */
1416                 pci_nvme_status_tc(&compl->status,
1417                     NVME_SCT_COMMAND_SPECIFIC,
1418                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1419                 return (1);
1420         }
1421         ncq->head = ncq->tail = 0;
1422         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1423                      command->prp1,
1424                      sizeof(struct nvme_command) * (size_t)ncq->size);
1425
1426         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1427
1428
1429         return (1);
1430 }
1431
1432 static int
1433 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1434         struct nvme_completion* compl)
1435 {
1436         uint64_t logoff;
1437         uint32_t logsize;
1438         uint8_t logpage;
1439
1440         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1441
1442         /*
1443          * Command specifies the number of dwords to return in fields NUMDU
1444          * and NUMDL. This is a zero-based value.
1445          */
1446         logpage = command->cdw10 & 0xFF;
1447         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1448         logsize *= sizeof(uint32_t);
1449         logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1450
1451         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1452
1453         switch (logpage) {
1454         case NVME_LOG_ERROR:
1455                 if (logoff >= sizeof(sc->err_log)) {
1456                         pci_nvme_status_genc(&compl->status,
1457                             NVME_SC_INVALID_FIELD);
1458                         break;
1459                 }
1460
1461                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1462                     command->prp2, (uint8_t *)&sc->err_log + logoff,
1463                     MIN(logsize - logoff, sizeof(sc->err_log)),
1464                     NVME_COPY_TO_PRP);
1465                 break;
1466         case NVME_LOG_HEALTH_INFORMATION:
1467                 if (logoff >= sizeof(sc->health_log)) {
1468                         pci_nvme_status_genc(&compl->status,
1469                             NVME_SC_INVALID_FIELD);
1470                         break;
1471                 }
1472
1473                 pthread_mutex_lock(&sc->mtx);
1474                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1475                     sizeof(sc->health_log.data_units_read));
1476                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1477                     sizeof(sc->health_log.data_units_written));
1478                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1479                     sizeof(sc->health_log.host_read_commands));
1480                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1481                     sizeof(sc->health_log.host_write_commands));
1482                 pthread_mutex_unlock(&sc->mtx);
1483
1484                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1485                     command->prp2, (uint8_t *)&sc->health_log + logoff,
1486                     MIN(logsize - logoff, sizeof(sc->health_log)),
1487                     NVME_COPY_TO_PRP);
1488                 break;
1489         case NVME_LOG_FIRMWARE_SLOT:
1490                 if (logoff >= sizeof(sc->fw_log)) {
1491                         pci_nvme_status_genc(&compl->status,
1492                             NVME_SC_INVALID_FIELD);
1493                         break;
1494                 }
1495
1496                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1497                     command->prp2, (uint8_t *)&sc->fw_log + logoff,
1498                     MIN(logsize - logoff, sizeof(sc->fw_log)),
1499                     NVME_COPY_TO_PRP);
1500                 break;
1501         case NVME_LOG_CHANGED_NAMESPACE:
1502                 if (logoff >= sizeof(sc->ns_log)) {
1503                         pci_nvme_status_genc(&compl->status,
1504                             NVME_SC_INVALID_FIELD);
1505                         break;
1506                 }
1507
1508                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1509                     command->prp2, (uint8_t *)&sc->ns_log + logoff,
1510                     MIN(logsize - logoff, sizeof(sc->ns_log)),
1511                     NVME_COPY_TO_PRP);
1512                 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1513                 break;
1514         default:
1515                 DPRINTF("%s get log page %x command not supported",
1516                         __func__, logpage);
1517
1518                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1519                     NVME_SC_INVALID_LOG_PAGE);
1520         }
1521
1522         return (1);
1523 }
1524
1525 static int
1526 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1527         struct nvme_completion* compl)
1528 {
1529         void *dest;
1530         uint16_t status;
1531
1532         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1533                 command->cdw10 & 0xFF, command->nsid);
1534
1535         status = 0;
1536         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1537
1538         switch (command->cdw10 & 0xFF) {
1539         case 0x00: /* return Identify Namespace data structure */
1540                 /* Global NS only valid with NS Management */
1541                 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1542                         pci_nvme_status_genc(&status,
1543                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1544                         break;
1545                 }
1546                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1547                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1548                     NVME_COPY_TO_PRP);
1549                 break;
1550         case 0x01: /* return Identify Controller data structure */
1551                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1552                     command->prp2, (uint8_t *)&sc->ctrldata,
1553                     sizeof(sc->ctrldata),
1554                     NVME_COPY_TO_PRP);
1555                 break;
1556         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1557                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1558                                   sizeof(uint32_t) * 1024);
1559                 /* All unused entries shall be zero */
1560                 memset(dest, 0, sizeof(uint32_t) * 1024);
1561                 ((uint32_t *)dest)[0] = 1;
1562                 break;
1563         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1564                 if (command->nsid != 1) {
1565                         pci_nvme_status_genc(&status,
1566                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1567                         break;
1568                 }
1569                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1570                                   sizeof(uint32_t) * 1024);
1571                 /* All bytes after the descriptor shall be zero */
1572                 memset(dest, 0, sizeof(uint32_t) * 1024);
1573
1574                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1575                 ((uint8_t *)dest)[0] = 1;
1576                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1577                 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
1578                 break;
1579         case 0x13:
1580                 /*
1581                  * Controller list is optional but used by UNH tests. Return
1582                  * a valid but empty list.
1583                  */
1584                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1585                                   sizeof(uint16_t) * 2048);
1586                 memset(dest, 0, sizeof(uint16_t) * 2048);
1587                 break;
1588         default:
1589                 DPRINTF("%s unsupported identify command requested 0x%x",
1590                          __func__, command->cdw10 & 0xFF);
1591                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1592                 break;
1593         }
1594
1595         compl->status = status;
1596         return (1);
1597 }
1598
1599 static const char *
1600 nvme_fid_to_name(uint8_t fid)
1601 {
1602         const char *name;
1603
1604         switch (fid) {
1605         case NVME_FEAT_ARBITRATION:
1606                 name = "Arbitration";
1607                 break;
1608         case NVME_FEAT_POWER_MANAGEMENT:
1609                 name = "Power Management";
1610                 break;
1611         case NVME_FEAT_LBA_RANGE_TYPE:
1612                 name = "LBA Range Type";
1613                 break;
1614         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1615                 name = "Temperature Threshold";
1616                 break;
1617         case NVME_FEAT_ERROR_RECOVERY:
1618                 name = "Error Recovery";
1619                 break;
1620         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1621                 name = "Volatile Write Cache";
1622                 break;
1623         case NVME_FEAT_NUMBER_OF_QUEUES:
1624                 name = "Number of Queues";
1625                 break;
1626         case NVME_FEAT_INTERRUPT_COALESCING:
1627                 name = "Interrupt Coalescing";
1628                 break;
1629         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1630                 name = "Interrupt Vector Configuration";
1631                 break;
1632         case NVME_FEAT_WRITE_ATOMICITY:
1633                 name = "Write Atomicity Normal";
1634                 break;
1635         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1636                 name = "Asynchronous Event Configuration";
1637                 break;
1638         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1639                 name = "Autonomous Power State Transition";
1640                 break;
1641         case NVME_FEAT_HOST_MEMORY_BUFFER:
1642                 name = "Host Memory Buffer";
1643                 break;
1644         case NVME_FEAT_TIMESTAMP:
1645                 name = "Timestamp";
1646                 break;
1647         case NVME_FEAT_KEEP_ALIVE_TIMER:
1648                 name = "Keep Alive Timer";
1649                 break;
1650         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1651                 name = "Host Controlled Thermal Management";
1652                 break;
1653         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1654                 name = "Non-Operation Power State Config";
1655                 break;
1656         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1657                 name = "Read Recovery Level Config";
1658                 break;
1659         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1660                 name = "Predictable Latency Mode Config";
1661                 break;
1662         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1663                 name = "Predictable Latency Mode Window";
1664                 break;
1665         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1666                 name = "LBA Status Information Report Interval";
1667                 break;
1668         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1669                 name = "Host Behavior Support";
1670                 break;
1671         case NVME_FEAT_SANITIZE_CONFIG:
1672                 name = "Sanitize Config";
1673                 break;
1674         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1675                 name = "Endurance Group Event Configuration";
1676                 break;
1677         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1678                 name = "Software Progress Marker";
1679                 break;
1680         case NVME_FEAT_HOST_IDENTIFIER:
1681                 name = "Host Identifier";
1682                 break;
1683         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1684                 name = "Reservation Notification Mask";
1685                 break;
1686         case NVME_FEAT_RESERVATION_PERSISTENCE:
1687                 name = "Reservation Persistence";
1688                 break;
1689         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1690                 name = "Namespace Write Protection Config";
1691                 break;
1692         default:
1693                 name = "Unknown";
1694                 break;
1695         }
1696
1697         return (name);
1698 }
1699
1700 static void
1701 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1702     struct nvme_feature_obj *feat __unused,
1703     struct nvme_command *command __unused,
1704     struct nvme_completion *compl)
1705 {
1706         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1707 }
1708
1709 static void
1710 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1711     struct nvme_feature_obj *feat __unused,
1712     struct nvme_command *command,
1713     struct nvme_completion *compl)
1714 {
1715         uint32_t i;
1716         uint32_t cdw11 = command->cdw11;
1717         uint16_t iv;
1718         bool cd;
1719
1720         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1721
1722         iv = cdw11 & 0xffff;
1723         cd = cdw11 & (1 << 16);
1724
1725         if (iv > (sc->max_queues + 1)) {
1726                 return;
1727         }
1728
1729         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1730         if ((iv == 0) && !cd)
1731                 return;
1732
1733         /* Requested Interrupt Vector must be used by a CQ */
1734         for (i = 0; i < sc->num_cqueues + 1; i++) {
1735                 if (sc->compl_queues[i].intr_vec == iv) {
1736                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1737                 }
1738         }
1739 }
1740
1741 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP                (0x4000)
1742 static void
1743 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1744     struct nvme_feature_obj *feat __unused,
1745     struct nvme_command *command,
1746     struct nvme_completion *compl)
1747 {
1748         if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1749                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1750 }
1751
1752 #define NVME_TEMP_THRESH_OVER   0
1753 #define NVME_TEMP_THRESH_UNDER  1
1754 static void
1755 nvme_feature_temperature(struct pci_nvme_softc *sc,
1756     struct nvme_feature_obj *feat __unused,
1757     struct nvme_command *command,
1758     struct nvme_completion *compl)
1759 {
1760         uint16_t        tmpth;  /* Temperature Threshold */
1761         uint8_t         tmpsel; /* Threshold Temperature Select */
1762         uint8_t         thsel;  /* Threshold Type Select */
1763         bool            set_crit = false;
1764         bool            report_crit;
1765
1766         tmpth  = command->cdw11 & 0xffff;
1767         tmpsel = (command->cdw11 >> 16) & 0xf;
1768         thsel  = (command->cdw11 >> 20) & 0x3;
1769
1770         DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1771
1772         /* Check for unsupported values */
1773         if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1774             (thsel > NVME_TEMP_THRESH_UNDER)) {
1775                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1776                 return;
1777         }
1778
1779         if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1780             ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1781                 set_crit = true;
1782
1783         pthread_mutex_lock(&sc->mtx);
1784         if (set_crit)
1785                 sc->health_log.critical_warning |=
1786                     NVME_CRIT_WARN_ST_TEMPERATURE;
1787         else
1788                 sc->health_log.critical_warning &=
1789                     ~NVME_CRIT_WARN_ST_TEMPERATURE;
1790         pthread_mutex_unlock(&sc->mtx);
1791
1792         report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 &
1793             NVME_CRIT_WARN_ST_TEMPERATURE;
1794
1795         if (set_crit && report_crit)
1796                 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1797                     sc->health_log.critical_warning);
1798
1799         DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1800 }
1801
1802 static void
1803 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1804     struct nvme_feature_obj *feat __unused,
1805     struct nvme_command *command,
1806     struct nvme_completion *compl)
1807 {
1808         uint16_t nqr;   /* Number of Queues Requested */
1809
1810         if (sc->num_q_is_set) {
1811                 WPRINTF("%s: Number of Queues already set", __func__);
1812                 pci_nvme_status_genc(&compl->status,
1813                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1814                 return;
1815         }
1816
1817         nqr = command->cdw11 & 0xFFFF;
1818         if (nqr == 0xffff) {
1819                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1820                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1821                 return;
1822         }
1823
1824         sc->num_squeues = ONE_BASED(nqr);
1825         if (sc->num_squeues > sc->max_queues) {
1826                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1827                                         sc->max_queues);
1828                 sc->num_squeues = sc->max_queues;
1829         }
1830
1831         nqr = (command->cdw11 >> 16) & 0xFFFF;
1832         if (nqr == 0xffff) {
1833                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1834                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1835                 return;
1836         }
1837
1838         sc->num_cqueues = ONE_BASED(nqr);
1839         if (sc->num_cqueues > sc->max_queues) {
1840                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1841                                         sc->max_queues);
1842                 sc->num_cqueues = sc->max_queues;
1843         }
1844
1845         /* Patch the command value which will be saved on callback's return */
1846         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1847         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1848
1849         sc->num_q_is_set = true;
1850 }
1851
1852 static int
1853 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1854         struct nvme_completion *compl)
1855 {
1856         struct nvme_feature_obj *feat;
1857         uint32_t nsid = command->nsid;
1858         uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10);
1859         bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10);
1860
1861         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1862
1863         if (fid >= NVME_FID_MAX) {
1864                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1865                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1866                 return (1);
1867         }
1868
1869         if (sv) {
1870                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1871                     NVME_SC_FEATURE_NOT_SAVEABLE);
1872                 return (1);
1873         }
1874
1875         feat = &sc->feat[fid];
1876
1877         if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1878                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1879                 return (1);
1880         }
1881
1882         if (!feat->namespace_specific &&
1883             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1884                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1885                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1886                 return (1);
1887         }
1888
1889         compl->cdw0 = 0;
1890         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1891
1892         if (feat->set)
1893                 feat->set(sc, feat, command, compl);
1894         else {
1895                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1896                     NVME_SC_FEATURE_NOT_CHANGEABLE);
1897                 return (1);
1898         }
1899
1900         DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1901         if (compl->status == NVME_SC_SUCCESS) {
1902                 feat->cdw11 = command->cdw11;
1903                 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1904                     (command->cdw11 != 0))
1905                         pci_nvme_aen_notify(sc);
1906         }
1907
1908         return (0);
1909 }
1910
1911 #define NVME_FEATURES_SEL_SUPPORTED     0x3
1912 #define NVME_FEATURES_NS_SPECIFIC       (1 << 1)
1913
1914 static int
1915 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1916         struct nvme_completion* compl)
1917 {
1918         struct nvme_feature_obj *feat;
1919         uint8_t fid = command->cdw10 & 0xFF;
1920         uint8_t sel = (command->cdw10 >> 8) & 0x7;
1921
1922         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1923
1924         if (fid >= NVME_FID_MAX) {
1925                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1926                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1927                 return (1);
1928         }
1929
1930         compl->cdw0 = 0;
1931         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1932
1933         feat = &sc->feat[fid];
1934         if (feat->get) {
1935                 feat->get(sc, feat, command, compl);
1936         }
1937
1938         if (compl->status == NVME_SC_SUCCESS) {
1939                 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1940                         compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1941                 else
1942                         compl->cdw0 = feat->cdw11;
1943         }
1944
1945         return (0);
1946 }
1947
1948 static int
1949 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1950         struct nvme_completion* compl)
1951 {
1952         uint8_t ses, lbaf, pi;
1953
1954         /* Only supports Secure Erase Setting - User Data Erase */
1955         ses = (command->cdw10 >> 9) & 0x7;
1956         if (ses > 0x1) {
1957                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1958                 return (1);
1959         }
1960
1961         /* Only supports a single LBA Format */
1962         lbaf = command->cdw10 & 0xf;
1963         if (lbaf != 0) {
1964                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1965                     NVME_SC_INVALID_FORMAT);
1966                 return (1);
1967         }
1968
1969         /* Doesn't support Protection Infomation */
1970         pi = (command->cdw10 >> 5) & 0x7;
1971         if (pi != 0) {
1972                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1973                 return (1);
1974         }
1975
1976         if (sc->nvstore.type == NVME_STOR_RAM) {
1977                 if (sc->nvstore.ctx)
1978                         free(sc->nvstore.ctx);
1979                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1980                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1981         } else {
1982                 struct pci_nvme_ioreq *req;
1983                 int err;
1984
1985                 req = pci_nvme_get_ioreq(sc);
1986                 if (req == NULL) {
1987                         pci_nvme_status_genc(&compl->status,
1988                             NVME_SC_INTERNAL_DEVICE_ERROR);
1989                         WPRINTF("%s: unable to allocate IO req", __func__);
1990                         return (1);
1991                 }
1992                 req->nvme_sq = &sc->submit_queues[0];
1993                 req->sqid = 0;
1994                 req->opc = command->opc;
1995                 req->cid = command->cid;
1996                 req->nsid = command->nsid;
1997
1998                 req->io_req.br_offset = 0;
1999                 req->io_req.br_resid = sc->nvstore.size;
2000                 req->io_req.br_callback = pci_nvme_io_done;
2001
2002                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
2003                 if (err) {
2004                         pci_nvme_status_genc(&compl->status,
2005                             NVME_SC_INTERNAL_DEVICE_ERROR);
2006                         pci_nvme_release_ioreq(sc, req);
2007                 } else
2008                         compl->status = NVME_NO_STATUS;
2009         }
2010
2011         return (1);
2012 }
2013
2014 static int
2015 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
2016     struct nvme_completion *compl)
2017 {
2018         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
2019                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
2020
2021         /* TODO: search for the command ID and abort it */
2022
2023         compl->cdw0 = 1;
2024         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2025         return (1);
2026 }
2027
2028 static int
2029 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2030         struct nvme_command* command, struct nvme_completion* compl)
2031 {
2032         DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2033             sc->aer_count, sc->ctrldata.aerl, command->cid);
2034
2035         /* Don't exceed the Async Event Request Limit (AERL). */
2036         if (pci_nvme_aer_limit_reached(sc)) {
2037                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2038                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2039                 return (1);
2040         }
2041
2042         if (pci_nvme_aer_add(sc, command->cid)) {
2043                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2044                                 NVME_SC_INTERNAL_DEVICE_ERROR);
2045                 return (1);
2046         }
2047
2048         /*
2049          * Raise events when they happen based on the Set Features cmd.
2050          * These events happen async, so only set completion successful if
2051          * there is an event reflective of the request to get event.
2052          */
2053         compl->status = NVME_NO_STATUS;
2054         pci_nvme_aen_notify(sc);
2055
2056         return (0);
2057 }
2058
2059 static void
2060 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2061 {
2062         struct nvme_completion compl;
2063         struct nvme_command *cmd;
2064         struct nvme_submission_queue *sq;
2065         struct nvme_completion_queue *cq;
2066         uint16_t sqhead;
2067
2068         DPRINTF("%s index %u", __func__, (uint32_t)value);
2069
2070         sq = &sc->submit_queues[0];
2071         cq = &sc->compl_queues[0];
2072
2073         pthread_mutex_lock(&sq->mtx);
2074
2075         sqhead = sq->head;
2076         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2077
2078         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2079                 cmd = &(sq->qbase)[sqhead];
2080                 compl.cdw0 = 0;
2081                 compl.status = 0;
2082
2083                 switch (cmd->opc) {
2084                 case NVME_OPC_DELETE_IO_SQ:
2085                         DPRINTF("%s command DELETE_IO_SQ", __func__);
2086                         nvme_opc_delete_io_sq(sc, cmd, &compl);
2087                         break;
2088                 case NVME_OPC_CREATE_IO_SQ:
2089                         DPRINTF("%s command CREATE_IO_SQ", __func__);
2090                         nvme_opc_create_io_sq(sc, cmd, &compl);
2091                         break;
2092                 case NVME_OPC_DELETE_IO_CQ:
2093                         DPRINTF("%s command DELETE_IO_CQ", __func__);
2094                         nvme_opc_delete_io_cq(sc, cmd, &compl);
2095                         break;
2096                 case NVME_OPC_CREATE_IO_CQ:
2097                         DPRINTF("%s command CREATE_IO_CQ", __func__);
2098                         nvme_opc_create_io_cq(sc, cmd, &compl);
2099                         break;
2100                 case NVME_OPC_GET_LOG_PAGE:
2101                         DPRINTF("%s command GET_LOG_PAGE", __func__);
2102                         nvme_opc_get_log_page(sc, cmd, &compl);
2103                         break;
2104                 case NVME_OPC_IDENTIFY:
2105                         DPRINTF("%s command IDENTIFY", __func__);
2106                         nvme_opc_identify(sc, cmd, &compl);
2107                         break;
2108                 case NVME_OPC_ABORT:
2109                         DPRINTF("%s command ABORT", __func__);
2110                         nvme_opc_abort(sc, cmd, &compl);
2111                         break;
2112                 case NVME_OPC_SET_FEATURES:
2113                         DPRINTF("%s command SET_FEATURES", __func__);
2114                         nvme_opc_set_features(sc, cmd, &compl);
2115                         break;
2116                 case NVME_OPC_GET_FEATURES:
2117                         DPRINTF("%s command GET_FEATURES", __func__);
2118                         nvme_opc_get_features(sc, cmd, &compl);
2119                         break;
2120                 case NVME_OPC_FIRMWARE_ACTIVATE:
2121                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2122                         pci_nvme_status_tc(&compl.status,
2123                             NVME_SCT_COMMAND_SPECIFIC,
2124                             NVME_SC_INVALID_FIRMWARE_SLOT);
2125                         break;
2126                 case NVME_OPC_ASYNC_EVENT_REQUEST:
2127                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2128                         nvme_opc_async_event_req(sc, cmd, &compl);
2129                         break;
2130                 case NVME_OPC_FORMAT_NVM:
2131                         DPRINTF("%s command FORMAT_NVM", __func__);
2132                         if ((sc->ctrldata.oacs &
2133                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2134                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2135                                 break;
2136                         }
2137                         nvme_opc_format_nvm(sc, cmd, &compl);
2138                         break;
2139                 case NVME_OPC_SECURITY_SEND:
2140                 case NVME_OPC_SECURITY_RECEIVE:
2141                 case NVME_OPC_SANITIZE:
2142                 case NVME_OPC_GET_LBA_STATUS:
2143                         DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2144                             cmd->opc);
2145                         /* Valid but unsupported opcodes */
2146                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2147                         break;
2148                 default:
2149                         DPRINTF("%s command OPC=%#X (not implemented)",
2150                             __func__,
2151                             cmd->opc);
2152                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2153                 }
2154                 sqhead = (sqhead + 1) % sq->size;
2155
2156                 if (NVME_COMPLETION_VALID(compl)) {
2157                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
2158                             compl.cdw0,
2159                             cmd->cid,
2160                             0,          /* SQID */
2161                             compl.status);
2162                 }
2163         }
2164
2165         DPRINTF("setting sqhead %u", sqhead);
2166         sq->head = sqhead;
2167
2168         if (cq->head != cq->tail)
2169                 pci_generate_msix(sc->nsc_pi, 0);
2170
2171         pthread_mutex_unlock(&sq->mtx);
2172 }
2173
2174 /*
2175  * Update the Write and Read statistics reported in SMART data
2176  *
2177  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2178  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2179  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2180  */
2181 static void
2182 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2183     size_t bytes, uint16_t status)
2184 {
2185
2186         pthread_mutex_lock(&sc->mtx);
2187         switch (opc) {
2188         case NVME_OPC_WRITE:
2189                 sc->write_commands++;
2190                 if (status != NVME_SC_SUCCESS)
2191                         break;
2192                 sc->write_dunits_remainder += (bytes / 512);
2193                 while (sc->write_dunits_remainder >= 1000) {
2194                         sc->write_data_units++;
2195                         sc->write_dunits_remainder -= 1000;
2196                 }
2197                 break;
2198         case NVME_OPC_READ:
2199                 sc->read_commands++;
2200                 if (status != NVME_SC_SUCCESS)
2201                         break;
2202                 sc->read_dunits_remainder += (bytes / 512);
2203                 while (sc->read_dunits_remainder >= 1000) {
2204                         sc->read_data_units++;
2205                         sc->read_dunits_remainder -= 1000;
2206                 }
2207                 break;
2208         default:
2209                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2210                 break;
2211         }
2212         pthread_mutex_unlock(&sc->mtx);
2213 }
2214
2215 /*
2216  * Check if the combination of Starting LBA (slba) and number of blocks
2217  * exceeds the range of the underlying storage.
2218  *
2219  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2220  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2221  * overflow.
2222  */
2223 static bool
2224 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2225     uint32_t nblocks)
2226 {
2227         size_t  offset, bytes;
2228
2229         /* Overflow check of multiplying Starting LBA by the sector size */
2230         if (slba >> (64 - nvstore->sectsz_bits))
2231                 return (true);
2232
2233         offset = slba << nvstore->sectsz_bits;
2234         bytes = nblocks << nvstore->sectsz_bits;
2235
2236         /* Overflow check of Number of Logical Blocks */
2237         if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2238                 return (true);
2239
2240         return (false);
2241 }
2242
2243 static int
2244 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused,
2245     struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset)
2246 {
2247         int iovidx;
2248         bool range_is_contiguous;
2249
2250         if (req == NULL)
2251                 return (-1);
2252
2253         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2254                 return (-1);
2255         }
2256
2257         /*
2258          * Minimize the number of IOVs by concatenating contiguous address
2259          * ranges. If the IOV count is zero, there is no previous range to
2260          * concatenate.
2261          */
2262         if (req->io_req.br_iovcnt == 0)
2263                 range_is_contiguous = false;
2264         else
2265                 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2266
2267         if (range_is_contiguous) {
2268                 iovidx = req->io_req.br_iovcnt - 1;
2269
2270                 req->io_req.br_iov[iovidx].iov_base =
2271                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2272                                      req->prev_gpaddr, size);
2273                 if (req->io_req.br_iov[iovidx].iov_base == NULL)
2274                         return (-1);
2275
2276                 req->prev_size += size;
2277                 req->io_req.br_resid += size;
2278
2279                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2280         } else {
2281                 iovidx = req->io_req.br_iovcnt;
2282                 if (iovidx == 0) {
2283                         req->io_req.br_offset = offset;
2284                         req->io_req.br_resid = 0;
2285                         req->io_req.br_param = req;
2286                 }
2287
2288                 req->io_req.br_iov[iovidx].iov_base =
2289                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2290                                      gpaddr, size);
2291                 if (req->io_req.br_iov[iovidx].iov_base == NULL)
2292                         return (-1);
2293
2294                 req->io_req.br_iov[iovidx].iov_len = size;
2295
2296                 req->prev_gpaddr = gpaddr;
2297                 req->prev_size = size;
2298                 req->io_req.br_resid += size;
2299
2300                 req->io_req.br_iovcnt++;
2301         }
2302
2303         return (0);
2304 }
2305
2306 static void
2307 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2308     struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2309 {
2310         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2311
2312         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2313                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2314                  NVME_STATUS_GET_SC(status));
2315
2316         pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2317
2318         if (cq->head != cq->tail) {
2319                 if (cq->intr_en & NVME_CQ_INTEN) {
2320                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2321                 } else {
2322                         DPRINTF("%s: CQ%u interrupt disabled",
2323                                                 __func__, sq->cqid);
2324                 }
2325         }
2326 }
2327
2328 static void
2329 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2330 {
2331         req->sc = NULL;
2332         req->nvme_sq = NULL;
2333         req->sqid = 0;
2334
2335         pthread_mutex_lock(&sc->mtx);
2336
2337         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2338         sc->pending_ios--;
2339
2340         /* when no more IO pending, can set to ready if device reset/enabled */
2341         if (sc->pending_ios == 0 &&
2342             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2343                 sc->regs.csts |= NVME_CSTS_RDY;
2344
2345         pthread_mutex_unlock(&sc->mtx);
2346
2347         sem_post(&sc->iosemlock);
2348 }
2349
2350 static struct pci_nvme_ioreq *
2351 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2352 {
2353         struct pci_nvme_ioreq *req = NULL;
2354
2355         sem_wait(&sc->iosemlock);
2356         pthread_mutex_lock(&sc->mtx);
2357
2358         req = STAILQ_FIRST(&sc->ioreqs_free);
2359         assert(req != NULL);
2360         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2361
2362         req->sc = sc;
2363
2364         sc->pending_ios++;
2365
2366         pthread_mutex_unlock(&sc->mtx);
2367
2368         req->io_req.br_iovcnt = 0;
2369         req->io_req.br_offset = 0;
2370         req->io_req.br_resid = 0;
2371         req->io_req.br_param = req;
2372         req->prev_gpaddr = 0;
2373         req->prev_size = 0;
2374
2375         return req;
2376 }
2377
2378 static void
2379 pci_nvme_io_done(struct blockif_req *br, int err)
2380 {
2381         struct pci_nvme_ioreq *req = br->br_param;
2382         struct nvme_submission_queue *sq = req->nvme_sq;
2383         uint16_t code, status;
2384
2385         DPRINTF("%s error %d %s", __func__, err, strerror(err));
2386
2387         /* TODO return correct error */
2388         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2389         status = 0;
2390         pci_nvme_status_genc(&status, code);
2391
2392         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2393         pci_nvme_stats_write_read_update(req->sc, req->opc,
2394             req->bytes, status);
2395         pci_nvme_release_ioreq(req->sc, req);
2396 }
2397
2398 /*
2399  * Implements the Flush command. The specification states:
2400  *    If a volatile write cache is not present, Flush commands complete
2401  *    successfully and have no effect
2402  * in the description of the Volatile Write Cache (VWC) field of the Identify
2403  * Controller data. Therefore, set status to Success if the command is
2404  * not supported (i.e. RAM or as indicated by the blockif).
2405  */
2406 static bool
2407 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2408     struct nvme_command *cmd __unused,
2409     struct pci_nvme_blockstore *nvstore,
2410     struct pci_nvme_ioreq *req,
2411     uint16_t *status)
2412 {
2413         bool pending = false;
2414
2415         if (nvstore->type == NVME_STOR_RAM) {
2416                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2417         } else {
2418                 int err;
2419
2420                 req->io_req.br_callback = pci_nvme_io_done;
2421
2422                 err = blockif_flush(nvstore->ctx, &req->io_req);
2423                 switch (err) {
2424                 case 0:
2425                         pending = true;
2426                         break;
2427                 case EOPNOTSUPP:
2428                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2429                         break;
2430                 default:
2431                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2432                 }
2433         }
2434
2435         return (pending);
2436 }
2437
2438 static uint16_t
2439 nvme_write_read_ram(struct pci_nvme_softc *sc,
2440     struct pci_nvme_blockstore *nvstore,
2441     uint64_t prp1, uint64_t prp2,
2442     size_t offset, uint64_t bytes,
2443     bool is_write)
2444 {
2445         uint8_t *buf = nvstore->ctx;
2446         enum nvme_copy_dir dir;
2447         uint16_t status;
2448
2449         if (is_write)
2450                 dir = NVME_COPY_TO_PRP;
2451         else
2452                 dir = NVME_COPY_FROM_PRP;
2453
2454         status = 0;
2455         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2456             buf + offset, bytes, dir))
2457                 pci_nvme_status_genc(&status,
2458                     NVME_SC_DATA_TRANSFER_ERROR);
2459         else
2460                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2461
2462         return (status);
2463 }
2464
2465 static uint16_t
2466 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2467     struct pci_nvme_blockstore *nvstore,
2468     struct pci_nvme_ioreq *req,
2469     uint64_t prp1, uint64_t prp2,
2470     size_t offset, uint64_t bytes,
2471     bool is_write)
2472 {
2473         uint64_t size;
2474         int err;
2475         uint16_t status = NVME_NO_STATUS;
2476
2477         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2478         if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) {
2479                 err = -1;
2480                 goto out;
2481         }
2482
2483         offset += size;
2484         bytes  -= size;
2485
2486         if (bytes == 0) {
2487                 ;
2488         } else if (bytes <= PAGE_SIZE) {
2489                 size = bytes;
2490                 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) {
2491                         err = -1;
2492                         goto out;
2493                 }
2494         } else {
2495                 void *vmctx = sc->nsc_pi->pi_vmctx;
2496                 uint64_t *prp_list = &prp2;
2497                 uint64_t *last = prp_list;
2498
2499                 /* PRP2 is pointer to a physical region page list */
2500                 while (bytes) {
2501                         /* Last entry in list points to the next list */
2502                         if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2503                                 uint64_t prp = *prp_list;
2504
2505                                 prp_list = paddr_guest2host(vmctx, prp,
2506                                     PAGE_SIZE - (prp % PAGE_SIZE));
2507                                 if (prp_list == NULL) {
2508                                         err = -1;
2509                                         goto out;
2510                                 }
2511                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
2512                         }
2513
2514                         size = MIN(bytes, PAGE_SIZE);
2515
2516                         if (pci_nvme_append_iov_req(sc, req, *prp_list, size,
2517                             offset)) {
2518                                 err = -1;
2519                                 goto out;
2520                         }
2521
2522                         offset += size;
2523                         bytes  -= size;
2524
2525                         prp_list++;
2526                 }
2527         }
2528         req->io_req.br_callback = pci_nvme_io_done;
2529         if (is_write)
2530                 err = blockif_write(nvstore->ctx, &req->io_req);
2531         else
2532                 err = blockif_read(nvstore->ctx, &req->io_req);
2533 out:
2534         if (err)
2535                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2536
2537         return (status);
2538 }
2539
2540 static bool
2541 nvme_opc_write_read(struct pci_nvme_softc *sc,
2542     struct nvme_command *cmd,
2543     struct pci_nvme_blockstore *nvstore,
2544     struct pci_nvme_ioreq *req,
2545     uint16_t *status)
2546 {
2547         uint64_t lba, nblocks, bytes;
2548         size_t offset;
2549         bool is_write = cmd->opc == NVME_OPC_WRITE;
2550         bool pending = false;
2551
2552         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2553         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2554         bytes = nblocks << nvstore->sectsz_bits;
2555         if (bytes > NVME_MAX_DATA_SIZE) {
2556                 WPRINTF("%s command would exceed MDTS", __func__);
2557                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2558                 goto out;
2559         }
2560
2561         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2562                 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2563                     __func__, lba, nblocks);
2564                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2565                 goto out;
2566         }
2567
2568         offset = lba << nvstore->sectsz_bits;
2569
2570         req->bytes = bytes;
2571         req->io_req.br_offset = lba;
2572
2573         /* PRP bits 1:0 must be zero */
2574         cmd->prp1 &= ~0x3UL;
2575         cmd->prp2 &= ~0x3UL;
2576
2577         if (nvstore->type == NVME_STOR_RAM) {
2578                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2579                     cmd->prp2, offset, bytes, is_write);
2580         } else {
2581                 *status = nvme_write_read_blockif(sc, nvstore, req,
2582                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2583
2584                 if (*status == NVME_NO_STATUS)
2585                         pending = true;
2586         }
2587 out:
2588         if (!pending)
2589                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2590
2591         return (pending);
2592 }
2593
2594 static void
2595 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2596 {
2597         struct pci_nvme_ioreq *req = br->br_param;
2598         struct pci_nvme_softc *sc = req->sc;
2599         bool done = true;
2600         uint16_t status;
2601
2602         status = 0;
2603         if (err) {
2604                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2605         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2606                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2607         } else {
2608                 struct iovec *iov = req->io_req.br_iov;
2609
2610                 req->prev_gpaddr++;
2611                 iov += req->prev_gpaddr;
2612
2613                 /* The iov_* values already include the sector size */
2614                 req->io_req.br_offset = (off_t)iov->iov_base;
2615                 req->io_req.br_resid = iov->iov_len;
2616                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2617                         pci_nvme_status_genc(&status,
2618                             NVME_SC_INTERNAL_DEVICE_ERROR);
2619                 } else
2620                         done = false;
2621         }
2622
2623         if (done) {
2624                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2625                     status);
2626                 pci_nvme_release_ioreq(sc, req);
2627         }
2628 }
2629
2630 static bool
2631 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2632     struct nvme_command *cmd,
2633     struct pci_nvme_blockstore *nvstore,
2634     struct pci_nvme_ioreq *req,
2635     uint16_t *status)
2636 {
2637         struct nvme_dsm_range *range = NULL;
2638         uint32_t nr, r, non_zero, dr;
2639         int err;
2640         bool pending = false;
2641
2642         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2643                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2644                 goto out;
2645         }
2646
2647         nr = cmd->cdw10 & 0xff;
2648
2649         /* copy locally because a range entry could straddle PRPs */
2650         range = calloc(1, NVME_MAX_DSM_TRIM);
2651         if (range == NULL) {
2652                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2653                 goto out;
2654         }
2655         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2656             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2657
2658         /* Check for invalid ranges and the number of non-zero lengths */
2659         non_zero = 0;
2660         for (r = 0; r <= nr; r++) {
2661                 if (pci_nvme_out_of_range(nvstore,
2662                     range[r].starting_lba, range[r].length)) {
2663                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2664                         goto out;
2665                 }
2666                 if (range[r].length != 0)
2667                         non_zero++;
2668         }
2669
2670         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2671                 size_t offset, bytes;
2672                 int sectsz_bits = sc->nvstore.sectsz_bits;
2673
2674                 /*
2675                  * DSM calls are advisory only, and compliant controllers
2676                  * may choose to take no actions (i.e. return Success).
2677                  */
2678                 if (!nvstore->deallocate) {
2679                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2680                         goto out;
2681                 }
2682
2683                 /* If all ranges have a zero length, return Success */
2684                 if (non_zero == 0) {
2685                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2686                         goto out;
2687                 }
2688
2689                 if (req == NULL) {
2690                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2691                         goto out;
2692                 }
2693
2694                 offset = range[0].starting_lba << sectsz_bits;
2695                 bytes = range[0].length << sectsz_bits;
2696
2697                 /*
2698                  * If the request is for more than a single range, store
2699                  * the ranges in the br_iov. Optimize for the common case
2700                  * of a single range.
2701                  *
2702                  * Note that NVMe Number of Ranges is a zero based value
2703                  */
2704                 req->io_req.br_iovcnt = 0;
2705                 req->io_req.br_offset = offset;
2706                 req->io_req.br_resid = bytes;
2707
2708                 if (nr == 0) {
2709                         req->io_req.br_callback = pci_nvme_io_done;
2710                 } else {
2711                         struct iovec *iov = req->io_req.br_iov;
2712
2713                         for (r = 0, dr = 0; r <= nr; r++) {
2714                                 offset = range[r].starting_lba << sectsz_bits;
2715                                 bytes = range[r].length << sectsz_bits;
2716                                 if (bytes == 0)
2717                                         continue;
2718
2719                                 if ((nvstore->size - offset) < bytes) {
2720                                         pci_nvme_status_genc(status,
2721                                             NVME_SC_LBA_OUT_OF_RANGE);
2722                                         goto out;
2723                                 }
2724                                 iov[dr].iov_base = (void *)offset;
2725                                 iov[dr].iov_len = bytes;
2726                                 dr++;
2727                         }
2728                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2729
2730                         /*
2731                          * Use prev_gpaddr to track the current entry and
2732                          * prev_size to track the number of entries
2733                          */
2734                         req->prev_gpaddr = 0;
2735                         req->prev_size = dr;
2736                 }
2737
2738                 err = blockif_delete(nvstore->ctx, &req->io_req);
2739                 if (err)
2740                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2741                 else
2742                         pending = true;
2743         }
2744 out:
2745         free(range);
2746         return (pending);
2747 }
2748
2749 static void
2750 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2751 {
2752         struct nvme_submission_queue *sq;
2753         uint16_t status;
2754         uint16_t sqhead;
2755
2756         /* handle all submissions up to sq->tail index */
2757         sq = &sc->submit_queues[idx];
2758
2759         pthread_mutex_lock(&sq->mtx);
2760
2761         sqhead = sq->head;
2762         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2763                  idx, sqhead, sq->tail, sq->qbase);
2764
2765         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2766                 struct nvme_command *cmd;
2767                 struct pci_nvme_ioreq *req;
2768                 uint32_t nsid;
2769                 bool pending;
2770
2771                 pending = false;
2772                 req = NULL;
2773                 status = 0;
2774
2775                 cmd = &sq->qbase[sqhead];
2776                 sqhead = (sqhead + 1) % sq->size;
2777
2778                 nsid = le32toh(cmd->nsid);
2779                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2780                         pci_nvme_status_genc(&status,
2781                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2782                         status |=
2783                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2784                         goto complete;
2785                 }
2786
2787                 req = pci_nvme_get_ioreq(sc);
2788                 if (req == NULL) {
2789                         pci_nvme_status_genc(&status,
2790                             NVME_SC_INTERNAL_DEVICE_ERROR);
2791                         WPRINTF("%s: unable to allocate IO req", __func__);
2792                         goto complete;
2793                 }
2794                 req->nvme_sq = sq;
2795                 req->sqid = idx;
2796                 req->opc = cmd->opc;
2797                 req->cid = cmd->cid;
2798                 req->nsid = cmd->nsid;
2799
2800                 switch (cmd->opc) {
2801                 case NVME_OPC_FLUSH:
2802                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2803                             req, &status);
2804                         break;
2805                 case NVME_OPC_WRITE:
2806                 case NVME_OPC_READ:
2807                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2808                             req, &status);
2809                         break;
2810                 case NVME_OPC_WRITE_ZEROES:
2811                         /* TODO: write zeroes
2812                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2813                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2814                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2815                         break;
2816                 case NVME_OPC_DATASET_MANAGEMENT:
2817                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2818                             req, &status);
2819                         break;
2820                 default:
2821                         WPRINTF("%s unhandled io command 0x%x",
2822                             __func__, cmd->opc);
2823                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2824                 }
2825 complete:
2826                 if (!pending) {
2827                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2828                         if (req != NULL)
2829                                 pci_nvme_release_ioreq(sc, req);
2830                 }
2831         }
2832
2833         sq->head = sqhead;
2834
2835         pthread_mutex_unlock(&sq->mtx);
2836 }
2837
2838 static void
2839 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc,
2840         uint64_t idx, int is_sq, uint64_t value)
2841 {
2842         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2843                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2844
2845         if (is_sq) {
2846                 if (idx > sc->num_squeues) {
2847                         WPRINTF("%s queue index %lu overflow from "
2848                                  "guest (max %u)",
2849                                  __func__, idx, sc->num_squeues);
2850                         return;
2851                 }
2852
2853                 atomic_store_short(&sc->submit_queues[idx].tail,
2854                                    (uint16_t)value);
2855
2856                 if (idx == 0) {
2857                         pci_nvme_handle_admin_cmd(sc, value);
2858                 } else {
2859                         /* submission queue; handle new entries in SQ */
2860                         if (idx > sc->num_squeues) {
2861                                 WPRINTF("%s SQ index %lu overflow from "
2862                                          "guest (max %u)",
2863                                          __func__, idx, sc->num_squeues);
2864                                 return;
2865                         }
2866                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2867                 }
2868         } else {
2869                 if (idx > sc->num_cqueues) {
2870                         WPRINTF("%s queue index %lu overflow from "
2871                                  "guest (max %u)",
2872                                  __func__, idx, sc->num_cqueues);
2873                         return;
2874                 }
2875
2876                 atomic_store_short(&sc->compl_queues[idx].head,
2877                                 (uint16_t)value);
2878         }
2879 }
2880
2881 static void
2882 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2883 {
2884         const char *s = iswrite ? "WRITE" : "READ";
2885
2886         switch (offset) {
2887         case NVME_CR_CAP_LOW:
2888                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2889                 break;
2890         case NVME_CR_CAP_HI:
2891                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2892                 break;
2893         case NVME_CR_VS:
2894                 DPRINTF("%s %s NVME_CR_VS", func, s);
2895                 break;
2896         case NVME_CR_INTMS:
2897                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2898                 break;
2899         case NVME_CR_INTMC:
2900                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2901                 break;
2902         case NVME_CR_CC:
2903                 DPRINTF("%s %s NVME_CR_CC", func, s);
2904                 break;
2905         case NVME_CR_CSTS:
2906                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2907                 break;
2908         case NVME_CR_NSSR:
2909                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2910                 break;
2911         case NVME_CR_AQA:
2912                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2913                 break;
2914         case NVME_CR_ASQ_LOW:
2915                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2916                 break;
2917         case NVME_CR_ASQ_HI:
2918                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2919                 break;
2920         case NVME_CR_ACQ_LOW:
2921                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2922                 break;
2923         case NVME_CR_ACQ_HI:
2924                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2925                 break;
2926         default:
2927                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2928         }
2929
2930 }
2931
2932 static void
2933 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size,
2934     uint64_t value)
2935 {
2936         uint32_t ccreg;
2937
2938         if (offset >= NVME_DOORBELL_OFFSET) {
2939                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2940                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2941                 int is_sq = (belloffset % 8) < 4;
2942
2943                 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2944                         WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2945                             offset);
2946                         return;
2947                 }
2948
2949                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2950                         WPRINTF("guest attempted an overflow write offset "
2951                                  "0x%lx, val 0x%lx in %s",
2952                                  offset, value, __func__);
2953                         return;
2954                 }
2955
2956                 if (is_sq) {
2957                         if (sc->submit_queues[idx].qbase == NULL)
2958                                 return;
2959                 } else if (sc->compl_queues[idx].qbase == NULL)
2960                         return;
2961
2962                 pci_nvme_handle_doorbell(sc, idx, is_sq, value);
2963                 return;
2964         }
2965
2966         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2967                 offset, size, value);
2968
2969         if (size != 4) {
2970                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2971                          "val 0x%lx) to bar0 in %s",
2972                          size, offset, value, __func__);
2973                 /* TODO: shutdown device */
2974                 return;
2975         }
2976
2977         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2978
2979         pthread_mutex_lock(&sc->mtx);
2980
2981         switch (offset) {
2982         case NVME_CR_CAP_LOW:
2983         case NVME_CR_CAP_HI:
2984                 /* readonly */
2985                 break;
2986         case NVME_CR_VS:
2987                 /* readonly */
2988                 break;
2989         case NVME_CR_INTMS:
2990                 /* MSI-X, so ignore */
2991                 break;
2992         case NVME_CR_INTMC:
2993                 /* MSI-X, so ignore */
2994                 break;
2995         case NVME_CR_CC:
2996                 ccreg = (uint32_t)value;
2997
2998                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2999                          "iocqes %u",
3000                         __func__,
3001                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
3002                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
3003                          NVME_CC_GET_IOCQES(ccreg));
3004
3005                 if (NVME_CC_GET_SHN(ccreg)) {
3006                         /* perform shutdown - flush out data to backend */
3007                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
3008                             NVME_CSTS_REG_SHST_SHIFT);
3009                         sc->regs.csts |= NVME_SHST_COMPLETE <<
3010                             NVME_CSTS_REG_SHST_SHIFT;
3011                 }
3012                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
3013                         if (NVME_CC_GET_EN(ccreg) == 0)
3014                                 /* transition 1-> causes controller reset */
3015                                 pci_nvme_reset_locked(sc);
3016                         else
3017                                 pci_nvme_init_controller(sc);
3018                 }
3019
3020                 /* Insert the iocqes, iosqes and en bits from the write */
3021                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
3022                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
3023                 if (NVME_CC_GET_EN(ccreg) == 0) {
3024                         /* Insert the ams, mps and css bit fields */
3025                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3026                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3027                         sc->regs.csts &= ~NVME_CSTS_RDY;
3028                 } else if ((sc->pending_ios == 0) &&
3029                     !(sc->regs.csts & NVME_CSTS_CFS)) {
3030                         sc->regs.csts |= NVME_CSTS_RDY;
3031                 }
3032                 break;
3033         case NVME_CR_CSTS:
3034                 break;
3035         case NVME_CR_NSSR:
3036                 /* ignore writes; don't support subsystem reset */
3037                 break;
3038         case NVME_CR_AQA:
3039                 sc->regs.aqa = (uint32_t)value;
3040                 break;
3041         case NVME_CR_ASQ_LOW:
3042                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3043                                (0xFFFFF000 & value);
3044                 break;
3045         case NVME_CR_ASQ_HI:
3046                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3047                                (value << 32);
3048                 break;
3049         case NVME_CR_ACQ_LOW:
3050                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3051                                (0xFFFFF000 & value);
3052                 break;
3053         case NVME_CR_ACQ_HI:
3054                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3055                                (value << 32);
3056                 break;
3057         default:
3058                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3059                          __func__, offset, value, size);
3060         }
3061         pthread_mutex_unlock(&sc->mtx);
3062 }
3063
3064 static void
3065 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size,
3066     uint64_t value)
3067 {
3068         struct pci_nvme_softc* sc = pi->pi_arg;
3069
3070         if (baridx == pci_msix_table_bar(pi) ||
3071             baridx == pci_msix_pba_bar(pi)) {
3072                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3073                          " value 0x%lx", baridx, offset, size, value);
3074
3075                 pci_emul_msix_twrite(pi, offset, size, value);
3076                 return;
3077         }
3078
3079         switch (baridx) {
3080         case 0:
3081                 pci_nvme_write_bar_0(sc, offset, size, value);
3082                 break;
3083
3084         default:
3085                 DPRINTF("%s unknown baridx %d, val 0x%lx",
3086                          __func__, baridx, value);
3087         }
3088 }
3089
3090 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3091         uint64_t offset, int size)
3092 {
3093         uint64_t value;
3094
3095         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3096
3097         if (offset < NVME_DOORBELL_OFFSET) {
3098                 void *p = &(sc->regs);
3099                 pthread_mutex_lock(&sc->mtx);
3100                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
3101                 pthread_mutex_unlock(&sc->mtx);
3102         } else {
3103                 value = 0;
3104                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3105         }
3106
3107         switch (size) {
3108         case 1:
3109                 value &= 0xFF;
3110                 break;
3111         case 2:
3112                 value &= 0xFFFF;
3113                 break;
3114         case 4:
3115                 value &= 0xFFFFFFFF;
3116                 break;
3117         }
3118
3119         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3120                  offset, size, (uint32_t)value);
3121
3122         return (value);
3123 }
3124
3125
3126
3127 static uint64_t
3128 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3129 {
3130         struct pci_nvme_softc* sc = pi->pi_arg;
3131
3132         if (baridx == pci_msix_table_bar(pi) ||
3133             baridx == pci_msix_pba_bar(pi)) {
3134                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3135                         baridx, offset, size);
3136
3137                 return pci_emul_msix_tread(pi, offset, size);
3138         }
3139
3140         switch (baridx) {
3141         case 0:
3142                 return pci_nvme_read_bar_0(sc, offset, size);
3143
3144         default:
3145                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3146         }
3147
3148         return (0);
3149 }
3150
3151 static int
3152 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3153 {
3154         char bident[sizeof("XXX:XXX")];
3155         const char *value;
3156         uint32_t sectsz;
3157
3158         sc->max_queues = NVME_QUEUES;
3159         sc->max_qentries = NVME_MAX_QENTRIES;
3160         sc->ioslots = NVME_IOSLOTS;
3161         sc->num_squeues = sc->max_queues;
3162         sc->num_cqueues = sc->max_queues;
3163         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3164         sectsz = 0;
3165         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3166                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3167
3168         value = get_config_value_node(nvl, "maxq");
3169         if (value != NULL)
3170                 sc->max_queues = atoi(value);
3171         value = get_config_value_node(nvl, "qsz");
3172         if (value != NULL) {
3173                 sc->max_qentries = atoi(value);
3174                 if (sc->max_qentries <= 0) {
3175                         EPRINTLN("nvme: Invalid qsz option %d",
3176                             sc->max_qentries);
3177                         return (-1);
3178                 }
3179         }
3180         value = get_config_value_node(nvl, "ioslots");
3181         if (value != NULL) {
3182                 sc->ioslots = atoi(value);
3183                 if (sc->ioslots <= 0) {
3184                         EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3185                         return (-1);
3186                 }
3187         }
3188         value = get_config_value_node(nvl, "sectsz");
3189         if (value != NULL)
3190                 sectsz = atoi(value);
3191         value = get_config_value_node(nvl, "ser");
3192         if (value != NULL) {
3193                 /*
3194                  * This field indicates the Product Serial Number in
3195                  * 7-bit ASCII, unused bytes should be space characters.
3196                  * Ref: NVMe v1.3c.
3197                  */
3198                 cpywithpad((char *)sc->ctrldata.sn,
3199                     sizeof(sc->ctrldata.sn), value, ' ');
3200         }
3201         value = get_config_value_node(nvl, "eui64");
3202         if (value != NULL)
3203                 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3204         value = get_config_value_node(nvl, "dsm");
3205         if (value != NULL) {
3206                 if (strcmp(value, "auto") == 0)
3207                         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3208                 else if (strcmp(value, "enable") == 0)
3209                         sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3210                 else if (strcmp(value, "disable") == 0)
3211                         sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3212         }
3213
3214         value = get_config_value_node(nvl, "ram");
3215         if (value != NULL) {
3216                 uint64_t sz = strtoull(value, NULL, 10);
3217
3218                 sc->nvstore.type = NVME_STOR_RAM;
3219                 sc->nvstore.size = sz * 1024 * 1024;
3220                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3221                 sc->nvstore.sectsz = 4096;
3222                 sc->nvstore.sectsz_bits = 12;
3223                 if (sc->nvstore.ctx == NULL) {
3224                         EPRINTLN("nvme: Unable to allocate RAM");
3225                         return (-1);
3226                 }
3227         } else {
3228                 snprintf(bident, sizeof(bident), "%u:%u",
3229                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3230                 sc->nvstore.ctx = blockif_open(nvl, bident);
3231                 if (sc->nvstore.ctx == NULL) {
3232                         EPRINTLN("nvme: Could not open backing file: %s",
3233                             strerror(errno));
3234                         return (-1);
3235                 }
3236                 sc->nvstore.type = NVME_STOR_BLOCKIF;
3237                 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3238         }
3239
3240         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3241                 sc->nvstore.sectsz = sectsz;
3242         else if (sc->nvstore.type != NVME_STOR_RAM)
3243                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3244         for (sc->nvstore.sectsz_bits = 9;
3245              (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3246              sc->nvstore.sectsz_bits++);
3247
3248         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3249                 sc->max_queues = NVME_QUEUES;
3250
3251         return (0);
3252 }
3253
3254 static void
3255 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3256     size_t new_size)
3257 {
3258         struct pci_nvme_softc *sc;
3259         struct pci_nvme_blockstore *nvstore;
3260         struct nvme_namespace_data *nd;
3261
3262         sc = arg;
3263         nvstore = &sc->nvstore;
3264         nd = &sc->nsdata;
3265
3266         nvstore->size = new_size;
3267         pci_nvme_init_nsdata_size(nvstore, nd);
3268
3269         /* Add changed NSID to list */
3270         sc->ns_log.ns[0] = 1;
3271         sc->ns_log.ns[1] = 0;
3272
3273         pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3274             PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3275 }
3276
3277 static int
3278 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl)
3279 {
3280         struct pci_nvme_softc *sc;
3281         uint32_t pci_membar_sz;
3282         int     error;
3283
3284         error = 0;
3285
3286         sc = calloc(1, sizeof(struct pci_nvme_softc));
3287         pi->pi_arg = sc;
3288         sc->nsc_pi = pi;
3289
3290         error = pci_nvme_parse_config(sc, nvl);
3291         if (error < 0)
3292                 goto done;
3293         else
3294                 error = 0;
3295
3296         STAILQ_INIT(&sc->ioreqs_free);
3297         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3298         for (uint32_t i = 0; i < sc->ioslots; i++) {
3299                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3300         }
3301
3302         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3303         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3304         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3305         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3306         pci_set_cfgdata8(pi, PCIR_PROGIF,
3307                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3308
3309         /*
3310          * Allocate size of NVMe registers + doorbell space for all queues.
3311          *
3312          * The specification requires a minimum memory I/O window size of 16K.
3313          * The Windows driver will refuse to start a device with a smaller
3314          * window.
3315          */
3316         pci_membar_sz = sizeof(struct nvme_registers) +
3317             2 * sizeof(uint32_t) * (sc->max_queues + 1);
3318         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3319
3320         DPRINTF("nvme membar size: %u", pci_membar_sz);
3321
3322         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3323         if (error) {
3324                 WPRINTF("%s pci alloc mem bar failed", __func__);
3325                 goto done;
3326         }
3327
3328         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3329         if (error) {
3330                 WPRINTF("%s pci add msixcap failed", __func__);
3331                 goto done;
3332         }
3333
3334         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3335         if (error) {
3336                 WPRINTF("%s pci add Express capability failed", __func__);
3337                 goto done;
3338         }
3339
3340         pthread_mutex_init(&sc->mtx, NULL);
3341         sem_init(&sc->iosemlock, 0, sc->ioslots);
3342         blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3343
3344         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3345         /*
3346          * Controller data depends on Namespace data so initialize Namespace
3347          * data first.
3348          */
3349         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3350         pci_nvme_init_ctrldata(sc);
3351         pci_nvme_init_logpages(sc);
3352         pci_nvme_init_features(sc);
3353
3354         pci_nvme_aer_init(sc);
3355         pci_nvme_aen_init(sc);
3356
3357         pci_nvme_reset(sc);
3358
3359         pci_lintr_request(pi);
3360
3361 done:
3362         return (error);
3363 }
3364
3365 static int
3366 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3367 {
3368         char *cp, *ram;
3369
3370         if (opts == NULL)
3371                 return (0);
3372
3373         if (strncmp(opts, "ram=", 4) == 0) {
3374                 cp = strchr(opts, ',');
3375                 if (cp == NULL) {
3376                         set_config_value_node(nvl, "ram", opts + 4);
3377                         return (0);
3378                 }
3379                 ram = strndup(opts + 4, cp - opts - 4);
3380                 set_config_value_node(nvl, "ram", ram);
3381                 free(ram);
3382                 return (pci_parse_legacy_config(nvl, cp + 1));
3383         } else
3384                 return (blockif_legacy_config(nvl, opts));
3385 }
3386
3387 static const struct pci_devemu pci_de_nvme = {
3388         .pe_emu =       "nvme",
3389         .pe_init =      pci_nvme_init,
3390         .pe_legacy_config = pci_nvme_legacy_config,
3391         .pe_barwrite =  pci_nvme_write,
3392         .pe_barread =   pci_nvme_read
3393 };
3394 PCI_EMUL_SET(pci_de_nvme);