]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: Mark variables and functions as static where appropriate
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80
81 #include <dev/nvme/nvme.h>
82
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88
89
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR           4
96
97 #define NVME_IOSLOTS            8
98
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN     (1 << 14)
101
102 #define NVME_QUEUES             16
103 #define NVME_MAX_QENTRIES       2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN             0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
108
109 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
110 #define NVME_MDTS               9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS          0xffff
117 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
118
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121
122 /* helpers */
123
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)         ((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)         ((one)  - 1)
128
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133
134 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
135
136 enum nvme_controller_register_offsets {
137         NVME_CR_CAP_LOW = 0x00,
138         NVME_CR_CAP_HI  = 0x04,
139         NVME_CR_VS      = 0x08,
140         NVME_CR_INTMS   = 0x0c,
141         NVME_CR_INTMC   = 0x10,
142         NVME_CR_CC      = 0x14,
143         NVME_CR_CSTS    = 0x1c,
144         NVME_CR_NSSR    = 0x20,
145         NVME_CR_AQA     = 0x24,
146         NVME_CR_ASQ_LOW = 0x28,
147         NVME_CR_ASQ_HI  = 0x2c,
148         NVME_CR_ACQ_LOW = 0x30,
149         NVME_CR_ACQ_HI  = 0x34,
150 };
151
152 enum nvme_cmd_cdw11 {
153         NVME_CMD_CDW11_PC  = 0x0001,
154         NVME_CMD_CDW11_IEN = 0x0002,
155         NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157
158 enum nvme_copy_dir {
159         NVME_COPY_TO_PRP,
160         NVME_COPY_FROM_PRP,
161 };
162
163 #define NVME_CQ_INTEN   0x01
164 #define NVME_CQ_INTCOAL 0x02
165
166 struct nvme_completion_queue {
167         struct nvme_completion *qbase;
168         pthread_mutex_t mtx;
169         uint32_t        size;
170         uint16_t        tail; /* nvme progress */
171         uint16_t        head; /* guest progress */
172         uint16_t        intr_vec;
173         uint32_t        intr_en;
174 };
175
176 struct nvme_submission_queue {
177         struct nvme_command *qbase;
178         pthread_mutex_t mtx;
179         uint32_t        size;
180         uint16_t        head; /* nvme progress */
181         uint16_t        tail; /* guest progress */
182         uint16_t        cqid; /* completion queue id */
183         int             qpriority;
184 };
185
186 enum nvme_storage_type {
187         NVME_STOR_BLOCKIF = 0,
188         NVME_STOR_RAM = 1,
189 };
190
191 struct pci_nvme_blockstore {
192         enum nvme_storage_type type;
193         void            *ctx;
194         uint64_t        size;
195         uint32_t        sectsz;
196         uint32_t        sectsz_bits;
197         uint64_t        eui64;
198         uint32_t        deallocate:1;
199 };
200
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207         ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208           NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209           0 )
210
211 struct pci_nvme_ioreq {
212         struct pci_nvme_softc *sc;
213         STAILQ_ENTRY(pci_nvme_ioreq) link;
214         struct nvme_submission_queue *nvme_sq;
215         uint16_t        sqid;
216
217         /* command information */
218         uint16_t        opc;
219         uint16_t        cid;
220         uint32_t        nsid;
221
222         uint64_t        prev_gpaddr;
223         size_t          prev_size;
224         size_t          bytes;
225
226         struct blockif_req io_req;
227
228         struct iovec    iovpadding[MDTS_PAD_SIZE];
229 };
230
231 enum nvme_dsm_type {
232         /* Dataset Management bit in ONCS reflects backing storage capability */
233         NVME_DATASET_MANAGEMENT_AUTO,
234         /* Unconditionally set Dataset Management bit in ONCS */
235         NVME_DATASET_MANAGEMENT_ENABLE,
236         /* Unconditionally clear Dataset Management bit in ONCS */
237         NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247
248 struct nvme_feature_obj {
249         uint32_t        cdw11;
250         nvme_feature_cb set;
251         nvme_feature_cb get;
252         bool namespace_specific;
253 };
254
255 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256
257 typedef enum {
258         PCI_NVME_AE_TYPE_ERROR = 0,
259         PCI_NVME_AE_TYPE_SMART,
260         PCI_NVME_AE_TYPE_NOTICE,
261         PCI_NVME_AE_TYPE_IO_CMD = 6,
262         PCI_NVME_AE_TYPE_VENDOR = 7,
263         PCI_NVME_AE_TYPE_MAX            /* Must be last */
264 } pci_nvme_async_type;
265
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268         STAILQ_ENTRY(pci_nvme_aer) link;
269         uint16_t        cid;    /* Command ID of the submitted AER */
270 };
271
272 /** Asynchronous Event Information - Notice */
273 typedef enum {
274         PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275         PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276         PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277         PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278         PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279         PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280         PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281         PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
283
284 #define PCI_NVME_AEI_NOTICE_SHIFT               8
285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
286
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289         pci_nvme_async_type atype;
290         uint32_t        event_data;
291         bool            posted;
292 };
293
294 /*
295  * By default, enable all Asynchrnous Event Notifications:
296  *     SMART / Health Critical Warnings
297  *     Namespace Attribute Notices
298  */
299 #define PCI_NVME_AEN_DEFAULT_MASK       0x11f
300
301 typedef enum {
302         NVME_CNTRLTYPE_IO = 1,
303         NVME_CNTRLTYPE_DISCOVERY = 2,
304         NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
306
307 struct pci_nvme_softc {
308         struct pci_devinst *nsc_pi;
309
310         pthread_mutex_t mtx;
311
312         struct nvme_registers regs;
313
314         struct nvme_namespace_data  nsdata;
315         struct nvme_controller_data ctrldata;
316         struct nvme_error_information_entry err_log;
317         struct nvme_health_information_page health_log;
318         struct nvme_firmware_page fw_log;
319         struct nvme_ns_list ns_log;
320
321         struct pci_nvme_blockstore nvstore;
322
323         uint16_t        max_qentries;   /* max entries per queue */
324         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
325         uint32_t        num_cqueues;
326         uint32_t        num_squeues;
327         bool            num_q_is_set; /* Has host set Number of Queues */
328
329         struct pci_nvme_ioreq *ioreqs;
330         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331         uint32_t        pending_ios;
332         uint32_t        ioslots;
333         sem_t           iosemlock;
334
335         /*
336          * Memory mapped Submission and Completion queues
337          * Each array includes both Admin and IO queues
338          */
339         struct nvme_completion_queue *compl_queues;
340         struct nvme_submission_queue *submit_queues;
341
342         struct nvme_feature_obj feat[NVME_FID_MAX];
343
344         enum nvme_dsm_type dataset_management;
345
346         /* Accounting for SMART data */
347         __uint128_t     read_data_units;
348         __uint128_t     write_data_units;
349         __uint128_t     read_commands;
350         __uint128_t     write_commands;
351         uint32_t        read_dunits_remainder;
352         uint32_t        write_dunits_remainder;
353
354         STAILQ_HEAD(, pci_nvme_aer) aer_list;
355         pthread_mutex_t aer_mtx;
356         uint32_t        aer_count;
357         struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
358         pthread_t       aen_tid;
359         pthread_mutex_t aen_mtx;
360         pthread_cond_t  aen_cond;
361 };
362
363
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365     struct nvme_completion_queue *cq,
366     uint32_t cdw0,
367     uint16_t cid,
368     uint16_t sqid,
369     uint16_t status);
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
373
374 /* Controller Configuration utils */
375 #define NVME_CC_GET_EN(cc) \
376         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define NVME_CC_GET_CSS(cc) \
378         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define NVME_CC_GET_SHN(cc) \
380         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define NVME_CC_GET_IOSQES(cc) \
382         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define NVME_CC_GET_IOCQES(cc) \
384         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
385
386 #define NVME_CC_WRITE_MASK \
387         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
390
391 #define NVME_CC_NEN_WRITE_MASK \
392         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
395
396 /* Controller Status utils */
397 #define NVME_CSTS_GET_RDY(sts) \
398         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
399
400 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
401
402 /* Completion Queue status word utils */
403 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
404 #define NVME_STATUS_MASK \
405         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
406          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
407
408 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
409         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
410
411 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
412     struct nvme_feature_obj *,
413     struct nvme_command *,
414     struct nvme_completion *);
415 static void nvme_feature_temperature(struct pci_nvme_softc *,
416     struct nvme_feature_obj *,
417     struct nvme_command *,
418     struct nvme_completion *);
419 static void nvme_feature_num_queues(struct pci_nvme_softc *,
420     struct nvme_feature_obj *,
421     struct nvme_command *,
422     struct nvme_completion *);
423 static void nvme_feature_iv_config(struct pci_nvme_softc *,
424     struct nvme_feature_obj *,
425     struct nvme_command *,
426     struct nvme_completion *);
427 static void nvme_feature_async_event(struct pci_nvme_softc *,
428     struct nvme_feature_obj *,
429     struct nvme_command *,
430     struct nvme_completion *);
431
432 static void *aen_thr(void *arg);
433
434 static __inline void
435 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
436 {
437         size_t len;
438
439         len = strnlen(src, dst_size);
440         memset(dst, pad, dst_size);
441         memcpy(dst, src, len);
442 }
443
444 static __inline void
445 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
446 {
447
448         *status &= ~NVME_STATUS_MASK;
449         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
450                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
451 }
452
453 static __inline void
454 pci_nvme_status_genc(uint16_t *status, uint16_t code)
455 {
456
457         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
458 }
459
460 /*
461  * Initialize the requested number or IO Submission and Completion Queues.
462  * Admin queues are allocated implicitly.
463  */
464 static void
465 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
466 {
467         uint32_t i;
468
469         /*
470          * Allocate and initialize the Submission Queues
471          */
472         if (nsq > NVME_QUEUES) {
473                 WPRINTF("%s: clamping number of SQ from %u to %u",
474                                         __func__, nsq, NVME_QUEUES);
475                 nsq = NVME_QUEUES;
476         }
477
478         sc->num_squeues = nsq;
479
480         sc->submit_queues = calloc(sc->num_squeues + 1,
481                                 sizeof(struct nvme_submission_queue));
482         if (sc->submit_queues == NULL) {
483                 WPRINTF("%s: SQ allocation failed", __func__);
484                 sc->num_squeues = 0;
485         } else {
486                 struct nvme_submission_queue *sq = sc->submit_queues;
487
488                 for (i = 0; i < sc->num_squeues; i++)
489                         pthread_mutex_init(&sq[i].mtx, NULL);
490         }
491
492         /*
493          * Allocate and initialize the Completion Queues
494          */
495         if (ncq > NVME_QUEUES) {
496                 WPRINTF("%s: clamping number of CQ from %u to %u",
497                                         __func__, ncq, NVME_QUEUES);
498                 ncq = NVME_QUEUES;
499         }
500
501         sc->num_cqueues = ncq;
502
503         sc->compl_queues = calloc(sc->num_cqueues + 1,
504                                 sizeof(struct nvme_completion_queue));
505         if (sc->compl_queues == NULL) {
506                 WPRINTF("%s: CQ allocation failed", __func__);
507                 sc->num_cqueues = 0;
508         } else {
509                 struct nvme_completion_queue *cq = sc->compl_queues;
510
511                 for (i = 0; i < sc->num_cqueues; i++)
512                         pthread_mutex_init(&cq[i].mtx, NULL);
513         }
514 }
515
516 static void
517 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
518 {
519         struct nvme_controller_data *cd = &sc->ctrldata;
520
521         cd->vid = 0xFB5D;
522         cd->ssvid = 0x0000;
523
524         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
525         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
526
527         /* Num of submission commands that we can handle at a time (2^rab) */
528         cd->rab   = 4;
529
530         /* FreeBSD OUI */
531         cd->ieee[0] = 0x58;
532         cd->ieee[1] = 0x9c;
533         cd->ieee[2] = 0xfc;
534
535         cd->mic = 0;
536
537         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
538
539         cd->ver = NVME_REV(1,4);
540
541         cd->cntrltype = NVME_CNTRLTYPE_IO;
542         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
543         cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
544         cd->acl = 2;
545         cd->aerl = 4;
546
547         /* Advertise 1, Read-only firmware slot */
548         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
549             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
550         cd->lpa = 0;    /* TODO: support some simple things like SMART */
551         cd->elpe = 0;   /* max error log page entries */
552         cd->npss = 1;   /* number of power states support */
553
554         /* Warning Composite Temperature Threshold */
555         cd->wctemp = 0x0157;
556         cd->cctemp = 0x0157;
557
558         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
559             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
560         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
561             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
562         cd->nn = 1;     /* number of namespaces */
563
564         cd->oncs = 0;
565         switch (sc->dataset_management) {
566         case NVME_DATASET_MANAGEMENT_AUTO:
567                 if (sc->nvstore.deallocate)
568                         cd->oncs |= NVME_ONCS_DSM;
569                 break;
570         case NVME_DATASET_MANAGEMENT_ENABLE:
571                 cd->oncs |= NVME_ONCS_DSM;
572                 break;
573         default:
574                 break;
575         }
576
577         cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
578             NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
579
580         cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
581
582         cd->power_state[0].mp = 10;
583 }
584
585 /*
586  * Calculate the CRC-16 of the given buffer
587  * See copyright attribution at top of file
588  */
589 static uint16_t
590 crc16(uint16_t crc, const void *buffer, unsigned int len)
591 {
592         const unsigned char *cp = buffer;
593         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
594         static uint16_t const crc16_table[256] = {
595                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
596                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
597                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
598                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
599                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
600                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
601                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
602                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
603                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
604                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
605                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
606                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
607                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
608                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
609                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
610                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
611                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
612                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
613                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
614                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
615                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
616                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
617                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
618                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
619                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
620                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
621                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
622                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
623                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
624                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
625                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
626                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
627         };
628
629         while (len--)
630                 crc = (((crc >> 8) & 0xffU) ^
631                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
632         return crc;
633 }
634
635 static void
636 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
637     struct nvme_namespace_data *nd)
638 {
639
640         /* Get capacity and block size information from backing store */
641         nd->nsze = nvstore->size / nvstore->sectsz;
642         nd->ncap = nd->nsze;
643         nd->nuse = nd->nsze;
644 }
645
646 static void
647 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
648     struct nvme_namespace_data *nd, uint32_t nsid,
649     struct pci_nvme_blockstore *nvstore)
650 {
651
652         pci_nvme_init_nsdata_size(nvstore, nd);
653
654         if (nvstore->type == NVME_STOR_BLOCKIF)
655                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
656
657         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
658         nd->flbas = 0;
659
660         /* Create an EUI-64 if user did not provide one */
661         if (nvstore->eui64 == 0) {
662                 char *data = NULL;
663                 uint64_t eui64 = nvstore->eui64;
664
665                 asprintf(&data, "%s%u%u%u", get_config_value("name"),
666                     sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
667                     sc->nsc_pi->pi_func);
668
669                 if (data != NULL) {
670                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
671                         free(data);
672                 }
673                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
674         }
675         be64enc(nd->eui64, nvstore->eui64);
676
677         /* LBA data-sz = 2^lbads */
678         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
679 }
680
681 static void
682 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
683 {
684
685         memset(&sc->err_log, 0, sizeof(sc->err_log));
686         memset(&sc->health_log, 0, sizeof(sc->health_log));
687         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
688         memset(&sc->ns_log, 0, sizeof(sc->ns_log));
689
690         /* Set read/write remainder to round up according to spec */
691         sc->read_dunits_remainder = 999;
692         sc->write_dunits_remainder = 999;
693
694         /* Set nominal Health values checked by implementations */
695         sc->health_log.temperature = NVME_TEMPERATURE;
696         sc->health_log.available_spare = 100;
697         sc->health_log.available_spare_threshold = 10;
698 }
699
700 static void
701 pci_nvme_init_features(struct pci_nvme_softc *sc)
702 {
703         enum nvme_feature       fid;
704
705         for (fid = 0; fid < NVME_FID_MAX; fid++) {
706                 switch (fid) {
707                 case NVME_FEAT_ARBITRATION:
708                 case NVME_FEAT_POWER_MANAGEMENT:
709                 case NVME_FEAT_INTERRUPT_COALESCING: //XXX
710                 case NVME_FEAT_WRITE_ATOMICITY:
711                         /* Mandatory but no special handling required */
712                 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
713                 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
714                 //                this returns a data buffer
715                         break;
716                 case NVME_FEAT_TEMPERATURE_THRESHOLD:
717                         sc->feat[fid].set = nvme_feature_temperature;
718                         break;
719                 case NVME_FEAT_ERROR_RECOVERY:
720                         sc->feat[fid].namespace_specific = true;
721                         break;
722                 case NVME_FEAT_NUMBER_OF_QUEUES:
723                         sc->feat[fid].set = nvme_feature_num_queues;
724                         break;
725                 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
726                         sc->feat[fid].set = nvme_feature_iv_config;
727                         break;
728                 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
729                         sc->feat[fid].set = nvme_feature_async_event;
730                         /* Enable all AENs by default */
731                         sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
732                         break;
733                 default:
734                         sc->feat[fid].set = nvme_feature_invalid_cb;
735                         sc->feat[fid].get = nvme_feature_invalid_cb;
736                 }
737         }
738 }
739
740 static void
741 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
742 {
743
744         STAILQ_INIT(&sc->aer_list);
745         sc->aer_count = 0;
746 }
747
748 static void
749 pci_nvme_aer_init(struct pci_nvme_softc *sc)
750 {
751
752         pthread_mutex_init(&sc->aer_mtx, NULL);
753         pci_nvme_aer_reset(sc);
754 }
755
756 static void
757 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
758 {
759         struct pci_nvme_aer *aer = NULL;
760
761         pthread_mutex_lock(&sc->aer_mtx);
762         while (!STAILQ_EMPTY(&sc->aer_list)) {
763                 aer = STAILQ_FIRST(&sc->aer_list);
764                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
765                 free(aer);
766         }
767         pthread_mutex_unlock(&sc->aer_mtx);
768
769         pci_nvme_aer_reset(sc);
770 }
771
772 static bool
773 pci_nvme_aer_available(struct pci_nvme_softc *sc)
774 {
775
776         return (sc->aer_count != 0);
777 }
778
779 static bool
780 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
781 {
782         struct nvme_controller_data *cd = &sc->ctrldata;
783
784         /* AERL is a zero based value while aer_count is one's based */
785         return (sc->aer_count == (cd->aerl + 1));
786 }
787
788 /*
789  * Add an Async Event Request
790  *
791  * Stores an AER to be returned later if the Controller needs to notify the
792  * host of an event.
793  * Note that while the NVMe spec doesn't require Controllers to return AER's
794  * in order, this implementation does preserve the order.
795  */
796 static int
797 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
798 {
799         struct pci_nvme_aer *aer = NULL;
800
801         aer = calloc(1, sizeof(struct pci_nvme_aer));
802         if (aer == NULL)
803                 return (-1);
804
805         /* Save the Command ID for use in the completion message */
806         aer->cid = cid;
807
808         pthread_mutex_lock(&sc->aer_mtx);
809         sc->aer_count++;
810         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
811         pthread_mutex_unlock(&sc->aer_mtx);
812
813         return (0);
814 }
815
816 /*
817  * Get an Async Event Request structure
818  *
819  * Returns a pointer to an AER previously submitted by the host or NULL if
820  * no AER's exist. Caller is responsible for freeing the returned struct.
821  */
822 static struct pci_nvme_aer *
823 pci_nvme_aer_get(struct pci_nvme_softc *sc)
824 {
825         struct pci_nvme_aer *aer = NULL;
826
827         pthread_mutex_lock(&sc->aer_mtx);
828         aer = STAILQ_FIRST(&sc->aer_list);
829         if (aer != NULL) {
830                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
831                 sc->aer_count--;
832         }
833         pthread_mutex_unlock(&sc->aer_mtx);
834         
835         return (aer);
836 }
837
838 static void
839 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
840 {
841         uint32_t        atype;
842
843         memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
844
845         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
846                 sc->aen[atype].atype = atype;
847         }
848 }
849
850 static void
851 pci_nvme_aen_init(struct pci_nvme_softc *sc)
852 {
853         char nstr[80];
854
855         pci_nvme_aen_reset(sc);
856
857         pthread_mutex_init(&sc->aen_mtx, NULL);
858         pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
859         snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
860             sc->nsc_pi->pi_func);
861         pthread_set_name_np(sc->aen_tid, nstr);
862 }
863
864 static void
865 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
866 {
867
868         pci_nvme_aen_reset(sc);
869 }
870
871 /* Notify the AEN thread of pending work */
872 static void
873 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
874 {
875
876         pthread_cond_signal(&sc->aen_cond);
877 }
878
879 /*
880  * Post an Asynchronous Event Notification
881  */
882 static int32_t
883 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
884                 uint32_t event_data)
885 {
886         struct pci_nvme_aen *aen;
887
888         if (atype >= PCI_NVME_AE_TYPE_MAX) {
889                 return(EINVAL);
890         }
891
892         pthread_mutex_lock(&sc->aen_mtx);
893         aen = &sc->aen[atype];
894
895         /* Has the controller already posted an event of this type? */
896         if (aen->posted) {
897                 pthread_mutex_unlock(&sc->aen_mtx);
898                 return(EALREADY);
899         }
900
901         aen->event_data = event_data;
902         aen->posted = true;
903         pthread_mutex_unlock(&sc->aen_mtx);
904
905         pci_nvme_aen_notify(sc);
906
907         return(0);
908 }
909
910 static void
911 pci_nvme_aen_process(struct pci_nvme_softc *sc)
912 {
913         struct pci_nvme_aer *aer;
914         struct pci_nvme_aen *aen;
915         pci_nvme_async_type atype;
916         uint32_t mask;
917         uint16_t status;
918         uint8_t lid;
919
920         assert(pthread_mutex_isowned_np(&sc->aen_mtx));
921         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
922                 aen = &sc->aen[atype];
923                 /* Previous iterations may have depleted the available AER's */
924                 if (!pci_nvme_aer_available(sc)) {
925                         DPRINTF("%s: no AER", __func__);
926                         break;
927                 }
928
929                 if (!aen->posted) {
930                         DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
931                         continue;
932                 }
933
934                 status = NVME_SC_SUCCESS;
935
936                 /* Is the event masked? */
937                 mask =
938                     sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
939
940                 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
941                 switch (atype) {
942                 case PCI_NVME_AE_TYPE_ERROR:
943                         lid = NVME_LOG_ERROR;
944                         break;
945                 case PCI_NVME_AE_TYPE_SMART:
946                         mask &= 0xff;
947                         if ((mask & aen->event_data) == 0)
948                                 continue;
949                         lid = NVME_LOG_HEALTH_INFORMATION;
950                         break;
951                 case PCI_NVME_AE_TYPE_NOTICE:
952                         if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
953                                 EPRINTLN("%s unknown AEN notice type %u",
954                                     __func__, aen->event_data);
955                                 status = NVME_SC_INTERNAL_DEVICE_ERROR;
956                                 break;
957                         }
958                         if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
959                                 continue;
960                         switch (aen->event_data) {
961                         case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
962                                 lid = NVME_LOG_CHANGED_NAMESPACE;
963                                 break;
964                         case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
965                                 lid = NVME_LOG_FIRMWARE_SLOT;
966                                 break;
967                         case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
968                                 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
969                                 break;
970                         case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
971                                 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
972                                 break;
973                         case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
974                                 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
975                                 break;
976                         case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
977                                 lid = NVME_LOG_LBA_STATUS_INFORMATION;
978                                 break;
979                         case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
980                                 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
981                                 break;
982                         default:
983                                 lid = 0;
984                         }
985                         break;
986                 default:
987                         /* bad type?!? */
988                         EPRINTLN("%s unknown AEN type %u", __func__, atype);
989                         status = NVME_SC_INTERNAL_DEVICE_ERROR;
990                         break;
991                 }
992
993                 aer = pci_nvme_aer_get(sc);
994                 assert(aer != NULL);
995
996                 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
997                 pci_nvme_cq_update(sc, &sc->compl_queues[0],
998                     (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
999                     aer->cid,
1000                     0,          /* SQID */
1001                     status);
1002
1003                 aen->event_data = 0;
1004                 aen->posted = false;
1005
1006                 pci_generate_msix(sc->nsc_pi, 0);
1007         }
1008 }
1009
1010 static void *
1011 aen_thr(void *arg)
1012 {
1013         struct pci_nvme_softc *sc;
1014
1015         sc = arg;
1016
1017         pthread_mutex_lock(&sc->aen_mtx);
1018         for (;;) {
1019                 pci_nvme_aen_process(sc);
1020                 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1021         }
1022         pthread_mutex_unlock(&sc->aen_mtx);
1023
1024         pthread_exit(NULL);
1025         return (NULL);
1026 }
1027
1028 static void
1029 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1030 {
1031         uint32_t i;
1032
1033         DPRINTF("%s", __func__);
1034
1035         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1036             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1037             (60 << NVME_CAP_LO_REG_TO_SHIFT);
1038
1039         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1040
1041         sc->regs.vs = NVME_REV(1,4);    /* NVMe v1.4 */
1042
1043         sc->regs.cc = 0;
1044
1045         assert(sc->submit_queues != NULL);
1046
1047         for (i = 0; i < sc->num_squeues + 1; i++) {
1048                 sc->submit_queues[i].qbase = NULL;
1049                 sc->submit_queues[i].size = 0;
1050                 sc->submit_queues[i].cqid = 0;
1051                 sc->submit_queues[i].tail = 0;
1052                 sc->submit_queues[i].head = 0;
1053         }
1054
1055         assert(sc->compl_queues != NULL);
1056
1057         for (i = 0; i < sc->num_cqueues + 1; i++) {
1058                 sc->compl_queues[i].qbase = NULL;
1059                 sc->compl_queues[i].size = 0;
1060                 sc->compl_queues[i].tail = 0;
1061                 sc->compl_queues[i].head = 0;
1062         }
1063
1064         sc->num_q_is_set = false;
1065
1066         pci_nvme_aer_destroy(sc);
1067         pci_nvme_aen_destroy(sc);
1068
1069         /*
1070          * Clear CSTS.RDY last to prevent the host from enabling Controller
1071          * before cleanup completes
1072          */
1073         sc->regs.csts = 0;
1074 }
1075
1076 static void
1077 pci_nvme_reset(struct pci_nvme_softc *sc)
1078 {
1079         pthread_mutex_lock(&sc->mtx);
1080         pci_nvme_reset_locked(sc);
1081         pthread_mutex_unlock(&sc->mtx);
1082 }
1083
1084 static void
1085 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1086 {
1087         uint16_t acqs, asqs;
1088
1089         DPRINTF("%s", __func__);
1090
1091         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1092         sc->submit_queues[0].size = asqs;
1093         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1094                     sizeof(struct nvme_command) * asqs);
1095
1096         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1097                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1098
1099         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
1100             NVME_AQA_REG_ACQS_MASK) + 1;
1101         sc->compl_queues[0].size = acqs;
1102         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1103                  sizeof(struct nvme_completion) * acqs);
1104         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1105
1106         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1107                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1108 }
1109
1110 static int
1111 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1112         size_t len, enum nvme_copy_dir dir)
1113 {
1114         uint8_t *p;
1115         size_t bytes;
1116
1117         if (len > (8 * 1024)) {
1118                 return (-1);
1119         }
1120
1121         /* Copy from the start of prp1 to the end of the physical page */
1122         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1123         bytes = MIN(bytes, len);
1124
1125         p = vm_map_gpa(ctx, prp1, bytes);
1126         if (p == NULL) {
1127                 return (-1);
1128         }
1129
1130         if (dir == NVME_COPY_TO_PRP)
1131                 memcpy(p, b, bytes);
1132         else
1133                 memcpy(b, p, bytes);
1134
1135         b += bytes;
1136
1137         len -= bytes;
1138         if (len == 0) {
1139                 return (0);
1140         }
1141
1142         len = MIN(len, PAGE_SIZE);
1143
1144         p = vm_map_gpa(ctx, prp2, len);
1145         if (p == NULL) {
1146                 return (-1);
1147         }
1148
1149         if (dir == NVME_COPY_TO_PRP)
1150                 memcpy(p, b, len);
1151         else
1152                 memcpy(b, p, len);
1153
1154         return (0);
1155 }
1156
1157 /*
1158  * Write a Completion Queue Entry update
1159  *
1160  * Write the completion and update the doorbell value
1161  */
1162 static void
1163 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1164                 struct nvme_completion_queue *cq,
1165                 uint32_t cdw0,
1166                 uint16_t cid,
1167                 uint16_t sqid,
1168                 uint16_t status)
1169 {
1170         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1171         struct nvme_completion *cqe;
1172
1173         assert(cq->qbase != NULL);
1174
1175         pthread_mutex_lock(&cq->mtx);
1176
1177         cqe = &cq->qbase[cq->tail];
1178
1179         /* Flip the phase bit */
1180         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1181
1182         cqe->cdw0 = cdw0;
1183         cqe->sqhd = sq->head;
1184         cqe->sqid = sqid;
1185         cqe->cid = cid;
1186         cqe->status = status;
1187
1188         cq->tail++;
1189         if (cq->tail >= cq->size) {
1190                 cq->tail = 0;
1191         }
1192
1193         pthread_mutex_unlock(&cq->mtx);
1194 }
1195
1196 static int
1197 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1198         struct nvme_completion* compl)
1199 {
1200         uint16_t qid = command->cdw10 & 0xffff;
1201
1202         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1203         if (qid == 0 || qid > sc->num_squeues ||
1204             (sc->submit_queues[qid].qbase == NULL)) {
1205                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1206                         __func__, qid, sc->num_squeues);
1207                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1208                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1209                 return (1);
1210         }
1211
1212         sc->submit_queues[qid].qbase = NULL;
1213         sc->submit_queues[qid].cqid = 0;
1214         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1215         return (1);
1216 }
1217
1218 static int
1219 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1220         struct nvme_completion* compl)
1221 {
1222         if (command->cdw11 & NVME_CMD_CDW11_PC) {
1223                 uint16_t qid = command->cdw10 & 0xffff;
1224                 struct nvme_submission_queue *nsq;
1225
1226                 if ((qid == 0) || (qid > sc->num_squeues) ||
1227                     (sc->submit_queues[qid].qbase != NULL)) {
1228                         WPRINTF("%s queue index %u > num_squeues %u",
1229                                 __func__, qid, sc->num_squeues);
1230                         pci_nvme_status_tc(&compl->status,
1231                             NVME_SCT_COMMAND_SPECIFIC,
1232                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1233                         return (1);
1234                 }
1235
1236                 nsq = &sc->submit_queues[qid];
1237                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1238                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1239                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1240                         /*
1241                          * Queues must specify at least two entries
1242                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1243                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1244                          */
1245                         pci_nvme_status_tc(&compl->status,
1246                             NVME_SCT_COMMAND_SPECIFIC,
1247                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1248                         return (1);
1249                 }
1250                 nsq->head = nsq->tail = 0;
1251
1252                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1253                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1254                         pci_nvme_status_tc(&compl->status,
1255                             NVME_SCT_COMMAND_SPECIFIC,
1256                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1257                         return (1);
1258                 }
1259
1260                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1261                         pci_nvme_status_tc(&compl->status,
1262                             NVME_SCT_COMMAND_SPECIFIC,
1263                             NVME_SC_COMPLETION_QUEUE_INVALID);
1264                         return (1);
1265                 }
1266
1267                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1268
1269                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1270                               sizeof(struct nvme_command) * (size_t)nsq->size);
1271
1272                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1273                         qid, nsq->size, nsq->qbase, nsq->cqid);
1274
1275                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1276
1277                 DPRINTF("%s completed creating IOSQ qid %u",
1278                          __func__, qid);
1279         } else {
1280                 /* 
1281                  * Guest sent non-cont submission queue request.
1282                  * This setting is unsupported by this emulation.
1283                  */
1284                 WPRINTF("%s unsupported non-contig (list-based) "
1285                          "create i/o submission queue", __func__);
1286
1287                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1288         }
1289         return (1);
1290 }
1291
1292 static int
1293 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1294         struct nvme_completion* compl)
1295 {
1296         uint16_t qid = command->cdw10 & 0xffff;
1297         uint16_t sqid;
1298
1299         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1300         if (qid == 0 || qid > sc->num_cqueues ||
1301             (sc->compl_queues[qid].qbase == NULL)) {
1302                 WPRINTF("%s queue index %u / num_cqueues %u",
1303                         __func__, qid, sc->num_cqueues);
1304                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1305                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1306                 return (1);
1307         }
1308
1309         /* Deleting an Active CQ is an error */
1310         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1311                 if (sc->submit_queues[sqid].cqid == qid) {
1312                         pci_nvme_status_tc(&compl->status,
1313                             NVME_SCT_COMMAND_SPECIFIC,
1314                             NVME_SC_INVALID_QUEUE_DELETION);
1315                         return (1);
1316                 }
1317
1318         sc->compl_queues[qid].qbase = NULL;
1319         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1320         return (1);
1321 }
1322
1323 static int
1324 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1325         struct nvme_completion* compl)
1326 {
1327         struct nvme_completion_queue *ncq;
1328         uint16_t qid = command->cdw10 & 0xffff;
1329
1330         /* Only support Physically Contiguous queues */
1331         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1332                 WPRINTF("%s unsupported non-contig (list-based) "
1333                          "create i/o completion queue",
1334                          __func__);
1335
1336                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1337                 return (1);
1338         }
1339
1340         if ((qid == 0) || (qid > sc->num_cqueues) ||
1341             (sc->compl_queues[qid].qbase != NULL)) {
1342                 WPRINTF("%s queue index %u > num_cqueues %u",
1343                         __func__, qid, sc->num_cqueues);
1344                 pci_nvme_status_tc(&compl->status,
1345                     NVME_SCT_COMMAND_SPECIFIC,
1346                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1347                 return (1);
1348         }
1349
1350         ncq = &sc->compl_queues[qid];
1351         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1352         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1353         if (ncq->intr_vec > (sc->max_queues + 1)) {
1354                 pci_nvme_status_tc(&compl->status,
1355                     NVME_SCT_COMMAND_SPECIFIC,
1356                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1357                 return (1);
1358         }
1359
1360         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1361         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1362                 /*
1363                  * Queues must specify at least two entries
1364                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1365                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1366                  */
1367                 pci_nvme_status_tc(&compl->status,
1368                     NVME_SCT_COMMAND_SPECIFIC,
1369                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1370                 return (1);
1371         }
1372         ncq->head = ncq->tail = 0;
1373         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1374                      command->prp1,
1375                      sizeof(struct nvme_command) * (size_t)ncq->size);
1376
1377         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1378
1379
1380         return (1);
1381 }
1382
1383 static int
1384 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1385         struct nvme_completion* compl)
1386 {
1387         uint64_t logoff;
1388         uint32_t logsize;
1389         uint8_t logpage = command->cdw10 & 0xFF;
1390
1391         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1392
1393         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1394
1395         /*
1396          * Command specifies the number of dwords to return in fields NUMDU
1397          * and NUMDL. This is a zero-based value.
1398          */
1399         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1400         logsize *= sizeof(uint32_t);
1401         logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1402
1403         switch (logpage) {
1404         case NVME_LOG_ERROR:
1405                 if (logoff >= sizeof(sc->err_log)) {
1406                         pci_nvme_status_genc(&compl->status,
1407                             NVME_SC_INVALID_FIELD);
1408                         break;
1409                 }
1410
1411                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1412                     command->prp2, (uint8_t *)&sc->err_log + logoff,
1413                     MIN(logsize - logoff, sizeof(sc->err_log)),
1414                     NVME_COPY_TO_PRP);
1415                 break;
1416         case NVME_LOG_HEALTH_INFORMATION:
1417                 if (logoff >= sizeof(sc->health_log)) {
1418                         pci_nvme_status_genc(&compl->status,
1419                             NVME_SC_INVALID_FIELD);
1420                         break;
1421                 }
1422
1423                 pthread_mutex_lock(&sc->mtx);
1424                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1425                     sizeof(sc->health_log.data_units_read));
1426                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1427                     sizeof(sc->health_log.data_units_written));
1428                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1429                     sizeof(sc->health_log.host_read_commands));
1430                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1431                     sizeof(sc->health_log.host_write_commands));
1432                 pthread_mutex_unlock(&sc->mtx);
1433
1434                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1435                     command->prp2, (uint8_t *)&sc->health_log + logoff,
1436                     MIN(logsize - logoff, sizeof(sc->health_log)),
1437                     NVME_COPY_TO_PRP);
1438                 break;
1439         case NVME_LOG_FIRMWARE_SLOT:
1440                 if (logoff >= sizeof(sc->fw_log)) {
1441                         pci_nvme_status_genc(&compl->status,
1442                             NVME_SC_INVALID_FIELD);
1443                         break;
1444                 }
1445
1446                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1447                     command->prp2, (uint8_t *)&sc->fw_log + logoff,
1448                     MIN(logsize - logoff, sizeof(sc->fw_log)),
1449                     NVME_COPY_TO_PRP);
1450                 break;
1451         case NVME_LOG_CHANGED_NAMESPACE:
1452                 if (logoff >= sizeof(sc->ns_log)) {
1453                         pci_nvme_status_genc(&compl->status,
1454                             NVME_SC_INVALID_FIELD);
1455                         break;
1456                 }
1457
1458                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1459                     command->prp2, (uint8_t *)&sc->ns_log + logoff,
1460                     MIN(logsize - logoff, sizeof(sc->ns_log)),
1461                     NVME_COPY_TO_PRP);
1462                 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1463                 break;
1464         default:
1465                 DPRINTF("%s get log page %x command not supported",
1466                         __func__, logpage);
1467
1468                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1469                     NVME_SC_INVALID_LOG_PAGE);
1470         }
1471
1472         return (1);
1473 }
1474
1475 static int
1476 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1477         struct nvme_completion* compl)
1478 {
1479         void *dest;
1480         uint16_t status;
1481
1482         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1483                 command->cdw10 & 0xFF, command->nsid);
1484
1485         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1486
1487         switch (command->cdw10 & 0xFF) {
1488         case 0x00: /* return Identify Namespace data structure */
1489                 /* Global NS only valid with NS Management */
1490                 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1491                         pci_nvme_status_genc(&status,
1492                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1493                         break;
1494                 }
1495                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1496                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1497                     NVME_COPY_TO_PRP);
1498                 break;
1499         case 0x01: /* return Identify Controller data structure */
1500                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1501                     command->prp2, (uint8_t *)&sc->ctrldata,
1502                     sizeof(sc->ctrldata),
1503                     NVME_COPY_TO_PRP);
1504                 break;
1505         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1506                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1507                                   sizeof(uint32_t) * 1024);
1508                 /* All unused entries shall be zero */
1509                 bzero(dest, sizeof(uint32_t) * 1024);
1510                 ((uint32_t *)dest)[0] = 1;
1511                 break;
1512         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1513                 if (command->nsid != 1) {
1514                         pci_nvme_status_genc(&status,
1515                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1516                         break;
1517                 }
1518                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1519                                   sizeof(uint32_t) * 1024);
1520                 /* All bytes after the descriptor shall be zero */
1521                 bzero(dest, sizeof(uint32_t) * 1024);
1522
1523                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1524                 ((uint8_t *)dest)[0] = 1;
1525                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1526                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1527                 break;
1528         default:
1529                 DPRINTF("%s unsupported identify command requested 0x%x",
1530                          __func__, command->cdw10 & 0xFF);
1531                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1532                 break;
1533         }
1534
1535         compl->status = status;
1536         return (1);
1537 }
1538
1539 static const char *
1540 nvme_fid_to_name(uint8_t fid)
1541 {
1542         const char *name;
1543
1544         switch (fid) {
1545         case NVME_FEAT_ARBITRATION:
1546                 name = "Arbitration";
1547                 break;
1548         case NVME_FEAT_POWER_MANAGEMENT:
1549                 name = "Power Management";
1550                 break;
1551         case NVME_FEAT_LBA_RANGE_TYPE:
1552                 name = "LBA Range Type";
1553                 break;
1554         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1555                 name = "Temperature Threshold";
1556                 break;
1557         case NVME_FEAT_ERROR_RECOVERY:
1558                 name = "Error Recovery";
1559                 break;
1560         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1561                 name = "Volatile Write Cache";
1562                 break;
1563         case NVME_FEAT_NUMBER_OF_QUEUES:
1564                 name = "Number of Queues";
1565                 break;
1566         case NVME_FEAT_INTERRUPT_COALESCING:
1567                 name = "Interrupt Coalescing";
1568                 break;
1569         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1570                 name = "Interrupt Vector Configuration";
1571                 break;
1572         case NVME_FEAT_WRITE_ATOMICITY:
1573                 name = "Write Atomicity Normal";
1574                 break;
1575         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1576                 name = "Asynchronous Event Configuration";
1577                 break;
1578         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1579                 name = "Autonomous Power State Transition";
1580                 break;
1581         case NVME_FEAT_HOST_MEMORY_BUFFER:
1582                 name = "Host Memory Buffer";
1583                 break;
1584         case NVME_FEAT_TIMESTAMP:
1585                 name = "Timestamp";
1586                 break;
1587         case NVME_FEAT_KEEP_ALIVE_TIMER:
1588                 name = "Keep Alive Timer";
1589                 break;
1590         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1591                 name = "Host Controlled Thermal Management";
1592                 break;
1593         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1594                 name = "Non-Operation Power State Config";
1595                 break;
1596         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1597                 name = "Read Recovery Level Config";
1598                 break;
1599         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1600                 name = "Predictable Latency Mode Config";
1601                 break;
1602         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1603                 name = "Predictable Latency Mode Window";
1604                 break;
1605         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1606                 name = "LBA Status Information Report Interval";
1607                 break;
1608         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1609                 name = "Host Behavior Support";
1610                 break;
1611         case NVME_FEAT_SANITIZE_CONFIG:
1612                 name = "Sanitize Config";
1613                 break;
1614         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1615                 name = "Endurance Group Event Configuration";
1616                 break;
1617         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1618                 name = "Software Progress Marker";
1619                 break;
1620         case NVME_FEAT_HOST_IDENTIFIER:
1621                 name = "Host Identifier";
1622                 break;
1623         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1624                 name = "Reservation Notification Mask";
1625                 break;
1626         case NVME_FEAT_RESERVATION_PERSISTENCE:
1627                 name = "Reservation Persistence";
1628                 break;
1629         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1630                 name = "Namespace Write Protection Config";
1631                 break;
1632         default:
1633                 name = "Unknown";
1634                 break;
1635         }
1636
1637         return (name);
1638 }
1639
1640 static void
1641 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1642     struct nvme_feature_obj *feat,
1643     struct nvme_command *command,
1644     struct nvme_completion *compl)
1645 {
1646
1647         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1648 }
1649
1650 static void
1651 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1652     struct nvme_feature_obj *feat,
1653     struct nvme_command *command,
1654     struct nvme_completion *compl)
1655 {
1656         uint32_t i;
1657         uint32_t cdw11 = command->cdw11;
1658         uint16_t iv;
1659         bool cd;
1660
1661         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1662
1663         iv = cdw11 & 0xffff;
1664         cd = cdw11 & (1 << 16);
1665
1666         if (iv > (sc->max_queues + 1)) {
1667                 return;
1668         }
1669
1670         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1671         if ((iv == 0) && !cd)
1672                 return;
1673
1674         /* Requested Interrupt Vector must be used by a CQ */
1675         for (i = 0; i < sc->num_cqueues + 1; i++) {
1676                 if (sc->compl_queues[i].intr_vec == iv) {
1677                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1678                 }
1679         }
1680 }
1681
1682 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP                (0x4000)
1683 static void
1684 nvme_feature_async_event(struct pci_nvme_softc *sc,
1685     struct nvme_feature_obj *feat,
1686     struct nvme_command *command,
1687     struct nvme_completion *compl)
1688 {
1689
1690         if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1691                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1692 }
1693
1694 #define NVME_TEMP_THRESH_OVER   0
1695 #define NVME_TEMP_THRESH_UNDER  1
1696 static void
1697 nvme_feature_temperature(struct pci_nvme_softc *sc,
1698     struct nvme_feature_obj *feat,
1699     struct nvme_command *command,
1700     struct nvme_completion *compl)
1701 {
1702         uint16_t        tmpth;  /* Temperature Threshold */
1703         uint8_t         tmpsel; /* Threshold Temperature Select */
1704         uint8_t         thsel;  /* Threshold Type Select */
1705         bool            set_crit = false;
1706
1707         tmpth  = command->cdw11 & 0xffff;
1708         tmpsel = (command->cdw11 >> 16) & 0xf;
1709         thsel  = (command->cdw11 >> 20) & 0x3;
1710
1711         DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1712
1713         /* Check for unsupported values */
1714         if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1715             (thsel > NVME_TEMP_THRESH_UNDER)) {
1716                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1717                 return;
1718         }
1719
1720         if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1721             ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1722                 set_crit = true;
1723
1724         pthread_mutex_lock(&sc->mtx);
1725         if (set_crit)
1726                 sc->health_log.critical_warning |=
1727                     NVME_CRIT_WARN_ST_TEMPERATURE;
1728         else
1729                 sc->health_log.critical_warning &=
1730                     ~NVME_CRIT_WARN_ST_TEMPERATURE;
1731         pthread_mutex_unlock(&sc->mtx);
1732
1733         if (set_crit)
1734                 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1735                     sc->health_log.critical_warning);
1736
1737
1738         DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1739 }
1740
1741 static void
1742 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1743     struct nvme_feature_obj *feat,
1744     struct nvme_command *command,
1745     struct nvme_completion *compl)
1746 {
1747         uint16_t nqr;   /* Number of Queues Requested */
1748
1749         if (sc->num_q_is_set) {
1750                 WPRINTF("%s: Number of Queues already set", __func__);
1751                 pci_nvme_status_genc(&compl->status,
1752                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1753                 return;
1754         }
1755
1756         nqr = command->cdw11 & 0xFFFF;
1757         if (nqr == 0xffff) {
1758                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1759                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1760                 return;
1761         }
1762
1763         sc->num_squeues = ONE_BASED(nqr);
1764         if (sc->num_squeues > sc->max_queues) {
1765                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1766                                         sc->max_queues);
1767                 sc->num_squeues = sc->max_queues;
1768         }
1769
1770         nqr = (command->cdw11 >> 16) & 0xFFFF;
1771         if (nqr == 0xffff) {
1772                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1773                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1774                 return;
1775         }
1776
1777         sc->num_cqueues = ONE_BASED(nqr);
1778         if (sc->num_cqueues > sc->max_queues) {
1779                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1780                                         sc->max_queues);
1781                 sc->num_cqueues = sc->max_queues;
1782         }
1783
1784         /* Patch the command value which will be saved on callback's return */
1785         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1786         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1787
1788         sc->num_q_is_set = true;
1789 }
1790
1791 static int
1792 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1793         struct nvme_completion *compl)
1794 {
1795         struct nvme_feature_obj *feat;
1796         uint32_t nsid = command->nsid;
1797         uint8_t fid = command->cdw10 & 0xFF;
1798
1799         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1800
1801         if (fid >= NVME_FID_MAX) {
1802                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1803                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1804                 return (1);
1805         }
1806         feat = &sc->feat[fid];
1807
1808         if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1809                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1810                 return (1);
1811         }
1812
1813         if (!feat->namespace_specific &&
1814             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1815                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1816                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1817                 return (1);
1818         }
1819
1820         compl->cdw0 = 0;
1821         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1822
1823         if (feat->set)
1824                 feat->set(sc, feat, command, compl);
1825
1826         DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1827         if (compl->status == NVME_SC_SUCCESS) {
1828                 feat->cdw11 = command->cdw11;
1829                 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1830                     (command->cdw11 != 0))
1831                         pci_nvme_aen_notify(sc);
1832         }
1833
1834         return (0);
1835 }
1836
1837 #define NVME_FEATURES_SEL_SUPPORTED     0x3
1838 #define NVME_FEATURES_NS_SPECIFIC       (1 << 1)
1839
1840 static int
1841 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1842         struct nvme_completion* compl)
1843 {
1844         struct nvme_feature_obj *feat;
1845         uint8_t fid = command->cdw10 & 0xFF;
1846         uint8_t sel = (command->cdw10 >> 8) & 0x7;
1847
1848         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1849
1850         if (fid >= NVME_FID_MAX) {
1851                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1852                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1853                 return (1);
1854         }
1855
1856         compl->cdw0 = 0;
1857         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1858
1859         feat = &sc->feat[fid];
1860         if (feat->get) {
1861                 feat->get(sc, feat, command, compl);
1862         }
1863
1864         if (compl->status == NVME_SC_SUCCESS) {
1865                 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1866                         compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1867                 else
1868                         compl->cdw0 = feat->cdw11;
1869         }
1870
1871         return (0);
1872 }
1873
1874 static int
1875 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1876         struct nvme_completion* compl)
1877 {
1878         uint8_t ses, lbaf, pi;
1879
1880         /* Only supports Secure Erase Setting - User Data Erase */
1881         ses = (command->cdw10 >> 9) & 0x7;
1882         if (ses > 0x1) {
1883                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1884                 return (1);
1885         }
1886
1887         /* Only supports a single LBA Format */
1888         lbaf = command->cdw10 & 0xf;
1889         if (lbaf != 0) {
1890                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1891                     NVME_SC_INVALID_FORMAT);
1892                 return (1);
1893         }
1894
1895         /* Doesn't support Protection Infomation */
1896         pi = (command->cdw10 >> 5) & 0x7;
1897         if (pi != 0) {
1898                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1899                 return (1);
1900         }
1901
1902         if (sc->nvstore.type == NVME_STOR_RAM) {
1903                 if (sc->nvstore.ctx)
1904                         free(sc->nvstore.ctx);
1905                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1906                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1907         } else {
1908                 struct pci_nvme_ioreq *req;
1909                 int err;
1910
1911                 req = pci_nvme_get_ioreq(sc);
1912                 if (req == NULL) {
1913                         pci_nvme_status_genc(&compl->status,
1914                             NVME_SC_INTERNAL_DEVICE_ERROR);
1915                         WPRINTF("%s: unable to allocate IO req", __func__);
1916                         return (1);
1917                 }
1918                 req->nvme_sq = &sc->submit_queues[0];
1919                 req->sqid = 0;
1920                 req->opc = command->opc;
1921                 req->cid = command->cid;
1922                 req->nsid = command->nsid;
1923
1924                 req->io_req.br_offset = 0;
1925                 req->io_req.br_resid = sc->nvstore.size;
1926                 req->io_req.br_callback = pci_nvme_io_done;
1927
1928                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1929                 if (err) {
1930                         pci_nvme_status_genc(&compl->status,
1931                             NVME_SC_INTERNAL_DEVICE_ERROR);
1932                         pci_nvme_release_ioreq(sc, req);
1933                 } else
1934                         compl->status = NVME_NO_STATUS;
1935         }
1936
1937         return (1);
1938 }
1939
1940 static int
1941 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1942         struct nvme_completion* compl)
1943 {
1944         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1945                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1946
1947         /* TODO: search for the command ID and abort it */
1948
1949         compl->cdw0 = 1;
1950         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1951         return (1);
1952 }
1953
1954 static int
1955 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1956         struct nvme_command* command, struct nvme_completion* compl)
1957 {
1958         DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1959             sc->aer_count, sc->ctrldata.aerl, command->cid);
1960
1961         /* Don't exceed the Async Event Request Limit (AERL). */
1962         if (pci_nvme_aer_limit_reached(sc)) {
1963                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1964                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1965                 return (1);
1966         }
1967
1968         if (pci_nvme_aer_add(sc, command->cid)) {
1969                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1970                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1971                 return (1);
1972         }
1973
1974         /*
1975          * Raise events when they happen based on the Set Features cmd.
1976          * These events happen async, so only set completion successful if
1977          * there is an event reflective of the request to get event.
1978          */
1979         compl->status = NVME_NO_STATUS;
1980         pci_nvme_aen_notify(sc);
1981
1982         return (0);
1983 }
1984
1985 static void
1986 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1987 {
1988         struct nvme_completion compl;
1989         struct nvme_command *cmd;
1990         struct nvme_submission_queue *sq;
1991         struct nvme_completion_queue *cq;
1992         uint16_t sqhead;
1993
1994         DPRINTF("%s index %u", __func__, (uint32_t)value);
1995
1996         sq = &sc->submit_queues[0];
1997         cq = &sc->compl_queues[0];
1998
1999         pthread_mutex_lock(&sq->mtx);
2000
2001         sqhead = sq->head;
2002         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2003         
2004         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2005                 cmd = &(sq->qbase)[sqhead];
2006                 compl.cdw0 = 0;
2007                 compl.status = 0;
2008
2009                 switch (cmd->opc) {
2010                 case NVME_OPC_DELETE_IO_SQ:
2011                         DPRINTF("%s command DELETE_IO_SQ", __func__);
2012                         nvme_opc_delete_io_sq(sc, cmd, &compl);
2013                         break;
2014                 case NVME_OPC_CREATE_IO_SQ:
2015                         DPRINTF("%s command CREATE_IO_SQ", __func__);
2016                         nvme_opc_create_io_sq(sc, cmd, &compl);
2017                         break;
2018                 case NVME_OPC_DELETE_IO_CQ:
2019                         DPRINTF("%s command DELETE_IO_CQ", __func__);
2020                         nvme_opc_delete_io_cq(sc, cmd, &compl);
2021                         break;
2022                 case NVME_OPC_CREATE_IO_CQ:
2023                         DPRINTF("%s command CREATE_IO_CQ", __func__);
2024                         nvme_opc_create_io_cq(sc, cmd, &compl);
2025                         break;
2026                 case NVME_OPC_GET_LOG_PAGE:
2027                         DPRINTF("%s command GET_LOG_PAGE", __func__);
2028                         nvme_opc_get_log_page(sc, cmd, &compl);
2029                         break;
2030                 case NVME_OPC_IDENTIFY:
2031                         DPRINTF("%s command IDENTIFY", __func__);
2032                         nvme_opc_identify(sc, cmd, &compl);
2033                         break;
2034                 case NVME_OPC_ABORT:
2035                         DPRINTF("%s command ABORT", __func__);
2036                         nvme_opc_abort(sc, cmd, &compl);
2037                         break;
2038                 case NVME_OPC_SET_FEATURES:
2039                         DPRINTF("%s command SET_FEATURES", __func__);
2040                         nvme_opc_set_features(sc, cmd, &compl);
2041                         break;
2042                 case NVME_OPC_GET_FEATURES:
2043                         DPRINTF("%s command GET_FEATURES", __func__);
2044                         nvme_opc_get_features(sc, cmd, &compl);
2045                         break;
2046                 case NVME_OPC_FIRMWARE_ACTIVATE:
2047                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2048                         pci_nvme_status_tc(&compl.status,
2049                             NVME_SCT_COMMAND_SPECIFIC,
2050                             NVME_SC_INVALID_FIRMWARE_SLOT);
2051                         break;
2052                 case NVME_OPC_ASYNC_EVENT_REQUEST:
2053                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2054                         nvme_opc_async_event_req(sc, cmd, &compl);
2055                         break;
2056                 case NVME_OPC_FORMAT_NVM:
2057                         DPRINTF("%s command FORMAT_NVM", __func__);
2058                         if ((sc->ctrldata.oacs &
2059                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2060                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2061                                 break;
2062                         }
2063                         nvme_opc_format_nvm(sc, cmd, &compl);
2064                         break;
2065                 case NVME_OPC_SECURITY_SEND:
2066                 case NVME_OPC_SECURITY_RECEIVE:
2067                 case NVME_OPC_SANITIZE:
2068                 case NVME_OPC_GET_LBA_STATUS:
2069                         DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2070                             cmd->opc);
2071                         /* Valid but unsupported opcodes */
2072                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2073                         break;
2074                 default:
2075                         DPRINTF("%s command OPC=%#X (not implemented)",
2076                             __func__,
2077                             cmd->opc);
2078                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2079                 }
2080                 sqhead = (sqhead + 1) % sq->size;
2081
2082                 if (NVME_COMPLETION_VALID(compl)) {
2083                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
2084                             compl.cdw0,
2085                             cmd->cid,
2086                             0,          /* SQID */
2087                             compl.status);
2088                 }
2089         }
2090
2091         DPRINTF("setting sqhead %u", sqhead);
2092         sq->head = sqhead;
2093
2094         if (cq->head != cq->tail)
2095                 pci_generate_msix(sc->nsc_pi, 0);
2096
2097         pthread_mutex_unlock(&sq->mtx);
2098 }
2099
2100 /*
2101  * Update the Write and Read statistics reported in SMART data
2102  *
2103  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2104  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2105  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2106  */
2107 static void
2108 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2109     size_t bytes, uint16_t status)
2110 {
2111
2112         pthread_mutex_lock(&sc->mtx);
2113         switch (opc) {
2114         case NVME_OPC_WRITE:
2115                 sc->write_commands++;
2116                 if (status != NVME_SC_SUCCESS)
2117                         break;
2118                 sc->write_dunits_remainder += (bytes / 512);
2119                 while (sc->write_dunits_remainder >= 1000) {
2120                         sc->write_data_units++;
2121                         sc->write_dunits_remainder -= 1000;
2122                 }
2123                 break;
2124         case NVME_OPC_READ:
2125                 sc->read_commands++;
2126                 if (status != NVME_SC_SUCCESS)
2127                         break;
2128                 sc->read_dunits_remainder += (bytes / 512);
2129                 while (sc->read_dunits_remainder >= 1000) {
2130                         sc->read_data_units++;
2131                         sc->read_dunits_remainder -= 1000;
2132                 }
2133                 break;
2134         default:
2135                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2136                 break;
2137         }
2138         pthread_mutex_unlock(&sc->mtx);
2139 }
2140
2141 /*
2142  * Check if the combination of Starting LBA (slba) and number of blocks
2143  * exceeds the range of the underlying storage.
2144  *
2145  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2146  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2147  * overflow.
2148  */
2149 static bool
2150 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2151     uint32_t nblocks)
2152 {
2153         size_t  offset, bytes;
2154
2155         /* Overflow check of multiplying Starting LBA by the sector size */
2156         if (slba >> (64 - nvstore->sectsz_bits))
2157                 return (true);
2158
2159         offset = slba << nvstore->sectsz_bits;
2160         bytes = nblocks << nvstore->sectsz_bits;
2161
2162         /* Overflow check of Number of Logical Blocks */
2163         if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2164                 return (true);
2165
2166         return (false);
2167 }
2168
2169 static int
2170 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2171         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2172 {
2173         int iovidx;
2174
2175         if (req == NULL)
2176                 return (-1);
2177
2178         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2179                 return (-1);
2180         }
2181
2182         /* concatenate contig block-iovs to minimize number of iovs */
2183         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2184                 iovidx = req->io_req.br_iovcnt - 1;
2185
2186                 req->io_req.br_iov[iovidx].iov_base =
2187                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2188                                      req->prev_gpaddr, size);
2189
2190                 req->prev_size += size;
2191                 req->io_req.br_resid += size;
2192
2193                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2194         } else {
2195                 iovidx = req->io_req.br_iovcnt;
2196                 if (iovidx == 0) {
2197                         req->io_req.br_offset = lba;
2198                         req->io_req.br_resid = 0;
2199                         req->io_req.br_param = req;
2200                 }
2201
2202                 req->io_req.br_iov[iovidx].iov_base =
2203                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2204                                      gpaddr, size);
2205
2206                 req->io_req.br_iov[iovidx].iov_len = size;
2207
2208                 req->prev_gpaddr = gpaddr;
2209                 req->prev_size = size;
2210                 req->io_req.br_resid += size;
2211
2212                 req->io_req.br_iovcnt++;
2213         }
2214
2215         return (0);
2216 }
2217
2218 static void
2219 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2220         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2221         uint32_t cdw0, uint16_t status)
2222 {
2223         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2224
2225         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2226                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2227                  NVME_STATUS_GET_SC(status));
2228
2229         pci_nvme_cq_update(sc, cq,
2230             0,          /* CDW0 */
2231             cid,
2232             sqid,
2233             status);
2234
2235         if (cq->head != cq->tail) {
2236                 if (cq->intr_en & NVME_CQ_INTEN) {
2237                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2238                 } else {
2239                         DPRINTF("%s: CQ%u interrupt disabled",
2240                                                 __func__, sq->cqid);
2241                 }
2242         }
2243 }
2244
2245 static void
2246 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2247 {
2248         req->sc = NULL;
2249         req->nvme_sq = NULL;
2250         req->sqid = 0;
2251
2252         pthread_mutex_lock(&sc->mtx);
2253
2254         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2255         sc->pending_ios--;
2256
2257         /* when no more IO pending, can set to ready if device reset/enabled */
2258         if (sc->pending_ios == 0 &&
2259             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2260                 sc->regs.csts |= NVME_CSTS_RDY;
2261
2262         pthread_mutex_unlock(&sc->mtx);
2263
2264         sem_post(&sc->iosemlock);
2265 }
2266
2267 static struct pci_nvme_ioreq *
2268 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2269 {
2270         struct pci_nvme_ioreq *req = NULL;
2271
2272         sem_wait(&sc->iosemlock);
2273         pthread_mutex_lock(&sc->mtx);
2274
2275         req = STAILQ_FIRST(&sc->ioreqs_free);
2276         assert(req != NULL);
2277         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2278
2279         req->sc = sc;
2280
2281         sc->pending_ios++;
2282
2283         pthread_mutex_unlock(&sc->mtx);
2284
2285         req->io_req.br_iovcnt = 0;
2286         req->io_req.br_offset = 0;
2287         req->io_req.br_resid = 0;
2288         req->io_req.br_param = req;
2289         req->prev_gpaddr = 0;
2290         req->prev_size = 0;
2291
2292         return req;
2293 }
2294
2295 static void
2296 pci_nvme_io_done(struct blockif_req *br, int err)
2297 {
2298         struct pci_nvme_ioreq *req = br->br_param;
2299         struct nvme_submission_queue *sq = req->nvme_sq;
2300         uint16_t code, status;
2301
2302         DPRINTF("%s error %d %s", __func__, err, strerror(err));
2303
2304         /* TODO return correct error */
2305         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2306         pci_nvme_status_genc(&status, code);
2307
2308         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2309         pci_nvme_stats_write_read_update(req->sc, req->opc,
2310             req->bytes, status);
2311         pci_nvme_release_ioreq(req->sc, req);
2312 }
2313
2314 /*
2315  * Implements the Flush command. The specification states:
2316  *    If a volatile write cache is not present, Flush commands complete
2317  *    successfully and have no effect
2318  * in the description of the Volatile Write Cache (VWC) field of the Identify
2319  * Controller data. Therefore, set status to Success if the command is
2320  * not supported (i.e. RAM or as indicated by the blockif).
2321  */
2322 static bool
2323 nvme_opc_flush(struct pci_nvme_softc *sc,
2324     struct nvme_command *cmd,
2325     struct pci_nvme_blockstore *nvstore,
2326     struct pci_nvme_ioreq *req,
2327     uint16_t *status)
2328 {
2329         bool pending = false;
2330
2331         if (nvstore->type == NVME_STOR_RAM) {
2332                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2333         } else {
2334                 int err;
2335
2336                 req->io_req.br_callback = pci_nvme_io_done;
2337
2338                 err = blockif_flush(nvstore->ctx, &req->io_req);
2339                 switch (err) {
2340                 case 0:
2341                         pending = true;
2342                         break;
2343                 case EOPNOTSUPP:
2344                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2345                         break;
2346                 default:
2347                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2348                 }
2349         }
2350
2351         return (pending);
2352 }
2353
2354 static uint16_t
2355 nvme_write_read_ram(struct pci_nvme_softc *sc,
2356     struct pci_nvme_blockstore *nvstore,
2357     uint64_t prp1, uint64_t prp2,
2358     size_t offset, uint64_t bytes,
2359     bool is_write)
2360 {
2361         uint8_t *buf = nvstore->ctx;
2362         enum nvme_copy_dir dir;
2363         uint16_t status;
2364
2365         if (is_write)
2366                 dir = NVME_COPY_TO_PRP;
2367         else
2368                 dir = NVME_COPY_FROM_PRP;
2369
2370         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2371             buf + offset, bytes, dir))
2372                 pci_nvme_status_genc(&status,
2373                     NVME_SC_DATA_TRANSFER_ERROR);
2374         else
2375                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2376
2377         return (status);
2378 }
2379
2380 static uint16_t
2381 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2382     struct pci_nvme_blockstore *nvstore,
2383     struct pci_nvme_ioreq *req,
2384     uint64_t prp1, uint64_t prp2,
2385     size_t offset, uint64_t bytes,
2386     bool is_write)
2387 {
2388         uint64_t size;
2389         int err;
2390         uint16_t status = NVME_NO_STATUS;
2391
2392         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2393         if (pci_nvme_append_iov_req(sc, req, prp1,
2394             size, is_write, offset)) {
2395                 pci_nvme_status_genc(&status,
2396                     NVME_SC_DATA_TRANSFER_ERROR);
2397                 goto out;
2398         }
2399
2400         offset += size;
2401         bytes  -= size;
2402
2403         if (bytes == 0) {
2404                 ;
2405         } else if (bytes <= PAGE_SIZE) {
2406                 size = bytes;
2407                 if (pci_nvme_append_iov_req(sc, req, prp2,
2408                     size, is_write, offset)) {
2409                         pci_nvme_status_genc(&status,
2410                             NVME_SC_DATA_TRANSFER_ERROR);
2411                         goto out;
2412                 }
2413         } else {
2414                 void *vmctx = sc->nsc_pi->pi_vmctx;
2415                 uint64_t *prp_list = &prp2;
2416                 uint64_t *last = prp_list;
2417
2418                 /* PRP2 is pointer to a physical region page list */
2419                 while (bytes) {
2420                         /* Last entry in list points to the next list */
2421                         if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2422                                 uint64_t prp = *prp_list;
2423
2424                                 prp_list = paddr_guest2host(vmctx, prp,
2425                                     PAGE_SIZE - (prp % PAGE_SIZE));
2426                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
2427                         }
2428
2429                         size = MIN(bytes, PAGE_SIZE);
2430
2431                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
2432                             size, is_write, offset)) {
2433                                 pci_nvme_status_genc(&status,
2434                                     NVME_SC_DATA_TRANSFER_ERROR);
2435                                 goto out;
2436                         }
2437
2438                         offset += size;
2439                         bytes  -= size;
2440
2441                         prp_list++;
2442                 }
2443         }
2444         req->io_req.br_callback = pci_nvme_io_done;
2445         if (is_write)
2446                 err = blockif_write(nvstore->ctx, &req->io_req);
2447         else
2448                 err = blockif_read(nvstore->ctx, &req->io_req);
2449
2450         if (err)
2451                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2452 out:
2453         return (status);
2454 }
2455
2456 static bool
2457 nvme_opc_write_read(struct pci_nvme_softc *sc,
2458     struct nvme_command *cmd,
2459     struct pci_nvme_blockstore *nvstore,
2460     struct pci_nvme_ioreq *req,
2461     uint16_t *status)
2462 {
2463         uint64_t lba, nblocks, bytes;
2464         size_t offset;
2465         bool is_write = cmd->opc == NVME_OPC_WRITE;
2466         bool pending = false;
2467
2468         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2469         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2470
2471         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2472                 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2473                     __func__, lba, nblocks);
2474                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2475                 goto out;
2476         }
2477
2478         bytes  = nblocks << nvstore->sectsz_bits;
2479         if (bytes > NVME_MAX_DATA_SIZE) {
2480                 WPRINTF("%s command would exceed MDTS", __func__);
2481                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2482                 goto out;
2483         }
2484
2485         offset = lba << nvstore->sectsz_bits;
2486
2487         req->bytes = bytes;
2488         req->io_req.br_offset = lba;
2489
2490         /* PRP bits 1:0 must be zero */
2491         cmd->prp1 &= ~0x3UL;
2492         cmd->prp2 &= ~0x3UL;
2493
2494         if (nvstore->type == NVME_STOR_RAM) {
2495                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2496                     cmd->prp2, offset, bytes, is_write);
2497         } else {
2498                 *status = nvme_write_read_blockif(sc, nvstore, req,
2499                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2500
2501                 if (*status == NVME_NO_STATUS)
2502                         pending = true;
2503         }
2504 out:
2505         if (!pending)
2506                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2507
2508         return (pending);
2509 }
2510
2511 static void
2512 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2513 {
2514         struct pci_nvme_ioreq *req = br->br_param;
2515         struct pci_nvme_softc *sc = req->sc;
2516         bool done = true;
2517         uint16_t status;
2518
2519         if (err) {
2520                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2521         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2522                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2523         } else {
2524                 struct iovec *iov = req->io_req.br_iov;
2525
2526                 req->prev_gpaddr++;
2527                 iov += req->prev_gpaddr;
2528
2529                 /* The iov_* values already include the sector size */
2530                 req->io_req.br_offset = (off_t)iov->iov_base;
2531                 req->io_req.br_resid = iov->iov_len;
2532                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2533                         pci_nvme_status_genc(&status,
2534                             NVME_SC_INTERNAL_DEVICE_ERROR);
2535                 } else
2536                         done = false;
2537         }
2538
2539         if (done) {
2540                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2541                     req->cid, 0, status);
2542                 pci_nvme_release_ioreq(sc, req);
2543         }
2544 }
2545
2546 static bool
2547 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2548     struct nvme_command *cmd,
2549     struct pci_nvme_blockstore *nvstore,
2550     struct pci_nvme_ioreq *req,
2551     uint16_t *status)
2552 {
2553         struct nvme_dsm_range *range;
2554         uint32_t nr, r, non_zero, dr;
2555         int err;
2556         bool pending = false;
2557
2558         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2559                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2560                 goto out;
2561         }
2562
2563         nr = cmd->cdw10 & 0xff;
2564
2565         /* copy locally because a range entry could straddle PRPs */
2566         range = calloc(1, NVME_MAX_DSM_TRIM);
2567         if (range == NULL) {
2568                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2569                 goto out;
2570         }
2571         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2572             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2573
2574         /* Check for invalid ranges and the number of non-zero lengths */
2575         non_zero = 0;
2576         for (r = 0; r <= nr; r++) {
2577                 if (pci_nvme_out_of_range(nvstore,
2578                     range[r].starting_lba, range[r].length)) {
2579                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2580                         goto out;
2581                 }
2582                 if (range[r].length != 0)
2583                         non_zero++;
2584         }
2585
2586         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2587                 size_t offset, bytes;
2588                 int sectsz_bits = sc->nvstore.sectsz_bits;
2589
2590                 /*
2591                  * DSM calls are advisory only, and compliant controllers
2592                  * may choose to take no actions (i.e. return Success).
2593                  */
2594                 if (!nvstore->deallocate) {
2595                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2596                         goto out;
2597                 }
2598
2599                 /* If all ranges have a zero length, return Success */
2600                 if (non_zero == 0) {
2601                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2602                         goto out;
2603                 }
2604
2605                 if (req == NULL) {
2606                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2607                         goto out;
2608                 }
2609
2610                 offset = range[0].starting_lba << sectsz_bits;
2611                 bytes = range[0].length << sectsz_bits;
2612
2613                 /*
2614                  * If the request is for more than a single range, store
2615                  * the ranges in the br_iov. Optimize for the common case
2616                  * of a single range.
2617                  *
2618                  * Note that NVMe Number of Ranges is a zero based value
2619                  */
2620                 req->io_req.br_iovcnt = 0;
2621                 req->io_req.br_offset = offset;
2622                 req->io_req.br_resid = bytes;
2623
2624                 if (nr == 0) {
2625                         req->io_req.br_callback = pci_nvme_io_done;
2626                 } else {
2627                         struct iovec *iov = req->io_req.br_iov;
2628
2629                         for (r = 0, dr = 0; r <= nr; r++) {
2630                                 offset = range[r].starting_lba << sectsz_bits;
2631                                 bytes = range[r].length << sectsz_bits;
2632                                 if (bytes == 0)
2633                                         continue;
2634
2635                                 if ((nvstore->size - offset) < bytes) {
2636                                         pci_nvme_status_genc(status,
2637                                             NVME_SC_LBA_OUT_OF_RANGE);
2638                                         goto out;
2639                                 }
2640                                 iov[dr].iov_base = (void *)offset;
2641                                 iov[dr].iov_len = bytes;
2642                                 dr++;
2643                         }
2644                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2645
2646                         /*
2647                          * Use prev_gpaddr to track the current entry and
2648                          * prev_size to track the number of entries
2649                          */
2650                         req->prev_gpaddr = 0;
2651                         req->prev_size = dr;
2652                 }
2653
2654                 err = blockif_delete(nvstore->ctx, &req->io_req);
2655                 if (err)
2656                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2657                 else
2658                         pending = true;
2659         }
2660 out:
2661         free(range);
2662         return (pending);
2663 }
2664
2665 static void
2666 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2667 {
2668         struct nvme_submission_queue *sq;
2669         uint16_t status;
2670         uint16_t sqhead;
2671
2672         /* handle all submissions up to sq->tail index */
2673         sq = &sc->submit_queues[idx];
2674
2675         pthread_mutex_lock(&sq->mtx);
2676
2677         sqhead = sq->head;
2678         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2679                  idx, sqhead, sq->tail, sq->qbase);
2680
2681         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2682                 struct nvme_command *cmd;
2683                 struct pci_nvme_ioreq *req;
2684                 uint32_t nsid;
2685                 bool pending;
2686
2687                 pending = false;
2688                 req = NULL;
2689                 status = 0;
2690
2691                 cmd = &sq->qbase[sqhead];
2692                 sqhead = (sqhead + 1) % sq->size;
2693
2694                 nsid = le32toh(cmd->nsid);
2695                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2696                         pci_nvme_status_genc(&status,
2697                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2698                         status |=
2699                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2700                         goto complete;
2701                 }
2702
2703                 req = pci_nvme_get_ioreq(sc);
2704                 if (req == NULL) {
2705                         pci_nvme_status_genc(&status,
2706                             NVME_SC_INTERNAL_DEVICE_ERROR);
2707                         WPRINTF("%s: unable to allocate IO req", __func__);
2708                         goto complete;
2709                 }
2710                 req->nvme_sq = sq;
2711                 req->sqid = idx;
2712                 req->opc = cmd->opc;
2713                 req->cid = cmd->cid;
2714                 req->nsid = cmd->nsid;
2715
2716                 switch (cmd->opc) {
2717                 case NVME_OPC_FLUSH:
2718                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2719                             req, &status);
2720                         break;
2721                 case NVME_OPC_WRITE:
2722                 case NVME_OPC_READ:
2723                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2724                             req, &status);
2725                         break;
2726                 case NVME_OPC_WRITE_ZEROES:
2727                         /* TODO: write zeroes
2728                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2729                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2730                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2731                         break;
2732                 case NVME_OPC_DATASET_MANAGEMENT:
2733                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2734                             req, &status);
2735                         break;
2736                 default:
2737                         WPRINTF("%s unhandled io command 0x%x",
2738                             __func__, cmd->opc);
2739                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2740                 }
2741 complete:
2742                 if (!pending) {
2743                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2744                             status);
2745                         if (req != NULL)
2746                                 pci_nvme_release_ioreq(sc, req);
2747                 }
2748         }
2749
2750         sq->head = sqhead;
2751
2752         pthread_mutex_unlock(&sq->mtx);
2753 }
2754
2755 static void
2756 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2757         uint64_t idx, int is_sq, uint64_t value)
2758 {
2759         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2760                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2761
2762         if (is_sq) {
2763                 if (idx > sc->num_squeues) {
2764                         WPRINTF("%s queue index %lu overflow from "
2765                                  "guest (max %u)",
2766                                  __func__, idx, sc->num_squeues);
2767                         return;
2768                 }
2769
2770                 atomic_store_short(&sc->submit_queues[idx].tail,
2771                                    (uint16_t)value);
2772
2773                 if (idx == 0) {
2774                         pci_nvme_handle_admin_cmd(sc, value);
2775                 } else {
2776                         /* submission queue; handle new entries in SQ */
2777                         if (idx > sc->num_squeues) {
2778                                 WPRINTF("%s SQ index %lu overflow from "
2779                                          "guest (max %u)",
2780                                          __func__, idx, sc->num_squeues);
2781                                 return;
2782                         }
2783                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2784                 }
2785         } else {
2786                 if (idx > sc->num_cqueues) {
2787                         WPRINTF("%s queue index %lu overflow from "
2788                                  "guest (max %u)",
2789                                  __func__, idx, sc->num_cqueues);
2790                         return;
2791                 }
2792
2793                 atomic_store_short(&sc->compl_queues[idx].head,
2794                                 (uint16_t)value);
2795         }
2796 }
2797
2798 static void
2799 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2800 {
2801         const char *s = iswrite ? "WRITE" : "READ";
2802
2803         switch (offset) {
2804         case NVME_CR_CAP_LOW:
2805                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2806                 break;
2807         case NVME_CR_CAP_HI:
2808                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2809                 break;
2810         case NVME_CR_VS:
2811                 DPRINTF("%s %s NVME_CR_VS", func, s);
2812                 break;
2813         case NVME_CR_INTMS:
2814                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2815                 break;
2816         case NVME_CR_INTMC:
2817                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2818                 break;
2819         case NVME_CR_CC:
2820                 DPRINTF("%s %s NVME_CR_CC", func, s);
2821                 break;
2822         case NVME_CR_CSTS:
2823                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2824                 break;
2825         case NVME_CR_NSSR:
2826                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2827                 break;
2828         case NVME_CR_AQA:
2829                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2830                 break;
2831         case NVME_CR_ASQ_LOW:
2832                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2833                 break;
2834         case NVME_CR_ASQ_HI:
2835                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2836                 break;
2837         case NVME_CR_ACQ_LOW:
2838                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2839                 break;
2840         case NVME_CR_ACQ_HI:
2841                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2842                 break;
2843         default:
2844                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2845         }
2846
2847 }
2848
2849 static void
2850 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2851         uint64_t offset, int size, uint64_t value)
2852 {
2853         uint32_t ccreg;
2854
2855         if (offset >= NVME_DOORBELL_OFFSET) {
2856                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2857                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2858                 int is_sq = (belloffset % 8) < 4;
2859
2860                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2861                         WPRINTF("guest attempted an overflow write offset "
2862                                  "0x%lx, val 0x%lx in %s",
2863                                  offset, value, __func__);
2864                         return;
2865                 }
2866
2867                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2868                 return;
2869         }
2870
2871         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2872                 offset, size, value);
2873
2874         if (size != 4) {
2875                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2876                          "val 0x%lx) to bar0 in %s",
2877                          size, offset, value, __func__);
2878                 /* TODO: shutdown device */
2879                 return;
2880         }
2881
2882         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2883
2884         pthread_mutex_lock(&sc->mtx);
2885
2886         switch (offset) {
2887         case NVME_CR_CAP_LOW:
2888         case NVME_CR_CAP_HI:
2889                 /* readonly */
2890                 break;
2891         case NVME_CR_VS:
2892                 /* readonly */
2893                 break;
2894         case NVME_CR_INTMS:
2895                 /* MSI-X, so ignore */
2896                 break;
2897         case NVME_CR_INTMC:
2898                 /* MSI-X, so ignore */
2899                 break;
2900         case NVME_CR_CC:
2901                 ccreg = (uint32_t)value;
2902
2903                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2904                          "iocqes %u",
2905                         __func__,
2906                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2907                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2908                          NVME_CC_GET_IOCQES(ccreg));
2909
2910                 if (NVME_CC_GET_SHN(ccreg)) {
2911                         /* perform shutdown - flush out data to backend */
2912                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2913                             NVME_CSTS_REG_SHST_SHIFT);
2914                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2915                             NVME_CSTS_REG_SHST_SHIFT;
2916                 }
2917                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2918                         if (NVME_CC_GET_EN(ccreg) == 0)
2919                                 /* transition 1-> causes controller reset */
2920                                 pci_nvme_reset_locked(sc);
2921                         else
2922                                 pci_nvme_init_controller(ctx, sc);
2923                 }
2924
2925                 /* Insert the iocqes, iosqes and en bits from the write */
2926                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2927                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2928                 if (NVME_CC_GET_EN(ccreg) == 0) {
2929                         /* Insert the ams, mps and css bit fields */
2930                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2931                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2932                         sc->regs.csts &= ~NVME_CSTS_RDY;
2933                 } else if (sc->pending_ios == 0) {
2934                         sc->regs.csts |= NVME_CSTS_RDY;
2935                 }
2936                 break;
2937         case NVME_CR_CSTS:
2938                 break;
2939         case NVME_CR_NSSR:
2940                 /* ignore writes; don't support subsystem reset */
2941                 break;
2942         case NVME_CR_AQA:
2943                 sc->regs.aqa = (uint32_t)value;
2944                 break;
2945         case NVME_CR_ASQ_LOW:
2946                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2947                                (0xFFFFF000 & value);
2948                 break;
2949         case NVME_CR_ASQ_HI:
2950                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2951                                (value << 32);
2952                 break;
2953         case NVME_CR_ACQ_LOW:
2954                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2955                                (0xFFFFF000 & value);
2956                 break;
2957         case NVME_CR_ACQ_HI:
2958                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2959                                (value << 32);
2960                 break;
2961         default:
2962                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2963                          __func__, offset, value, size);
2964         }
2965         pthread_mutex_unlock(&sc->mtx);
2966 }
2967
2968 static void
2969 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2970                 int baridx, uint64_t offset, int size, uint64_t value)
2971 {
2972         struct pci_nvme_softc* sc = pi->pi_arg;
2973
2974         if (baridx == pci_msix_table_bar(pi) ||
2975             baridx == pci_msix_pba_bar(pi)) {
2976                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2977                          " value 0x%lx", baridx, offset, size, value);
2978
2979                 pci_emul_msix_twrite(pi, offset, size, value);
2980                 return;
2981         }
2982
2983         switch (baridx) {
2984         case 0:
2985                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2986                 break;
2987
2988         default:
2989                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2990                          __func__, baridx, value);
2991         }
2992 }
2993
2994 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2995         uint64_t offset, int size)
2996 {
2997         uint64_t value;
2998
2999         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3000
3001         if (offset < NVME_DOORBELL_OFFSET) {
3002                 void *p = &(sc->regs);
3003                 pthread_mutex_lock(&sc->mtx);
3004                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
3005                 pthread_mutex_unlock(&sc->mtx);
3006         } else {
3007                 value = 0;
3008                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3009         }
3010
3011         switch (size) {
3012         case 1:
3013                 value &= 0xFF;
3014                 break;
3015         case 2:
3016                 value &= 0xFFFF;
3017                 break;
3018         case 4:
3019                 value &= 0xFFFFFFFF;
3020                 break;
3021         }
3022
3023         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3024                  offset, size, (uint32_t)value);
3025
3026         return (value);
3027 }
3028
3029
3030
3031 static uint64_t
3032 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
3033     uint64_t offset, int size)
3034 {
3035         struct pci_nvme_softc* sc = pi->pi_arg;
3036
3037         if (baridx == pci_msix_table_bar(pi) ||
3038             baridx == pci_msix_pba_bar(pi)) {
3039                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3040                         baridx, offset, size);
3041
3042                 return pci_emul_msix_tread(pi, offset, size);
3043         }
3044
3045         switch (baridx) {
3046         case 0:
3047                 return pci_nvme_read_bar_0(sc, offset, size);
3048
3049         default:
3050                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3051         }
3052
3053         return (0);
3054 }
3055
3056 static int
3057 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3058 {
3059         char bident[sizeof("XX:X:X")];
3060         const char *value;
3061         uint32_t sectsz;
3062
3063         sc->max_queues = NVME_QUEUES;
3064         sc->max_qentries = NVME_MAX_QENTRIES;
3065         sc->ioslots = NVME_IOSLOTS;
3066         sc->num_squeues = sc->max_queues;
3067         sc->num_cqueues = sc->max_queues;
3068         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3069         sectsz = 0;
3070         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3071                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3072
3073         value = get_config_value_node(nvl, "maxq");
3074         if (value != NULL)
3075                 sc->max_queues = atoi(value);
3076         value = get_config_value_node(nvl, "qsz");
3077         if (value != NULL) {
3078                 sc->max_qentries = atoi(value);
3079                 if (sc->max_qentries <= 0) {
3080                         EPRINTLN("nvme: Invalid qsz option %d",
3081                             sc->max_qentries);
3082                         return (-1);
3083                 }
3084         }
3085         value = get_config_value_node(nvl, "ioslots");
3086         if (value != NULL) {
3087                 sc->ioslots = atoi(value);
3088                 if (sc->ioslots <= 0) {
3089                         EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3090                         return (-1);
3091                 }
3092         }
3093         value = get_config_value_node(nvl, "sectsz");
3094         if (value != NULL)
3095                 sectsz = atoi(value);
3096         value = get_config_value_node(nvl, "ser");
3097         if (value != NULL) {
3098                 /*
3099                  * This field indicates the Product Serial Number in
3100                  * 7-bit ASCII, unused bytes should be space characters.
3101                  * Ref: NVMe v1.3c.
3102                  */
3103                 cpywithpad((char *)sc->ctrldata.sn,
3104                     sizeof(sc->ctrldata.sn), value, ' ');
3105         }
3106         value = get_config_value_node(nvl, "eui64");
3107         if (value != NULL)
3108                 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3109         value = get_config_value_node(nvl, "dsm");
3110         if (value != NULL) {
3111                 if (strcmp(value, "auto") == 0)
3112                         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3113                 else if (strcmp(value, "enable") == 0)
3114                         sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3115                 else if (strcmp(value, "disable") == 0)
3116                         sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3117         }
3118
3119         value = get_config_value_node(nvl, "ram");
3120         if (value != NULL) {
3121                 uint64_t sz = strtoull(value, NULL, 10);
3122
3123                 sc->nvstore.type = NVME_STOR_RAM;
3124                 sc->nvstore.size = sz * 1024 * 1024;
3125                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3126                 sc->nvstore.sectsz = 4096;
3127                 sc->nvstore.sectsz_bits = 12;
3128                 if (sc->nvstore.ctx == NULL) {
3129                         EPRINTLN("nvme: Unable to allocate RAM");
3130                         return (-1);
3131                 }
3132         } else {
3133                 snprintf(bident, sizeof(bident), "%d:%d",
3134                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3135                 sc->nvstore.ctx = blockif_open(nvl, bident);
3136                 if (sc->nvstore.ctx == NULL) {
3137                         EPRINTLN("nvme: Could not open backing file: %s",
3138                             strerror(errno));
3139                         return (-1);
3140                 }
3141                 sc->nvstore.type = NVME_STOR_BLOCKIF;
3142                 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3143         }
3144
3145         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3146                 sc->nvstore.sectsz = sectsz;
3147         else if (sc->nvstore.type != NVME_STOR_RAM)
3148                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3149         for (sc->nvstore.sectsz_bits = 9;
3150              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3151              sc->nvstore.sectsz_bits++);
3152
3153         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3154                 sc->max_queues = NVME_QUEUES;
3155
3156         return (0);
3157 }
3158
3159 static void
3160 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3161 {
3162         struct pci_nvme_softc *sc;
3163         struct pci_nvme_blockstore *nvstore;
3164         struct nvme_namespace_data *nd;
3165
3166         sc = arg;
3167         nvstore = &sc->nvstore;
3168         nd = &sc->nsdata;
3169
3170         nvstore->size = new_size;
3171         pci_nvme_init_nsdata_size(nvstore, nd);
3172
3173         /* Add changed NSID to list */
3174         sc->ns_log.ns[0] = 1;
3175         sc->ns_log.ns[1] = 0;
3176
3177         pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3178             PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3179 }
3180
3181 static int
3182 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3183 {
3184         struct pci_nvme_softc *sc;
3185         uint32_t pci_membar_sz;
3186         int     error;
3187
3188         error = 0;
3189
3190         sc = calloc(1, sizeof(struct pci_nvme_softc));
3191         pi->pi_arg = sc;
3192         sc->nsc_pi = pi;
3193
3194         error = pci_nvme_parse_config(sc, nvl);
3195         if (error < 0)
3196                 goto done;
3197         else
3198                 error = 0;
3199
3200         STAILQ_INIT(&sc->ioreqs_free);
3201         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3202         for (int i = 0; i < sc->ioslots; i++) {
3203                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3204         }
3205
3206         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3207         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3208         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3209         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3210         pci_set_cfgdata8(pi, PCIR_PROGIF,
3211                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3212
3213         /*
3214          * Allocate size of NVMe registers + doorbell space for all queues.
3215          *
3216          * The specification requires a minimum memory I/O window size of 16K.
3217          * The Windows driver will refuse to start a device with a smaller
3218          * window.
3219          */
3220         pci_membar_sz = sizeof(struct nvme_registers) +
3221             2 * sizeof(uint32_t) * (sc->max_queues + 1);
3222         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3223
3224         DPRINTF("nvme membar size: %u", pci_membar_sz);
3225
3226         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3227         if (error) {
3228                 WPRINTF("%s pci alloc mem bar failed", __func__);
3229                 goto done;
3230         }
3231
3232         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3233         if (error) {
3234                 WPRINTF("%s pci add msixcap failed", __func__);
3235                 goto done;
3236         }
3237
3238         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3239         if (error) {
3240                 WPRINTF("%s pci add Express capability failed", __func__);
3241                 goto done;
3242         }
3243
3244         pthread_mutex_init(&sc->mtx, NULL);
3245         sem_init(&sc->iosemlock, 0, sc->ioslots);
3246         blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3247
3248         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3249         /*
3250          * Controller data depends on Namespace data so initialize Namespace
3251          * data first.
3252          */
3253         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3254         pci_nvme_init_ctrldata(sc);
3255         pci_nvme_init_logpages(sc);
3256         pci_nvme_init_features(sc);
3257
3258         pci_nvme_aer_init(sc);
3259         pci_nvme_aen_init(sc);
3260
3261         pci_nvme_reset(sc);
3262
3263         pci_lintr_request(pi);
3264
3265 done:
3266         return (error);
3267 }
3268
3269 static int
3270 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3271 {
3272         char *cp, *ram;
3273
3274         if (opts == NULL)
3275                 return (0);
3276
3277         if (strncmp(opts, "ram=", 4) == 0) {
3278                 cp = strchr(opts, ',');
3279                 if (cp == NULL) {
3280                         set_config_value_node(nvl, "ram", opts + 4);
3281                         return (0);
3282                 }
3283                 ram = strndup(opts + 4, cp - opts - 4);
3284                 set_config_value_node(nvl, "ram", ram);
3285                 free(ram);
3286                 return (pci_parse_legacy_config(nvl, cp + 1));
3287         } else
3288                 return (blockif_legacy_config(nvl, opts));
3289 }
3290
3291 static const struct pci_devemu pci_de_nvme = {
3292         .pe_emu =       "nvme",
3293         .pe_init =      pci_nvme_init,
3294         .pe_legacy_config = pci_nvme_legacy_config,
3295         .pe_barwrite =  pci_nvme_write,
3296         .pe_barread =   pci_nvme_read
3297 };
3298 PCI_EMUL_SET(pci_de_nvme);