]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: fix NVMe Get Log Page command
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79
80 #include <dev/nvme/nvme.h>
81
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86
87
88 static int nvme_debug = 0;
89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91
92 /* defaults; can be overridden */
93 #define NVME_MSIX_BAR           4
94
95 #define NVME_IOSLOTS            8
96
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN     (1 << 14)
99
100 #define NVME_QUEUES             16
101 #define NVME_MAX_QENTRIES       2048
102
103 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
104 #define NVME_MAX_BLOCKIOVS      512
105
106 /* This is a synthetic status code to indicate there is no status */
107 #define NVME_NO_STATUS          0xffff
108 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
109
110 /* helpers */
111
112 /* Convert a zero-based value into a one-based value */
113 #define ONE_BASED(zero)         ((zero) + 1)
114 /* Convert a one-based value into a zero-based value */
115 #define ZERO_BASED(one)         ((one)  - 1)
116
117 /* Encode number of SQ's and CQ's for Set/Get Features */
118 #define NVME_FEATURE_NUM_QUEUES(sc) \
119         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
120         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
121
122 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
123
124 enum nvme_controller_register_offsets {
125         NVME_CR_CAP_LOW = 0x00,
126         NVME_CR_CAP_HI  = 0x04,
127         NVME_CR_VS      = 0x08,
128         NVME_CR_INTMS   = 0x0c,
129         NVME_CR_INTMC   = 0x10,
130         NVME_CR_CC      = 0x14,
131         NVME_CR_CSTS    = 0x1c,
132         NVME_CR_NSSR    = 0x20,
133         NVME_CR_AQA     = 0x24,
134         NVME_CR_ASQ_LOW = 0x28,
135         NVME_CR_ASQ_HI  = 0x2c,
136         NVME_CR_ACQ_LOW = 0x30,
137         NVME_CR_ACQ_HI  = 0x34,
138 };
139
140 enum nvme_cmd_cdw11 {
141         NVME_CMD_CDW11_PC  = 0x0001,
142         NVME_CMD_CDW11_IEN = 0x0002,
143         NVME_CMD_CDW11_IV  = 0xFFFF0000,
144 };
145
146 enum nvme_copy_dir {
147         NVME_COPY_TO_PRP,
148         NVME_COPY_FROM_PRP,
149 };
150
151 #define NVME_CQ_INTEN   0x01
152 #define NVME_CQ_INTCOAL 0x02
153
154 struct nvme_completion_queue {
155         struct nvme_completion *qbase;
156         pthread_mutex_t mtx;
157         uint32_t        size;
158         uint16_t        tail; /* nvme progress */
159         uint16_t        head; /* guest progress */
160         uint16_t        intr_vec;
161         uint32_t        intr_en;
162 };
163
164 struct nvme_submission_queue {
165         struct nvme_command *qbase;
166         pthread_mutex_t mtx;
167         uint32_t        size;
168         uint16_t        head; /* nvme progress */
169         uint16_t        tail; /* guest progress */
170         uint16_t        cqid; /* completion queue id */
171         int             qpriority;
172 };
173
174 enum nvme_storage_type {
175         NVME_STOR_BLOCKIF = 0,
176         NVME_STOR_RAM = 1,
177 };
178
179 struct pci_nvme_blockstore {
180         enum nvme_storage_type type;
181         void            *ctx;
182         uint64_t        size;
183         uint32_t        sectsz;
184         uint32_t        sectsz_bits;
185         uint64_t        eui64;
186         uint32_t        deallocate:1;
187 };
188
189 struct pci_nvme_ioreq {
190         struct pci_nvme_softc *sc;
191         STAILQ_ENTRY(pci_nvme_ioreq) link;
192         struct nvme_submission_queue *nvme_sq;
193         uint16_t        sqid;
194
195         /* command information */
196         uint16_t        opc;
197         uint16_t        cid;
198         uint32_t        nsid;
199
200         uint64_t        prev_gpaddr;
201         size_t          prev_size;
202
203         /*
204          * lock if all iovs consumed (big IO);
205          * complete transaction before continuing
206          */
207         pthread_mutex_t mtx;
208         pthread_cond_t  cv;
209
210         struct blockif_req io_req;
211
212         /* pad to fit up to 512 page descriptors from guest IO request */
213         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
214 };
215
216 enum nvme_dsm_type {
217         /* Dataset Management bit in ONCS reflects backing storage capability */
218         NVME_DATASET_MANAGEMENT_AUTO,
219         /* Unconditionally set Dataset Management bit in ONCS */
220         NVME_DATASET_MANAGEMENT_ENABLE,
221         /* Unconditionally clear Dataset Management bit in ONCS */
222         NVME_DATASET_MANAGEMENT_DISABLE,
223 };
224
225 struct pci_nvme_softc {
226         struct pci_devinst *nsc_pi;
227
228         pthread_mutex_t mtx;
229
230         struct nvme_registers regs;
231
232         struct nvme_namespace_data  nsdata;
233         struct nvme_controller_data ctrldata;
234         struct nvme_error_information_entry err_log;
235         struct nvme_health_information_page health_log;
236         struct nvme_firmware_page fw_log;
237
238         struct pci_nvme_blockstore nvstore;
239
240         uint16_t        max_qentries;   /* max entries per queue */
241         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
242         uint32_t        num_cqueues;
243         uint32_t        num_squeues;
244
245         struct pci_nvme_ioreq *ioreqs;
246         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
247         uint32_t        pending_ios;
248         uint32_t        ioslots;
249         sem_t           iosemlock;
250
251         /*
252          * Memory mapped Submission and Completion queues
253          * Each array includes both Admin and IO queues
254          */
255         struct nvme_completion_queue *compl_queues;
256         struct nvme_submission_queue *submit_queues;
257
258         /* controller features */
259         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
260         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
261         uint32_t        async_ev_config;         /* 0x0B: async event config */
262
263         enum nvme_dsm_type dataset_management;
264 };
265
266
267 static void pci_nvme_io_partial(struct blockif_req *br, int err);
268
269 /* Controller Configuration utils */
270 #define NVME_CC_GET_EN(cc) \
271         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
272 #define NVME_CC_GET_CSS(cc) \
273         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
274 #define NVME_CC_GET_SHN(cc) \
275         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
276 #define NVME_CC_GET_IOSQES(cc) \
277         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
278 #define NVME_CC_GET_IOCQES(cc) \
279         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
280
281 #define NVME_CC_WRITE_MASK \
282         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
283          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
284          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
285
286 #define NVME_CC_NEN_WRITE_MASK \
287         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
288          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
289          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
290
291 /* Controller Status utils */
292 #define NVME_CSTS_GET_RDY(sts) \
293         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
294
295 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
296
297 /* Completion Queue status word utils */
298 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
299 #define NVME_STATUS_MASK \
300         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
301          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
302
303 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
304         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
305
306 static __inline void
307 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
308 {
309         size_t len;
310
311         len = strnlen(src, dst_size);
312         memset(dst, pad, dst_size);
313         memcpy(dst, src, len);
314 }
315
316 static __inline void
317 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
318 {
319
320         *status &= ~NVME_STATUS_MASK;
321         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
322                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
323 }
324
325 static __inline void
326 pci_nvme_status_genc(uint16_t *status, uint16_t code)
327 {
328
329         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
330 }
331
332 /*
333  * Initialize the requested number or IO Submission and Completion Queues.
334  * Admin queues are allocated implicitly.
335  */
336 static void
337 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
338 {
339         uint32_t i;
340
341         /*
342          * Allocate and initialize the Submission Queues
343          */
344         if (nsq > NVME_QUEUES) {
345                 WPRINTF("%s: clamping number of SQ from %u to %u",
346                                         __func__, nsq, NVME_QUEUES);
347                 nsq = NVME_QUEUES;
348         }
349
350         sc->num_squeues = nsq;
351
352         sc->submit_queues = calloc(sc->num_squeues + 1,
353                                 sizeof(struct nvme_submission_queue));
354         if (sc->submit_queues == NULL) {
355                 WPRINTF("%s: SQ allocation failed", __func__);
356                 sc->num_squeues = 0;
357         } else {
358                 struct nvme_submission_queue *sq = sc->submit_queues;
359
360                 for (i = 0; i < sc->num_squeues; i++)
361                         pthread_mutex_init(&sq[i].mtx, NULL);
362         }
363
364         /*
365          * Allocate and initialize the Completion Queues
366          */
367         if (ncq > NVME_QUEUES) {
368                 WPRINTF("%s: clamping number of CQ from %u to %u",
369                                         __func__, ncq, NVME_QUEUES);
370                 ncq = NVME_QUEUES;
371         }
372
373         sc->num_cqueues = ncq;
374
375         sc->compl_queues = calloc(sc->num_cqueues + 1,
376                                 sizeof(struct nvme_completion_queue));
377         if (sc->compl_queues == NULL) {
378                 WPRINTF("%s: CQ allocation failed", __func__);
379                 sc->num_cqueues = 0;
380         } else {
381                 struct nvme_completion_queue *cq = sc->compl_queues;
382
383                 for (i = 0; i < sc->num_cqueues; i++)
384                         pthread_mutex_init(&cq[i].mtx, NULL);
385         }
386 }
387
388 static void
389 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
390 {
391         struct nvme_controller_data *cd = &sc->ctrldata;
392
393         cd->vid = 0xFB5D;
394         cd->ssvid = 0x0000;
395
396         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
397         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
398
399         /* Num of submission commands that we can handle at a time (2^rab) */
400         cd->rab   = 4;
401
402         /* FreeBSD OUI */
403         cd->ieee[0] = 0x58;
404         cd->ieee[1] = 0x9c;
405         cd->ieee[2] = 0xfc;
406
407         cd->mic = 0;
408
409         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
410
411         cd->ver = 0x00010300;
412
413         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
414         cd->acl = 2;
415         cd->aerl = 4;
416
417         cd->lpa = 0;    /* TODO: support some simple things like SMART */
418         cd->elpe = 0;   /* max error log page entries */
419         cd->npss = 1;   /* number of power states support */
420
421         /* Warning Composite Temperature Threshold */
422         cd->wctemp = 0x0157;
423
424         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
425             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
426         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
427             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
428         cd->nn = 1;     /* number of namespaces */
429
430         cd->oncs = 0;
431         switch (sc->dataset_management) {
432         case NVME_DATASET_MANAGEMENT_AUTO:
433                 if (sc->nvstore.deallocate)
434                         cd->oncs |= NVME_ONCS_DSM;
435                 break;
436         case NVME_DATASET_MANAGEMENT_ENABLE:
437                 cd->oncs |= NVME_ONCS_DSM;
438                 break;
439         default:
440                 break;
441         }
442
443         cd->fna = 0x03;
444
445         cd->power_state[0].mp = 10;
446 }
447
448 /*
449  * Calculate the CRC-16 of the given buffer
450  * See copyright attribution at top of file
451  */
452 static uint16_t
453 crc16(uint16_t crc, const void *buffer, unsigned int len)
454 {
455         const unsigned char *cp = buffer;
456         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
457         static uint16_t const crc16_table[256] = {
458                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
459                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
460                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
461                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
462                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
463                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
464                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
465                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
466                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
467                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
468                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
469                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
470                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
471                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
472                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
473                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
474                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
475                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
476                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
477                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
478                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
479                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
480                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
481                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
482                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
483                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
484                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
485                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
486                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
487                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
488                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
489                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
490         };
491
492         while (len--)
493                 crc = (((crc >> 8) & 0xffU) ^
494                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
495         return crc;
496 }
497
498 static void
499 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
500     struct nvme_namespace_data *nd, uint32_t nsid,
501     struct pci_nvme_blockstore *nvstore)
502 {
503
504         /* Get capacity and block size information from backing store */
505         nd->nsze = nvstore->size / nvstore->sectsz;
506         nd->ncap = nd->nsze;
507         nd->nuse = nd->nsze;
508
509         if (nvstore->type == NVME_STOR_BLOCKIF)
510                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
511
512         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
513         nd->flbas = 0;
514
515         /* Create an EUI-64 if user did not provide one */
516         if (nvstore->eui64 == 0) {
517                 char *data = NULL;
518                 uint64_t eui64 = nvstore->eui64;
519
520                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
521                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
522
523                 if (data != NULL) {
524                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
525                         free(data);
526                 }
527                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
528         }
529         be64enc(nd->eui64, nvstore->eui64);
530
531         /* LBA data-sz = 2^lbads */
532         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
533 }
534
535 static void
536 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
537 {
538
539         memset(&sc->err_log, 0, sizeof(sc->err_log));
540         memset(&sc->health_log, 0, sizeof(sc->health_log));
541         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
542 }
543
544 static void
545 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
546 {
547         uint32_t i;
548
549         DPRINTF("%s", __func__);
550
551         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
552             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
553             (60 << NVME_CAP_LO_REG_TO_SHIFT);
554
555         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
556
557         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
558
559         sc->regs.cc = 0;
560         sc->regs.csts = 0;
561
562         assert(sc->submit_queues != NULL);
563
564         for (i = 0; i < sc->num_squeues + 1; i++) {
565                 sc->submit_queues[i].qbase = NULL;
566                 sc->submit_queues[i].size = 0;
567                 sc->submit_queues[i].cqid = 0;
568                 sc->submit_queues[i].tail = 0;
569                 sc->submit_queues[i].head = 0;
570         }
571
572         assert(sc->compl_queues != NULL);
573
574         for (i = 0; i < sc->num_cqueues + 1; i++) {
575                 sc->compl_queues[i].qbase = NULL;
576                 sc->compl_queues[i].size = 0;
577                 sc->compl_queues[i].tail = 0;
578                 sc->compl_queues[i].head = 0;
579         }
580 }
581
582 static void
583 pci_nvme_reset(struct pci_nvme_softc *sc)
584 {
585         pthread_mutex_lock(&sc->mtx);
586         pci_nvme_reset_locked(sc);
587         pthread_mutex_unlock(&sc->mtx);
588 }
589
590 static void
591 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
592 {
593         uint16_t acqs, asqs;
594
595         DPRINTF("%s", __func__);
596
597         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
598         sc->submit_queues[0].size = asqs;
599         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
600                     sizeof(struct nvme_command) * asqs);
601
602         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
603                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
604
605         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
606             NVME_AQA_REG_ACQS_MASK) + 1;
607         sc->compl_queues[0].size = acqs;
608         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
609                  sizeof(struct nvme_completion) * acqs);
610
611         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
612                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
613 }
614
615 static int
616 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
617         size_t len, enum nvme_copy_dir dir)
618 {
619         uint8_t *p;
620         size_t bytes;
621
622         if (len > (8 * 1024)) {
623                 return (-1);
624         }
625
626         /* Copy from the start of prp1 to the end of the physical page */
627         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
628         bytes = MIN(bytes, len);
629
630         p = vm_map_gpa(ctx, prp1, bytes);
631         if (p == NULL) {
632                 return (-1);
633         }
634
635         if (dir == NVME_COPY_TO_PRP)
636                 memcpy(p, b, bytes);
637         else
638                 memcpy(b, p, bytes);
639
640         b += bytes;
641
642         len -= bytes;
643         if (len == 0) {
644                 return (0);
645         }
646
647         len = MIN(len, PAGE_SIZE);
648
649         p = vm_map_gpa(ctx, prp2, len);
650         if (p == NULL) {
651                 return (-1);
652         }
653
654         if (dir == NVME_COPY_TO_PRP)
655                 memcpy(p, b, len);
656         else
657                 memcpy(b, p, len);
658
659         return (0);
660 }
661
662 /*
663  * Write a Completion Queue Entry update
664  *
665  * Write the completion and update the doorbell value
666  */
667 static void
668 pci_nvme_cq_update(struct pci_nvme_softc *sc,
669                 struct nvme_completion_queue *cq,
670                 uint32_t cdw0,
671                 uint16_t cid,
672                 uint16_t sqid,
673                 uint16_t status)
674 {
675         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
676         struct nvme_completion *cqe;
677
678         assert(cq->qbase != NULL);
679
680         pthread_mutex_lock(&cq->mtx);
681
682         cqe = &cq->qbase[cq->tail];
683
684         /* Flip the phase bit */
685         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
686
687         cqe->cdw0 = cdw0;
688         cqe->sqhd = sq->head;
689         cqe->sqid = sqid;
690         cqe->cid = cid;
691         cqe->status = status;
692
693         cq->tail++;
694         if (cq->tail >= cq->size) {
695                 cq->tail = 0;
696         }
697
698         pthread_mutex_unlock(&cq->mtx);
699 }
700
701 static int
702 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
703         struct nvme_completion* compl)
704 {
705         uint16_t qid = command->cdw10 & 0xffff;
706
707         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
708         if (qid == 0 || qid > sc->num_squeues) {
709                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
710                         __func__, qid, sc->num_squeues);
711                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
712                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
713                 return (1);
714         }
715
716         sc->submit_queues[qid].qbase = NULL;
717         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
718         return (1);
719 }
720
721 static int
722 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
723         struct nvme_completion* compl)
724 {
725         if (command->cdw11 & NVME_CMD_CDW11_PC) {
726                 uint16_t qid = command->cdw10 & 0xffff;
727                 struct nvme_submission_queue *nsq;
728
729                 if ((qid == 0) || (qid > sc->num_squeues)) {
730                         WPRINTF("%s queue index %u > num_squeues %u",
731                                 __func__, qid, sc->num_squeues);
732                         pci_nvme_status_tc(&compl->status,
733                             NVME_SCT_COMMAND_SPECIFIC,
734                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
735                         return (1);
736                 }
737
738                 nsq = &sc->submit_queues[qid];
739                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
740
741                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
742                               sizeof(struct nvme_command) * (size_t)nsq->size);
743                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
744                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
745
746                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
747                         qid, nsq->size, nsq->qbase, nsq->cqid);
748
749                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
750
751                 DPRINTF("%s completed creating IOSQ qid %u",
752                          __func__, qid);
753         } else {
754                 /* 
755                  * Guest sent non-cont submission queue request.
756                  * This setting is unsupported by this emulation.
757                  */
758                 WPRINTF("%s unsupported non-contig (list-based) "
759                          "create i/o submission queue", __func__);
760
761                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
762         }
763         return (1);
764 }
765
766 static int
767 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
768         struct nvme_completion* compl)
769 {
770         uint16_t qid = command->cdw10 & 0xffff;
771
772         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
773         if (qid == 0 || qid > sc->num_cqueues) {
774                 WPRINTF("%s queue index %u / num_cqueues %u",
775                         __func__, qid, sc->num_cqueues);
776                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
777                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
778                 return (1);
779         }
780
781         sc->compl_queues[qid].qbase = NULL;
782         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
783         return (1);
784 }
785
786 static int
787 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
788         struct nvme_completion* compl)
789 {
790
791         if (command->cdw11 & NVME_CMD_CDW11_PC) {
792                 uint16_t qid = command->cdw10 & 0xffff;
793                 struct nvme_completion_queue *ncq;
794
795                 if ((qid == 0) || (qid > sc->num_cqueues)) {
796                         WPRINTF("%s queue index %u > num_cqueues %u",
797                                 __func__, qid, sc->num_cqueues);
798                         pci_nvme_status_tc(&compl->status,
799                             NVME_SCT_COMMAND_SPECIFIC,
800                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
801                         return (1);
802                 }
803
804                 ncq = &sc->compl_queues[qid];
805                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
806                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
807                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
808
809                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
810                              command->prp1,
811                              sizeof(struct nvme_command) * (size_t)ncq->size);
812
813                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
814         } else {
815                 /* 
816                  * Non-contig completion queue unsupported.
817                  */
818                 WPRINTF("%s unsupported non-contig (list-based) "
819                          "create i/o completion queue",
820                          __func__);
821
822                 /* 0x12 = Invalid Use of Controller Memory Buffer */
823                 pci_nvme_status_genc(&compl->status, 0x12);
824         }
825
826         return (1);
827 }
828
829 static int
830 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
831         struct nvme_completion* compl)
832 {
833         uint32_t logsize;
834         uint8_t logpage = command->cdw10 & 0xFF;
835
836         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
837
838         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
839
840         /*
841          * Command specifies the number of dwords to return in fields NUMDU
842          * and NUMDL. This is a zero-based value.
843          */
844         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
845         logsize *= sizeof(uint32_t);
846
847         switch (logpage) {
848         case NVME_LOG_ERROR:
849                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
850                     command->prp2, (uint8_t *)&sc->err_log,
851                     MIN(logsize, sizeof(sc->err_log)),
852                     NVME_COPY_TO_PRP);
853                 break;
854         case NVME_LOG_HEALTH_INFORMATION:
855                 /* TODO: present some smart info */
856                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
857                     command->prp2, (uint8_t *)&sc->health_log,
858                     MIN(logsize, sizeof(sc->health_log)),
859                     NVME_COPY_TO_PRP);
860                 break;
861         case NVME_LOG_FIRMWARE_SLOT:
862                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
863                     command->prp2, (uint8_t *)&sc->fw_log,
864                     MIN(logsize, sizeof(sc->fw_log)),
865                     NVME_COPY_TO_PRP);
866                 break;
867         default:
868                 DPRINTF("%s get log page %x command not supported",
869                         __func__, logpage);
870
871                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
872                     NVME_SC_INVALID_LOG_PAGE);
873         }
874
875         return (1);
876 }
877
878 static int
879 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
880         struct nvme_completion* compl)
881 {
882         void *dest;
883         uint16_t status;
884
885         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
886                 command->cdw10 & 0xFF, command->nsid);
887
888         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
889
890         switch (command->cdw10 & 0xFF) {
891         case 0x00: /* return Identify Namespace data structure */
892                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
893                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
894                     NVME_COPY_TO_PRP);
895                 break;
896         case 0x01: /* return Identify Controller data structure */
897                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
898                     command->prp2, (uint8_t *)&sc->ctrldata,
899                     sizeof(sc->ctrldata),
900                     NVME_COPY_TO_PRP);
901                 break;
902         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
903                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
904                                   sizeof(uint32_t) * 1024);
905                 ((uint32_t *)dest)[0] = 1;
906                 ((uint32_t *)dest)[1] = 0;
907                 break;
908         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
909                 if (command->nsid != 1) {
910                         pci_nvme_status_genc(&status,
911                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
912                         break;
913                 }
914                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
915                                   sizeof(uint32_t) * 1024);
916                 /* All bytes after the descriptor shall be zero */
917                 bzero(dest, sizeof(uint32_t) * 1024);
918
919                 /* Return NIDT=1 (i.e. EUI64) descriptor */
920                 ((uint8_t *)dest)[0] = 1;
921                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
922                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
923                 break;
924         default:
925                 DPRINTF("%s unsupported identify command requested 0x%x",
926                          __func__, command->cdw10 & 0xFF);
927                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
928                 return (1);
929         }
930
931         compl->status = status;
932         return (1);
933 }
934
935 static int
936 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
937         struct nvme_completion* compl)
938 {
939         uint16_t nqr;   /* Number of Queues Requested */
940
941         nqr = command->cdw11 & 0xFFFF;
942         if (nqr == 0xffff) {
943                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
944                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
945                 return (-1);
946         }
947
948         sc->num_squeues = ONE_BASED(nqr);
949         if (sc->num_squeues > sc->max_queues) {
950                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
951                                         sc->max_queues);
952                 sc->num_squeues = sc->max_queues;
953         }
954
955         nqr = (command->cdw11 >> 16) & 0xFFFF;
956         if (nqr == 0xffff) {
957                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
958                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
959                 return (-1);
960         }
961
962         sc->num_cqueues = ONE_BASED(nqr);
963         if (sc->num_cqueues > sc->max_queues) {
964                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
965                                         sc->max_queues);
966                 sc->num_cqueues = sc->max_queues;
967         }
968
969         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
970
971         return (0);
972 }
973
974 static int
975 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
976         struct nvme_completion* compl)
977 {
978         int feature = command->cdw10 & 0xFF;
979         uint32_t iv;
980
981         DPRINTF("%s feature 0x%x", __func__, feature);
982         compl->cdw0 = 0;
983
984         switch (feature) {
985         case NVME_FEAT_ARBITRATION:
986                 DPRINTF("  arbitration 0x%x", command->cdw11);
987                 break;
988         case NVME_FEAT_POWER_MANAGEMENT:
989                 DPRINTF("  power management 0x%x", command->cdw11);
990                 break;
991         case NVME_FEAT_LBA_RANGE_TYPE:
992                 DPRINTF("  lba range 0x%x", command->cdw11);
993                 break;
994         case NVME_FEAT_TEMPERATURE_THRESHOLD:
995                 DPRINTF("  temperature threshold 0x%x", command->cdw11);
996                 break;
997         case NVME_FEAT_ERROR_RECOVERY:
998                 DPRINTF("  error recovery 0x%x", command->cdw11);
999                 break;
1000         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1001                 DPRINTF("  volatile write cache 0x%x", command->cdw11);
1002                 break;
1003         case NVME_FEAT_NUMBER_OF_QUEUES:
1004                 nvme_set_feature_queues(sc, command, compl);
1005                 break;
1006         case NVME_FEAT_INTERRUPT_COALESCING:
1007                 DPRINTF("  interrupt coalescing 0x%x", command->cdw11);
1008
1009                 /* in uS */
1010                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
1011
1012                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
1013                 break;
1014         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1015                 iv = command->cdw11 & 0xFFFF;
1016
1017                 DPRINTF("  interrupt vector configuration 0x%x",
1018                         command->cdw11);
1019
1020                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
1021                         if (sc->compl_queues[i].intr_vec == iv) {
1022                                 if (command->cdw11 & (1 << 16))
1023                                         sc->compl_queues[i].intr_en |=
1024                                                               NVME_CQ_INTCOAL;  
1025                                 else
1026                                         sc->compl_queues[i].intr_en &=
1027                                                              ~NVME_CQ_INTCOAL;  
1028                         }
1029                 }
1030                 break;
1031         case NVME_FEAT_WRITE_ATOMICITY:
1032                 DPRINTF("  write atomicity 0x%x", command->cdw11);
1033                 break;
1034         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1035                 DPRINTF("  async event configuration 0x%x",
1036                         command->cdw11);
1037                 sc->async_ev_config = command->cdw11;
1038                 break;
1039         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1040                 DPRINTF("  software progress marker 0x%x",
1041                         command->cdw11);
1042                 break;
1043         case 0x0C:
1044                 DPRINTF("  autonomous power state transition 0x%x",
1045                         command->cdw11);
1046                 break;
1047         default:
1048                 WPRINTF("%s invalid feature", __func__);
1049                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1050                 return (1);
1051         }
1052
1053         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1054         return (1);
1055 }
1056
1057 static int
1058 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1059         struct nvme_completion* compl)
1060 {
1061         int feature = command->cdw10 & 0xFF;
1062
1063         DPRINTF("%s feature 0x%x", __func__, feature);
1064
1065         compl->cdw0 = 0;
1066
1067         switch (feature) {
1068         case NVME_FEAT_ARBITRATION:
1069                 DPRINTF("  arbitration");
1070                 break;
1071         case NVME_FEAT_POWER_MANAGEMENT:
1072                 DPRINTF("  power management");
1073                 break;
1074         case NVME_FEAT_LBA_RANGE_TYPE:
1075                 DPRINTF("  lba range");
1076                 break;
1077         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1078                 DPRINTF("  temperature threshold");
1079                 switch ((command->cdw11 >> 20) & 0x3) {
1080                 case 0:
1081                         /* Over temp threshold */
1082                         compl->cdw0 = 0xFFFF;
1083                         break;
1084                 case 1:
1085                         /* Under temp threshold */
1086                         compl->cdw0 = 0;
1087                         break;
1088                 default:
1089                         WPRINTF("  invalid threshold type select");
1090                         pci_nvme_status_genc(&compl->status,
1091                             NVME_SC_INVALID_FIELD);
1092                         return (1);
1093                 }
1094                 break;
1095         case NVME_FEAT_ERROR_RECOVERY:
1096                 DPRINTF("  error recovery");
1097                 break;
1098         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1099                 DPRINTF("  volatile write cache");
1100                 break;
1101         case NVME_FEAT_NUMBER_OF_QUEUES:
1102                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1103
1104                 DPRINTF("  number of queues (submit %u, completion %u)",
1105                         compl->cdw0 & 0xFFFF,
1106                         (compl->cdw0 >> 16) & 0xFFFF);
1107
1108                 break;
1109         case NVME_FEAT_INTERRUPT_COALESCING:
1110                 DPRINTF("  interrupt coalescing");
1111                 break;
1112         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1113                 DPRINTF("  interrupt vector configuration");
1114                 break;
1115         case NVME_FEAT_WRITE_ATOMICITY:
1116                 DPRINTF("  write atomicity");
1117                 break;
1118         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1119                 DPRINTF("  async event configuration");
1120                 sc->async_ev_config = command->cdw11;
1121                 break;
1122         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1123                 DPRINTF("  software progress marker");
1124                 break;
1125         case 0x0C:
1126                 DPRINTF("  autonomous power state transition");
1127                 break;
1128         default:
1129                 WPRINTF("%s invalid feature 0x%x", __func__, feature);
1130                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1131                 return (1);
1132         }
1133
1134         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1135         return (1);
1136 }
1137
1138 static int
1139 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1140         struct nvme_completion* compl)
1141 {
1142         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1143                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1144
1145         /* TODO: search for the command ID and abort it */
1146
1147         compl->cdw0 = 1;
1148         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1149         return (1);
1150 }
1151
1152 static int
1153 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1154         struct nvme_command* command, struct nvme_completion* compl)
1155 {
1156         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1157
1158         /*
1159          * TODO: raise events when they happen based on the Set Features cmd.
1160          * These events happen async, so only set completion successful if
1161          * there is an event reflective of the request to get event.
1162          */
1163         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1164             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1165         return (0);
1166 }
1167
1168 static void
1169 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1170 {
1171         struct nvme_completion compl;
1172         struct nvme_command *cmd;
1173         struct nvme_submission_queue *sq;
1174         struct nvme_completion_queue *cq;
1175         uint16_t sqhead;
1176
1177         DPRINTF("%s index %u", __func__, (uint32_t)value);
1178
1179         sq = &sc->submit_queues[0];
1180         cq = &sc->compl_queues[0];
1181
1182         pthread_mutex_lock(&sq->mtx);
1183
1184         sqhead = sq->head;
1185         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1186         
1187         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1188                 cmd = &(sq->qbase)[sqhead];
1189                 compl.cdw0 = 0;
1190                 compl.status = 0;
1191
1192                 switch (cmd->opc) {
1193                 case NVME_OPC_DELETE_IO_SQ:
1194                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1195                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1196                         break;
1197                 case NVME_OPC_CREATE_IO_SQ:
1198                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1199                         nvme_opc_create_io_sq(sc, cmd, &compl);
1200                         break;
1201                 case NVME_OPC_DELETE_IO_CQ:
1202                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1203                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1204                         break;
1205                 case NVME_OPC_CREATE_IO_CQ:
1206                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1207                         nvme_opc_create_io_cq(sc, cmd, &compl);
1208                         break;
1209                 case NVME_OPC_GET_LOG_PAGE:
1210                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1211                         nvme_opc_get_log_page(sc, cmd, &compl);
1212                         break;
1213                 case NVME_OPC_IDENTIFY:
1214                         DPRINTF("%s command IDENTIFY", __func__);
1215                         nvme_opc_identify(sc, cmd, &compl);
1216                         break;
1217                 case NVME_OPC_ABORT:
1218                         DPRINTF("%s command ABORT", __func__);
1219                         nvme_opc_abort(sc, cmd, &compl);
1220                         break;
1221                 case NVME_OPC_SET_FEATURES:
1222                         DPRINTF("%s command SET_FEATURES", __func__);
1223                         nvme_opc_set_features(sc, cmd, &compl);
1224                         break;
1225                 case NVME_OPC_GET_FEATURES:
1226                         DPRINTF("%s command GET_FEATURES", __func__);
1227                         nvme_opc_get_features(sc, cmd, &compl);
1228                         break;
1229                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1230                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1231                         /* XXX dont care, unhandled for now
1232                         nvme_opc_async_event_req(sc, cmd, &compl);
1233                         */
1234                         compl.status = NVME_NO_STATUS;
1235                         break;
1236                 default:
1237                         WPRINTF("0x%x command is not implemented",
1238                             cmd->opc);
1239                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1240                 }
1241                 sqhead = (sqhead + 1) % sq->size;
1242
1243                 if (NVME_COMPLETION_VALID(compl)) {
1244                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
1245                             compl.cdw0,
1246                             cmd->cid,
1247                             0,          /* SQID */
1248                             compl.status);
1249                 }
1250         }
1251
1252         DPRINTF("setting sqhead %u", sqhead);
1253         sq->head = sqhead;
1254
1255         if (cq->head != cq->tail)
1256                 pci_generate_msix(sc->nsc_pi, 0);
1257
1258         pthread_mutex_unlock(&sq->mtx);
1259 }
1260
1261 static int
1262 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1263         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1264 {
1265         int iovidx;
1266
1267         if (req != NULL) {
1268                 /* concatenate contig block-iovs to minimize number of iovs */
1269                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1270                         iovidx = req->io_req.br_iovcnt - 1;
1271
1272                         req->io_req.br_iov[iovidx].iov_base =
1273                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1274                                              req->prev_gpaddr, size);
1275
1276                         req->prev_size += size;
1277                         req->io_req.br_resid += size;
1278
1279                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1280                 } else {
1281                         pthread_mutex_lock(&req->mtx);
1282
1283                         iovidx = req->io_req.br_iovcnt;
1284                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1285                                 int err = 0;
1286
1287                                 DPRINTF("large I/O, doing partial req");
1288
1289                                 iovidx = 0;
1290                                 req->io_req.br_iovcnt = 0;
1291
1292                                 req->io_req.br_callback = pci_nvme_io_partial;
1293
1294                                 if (!do_write)
1295                                         err = blockif_read(sc->nvstore.ctx,
1296                                                            &req->io_req);
1297                                 else
1298                                         err = blockif_write(sc->nvstore.ctx,
1299                                                             &req->io_req);
1300
1301                                 /* wait until req completes before cont */
1302                                 if (err == 0)
1303                                         pthread_cond_wait(&req->cv, &req->mtx);
1304                         }
1305                         if (iovidx == 0) {
1306                                 req->io_req.br_offset = lba;
1307                                 req->io_req.br_resid = 0;
1308                                 req->io_req.br_param = req;
1309                         }
1310
1311                         req->io_req.br_iov[iovidx].iov_base =
1312                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1313                                              gpaddr, size);
1314
1315                         req->io_req.br_iov[iovidx].iov_len = size;
1316
1317                         req->prev_gpaddr = gpaddr;
1318                         req->prev_size = size;
1319                         req->io_req.br_resid += size;
1320
1321                         req->io_req.br_iovcnt++;
1322
1323                         pthread_mutex_unlock(&req->mtx);
1324                 }
1325         } else {
1326                 /* RAM buffer: read/write directly */
1327                 void *p = sc->nvstore.ctx;
1328                 void *gptr;
1329
1330                 if ((lba + size) > sc->nvstore.size) {
1331                         WPRINTF("%s write would overflow RAM", __func__);
1332                         return (-1);
1333                 }
1334
1335                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1336                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1337                 if (do_write) 
1338                         memcpy(p, gptr, size);
1339                 else
1340                         memcpy(gptr, p, size);
1341         }
1342         return (0);
1343 }
1344
1345 static void
1346 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1347         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1348         uint32_t cdw0, uint16_t status)
1349 {
1350         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1351
1352         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1353                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1354                  NVME_STATUS_GET_SC(status));
1355
1356         pci_nvme_cq_update(sc, cq,
1357             0,          /* CDW0 */
1358             cid,
1359             sqid,
1360             status);
1361
1362         if (cq->head != cq->tail) {
1363                 if (cq->intr_en & NVME_CQ_INTEN) {
1364                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1365                 } else {
1366                         DPRINTF("%s: CQ%u interrupt disabled",
1367                                                 __func__, sq->cqid);
1368                 }
1369         }
1370 }
1371
1372 static void
1373 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1374 {
1375         req->sc = NULL;
1376         req->nvme_sq = NULL;
1377         req->sqid = 0;
1378
1379         pthread_mutex_lock(&sc->mtx);
1380
1381         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1382         sc->pending_ios--;
1383
1384         /* when no more IO pending, can set to ready if device reset/enabled */
1385         if (sc->pending_ios == 0 &&
1386             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1387                 sc->regs.csts |= NVME_CSTS_RDY;
1388
1389         pthread_mutex_unlock(&sc->mtx);
1390
1391         sem_post(&sc->iosemlock);
1392 }
1393
1394 static struct pci_nvme_ioreq *
1395 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1396 {
1397         struct pci_nvme_ioreq *req = NULL;;
1398
1399         sem_wait(&sc->iosemlock);
1400         pthread_mutex_lock(&sc->mtx);
1401
1402         req = STAILQ_FIRST(&sc->ioreqs_free);
1403         assert(req != NULL);
1404         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1405
1406         req->sc = sc;
1407
1408         sc->pending_ios++;
1409
1410         pthread_mutex_unlock(&sc->mtx);
1411
1412         req->io_req.br_iovcnt = 0;
1413         req->io_req.br_offset = 0;
1414         req->io_req.br_resid = 0;
1415         req->io_req.br_param = req;
1416         req->prev_gpaddr = 0;
1417         req->prev_size = 0;
1418
1419         return req;
1420 }
1421
1422 static void
1423 pci_nvme_io_done(struct blockif_req *br, int err)
1424 {
1425         struct pci_nvme_ioreq *req = br->br_param;
1426         struct nvme_submission_queue *sq = req->nvme_sq;
1427         uint16_t code, status;
1428
1429         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1430
1431         /* TODO return correct error */
1432         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1433         pci_nvme_status_genc(&status, code);
1434
1435         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1436         pci_nvme_release_ioreq(req->sc, req);
1437 }
1438
1439 static void
1440 pci_nvme_io_partial(struct blockif_req *br, int err)
1441 {
1442         struct pci_nvme_ioreq *req = br->br_param;
1443
1444         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1445
1446         pthread_cond_signal(&req->cv);
1447 }
1448
1449 /*
1450  * Implements the Flush command. The specification states:
1451  *    If a volatile write cache is not present, Flush commands complete
1452  *    successfully and have no effect
1453  * in the description of the Volatile Write Cache (VWC) field of the Identify
1454  * Controller data. Therefore, set status to Success if the command is
1455  * not supported (i.e. RAM or as indicated by the blockif).
1456  */
1457 static bool
1458 nvme_opc_flush(struct pci_nvme_softc *sc,
1459     struct nvme_command *cmd,
1460     struct pci_nvme_blockstore *nvstore,
1461     struct pci_nvme_ioreq *req,
1462     uint16_t *status)
1463 {
1464         bool pending = false;
1465
1466         if (nvstore->type == NVME_STOR_RAM) {
1467                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1468         } else {
1469                 int err;
1470
1471                 req->io_req.br_callback = pci_nvme_io_done;
1472
1473                 err = blockif_flush(nvstore->ctx, &req->io_req);
1474                 switch (err) {
1475                 case 0:
1476                         pending = true;
1477                         break;
1478                 case EOPNOTSUPP:
1479                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1480                         break;
1481                 default:
1482                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1483                 }
1484         }
1485
1486         return (pending);
1487 }
1488
1489 static bool
1490 nvme_opc_write_read(struct pci_nvme_softc *sc,
1491     struct nvme_command *cmd,
1492     struct pci_nvme_blockstore *nvstore,
1493     struct pci_nvme_ioreq *req,
1494     uint16_t *status)
1495 {
1496         uint64_t lba, nblocks, bytes;
1497         size_t offset;
1498         bool is_write = cmd->opc == NVME_OPC_WRITE;
1499         bool pending = false;
1500
1501         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1502         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1503
1504         offset = lba * nvstore->sectsz;
1505         bytes  = nblocks * nvstore->sectsz;
1506
1507         if ((offset + bytes) > nvstore->size) {
1508                 WPRINTF("%s command would exceed LBA range", __func__);
1509                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1510                 goto out;
1511         }
1512
1513         req->io_req.br_offset = lba;
1514
1515         /* PRP bits 1:0 must be zero */
1516         cmd->prp1 &= ~0x3UL;
1517         cmd->prp2 &= ~0x3UL;
1518
1519         if (nvstore->type == NVME_STOR_RAM) {
1520                 uint8_t *buf = nvstore->ctx;
1521                 enum nvme_copy_dir dir;
1522
1523                 if (is_write)
1524                         dir = NVME_COPY_TO_PRP;
1525                 else
1526                         dir = NVME_COPY_FROM_PRP;
1527
1528                 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1529                     buf + offset, bytes, dir))
1530                         pci_nvme_status_genc(status,
1531                             NVME_SC_DATA_TRANSFER_ERROR);
1532                 else
1533                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1534         } else {
1535                 uint64_t size;
1536                 int err;
1537
1538                 size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes);
1539                 if (pci_nvme_append_iov_req(sc, req, cmd->prp1,
1540                     size, is_write, offset)) {
1541                         pci_nvme_status_genc(status,
1542                             NVME_SC_DATA_TRANSFER_ERROR);
1543                         goto out;
1544                 }
1545
1546                 offset += size;
1547                 bytes  -= size;
1548
1549                 if (bytes == 0) {
1550                         ;
1551                 } else if (bytes <= PAGE_SIZE) {
1552                         size = bytes;
1553                         if (pci_nvme_append_iov_req(sc, req, cmd->prp2,
1554                             size, is_write, offset)) {
1555                                 pci_nvme_status_genc(status,
1556                                     NVME_SC_DATA_TRANSFER_ERROR);
1557                                 goto out;
1558                         }
1559                 } else {
1560                         void *vmctx = sc->nsc_pi->pi_vmctx;
1561                         uint64_t *prp_list = &cmd->prp2;
1562                         uint64_t *last = prp_list;
1563
1564                         /* PRP2 is pointer to a physical region page list */
1565                         while (bytes) {
1566                                 /* Last entry in list points to the next list */
1567                                 if (prp_list == last) {
1568                                         uint64_t prp = *prp_list;
1569
1570                                         prp_list = paddr_guest2host(vmctx, prp,
1571                                             PAGE_SIZE - (prp % PAGE_SIZE));
1572                                         last = prp_list + (NVME_PRP2_ITEMS - 1);
1573                                 }
1574
1575                                 size = MIN(bytes, PAGE_SIZE);
1576
1577                                 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1578                                     size, is_write, offset)) {
1579                                         pci_nvme_status_genc(status,
1580                                             NVME_SC_DATA_TRANSFER_ERROR);
1581                                         goto out;
1582                                 }
1583
1584                                 offset += size;
1585                                 bytes  -= size;
1586
1587                                 prp_list++;
1588                         }
1589                 }
1590                 req->io_req.br_callback = pci_nvme_io_done;
1591                 if (is_write)
1592                         err = blockif_write(nvstore->ctx, &req->io_req);
1593                 else
1594                         err = blockif_read(nvstore->ctx, &req->io_req);
1595
1596                 if (err)
1597                         pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR);
1598                 else
1599                         pending = true;
1600         }
1601 out:
1602         return (pending);
1603 }
1604
1605 static void
1606 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1607 {
1608         struct pci_nvme_ioreq *req = br->br_param;
1609         struct pci_nvme_softc *sc = req->sc;
1610         bool done = true;
1611         uint16_t status;
1612
1613         if (err) {
1614                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1615         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1616                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1617         } else {
1618                 struct iovec *iov = req->io_req.br_iov;
1619
1620                 req->prev_gpaddr++;
1621                 iov += req->prev_gpaddr;
1622
1623                 /* The iov_* values already include the sector size */
1624                 req->io_req.br_offset = (off_t)iov->iov_base;
1625                 req->io_req.br_resid = iov->iov_len;
1626                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1627                         pci_nvme_status_genc(&status,
1628                             NVME_SC_INTERNAL_DEVICE_ERROR);
1629                 } else
1630                         done = false;
1631         }
1632
1633         if (done) {
1634                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1635                     req->cid, 0, status);
1636                 pci_nvme_release_ioreq(sc, req);
1637         }
1638 }
1639
1640 static bool
1641 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1642     struct nvme_command *cmd,
1643     struct pci_nvme_blockstore *nvstore,
1644     struct pci_nvme_ioreq *req,
1645     uint16_t *status)
1646 {
1647         int err;
1648         bool pending = false;
1649
1650         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1651                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1652                 goto out;
1653         }
1654
1655         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1656                 struct nvme_dsm_range *range;
1657                 uint32_t nr, r;
1658                 int sectsz = sc->nvstore.sectsz;
1659
1660                 /*
1661                  * DSM calls are advisory only, and compliant controllers
1662                  * may choose to take no actions (i.e. return Success).
1663                  */
1664                 if (!nvstore->deallocate) {
1665                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1666                         goto out;
1667                 }
1668
1669                 if (req == NULL) {
1670                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1671                         goto out;
1672                 }
1673
1674                 /* copy locally because a range entry could straddle PRPs */
1675                 range = calloc(1, NVME_MAX_DSM_TRIM);
1676                 if (range == NULL) {
1677                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1678                         goto out;
1679                 }
1680                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1681                     (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1682
1683                 /*
1684                  * If the request is for more than a single range, store
1685                  * the ranges in the br_iov. Optimize for the common case
1686                  * of a single range.
1687                  *
1688                  * Note that NVMe Number of Ranges is a zero based value
1689                  */
1690                 nr = cmd->cdw10 & 0xff;
1691
1692                 req->io_req.br_iovcnt = 0;
1693                 req->io_req.br_offset = range[0].starting_lba * sectsz;
1694                 req->io_req.br_resid = range[0].length * sectsz;
1695
1696                 if (nr == 0) {
1697                         req->io_req.br_callback = pci_nvme_io_done;
1698                 } else {
1699                         struct iovec *iov = req->io_req.br_iov;
1700
1701                         for (r = 0; r <= nr; r++) {
1702                                 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1703                                 iov[r].iov_len = range[r].length * sectsz;
1704                         }
1705                         req->io_req.br_callback = pci_nvme_dealloc_sm;
1706
1707                         /*
1708                          * Use prev_gpaddr to track the current entry and
1709                          * prev_size to track the number of entries
1710                          */
1711                         req->prev_gpaddr = 0;
1712                         req->prev_size = r;
1713                 }
1714
1715                 err = blockif_delete(nvstore->ctx, &req->io_req);
1716                 if (err)
1717                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1718                 else
1719                         pending = true;
1720
1721                 free(range);
1722         }
1723 out:
1724         return (pending);
1725 }
1726
1727 static void
1728 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1729 {
1730         struct nvme_submission_queue *sq;
1731         uint16_t status;
1732         uint16_t sqhead;
1733
1734         /* handle all submissions up to sq->tail index */
1735         sq = &sc->submit_queues[idx];
1736
1737         pthread_mutex_lock(&sq->mtx);
1738
1739         sqhead = sq->head;
1740         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1741                  idx, sqhead, sq->tail, sq->qbase);
1742
1743         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1744                 struct nvme_command *cmd;
1745                 struct pci_nvme_ioreq *req;
1746                 uint32_t nsid;
1747                 bool pending;
1748
1749                 pending = false;
1750                 req = NULL;
1751                 status = 0;
1752
1753                 cmd = &sq->qbase[sqhead];
1754                 sqhead = (sqhead + 1) % sq->size;
1755
1756                 nsid = le32toh(cmd->nsid);
1757                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1758                         pci_nvme_status_genc(&status,
1759                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1760                         status |=
1761                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1762                         goto complete;
1763                 }
1764
1765                 req = pci_nvme_get_ioreq(sc);
1766                 if (req == NULL) {
1767                         pci_nvme_status_genc(&status,
1768                             NVME_SC_INTERNAL_DEVICE_ERROR);
1769                         WPRINTF("%s: unable to allocate IO req", __func__);
1770                         goto complete;
1771                 }
1772                 req->nvme_sq = sq;
1773                 req->sqid = idx;
1774                 req->opc = cmd->opc;
1775                 req->cid = cmd->cid;
1776                 req->nsid = cmd->nsid;
1777
1778                 switch (cmd->opc) {
1779                 case NVME_OPC_FLUSH:
1780                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1781                             req, &status);
1782                         break;
1783                 case NVME_OPC_WRITE:
1784                 case NVME_OPC_READ:
1785                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1786                             req, &status);
1787                         break;
1788                 case NVME_OPC_WRITE_ZEROES:
1789                         /* TODO: write zeroes
1790                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1791                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
1792                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1793                         break;
1794                 case NVME_OPC_DATASET_MANAGEMENT:
1795                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1796                             req, &status);
1797                         break;
1798                 default:
1799                         WPRINTF("%s unhandled io command 0x%x",
1800                             __func__, cmd->opc);
1801                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1802                 }
1803 complete:
1804                 if (!pending) {
1805                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1806                             status);
1807                         if (req != NULL)
1808                                 pci_nvme_release_ioreq(sc, req);
1809                 }
1810         }
1811
1812         sq->head = sqhead;
1813
1814         pthread_mutex_unlock(&sq->mtx);
1815 }
1816
1817 static void
1818 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1819         uint64_t idx, int is_sq, uint64_t value)
1820 {
1821         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
1822                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
1823
1824         if (is_sq) {
1825                 if (idx > sc->num_squeues) {
1826                         WPRINTF("%s queue index %lu overflow from "
1827                                  "guest (max %u)",
1828                                  __func__, idx, sc->num_squeues);
1829                         return;
1830                 }
1831
1832                 atomic_store_short(&sc->submit_queues[idx].tail,
1833                                    (uint16_t)value);
1834
1835                 if (idx == 0) {
1836                         pci_nvme_handle_admin_cmd(sc, value);
1837                 } else {
1838                         /* submission queue; handle new entries in SQ */
1839                         if (idx > sc->num_squeues) {
1840                                 WPRINTF("%s SQ index %lu overflow from "
1841                                          "guest (max %u)",
1842                                          __func__, idx, sc->num_squeues);
1843                                 return;
1844                         }
1845                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1846                 }
1847         } else {
1848                 if (idx > sc->num_cqueues) {
1849                         WPRINTF("%s queue index %lu overflow from "
1850                                  "guest (max %u)",
1851                                  __func__, idx, sc->num_cqueues);
1852                         return;
1853                 }
1854
1855                 atomic_store_short(&sc->compl_queues[idx].head,
1856                                 (uint16_t)value);
1857         }
1858 }
1859
1860 static void
1861 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1862 {
1863         const char *s = iswrite ? "WRITE" : "READ";
1864
1865         switch (offset) {
1866         case NVME_CR_CAP_LOW:
1867                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
1868                 break;
1869         case NVME_CR_CAP_HI:
1870                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
1871                 break;
1872         case NVME_CR_VS:
1873                 DPRINTF("%s %s NVME_CR_VS", func, s);
1874                 break;
1875         case NVME_CR_INTMS:
1876                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
1877                 break;
1878         case NVME_CR_INTMC:
1879                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
1880                 break;
1881         case NVME_CR_CC:
1882                 DPRINTF("%s %s NVME_CR_CC", func, s);
1883                 break;
1884         case NVME_CR_CSTS:
1885                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
1886                 break;
1887         case NVME_CR_NSSR:
1888                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
1889                 break;
1890         case NVME_CR_AQA:
1891                 DPRINTF("%s %s NVME_CR_AQA", func, s);
1892                 break;
1893         case NVME_CR_ASQ_LOW:
1894                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
1895                 break;
1896         case NVME_CR_ASQ_HI:
1897                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
1898                 break;
1899         case NVME_CR_ACQ_LOW:
1900                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
1901                 break;
1902         case NVME_CR_ACQ_HI:
1903                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
1904                 break;
1905         default:
1906                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
1907         }
1908
1909 }
1910
1911 static void
1912 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1913         uint64_t offset, int size, uint64_t value)
1914 {
1915         uint32_t ccreg;
1916
1917         if (offset >= NVME_DOORBELL_OFFSET) {
1918                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1919                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1920                 int is_sq = (belloffset % 8) < 4;
1921
1922                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1923                         WPRINTF("guest attempted an overflow write offset "
1924                                  "0x%lx, val 0x%lx in %s",
1925                                  offset, value, __func__);
1926                         return;
1927                 }
1928
1929                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1930                 return;
1931         }
1932
1933         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
1934                 offset, size, value);
1935
1936         if (size != 4) {
1937                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
1938                          "val 0x%lx) to bar0 in %s",
1939                          size, offset, value, __func__);
1940                 /* TODO: shutdown device */
1941                 return;
1942         }
1943
1944         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1945
1946         pthread_mutex_lock(&sc->mtx);
1947
1948         switch (offset) {
1949         case NVME_CR_CAP_LOW:
1950         case NVME_CR_CAP_HI:
1951                 /* readonly */
1952                 break;
1953         case NVME_CR_VS:
1954                 /* readonly */
1955                 break;
1956         case NVME_CR_INTMS:
1957                 /* MSI-X, so ignore */
1958                 break;
1959         case NVME_CR_INTMC:
1960                 /* MSI-X, so ignore */
1961                 break;
1962         case NVME_CR_CC:
1963                 ccreg = (uint32_t)value;
1964
1965                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1966                          "iocqes %u",
1967                         __func__,
1968                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1969                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1970                          NVME_CC_GET_IOCQES(ccreg));
1971
1972                 if (NVME_CC_GET_SHN(ccreg)) {
1973                         /* perform shutdown - flush out data to backend */
1974                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1975                             NVME_CSTS_REG_SHST_SHIFT);
1976                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1977                             NVME_CSTS_REG_SHST_SHIFT;
1978                 }
1979                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1980                         if (NVME_CC_GET_EN(ccreg) == 0)
1981                                 /* transition 1-> causes controller reset */
1982                                 pci_nvme_reset_locked(sc);
1983                         else
1984                                 pci_nvme_init_controller(ctx, sc);
1985                 }
1986
1987                 /* Insert the iocqes, iosqes and en bits from the write */
1988                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1989                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1990                 if (NVME_CC_GET_EN(ccreg) == 0) {
1991                         /* Insert the ams, mps and css bit fields */
1992                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1993                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1994                         sc->regs.csts &= ~NVME_CSTS_RDY;
1995                 } else if (sc->pending_ios == 0) {
1996                         sc->regs.csts |= NVME_CSTS_RDY;
1997                 }
1998                 break;
1999         case NVME_CR_CSTS:
2000                 break;
2001         case NVME_CR_NSSR:
2002                 /* ignore writes; don't support subsystem reset */
2003                 break;
2004         case NVME_CR_AQA:
2005                 sc->regs.aqa = (uint32_t)value;
2006                 break;
2007         case NVME_CR_ASQ_LOW:
2008                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2009                                (0xFFFFF000 & value);
2010                 break;
2011         case NVME_CR_ASQ_HI:
2012                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2013                                (value << 32);
2014                 break;
2015         case NVME_CR_ACQ_LOW:
2016                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2017                                (0xFFFFF000 & value);
2018                 break;
2019         case NVME_CR_ACQ_HI:
2020                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2021                                (value << 32);
2022                 break;
2023         default:
2024                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2025                          __func__, offset, value, size);
2026         }
2027         pthread_mutex_unlock(&sc->mtx);
2028 }
2029
2030 static void
2031 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2032                 int baridx, uint64_t offset, int size, uint64_t value)
2033 {
2034         struct pci_nvme_softc* sc = pi->pi_arg;
2035
2036         if (baridx == pci_msix_table_bar(pi) ||
2037             baridx == pci_msix_pba_bar(pi)) {
2038                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2039                          " value 0x%lx", baridx, offset, size, value);
2040
2041                 pci_emul_msix_twrite(pi, offset, size, value);
2042                 return;
2043         }
2044
2045         switch (baridx) {
2046         case 0:
2047                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2048                 break;
2049
2050         default:
2051                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2052                          __func__, baridx, value);
2053         }
2054 }
2055
2056 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2057         uint64_t offset, int size)
2058 {
2059         uint64_t value;
2060
2061         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2062
2063         if (offset < NVME_DOORBELL_OFFSET) {
2064                 void *p = &(sc->regs);
2065                 pthread_mutex_lock(&sc->mtx);
2066                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2067                 pthread_mutex_unlock(&sc->mtx);
2068         } else {
2069                 value = 0;
2070                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2071         }
2072
2073         switch (size) {
2074         case 1:
2075                 value &= 0xFF;
2076                 break;
2077         case 2:
2078                 value &= 0xFFFF;
2079                 break;
2080         case 4:
2081                 value &= 0xFFFFFFFF;
2082                 break;
2083         }
2084
2085         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2086                  offset, size, (uint32_t)value);
2087
2088         return (value);
2089 }
2090
2091
2092
2093 static uint64_t
2094 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2095     uint64_t offset, int size)
2096 {
2097         struct pci_nvme_softc* sc = pi->pi_arg;
2098
2099         if (baridx == pci_msix_table_bar(pi) ||
2100             baridx == pci_msix_pba_bar(pi)) {
2101                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2102                         baridx, offset, size);
2103
2104                 return pci_emul_msix_tread(pi, offset, size);
2105         }
2106
2107         switch (baridx) {
2108         case 0:
2109                 return pci_nvme_read_bar_0(sc, offset, size);
2110
2111         default:
2112                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2113         }
2114
2115         return (0);
2116 }
2117
2118
2119 static int
2120 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2121 {
2122         char bident[sizeof("XX:X:X")];
2123         char    *uopt, *xopts, *config;
2124         uint32_t sectsz;
2125         int optidx;
2126
2127         sc->max_queues = NVME_QUEUES;
2128         sc->max_qentries = NVME_MAX_QENTRIES;
2129         sc->ioslots = NVME_IOSLOTS;
2130         sc->num_squeues = sc->max_queues;
2131         sc->num_cqueues = sc->max_queues;
2132         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2133         sectsz = 0;
2134
2135         uopt = strdup(opts);
2136         optidx = 0;
2137         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2138                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2139         for (xopts = strtok(uopt, ",");
2140              xopts != NULL;
2141              xopts = strtok(NULL, ",")) {
2142
2143                 if ((config = strchr(xopts, '=')) != NULL)
2144                         *config++ = '\0';
2145
2146                 if (!strcmp("maxq", xopts)) {
2147                         sc->max_queues = atoi(config);
2148                 } else if (!strcmp("qsz", xopts)) {
2149                         sc->max_qentries = atoi(config);
2150                 } else if (!strcmp("ioslots", xopts)) {
2151                         sc->ioslots = atoi(config);
2152                 } else if (!strcmp("sectsz", xopts)) {
2153                         sectsz = atoi(config);
2154                 } else if (!strcmp("ser", xopts)) {
2155                         /*
2156                          * This field indicates the Product Serial Number in
2157                          * 7-bit ASCII, unused bytes should be space characters.
2158                          * Ref: NVMe v1.3c.
2159                          */
2160                         cpywithpad((char *)sc->ctrldata.sn,
2161                                    sizeof(sc->ctrldata.sn), config, ' ');
2162                 } else if (!strcmp("ram", xopts)) {
2163                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2164
2165                         sc->nvstore.type = NVME_STOR_RAM;
2166                         sc->nvstore.size = sz * 1024 * 1024;
2167                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2168                         sc->nvstore.sectsz = 4096;
2169                         sc->nvstore.sectsz_bits = 12;
2170                         if (sc->nvstore.ctx == NULL) {
2171                                 perror("Unable to allocate RAM");
2172                                 free(uopt);
2173                                 return (-1);
2174                         }
2175                 } else if (!strcmp("eui64", xopts)) {
2176                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2177                 } else if (!strcmp("dsm", xopts)) {
2178                         if (!strcmp("auto", config))
2179                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2180                         else if (!strcmp("enable", config))
2181                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2182                         else if (!strcmp("disable", config))
2183                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2184                 } else if (optidx == 0) {
2185                         snprintf(bident, sizeof(bident), "%d:%d",
2186                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2187                         sc->nvstore.ctx = blockif_open(xopts, bident);
2188                         if (sc->nvstore.ctx == NULL) {
2189                                 perror("Could not open backing file");
2190                                 free(uopt);
2191                                 return (-1);
2192                         }
2193                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2194                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2195                 } else {
2196                         EPRINTLN("Invalid option %s", xopts);
2197                         free(uopt);
2198                         return (-1);
2199                 }
2200
2201                 optidx++;
2202         }
2203         free(uopt);
2204
2205         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2206                 EPRINTLN("backing store not specified");
2207                 return (-1);
2208         }
2209         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2210                 sc->nvstore.sectsz = sectsz;
2211         else if (sc->nvstore.type != NVME_STOR_RAM)
2212                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2213         for (sc->nvstore.sectsz_bits = 9;
2214              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2215              sc->nvstore.sectsz_bits++);
2216
2217         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2218                 sc->max_queues = NVME_QUEUES;
2219
2220         if (sc->max_qentries <= 0) {
2221                 EPRINTLN("Invalid qsz option");
2222                 return (-1);
2223         }
2224         if (sc->ioslots <= 0) {
2225                 EPRINTLN("Invalid ioslots option");
2226                 return (-1);
2227         }
2228
2229         return (0);
2230 }
2231
2232 static int
2233 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2234 {
2235         struct pci_nvme_softc *sc;
2236         uint32_t pci_membar_sz;
2237         int     error;
2238
2239         error = 0;
2240
2241         sc = calloc(1, sizeof(struct pci_nvme_softc));
2242         pi->pi_arg = sc;
2243         sc->nsc_pi = pi;
2244
2245         error = pci_nvme_parse_opts(sc, opts);
2246         if (error < 0)
2247                 goto done;
2248         else
2249                 error = 0;
2250
2251         STAILQ_INIT(&sc->ioreqs_free);
2252         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2253         for (int i = 0; i < sc->ioslots; i++) {
2254                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2255                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2256                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2257         }
2258         sc->intr_coales_aggr_thresh = 1;
2259
2260         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2261         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2262         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2263         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2264         pci_set_cfgdata8(pi, PCIR_PROGIF,
2265                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2266
2267         /*
2268          * Allocate size of NVMe registers + doorbell space for all queues.
2269          *
2270          * The specification requires a minimum memory I/O window size of 16K.
2271          * The Windows driver will refuse to start a device with a smaller
2272          * window.
2273          */
2274         pci_membar_sz = sizeof(struct nvme_registers) +
2275             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2276         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2277
2278         DPRINTF("nvme membar size: %u", pci_membar_sz);
2279
2280         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2281         if (error) {
2282                 WPRINTF("%s pci alloc mem bar failed", __func__);
2283                 goto done;
2284         }
2285
2286         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2287         if (error) {
2288                 WPRINTF("%s pci add msixcap failed", __func__);
2289                 goto done;
2290         }
2291
2292         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2293         if (error) {
2294                 WPRINTF("%s pci add Express capability failed", __func__);
2295                 goto done;
2296         }
2297
2298         pthread_mutex_init(&sc->mtx, NULL);
2299         sem_init(&sc->iosemlock, 0, sc->ioslots);
2300
2301         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2302         /*
2303          * Controller data depends on Namespace data so initialize Namespace
2304          * data first.
2305          */
2306         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2307         pci_nvme_init_ctrldata(sc);
2308         pci_nvme_init_logpages(sc);
2309
2310         pci_nvme_reset(sc);
2311
2312         pci_lintr_request(pi);
2313
2314 done:
2315         return (error);
2316 }
2317
2318
2319 struct pci_devemu pci_de_nvme = {
2320         .pe_emu =       "nvme",
2321         .pe_init =      pci_nvme_init,
2322         .pe_barwrite =  pci_nvme_write,
2323         .pe_barread =   pci_nvme_read
2324 };
2325 PCI_EMUL_SET(pci_de_nvme);