]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
sys/{x86,amd64}: remove one of doubled ;s
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov 
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62
63 #include <assert.h>
64 #include <pthread.h>
65 #include <semaphore.h>
66 #include <stdbool.h>
67 #include <stddef.h>
68 #include <stdint.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
75 #include <vmmapi.h>
76
77 #include <dev/nvme/nvme.h>
78
79 #include "bhyverun.h"
80 #include "block_if.h"
81 #include "pci_emul.h"
82
83
84 static int nvme_debug = 0;
85 #define DPRINTF(params) if (nvme_debug) printf params
86 #define WPRINTF(params) printf params
87
88 /* defaults; can be overridden */
89 #define NVME_MSIX_BAR           4
90
91 #define NVME_IOSLOTS            8
92
93 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
94 #define NVME_MMIO_SPACE_MIN     (1 << 14)
95
96 #define NVME_QUEUES             16
97 #define NVME_MAX_QENTRIES       2048
98
99 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
100 #define NVME_MAX_BLOCKIOVS      512
101
102 /* This is a synthetic status code to indicate there is no status */
103 #define NVME_NO_STATUS          0xffff
104 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
105
106 /* helpers */
107
108 /* Convert a zero-based value into a one-based value */
109 #define ONE_BASED(zero)         ((zero) + 1)
110 /* Convert a one-based value into a zero-based value */
111 #define ZERO_BASED(one)         ((one)  - 1)
112
113 /* Encode number of SQ's and CQ's for Set/Get Features */
114 #define NVME_FEATURE_NUM_QUEUES(sc) \
115         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
116         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
117
118 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
119
120 enum nvme_controller_register_offsets {
121         NVME_CR_CAP_LOW = 0x00,
122         NVME_CR_CAP_HI  = 0x04,
123         NVME_CR_VS      = 0x08,
124         NVME_CR_INTMS   = 0x0c,
125         NVME_CR_INTMC   = 0x10,
126         NVME_CR_CC      = 0x14,
127         NVME_CR_CSTS    = 0x1c,
128         NVME_CR_NSSR    = 0x20,
129         NVME_CR_AQA     = 0x24,
130         NVME_CR_ASQ_LOW = 0x28,
131         NVME_CR_ASQ_HI  = 0x2c,
132         NVME_CR_ACQ_LOW = 0x30,
133         NVME_CR_ACQ_HI  = 0x34,
134 };
135
136 enum nvme_cmd_cdw11 {
137         NVME_CMD_CDW11_PC  = 0x0001,
138         NVME_CMD_CDW11_IEN = 0x0002,
139         NVME_CMD_CDW11_IV  = 0xFFFF0000,
140 };
141
142 #define NVME_CQ_INTEN   0x01
143 #define NVME_CQ_INTCOAL 0x02
144
145 struct nvme_completion_queue {
146         struct nvme_completion *qbase;
147         uint32_t        size;
148         uint16_t        tail; /* nvme progress */
149         uint16_t        head; /* guest progress */
150         uint16_t        intr_vec;
151         uint32_t        intr_en;
152         pthread_mutex_t mtx;
153 };
154
155 struct nvme_submission_queue {
156         struct nvme_command *qbase;
157         uint32_t        size;
158         uint16_t        head; /* nvme progress */
159         uint16_t        tail; /* guest progress */
160         uint16_t        cqid; /* completion queue id */
161         int             busy; /* queue is being processed */
162         int             qpriority;
163 };
164
165 enum nvme_storage_type {
166         NVME_STOR_BLOCKIF = 0,
167         NVME_STOR_RAM = 1,
168 };
169
170 struct pci_nvme_blockstore {
171         enum nvme_storage_type type;
172         void            *ctx;
173         uint64_t        size;
174         uint32_t        sectsz;
175         uint32_t        sectsz_bits;
176         uint64_t        eui64;
177 };
178
179 struct pci_nvme_ioreq {
180         struct pci_nvme_softc *sc;
181         struct pci_nvme_ioreq *next;
182         struct nvme_submission_queue *nvme_sq;
183         uint16_t        sqid;
184
185         /* command information */
186         uint16_t        opc;
187         uint16_t        cid;
188         uint32_t        nsid;
189
190         uint64_t        prev_gpaddr;
191         size_t          prev_size;
192
193         /*
194          * lock if all iovs consumed (big IO);
195          * complete transaction before continuing
196          */
197         pthread_mutex_t mtx;
198         pthread_cond_t  cv;
199
200         struct blockif_req io_req;
201
202         /* pad to fit up to 512 page descriptors from guest IO request */
203         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
204 };
205
206 struct pci_nvme_softc {
207         struct pci_devinst *nsc_pi;
208
209         pthread_mutex_t mtx;
210
211         struct nvme_registers regs;
212
213         struct nvme_namespace_data  nsdata;
214         struct nvme_controller_data ctrldata;
215         struct nvme_error_information_entry err_log;
216         struct nvme_health_information_page health_log;
217         struct nvme_firmware_page fw_log;
218
219         struct pci_nvme_blockstore nvstore;
220
221         uint16_t        max_qentries;   /* max entries per queue */
222         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
223         uint32_t        num_cqueues;
224         uint32_t        num_squeues;
225
226         struct pci_nvme_ioreq *ioreqs;
227         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
228         uint32_t        pending_ios;
229         uint32_t        ioslots;
230         sem_t           iosemlock;
231
232         /*
233          * Memory mapped Submission and Completion queues
234          * Each array includes both Admin and IO queues
235          */
236         struct nvme_completion_queue *compl_queues;
237         struct nvme_submission_queue *submit_queues;
238
239         /* controller features */
240         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
241         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
242         uint32_t        async_ev_config;         /* 0x0B: async event config */
243 };
244
245
246 static void pci_nvme_io_partial(struct blockif_req *br, int err);
247
248 /* Controller Configuration utils */
249 #define NVME_CC_GET_EN(cc) \
250         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
251 #define NVME_CC_GET_CSS(cc) \
252         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
253 #define NVME_CC_GET_SHN(cc) \
254         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
255 #define NVME_CC_GET_IOSQES(cc) \
256         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
257 #define NVME_CC_GET_IOCQES(cc) \
258         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
259
260 #define NVME_CC_WRITE_MASK \
261         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
262          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
263          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
264
265 #define NVME_CC_NEN_WRITE_MASK \
266         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
267          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
268          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
269
270 /* Controller Status utils */
271 #define NVME_CSTS_GET_RDY(sts) \
272         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
273
274 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
275
276 /* Completion Queue status word utils */
277 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
278 #define NVME_STATUS_MASK \
279         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
280          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
281
282 static __inline void
283 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
284 {
285         size_t len;
286
287         len = strnlen(src, dst_size);
288         memset(dst, pad, dst_size);
289         memcpy(dst, src, len);
290 }
291
292 static __inline void
293 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
294 {
295
296         *status &= ~NVME_STATUS_MASK;
297         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
298                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
299 }
300
301 static __inline void
302 pci_nvme_status_genc(uint16_t *status, uint16_t code)
303 {
304
305         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
306 }
307
308 static __inline void
309 pci_nvme_toggle_phase(uint16_t *status, int prev)
310 {
311
312         if (prev)
313                 *status &= ~NVME_STATUS_P;
314         else
315                 *status |= NVME_STATUS_P;
316 }
317
318 static void
319 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
320 {
321         struct nvme_controller_data *cd = &sc->ctrldata;
322
323         cd->vid = 0xFB5D;
324         cd->ssvid = 0x0000;
325
326         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
327         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
328
329         /* Num of submission commands that we can handle at a time (2^rab) */
330         cd->rab   = 4;
331
332         /* FreeBSD OUI */
333         cd->ieee[0] = 0x58;
334         cd->ieee[1] = 0x9c;
335         cd->ieee[2] = 0xfc;
336
337         cd->mic = 0;
338
339         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
340
341         cd->ver = 0x00010300;
342
343         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
344         cd->acl = 2;
345         cd->aerl = 4;
346
347         cd->lpa = 0;    /* TODO: support some simple things like SMART */
348         cd->elpe = 0;   /* max error log page entries */
349         cd->npss = 1;   /* number of power states support */
350
351         /* Warning Composite Temperature Threshold */
352         cd->wctemp = 0x0157;
353
354         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
355             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
356         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
357             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
358         cd->nn = 1;     /* number of namespaces */
359
360         cd->fna = 0x03;
361
362         cd->power_state[0].mp = 10;
363 }
364
365 /*
366  * Calculate the CRC-16 of the given buffer
367  * See copyright attribution at top of file
368  */
369 static uint16_t
370 crc16(uint16_t crc, const void *buffer, unsigned int len)
371 {
372         const unsigned char *cp = buffer;
373         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
374         static uint16_t const crc16_table[256] = {
375                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
376                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
377                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
378                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
379                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
380                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
381                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
382                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
383                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
384                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
385                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
386                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
387                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
388                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
389                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
390                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
391                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
392                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
393                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
394                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
395                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
396                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
397                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
398                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
399                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
400                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
401                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
402                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
403                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
404                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
405                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
406                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
407         };
408
409         while (len--)
410                 crc = (((crc >> 8) & 0xffU) ^
411                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
412         return crc;
413 }
414
415 static void
416 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
417     struct nvme_namespace_data *nd, uint32_t nsid,
418     uint64_t eui64)
419 {
420
421         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
422         nd->ncap = nd->nsze;
423         nd->nuse = nd->nsze;
424
425         /* Get LBA and backstore information from backing store */
426         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
427         nd->flbas = 0;
428
429         /* Create an EUI-64 if user did not provide one */
430         if (eui64 == 0) {
431                 char *data = NULL;
432
433                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
434                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
435
436                 if (data != NULL) {
437                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
438                         free(data);
439                 }
440                 eui64 = (eui64 << 16) | (nsid & 0xffff);
441         }
442         be64enc(nd->eui64, eui64);
443
444         /* LBA data-sz = 2^lbads */
445         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
446 }
447
448 static void
449 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
450 {
451
452         memset(&sc->err_log, 0, sizeof(sc->err_log));
453         memset(&sc->health_log, 0, sizeof(sc->health_log));
454         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
455 }
456
457 static void
458 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
459 {
460         DPRINTF(("%s\r\n", __func__));
461
462         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
463             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
464             (60 << NVME_CAP_LO_REG_TO_SHIFT);
465
466         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
467
468         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
469
470         sc->regs.cc = 0;
471         sc->regs.csts = 0;
472
473         sc->num_cqueues = sc->num_squeues = sc->max_queues;
474         if (sc->submit_queues != NULL) {
475                 for (int i = 0; i < sc->num_squeues + 1; i++) {
476                         /*
477                          * The Admin Submission Queue is at index 0.
478                          * It must not be changed at reset otherwise the
479                          * emulation will be out of sync with the guest.
480                          */
481                         if (i != 0) {
482                                 sc->submit_queues[i].qbase = NULL;
483                                 sc->submit_queues[i].size = 0;
484                                 sc->submit_queues[i].cqid = 0;
485                         }
486                         sc->submit_queues[i].tail = 0;
487                         sc->submit_queues[i].head = 0;
488                         sc->submit_queues[i].busy = 0;
489                 }
490         } else
491                 sc->submit_queues = calloc(sc->num_squeues + 1,
492                                         sizeof(struct nvme_submission_queue));
493
494         if (sc->compl_queues != NULL) {
495                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
496                         /* See Admin Submission Queue note above */
497                         if (i != 0) {
498                                 sc->compl_queues[i].qbase = NULL;
499                                 sc->compl_queues[i].size = 0;
500                         }
501
502                         sc->compl_queues[i].tail = 0;
503                         sc->compl_queues[i].head = 0;
504                 }
505         } else {
506                 sc->compl_queues = calloc(sc->num_cqueues + 1,
507                                         sizeof(struct nvme_completion_queue));
508
509                 for (int i = 0; i < sc->num_cqueues + 1; i++)
510                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
511         }
512 }
513
514 static void
515 pci_nvme_reset(struct pci_nvme_softc *sc)
516 {
517         pthread_mutex_lock(&sc->mtx);
518         pci_nvme_reset_locked(sc);
519         pthread_mutex_unlock(&sc->mtx);
520 }
521
522 static void
523 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
524 {
525         uint16_t acqs, asqs;
526
527         DPRINTF(("%s\r\n", __func__));
528
529         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
530         sc->submit_queues[0].size = asqs;
531         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
532                     sizeof(struct nvme_command) * asqs);
533
534         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
535                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
536
537         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
538             NVME_AQA_REG_ACQS_MASK) + 1;
539         sc->compl_queues[0].size = acqs;
540         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
541                  sizeof(struct nvme_completion) * acqs);
542         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
543                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
544 }
545
546 static int
547 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
548         size_t len)
549 {
550         uint8_t *dst;
551         size_t bytes;
552
553         if (len > (8 * 1024)) {
554                 return (-1);
555         }
556
557         /* Copy from the start of prp1 to the end of the physical page */
558         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
559         bytes = MIN(bytes, len);
560
561         dst = vm_map_gpa(ctx, prp1, bytes);
562         if (dst == NULL) {
563                 return (-1);
564         }
565
566         memcpy(dst, src, bytes);
567
568         src += bytes;
569
570         len -= bytes;
571         if (len == 0) {
572                 return (0);
573         }
574
575         len = MIN(len, PAGE_SIZE);
576
577         dst = vm_map_gpa(ctx, prp2, len);
578         if (dst == NULL) {
579                 return (-1);
580         }
581
582         memcpy(dst, src, len);
583
584         return (0);
585 }
586
587 static int
588 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
589         struct nvme_completion* compl)
590 {
591         uint16_t qid = command->cdw10 & 0xffff;
592
593         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
594         if (qid == 0 || qid > sc->num_squeues) {
595                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
596                         __func__, qid, sc->num_squeues));
597                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
598                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
599                 return (1);
600         }
601
602         sc->submit_queues[qid].qbase = NULL;
603         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
604         return (1);
605 }
606
607 static int
608 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
609         struct nvme_completion* compl)
610 {
611         if (command->cdw11 & NVME_CMD_CDW11_PC) {
612                 uint16_t qid = command->cdw10 & 0xffff;
613                 struct nvme_submission_queue *nsq;
614
615                 if ((qid == 0) || (qid > sc->num_squeues)) {
616                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
617                                 __func__, qid, sc->num_squeues));
618                         pci_nvme_status_tc(&compl->status,
619                             NVME_SCT_COMMAND_SPECIFIC,
620                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
621                         return (1);
622                 }
623
624                 nsq = &sc->submit_queues[qid];
625                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
626
627                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
628                               sizeof(struct nvme_command) * (size_t)nsq->size);
629                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
630                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
631
632                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
633                         qid, nsq->size, nsq->qbase, nsq->cqid));
634
635                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
636
637                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
638                          __func__, qid));
639         } else {
640                 /* 
641                  * Guest sent non-cont submission queue request.
642                  * This setting is unsupported by this emulation.
643                  */
644                 WPRINTF(("%s unsupported non-contig (list-based) "
645                          "create i/o submission queue\r\n", __func__));
646
647                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
648         }
649         return (1);
650 }
651
652 static int
653 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
654         struct nvme_completion* compl)
655 {
656         uint16_t qid = command->cdw10 & 0xffff;
657
658         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
659         if (qid == 0 || qid > sc->num_cqueues) {
660                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
661                         __func__, qid, sc->num_cqueues));
662                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
663                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
664                 return (1);
665         }
666
667         sc->compl_queues[qid].qbase = NULL;
668         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
669         return (1);
670 }
671
672 static int
673 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
674         struct nvme_completion* compl)
675 {
676         if (command->cdw11 & NVME_CMD_CDW11_PC) {
677                 uint16_t qid = command->cdw10 & 0xffff;
678                 struct nvme_completion_queue *ncq;
679
680                 if ((qid == 0) || (qid > sc->num_cqueues)) {
681                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
682                                 __func__, qid, sc->num_cqueues));
683                         pci_nvme_status_tc(&compl->status,
684                             NVME_SCT_COMMAND_SPECIFIC,
685                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
686                         return (1);
687                 }
688
689                 ncq = &sc->compl_queues[qid];
690                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
691                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
692                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
693
694                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
695                              command->prp1,
696                              sizeof(struct nvme_command) * (size_t)ncq->size);
697
698                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
699         } else {
700                 /* 
701                  * Non-contig completion queue unsupported.
702                  */
703                 WPRINTF(("%s unsupported non-contig (list-based) "
704                          "create i/o completion queue\r\n",
705                          __func__));
706
707                 /* 0x12 = Invalid Use of Controller Memory Buffer */
708                 pci_nvme_status_genc(&compl->status, 0x12);
709         }
710
711         return (1);
712 }
713
714 static int
715 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
716         struct nvme_completion* compl)
717 {
718         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
719         uint8_t logpage = command->cdw10 & 0xFF;
720
721         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
722
723         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
724
725         switch (logpage) {
726         case NVME_LOG_ERROR:
727                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
728                     command->prp2, (uint8_t *)&sc->err_log, logsize);
729                 break;
730         case NVME_LOG_HEALTH_INFORMATION:
731                 /* TODO: present some smart info */
732                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
733                     command->prp2, (uint8_t *)&sc->health_log, logsize);
734                 break;
735         case NVME_LOG_FIRMWARE_SLOT:
736                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
737                     command->prp2, (uint8_t *)&sc->fw_log, logsize);
738                 break;
739         default:
740                 WPRINTF(("%s get log page %x command not supported\r\n",
741                         __func__, logpage));
742
743                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
744                     NVME_SC_INVALID_LOG_PAGE);
745         }
746
747         return (1);
748 }
749
750 static int
751 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
752         struct nvme_completion* compl)
753 {
754         void *dest;
755
756         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
757                 command->cdw10 & 0xFF, command->nsid));
758
759         switch (command->cdw10 & 0xFF) {
760         case 0x00: /* return Identify Namespace data structure */
761                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
762                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
763                 break;
764         case 0x01: /* return Identify Controller data structure */
765                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
766                     command->prp2, (uint8_t *)&sc->ctrldata,
767                     sizeof(sc->ctrldata));
768                 break;
769         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
770                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
771                                   sizeof(uint32_t) * 1024);
772                 ((uint32_t *)dest)[0] = 1;
773                 ((uint32_t *)dest)[1] = 0;
774                 break;
775         case 0x11:
776                 pci_nvme_status_genc(&compl->status,
777                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
778                 return (1);
779         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
780         case 0x10:
781         case 0x12:
782         case 0x13:
783         case 0x14:
784         case 0x15:
785         default:
786                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
787                          __func__, command->cdw10 & 0xFF));
788                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
789                 return (1);
790         }
791
792         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
793         return (1);
794 }
795
796 static int
797 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
798         struct nvme_completion* compl)
799 {
800         uint16_t nqr;   /* Number of Queues Requested */
801
802         nqr = command->cdw11 & 0xFFFF;
803         if (nqr == 0xffff) {
804                 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
805                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
806                 return (-1);
807         }
808
809         sc->num_squeues = ONE_BASED(nqr);
810         if (sc->num_squeues > sc->max_queues) {
811                 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
812                                         sc->max_queues));
813                 sc->num_squeues = sc->max_queues;
814         }
815
816         nqr = (command->cdw11 >> 16) & 0xFFFF;
817         if (nqr == 0xffff) {
818                 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
819                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
820                 return (-1);
821         }
822
823         sc->num_cqueues = ONE_BASED(nqr);
824         if (sc->num_cqueues > sc->max_queues) {
825                 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
826                                         sc->max_queues));
827                 sc->num_cqueues = sc->max_queues;
828         }
829
830         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
831
832         return (0);
833 }
834
835 static int
836 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
837         struct nvme_completion* compl)
838 {
839         int feature = command->cdw10 & 0xFF;
840         uint32_t iv;
841
842         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
843         compl->cdw0 = 0;
844
845         switch (feature) {
846         case NVME_FEAT_ARBITRATION:
847                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
848                 break;
849         case NVME_FEAT_POWER_MANAGEMENT:
850                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
851                 break;
852         case NVME_FEAT_LBA_RANGE_TYPE:
853                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
854                 break;
855         case NVME_FEAT_TEMPERATURE_THRESHOLD:
856                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
857                 break;
858         case NVME_FEAT_ERROR_RECOVERY:
859                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
860                 break;
861         case NVME_FEAT_VOLATILE_WRITE_CACHE:
862                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
863                 break;
864         case NVME_FEAT_NUMBER_OF_QUEUES:
865                 nvme_set_feature_queues(sc, command, compl);
866                 break;
867         case NVME_FEAT_INTERRUPT_COALESCING:
868                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
869
870                 /* in uS */
871                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
872
873                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
874                 break;
875         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
876                 iv = command->cdw11 & 0xFFFF;
877
878                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
879                         command->cdw11));
880
881                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
882                         if (sc->compl_queues[i].intr_vec == iv) {
883                                 if (command->cdw11 & (1 << 16))
884                                         sc->compl_queues[i].intr_en |=
885                                                               NVME_CQ_INTCOAL;  
886                                 else
887                                         sc->compl_queues[i].intr_en &=
888                                                              ~NVME_CQ_INTCOAL;  
889                         }
890                 }
891                 break;
892         case NVME_FEAT_WRITE_ATOMICITY:
893                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
894                 break;
895         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
896                 DPRINTF(("  async event configuration 0x%x\r\n",
897                         command->cdw11));
898                 sc->async_ev_config = command->cdw11;
899                 break;
900         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
901                 DPRINTF(("  software progress marker 0x%x\r\n",
902                         command->cdw11));
903                 break;
904         case 0x0C:
905                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
906                         command->cdw11));
907                 break;
908         default:
909                 WPRINTF(("%s invalid feature\r\n", __func__));
910                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
911                 return (1);
912         }
913
914         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
915         return (1);
916 }
917
918 static int
919 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
920         struct nvme_completion* compl)
921 {
922         int feature = command->cdw10 & 0xFF;
923
924         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
925
926         compl->cdw0 = 0;
927
928         switch (feature) {
929         case NVME_FEAT_ARBITRATION:
930                 DPRINTF(("  arbitration\r\n"));
931                 break;
932         case NVME_FEAT_POWER_MANAGEMENT:
933                 DPRINTF(("  power management\r\n"));
934                 break;
935         case NVME_FEAT_LBA_RANGE_TYPE:
936                 DPRINTF(("  lba range\r\n"));
937                 break;
938         case NVME_FEAT_TEMPERATURE_THRESHOLD:
939                 DPRINTF(("  temperature threshold\r\n"));
940                 switch ((command->cdw11 >> 20) & 0x3) {
941                 case 0:
942                         /* Over temp threshold */
943                         compl->cdw0 = 0xFFFF;
944                         break;
945                 case 1:
946                         /* Under temp threshold */
947                         compl->cdw0 = 0;
948                         break;
949                 default:
950                         WPRINTF(("  invalid threshold type select\r\n"));
951                         pci_nvme_status_genc(&compl->status,
952                             NVME_SC_INVALID_FIELD);
953                         return (1);
954                 }
955                 break;
956         case NVME_FEAT_ERROR_RECOVERY:
957                 DPRINTF(("  error recovery\r\n"));
958                 break;
959         case NVME_FEAT_VOLATILE_WRITE_CACHE:
960                 DPRINTF(("  volatile write cache\r\n"));
961                 break;
962         case NVME_FEAT_NUMBER_OF_QUEUES:
963                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
964
965                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
966                         compl->cdw0 & 0xFFFF,
967                         (compl->cdw0 >> 16) & 0xFFFF));
968
969                 break;
970         case NVME_FEAT_INTERRUPT_COALESCING:
971                 DPRINTF(("  interrupt coalescing\r\n"));
972                 break;
973         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
974                 DPRINTF(("  interrupt vector configuration\r\n"));
975                 break;
976         case NVME_FEAT_WRITE_ATOMICITY:
977                 DPRINTF(("  write atomicity\r\n"));
978                 break;
979         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
980                 DPRINTF(("  async event configuration\r\n"));
981                 sc->async_ev_config = command->cdw11;
982                 break;
983         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
984                 DPRINTF(("  software progress marker\r\n"));
985                 break;
986         case 0x0C:
987                 DPRINTF(("  autonomous power state transition\r\n"));
988                 break;
989         default:
990                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
991                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
992                 return (1);
993         }
994
995         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
996         return (1);
997 }
998
999 static int
1000 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1001         struct nvme_completion* compl)
1002 {
1003         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
1004                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1005
1006         /* TODO: search for the command ID and abort it */
1007
1008         compl->cdw0 = 1;
1009         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1010         return (1);
1011 }
1012
1013 static int
1014 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1015         struct nvme_command* command, struct nvme_completion* compl)
1016 {
1017         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
1018
1019         /*
1020          * TODO: raise events when they happen based on the Set Features cmd.
1021          * These events happen async, so only set completion successful if
1022          * there is an event reflective of the request to get event.
1023          */
1024         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1025             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1026         return (0);
1027 }
1028
1029 static void
1030 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1031 {
1032         struct nvme_completion compl;
1033         struct nvme_command *cmd;
1034         struct nvme_submission_queue *sq;
1035         struct nvme_completion_queue *cq;
1036         int do_intr = 0;
1037         uint16_t sqhead;
1038
1039         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
1040
1041         sq = &sc->submit_queues[0];
1042
1043         sqhead = atomic_load_acq_short(&sq->head);
1044
1045         if (atomic_testandset_int(&sq->busy, 1)) {
1046                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
1047                         __func__, sqhead, sq->tail));
1048                 return;
1049         }
1050
1051         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
1052         
1053         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1054                 cmd = &(sq->qbase)[sqhead];
1055                 compl.cdw0 = 0;
1056                 compl.status = 0;
1057
1058                 switch (cmd->opc) {
1059                 case NVME_OPC_DELETE_IO_SQ:
1060                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
1061                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1062                         break;
1063                 case NVME_OPC_CREATE_IO_SQ:
1064                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
1065                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1066                         break;
1067                 case NVME_OPC_DELETE_IO_CQ:
1068                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
1069                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1070                         break;
1071                 case NVME_OPC_CREATE_IO_CQ:
1072                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
1073                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1074                         break;
1075                 case NVME_OPC_GET_LOG_PAGE:
1076                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1077                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1078                         break;
1079                 case NVME_OPC_IDENTIFY:
1080                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
1081                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
1082                         break;
1083                 case NVME_OPC_ABORT:
1084                         DPRINTF(("%s command ABORT\r\n", __func__));
1085                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
1086                         break;
1087                 case NVME_OPC_SET_FEATURES:
1088                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1089                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1090                         break;
1091                 case NVME_OPC_GET_FEATURES:
1092                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1093                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1094                         break;
1095                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1096                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1097                         /* XXX dont care, unhandled for now
1098                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1099                         */
1100                         compl.status = NVME_NO_STATUS;
1101                         break;
1102                 default:
1103                         WPRINTF(("0x%x command is not implemented\r\n",
1104                             cmd->opc));
1105                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1106                         do_intr |= 1;
1107                 }
1108         
1109                 if (NVME_COMPLETION_VALID(compl)) {
1110                         struct nvme_completion *cp;
1111                         int phase;
1112
1113                         cq = &sc->compl_queues[0];
1114
1115                         cp = &(cq->qbase)[cq->tail];
1116                         cp->cdw0 = compl.cdw0;
1117                         cp->sqid = 0;
1118                         cp->sqhd = sqhead;
1119                         cp->cid = cmd->cid;
1120
1121                         phase = NVME_STATUS_GET_P(cp->status);
1122                         cp->status = compl.status;
1123                         pci_nvme_toggle_phase(&cp->status, phase);
1124
1125                         cq->tail = (cq->tail + 1) % cq->size;
1126                 }
1127                 sqhead = (sqhead + 1) % sq->size;
1128         }
1129
1130         DPRINTF(("setting sqhead %u\r\n", sqhead));
1131         atomic_store_short(&sq->head, sqhead);
1132         atomic_store_int(&sq->busy, 0);
1133
1134         if (do_intr)
1135                 pci_generate_msix(sc->nsc_pi, 0);
1136
1137 }
1138
1139 static int
1140 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1141         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1142 {
1143         int iovidx;
1144
1145         if (req != NULL) {
1146                 /* concatenate contig block-iovs to minimize number of iovs */
1147                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1148                         iovidx = req->io_req.br_iovcnt - 1;
1149
1150                         req->io_req.br_iov[iovidx].iov_base =
1151                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1152                                              req->prev_gpaddr, size);
1153
1154                         req->prev_size += size;
1155                         req->io_req.br_resid += size;
1156
1157                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1158                 } else {
1159                         pthread_mutex_lock(&req->mtx);
1160
1161                         iovidx = req->io_req.br_iovcnt;
1162                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1163                                 int err = 0;
1164
1165                                 DPRINTF(("large I/O, doing partial req\r\n"));
1166
1167                                 iovidx = 0;
1168                                 req->io_req.br_iovcnt = 0;
1169
1170                                 req->io_req.br_callback = pci_nvme_io_partial;
1171
1172                                 if (!do_write)
1173                                         err = blockif_read(sc->nvstore.ctx,
1174                                                            &req->io_req);
1175                                 else
1176                                         err = blockif_write(sc->nvstore.ctx,
1177                                                             &req->io_req);
1178
1179                                 /* wait until req completes before cont */
1180                                 if (err == 0)
1181                                         pthread_cond_wait(&req->cv, &req->mtx);
1182                         }
1183                         if (iovidx == 0) {
1184                                 req->io_req.br_offset = lba;
1185                                 req->io_req.br_resid = 0;
1186                                 req->io_req.br_param = req;
1187                         }
1188
1189                         req->io_req.br_iov[iovidx].iov_base =
1190                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1191                                              gpaddr, size);
1192
1193                         req->io_req.br_iov[iovidx].iov_len = size;
1194
1195                         req->prev_gpaddr = gpaddr;
1196                         req->prev_size = size;
1197                         req->io_req.br_resid += size;
1198
1199                         req->io_req.br_iovcnt++;
1200
1201                         pthread_mutex_unlock(&req->mtx);
1202                 }
1203         } else {
1204                 /* RAM buffer: read/write directly */
1205                 void *p = sc->nvstore.ctx;
1206                 void *gptr;
1207
1208                 if ((lba + size) > sc->nvstore.size) {
1209                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1210                         return (-1);
1211                 }
1212
1213                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1214                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1215                 if (do_write) 
1216                         memcpy(p, gptr, size);
1217                 else
1218                         memcpy(gptr, p, size);
1219         }
1220         return (0);
1221 }
1222
1223 static void
1224 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1225         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1226         uint32_t cdw0, uint16_t status, int ignore_busy)
1227 {
1228         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1229         struct nvme_completion *compl;
1230         int do_intr = 0;
1231         int phase;
1232
1233         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1234                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1235                  NVME_STATUS_GET_SC(status)));
1236
1237         pthread_mutex_lock(&cq->mtx);
1238
1239         assert(cq->qbase != NULL);
1240
1241         compl = &cq->qbase[cq->tail];
1242
1243         compl->sqhd = atomic_load_acq_short(&sq->head);
1244         compl->sqid = sqid;
1245         compl->cid = cid;
1246
1247         // toggle phase
1248         phase = NVME_STATUS_GET_P(compl->status);
1249         compl->status = status;
1250         pci_nvme_toggle_phase(&compl->status, phase);
1251
1252         cq->tail = (cq->tail + 1) % cq->size;
1253
1254         if (cq->intr_en & NVME_CQ_INTEN)
1255                 do_intr = 1;
1256
1257         pthread_mutex_unlock(&cq->mtx);
1258
1259         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1260                 if (do_intr)
1261                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1262 }
1263
1264 static void
1265 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1266 {
1267         req->sc = NULL;
1268         req->nvme_sq = NULL;
1269         req->sqid = 0;
1270
1271         pthread_mutex_lock(&sc->mtx);
1272
1273         req->next = sc->ioreqs_free;
1274         sc->ioreqs_free = req;
1275         sc->pending_ios--;
1276
1277         /* when no more IO pending, can set to ready if device reset/enabled */
1278         if (sc->pending_ios == 0 &&
1279             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1280                 sc->regs.csts |= NVME_CSTS_RDY;
1281
1282         pthread_mutex_unlock(&sc->mtx);
1283
1284         sem_post(&sc->iosemlock);
1285 }
1286
1287 static struct pci_nvme_ioreq *
1288 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1289 {
1290         struct pci_nvme_ioreq *req = NULL;;
1291
1292         sem_wait(&sc->iosemlock);
1293         pthread_mutex_lock(&sc->mtx);
1294
1295         req = sc->ioreqs_free;
1296         assert(req != NULL);
1297
1298         sc->ioreqs_free = req->next;
1299
1300         req->next = NULL;
1301         req->sc = sc;
1302
1303         sc->pending_ios++;
1304
1305         pthread_mutex_unlock(&sc->mtx);
1306
1307         req->io_req.br_iovcnt = 0;
1308         req->io_req.br_offset = 0;
1309         req->io_req.br_resid = 0;
1310         req->io_req.br_param = req;
1311         req->prev_gpaddr = 0;
1312         req->prev_size = 0;
1313
1314         return req;
1315 }
1316
1317 static void
1318 pci_nvme_io_done(struct blockif_req *br, int err)
1319 {
1320         struct pci_nvme_ioreq *req = br->br_param;
1321         struct nvme_submission_queue *sq = req->nvme_sq;
1322         uint16_t code, status;
1323
1324         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1325         
1326         /* TODO return correct error */
1327         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1328         pci_nvme_status_genc(&status, code);
1329
1330         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1331         pci_nvme_release_ioreq(req->sc, req);
1332 }
1333
1334 static void
1335 pci_nvme_io_partial(struct blockif_req *br, int err)
1336 {
1337         struct pci_nvme_ioreq *req = br->br_param;
1338
1339         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1340
1341         pthread_cond_signal(&req->cv);
1342 }
1343
1344
1345 static void
1346 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1347 {
1348         struct nvme_submission_queue *sq;
1349         uint16_t status;
1350         uint16_t sqhead;
1351         int err;
1352
1353         /* handle all submissions up to sq->tail index */
1354         sq = &sc->submit_queues[idx];
1355
1356         if (atomic_testandset_int(&sq->busy, 1)) {
1357                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1358                 return;
1359         }
1360
1361         sqhead = atomic_load_acq_short(&sq->head);
1362
1363         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1364                  idx, sqhead, sq->tail, sq->qbase));
1365
1366         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1367                 struct nvme_command *cmd;
1368                 struct pci_nvme_ioreq *req = NULL;
1369                 uint64_t lba;
1370                 uint64_t nblocks, bytes, size, cpsz;
1371
1372                 /* TODO: support scatter gather list handling */
1373
1374                 cmd = &sq->qbase[sqhead];
1375                 sqhead = (sqhead + 1) % sq->size;
1376
1377                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1378
1379                 if (cmd->opc == NVME_OPC_FLUSH) {
1380                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1381                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1382                                                 status, 1);
1383
1384                         continue;
1385                 } else if (cmd->opc == 0x08) {
1386                         /* TODO: write zeroes */
1387                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1388                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1389                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1390                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1391                                                 status, 1);
1392
1393                         continue;
1394                 }
1395
1396                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1397
1398                 bytes = nblocks * sc->nvstore.sectsz;
1399
1400                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1401                         req = pci_nvme_get_ioreq(sc);
1402                         req->nvme_sq = sq;
1403                         req->sqid = idx;
1404                 }
1405
1406                 /*
1407                  * If data starts mid-page and flows into the next page, then
1408                  * increase page count
1409                  */
1410
1411                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1412                          "(%lu-bytes)\r\n",
1413                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1414                          cmd->opc == NVME_OPC_WRITE ?
1415                              "WRITE" : "READ",
1416                          lba, nblocks, bytes));
1417
1418                 cmd->prp1 &= ~(0x03UL);
1419                 cmd->prp2 &= ~(0x03UL);
1420
1421                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1422
1423                 size = bytes;
1424                 lba *= sc->nvstore.sectsz;
1425
1426                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1427
1428                 if (cpsz > bytes)
1429                         cpsz = bytes;
1430
1431                 if (req != NULL) {
1432                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1433                                                 cmd->cdw10;
1434                         req->opc = cmd->opc;
1435                         req->cid = cmd->cid;
1436                         req->nsid = cmd->nsid;
1437                 }
1438
1439                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1440                     cmd->opc == NVME_OPC_WRITE, lba);
1441                 lba += cpsz;
1442                 size -= cpsz;
1443
1444                 if (size == 0)
1445                         goto iodone;
1446
1447                 if (size <= PAGE_SIZE) {
1448                         /* prp2 is second (and final) page in transfer */
1449
1450                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1451                             size,
1452                             cmd->opc == NVME_OPC_WRITE,
1453                             lba);
1454                 } else {
1455                         uint64_t *prp_list;
1456                         int i;
1457
1458                         /* prp2 is pointer to a physical region page list */
1459                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1460                                                     cmd->prp2, PAGE_SIZE);
1461
1462                         i = 0;
1463                         while (size != 0) {
1464                                 cpsz = MIN(size, PAGE_SIZE);
1465
1466                                 /*
1467                                  * Move to linked physical region page list
1468                                  * in last item.
1469                                  */ 
1470                                 if (i == (NVME_PRP2_ITEMS-1) &&
1471                                     size > PAGE_SIZE) {
1472                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1473                                         prp_list = paddr_guest2host(
1474                                                       sc->nsc_pi->pi_vmctx,
1475                                                       prp_list[i], PAGE_SIZE);
1476                                         i = 0;
1477                                 }
1478                                 if (prp_list[i] == 0) {
1479                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1480                                         err = 1;
1481                                         break;
1482                                 }
1483
1484                                 err = pci_nvme_append_iov_req(sc, req,
1485                                     prp_list[i], cpsz,
1486                                     cmd->opc == NVME_OPC_WRITE, lba);
1487                                 if (err)
1488                                         break;
1489
1490                                 lba += cpsz;
1491                                 size -= cpsz;
1492                                 i++;
1493                         }
1494                 }
1495
1496 iodone:
1497                 if (sc->nvstore.type == NVME_STOR_RAM) {
1498                         uint16_t code, status;
1499
1500                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1501                             NVME_SC_SUCCESS;
1502                         pci_nvme_status_genc(&status, code);
1503
1504                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1505                                                 status, 1);
1506
1507                         continue;
1508                 }
1509
1510
1511                 if (err)
1512                         goto do_error;
1513
1514                 req->io_req.br_callback = pci_nvme_io_done;
1515
1516                 err = 0;
1517                 switch (cmd->opc) {
1518                 case NVME_OPC_READ:
1519                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1520                         break;
1521                 case NVME_OPC_WRITE:
1522                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1523                         break;
1524                 default:
1525                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1526                                  __func__, cmd->opc));
1527                         err = 1;
1528                 }
1529
1530 do_error:
1531                 if (err) {
1532                         uint16_t status;
1533
1534                         pci_nvme_status_genc(&status,
1535                             NVME_SC_DATA_TRANSFER_ERROR);
1536
1537                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1538                                                 status, 1);
1539                         pci_nvme_release_ioreq(sc, req);
1540                 }
1541         }
1542
1543         atomic_store_short(&sq->head, sqhead);
1544         atomic_store_int(&sq->busy, 0);
1545 }
1546
1547 static void
1548 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1549         uint64_t idx, int is_sq, uint64_t value)
1550 {
1551         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1552                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1553
1554         if (is_sq) {
1555                 atomic_store_short(&sc->submit_queues[idx].tail,
1556                                    (uint16_t)value);
1557
1558                 if (idx == 0) {
1559                         pci_nvme_handle_admin_cmd(sc, value);
1560                 } else {
1561                         /* submission queue; handle new entries in SQ */
1562                         if (idx > sc->num_squeues) {
1563                                 WPRINTF(("%s SQ index %lu overflow from "
1564                                          "guest (max %u)\r\n",
1565                                          __func__, idx, sc->num_squeues));
1566                                 return;
1567                         }
1568                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1569                 }
1570         } else {
1571                 if (idx > sc->num_cqueues) {
1572                         WPRINTF(("%s queue index %lu overflow from "
1573                                  "guest (max %u)\r\n",
1574                                  __func__, idx, sc->num_cqueues));
1575                         return;
1576                 }
1577
1578                 sc->compl_queues[idx].head = (uint16_t)value;
1579         }
1580 }
1581
1582 static void
1583 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1584 {
1585         const char *s = iswrite ? "WRITE" : "READ";
1586
1587         switch (offset) {
1588         case NVME_CR_CAP_LOW:
1589                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1590                 break;
1591         case NVME_CR_CAP_HI:
1592                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1593                 break;
1594         case NVME_CR_VS:
1595                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1596                 break;
1597         case NVME_CR_INTMS:
1598                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1599                 break;
1600         case NVME_CR_INTMC:
1601                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1602                 break;
1603         case NVME_CR_CC:
1604                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1605                 break;
1606         case NVME_CR_CSTS:
1607                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1608                 break;
1609         case NVME_CR_NSSR:
1610                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1611                 break;
1612         case NVME_CR_AQA:
1613                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1614                 break;
1615         case NVME_CR_ASQ_LOW:
1616                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1617                 break;
1618         case NVME_CR_ASQ_HI:
1619                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1620                 break;
1621         case NVME_CR_ACQ_LOW:
1622                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1623                 break;
1624         case NVME_CR_ACQ_HI:
1625                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1626                 break;
1627         default:
1628                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1629         }
1630
1631 }
1632
1633 static void
1634 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1635         uint64_t offset, int size, uint64_t value)
1636 {
1637         uint32_t ccreg;
1638
1639         if (offset >= NVME_DOORBELL_OFFSET) {
1640                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1641                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1642                 int is_sq = (belloffset % 8) < 4;
1643
1644                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1645                         WPRINTF(("guest attempted an overflow write offset "
1646                                  "0x%lx, val 0x%lx in %s",
1647                                  offset, value, __func__));
1648                         return;
1649                 }
1650
1651                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1652                 return;
1653         }
1654
1655         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1656                 offset, size, value));
1657
1658         if (size != 4) {
1659                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1660                          "val 0x%lx) to bar0 in %s",
1661                          size, offset, value, __func__));
1662                 /* TODO: shutdown device */
1663                 return;
1664         }
1665
1666         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1667
1668         pthread_mutex_lock(&sc->mtx);
1669
1670         switch (offset) {
1671         case NVME_CR_CAP_LOW:
1672         case NVME_CR_CAP_HI:
1673                 /* readonly */
1674                 break;
1675         case NVME_CR_VS:
1676                 /* readonly */
1677                 break;
1678         case NVME_CR_INTMS:
1679                 /* MSI-X, so ignore */
1680                 break;
1681         case NVME_CR_INTMC:
1682                 /* MSI-X, so ignore */
1683                 break;
1684         case NVME_CR_CC:
1685                 ccreg = (uint32_t)value;
1686
1687                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1688                          "iocqes %u\r\n",
1689                         __func__,
1690                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1691                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1692                          NVME_CC_GET_IOCQES(ccreg)));
1693
1694                 if (NVME_CC_GET_SHN(ccreg)) {
1695                         /* perform shutdown - flush out data to backend */
1696                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1697                             NVME_CSTS_REG_SHST_SHIFT);
1698                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1699                             NVME_CSTS_REG_SHST_SHIFT;
1700                 }
1701                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1702                         if (NVME_CC_GET_EN(ccreg) == 0)
1703                                 /* transition 1-> causes controller reset */
1704                                 pci_nvme_reset_locked(sc);
1705                         else
1706                                 pci_nvme_init_controller(ctx, sc);
1707                 }
1708
1709                 /* Insert the iocqes, iosqes and en bits from the write */
1710                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1711                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1712                 if (NVME_CC_GET_EN(ccreg) == 0) {
1713                         /* Insert the ams, mps and css bit fields */
1714                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1715                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1716                         sc->regs.csts &= ~NVME_CSTS_RDY;
1717                 } else if (sc->pending_ios == 0) {
1718                         sc->regs.csts |= NVME_CSTS_RDY;
1719                 }
1720                 break;
1721         case NVME_CR_CSTS:
1722                 break;
1723         case NVME_CR_NSSR:
1724                 /* ignore writes; don't support subsystem reset */
1725                 break;
1726         case NVME_CR_AQA:
1727                 sc->regs.aqa = (uint32_t)value;
1728                 break;
1729         case NVME_CR_ASQ_LOW:
1730                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1731                                (0xFFFFF000 & value);
1732                 break;
1733         case NVME_CR_ASQ_HI:
1734                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1735                                (value << 32);
1736                 break;
1737         case NVME_CR_ACQ_LOW:
1738                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1739                                (0xFFFFF000 & value);
1740                 break;
1741         case NVME_CR_ACQ_HI:
1742                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1743                                (value << 32);
1744                 break;
1745         default:
1746                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1747                          __func__, offset, value, size));
1748         }
1749         pthread_mutex_unlock(&sc->mtx);
1750 }
1751
1752 static void
1753 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1754                 int baridx, uint64_t offset, int size, uint64_t value)
1755 {
1756         struct pci_nvme_softc* sc = pi->pi_arg;
1757
1758         if (baridx == pci_msix_table_bar(pi) ||
1759             baridx == pci_msix_pba_bar(pi)) {
1760                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1761                          " value 0x%lx\r\n", baridx, offset, size, value));
1762
1763                 pci_emul_msix_twrite(pi, offset, size, value);
1764                 return;
1765         }
1766
1767         switch (baridx) {
1768         case 0:
1769                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1770                 break;
1771
1772         default:
1773                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1774                          __func__, baridx, value));
1775         }
1776 }
1777
1778 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1779         uint64_t offset, int size)
1780 {
1781         uint64_t value;
1782
1783         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1784
1785         if (offset < NVME_DOORBELL_OFFSET) {
1786                 void *p = &(sc->regs);
1787                 pthread_mutex_lock(&sc->mtx);
1788                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1789                 pthread_mutex_unlock(&sc->mtx);
1790         } else {
1791                 value = 0;
1792                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1793         }
1794
1795         switch (size) {
1796         case 1:
1797                 value &= 0xFF;
1798                 break;
1799         case 2:
1800                 value &= 0xFFFF;
1801                 break;
1802         case 4:
1803                 value &= 0xFFFFFFFF;
1804                 break;
1805         }
1806
1807         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1808                  offset, size, (uint32_t)value));
1809
1810         return (value);
1811 }
1812
1813
1814
1815 static uint64_t
1816 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1817     uint64_t offset, int size)
1818 {
1819         struct pci_nvme_softc* sc = pi->pi_arg;
1820
1821         if (baridx == pci_msix_table_bar(pi) ||
1822             baridx == pci_msix_pba_bar(pi)) {
1823                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1824                         baridx, offset, size));
1825
1826                 return pci_emul_msix_tread(pi, offset, size);
1827         }
1828
1829         switch (baridx) {
1830         case 0:
1831                 return pci_nvme_read_bar_0(sc, offset, size);
1832
1833         default:
1834                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1835         }
1836
1837         return (0);
1838 }
1839
1840
1841 static int
1842 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1843 {
1844         char bident[sizeof("XX:X:X")];
1845         char    *uopt, *xopts, *config;
1846         uint32_t sectsz;
1847         int optidx;
1848
1849         sc->max_queues = NVME_QUEUES;
1850         sc->max_qentries = NVME_MAX_QENTRIES;
1851         sc->ioslots = NVME_IOSLOTS;
1852         sc->num_squeues = sc->max_queues;
1853         sc->num_cqueues = sc->max_queues;
1854         sectsz = 0;
1855
1856         uopt = strdup(opts);
1857         optidx = 0;
1858         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1859                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1860         for (xopts = strtok(uopt, ",");
1861              xopts != NULL;
1862              xopts = strtok(NULL, ",")) {
1863
1864                 if ((config = strchr(xopts, '=')) != NULL)
1865                         *config++ = '\0';
1866
1867                 if (!strcmp("maxq", xopts)) {
1868                         sc->max_queues = atoi(config);
1869                 } else if (!strcmp("qsz", xopts)) {
1870                         sc->max_qentries = atoi(config);
1871                 } else if (!strcmp("ioslots", xopts)) {
1872                         sc->ioslots = atoi(config);
1873                 } else if (!strcmp("sectsz", xopts)) {
1874                         sectsz = atoi(config);
1875                 } else if (!strcmp("ser", xopts)) {
1876                         /*
1877                          * This field indicates the Product Serial Number in
1878                          * 7-bit ASCII, unused bytes should be space characters.
1879                          * Ref: NVMe v1.3c.
1880                          */
1881                         cpywithpad((char *)sc->ctrldata.sn,
1882                                    sizeof(sc->ctrldata.sn), config, ' ');
1883                 } else if (!strcmp("ram", xopts)) {
1884                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1885
1886                         sc->nvstore.type = NVME_STOR_RAM;
1887                         sc->nvstore.size = sz * 1024 * 1024;
1888                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1889                         sc->nvstore.sectsz = 4096;
1890                         sc->nvstore.sectsz_bits = 12;
1891                         if (sc->nvstore.ctx == NULL) {
1892                                 perror("Unable to allocate RAM");
1893                                 free(uopt);
1894                                 return (-1);
1895                         }
1896                 } else if (!strcmp("eui64", xopts)) {
1897                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1898                 } else if (optidx == 0) {
1899                         snprintf(bident, sizeof(bident), "%d:%d",
1900                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1901                         sc->nvstore.ctx = blockif_open(xopts, bident);
1902                         if (sc->nvstore.ctx == NULL) {
1903                                 perror("Could not open backing file");
1904                                 free(uopt);
1905                                 return (-1);
1906                         }
1907                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1908                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1909                 } else {
1910                         fprintf(stderr, "Invalid option %s\n", xopts);
1911                         free(uopt);
1912                         return (-1);
1913                 }
1914
1915                 optidx++;
1916         }
1917         free(uopt);
1918
1919         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1920                 fprintf(stderr, "backing store not specified\n");
1921                 return (-1);
1922         }
1923         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1924                 sc->nvstore.sectsz = sectsz;
1925         else if (sc->nvstore.type != NVME_STOR_RAM)
1926                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1927         for (sc->nvstore.sectsz_bits = 9;
1928              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1929              sc->nvstore.sectsz_bits++);
1930
1931         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1932                 sc->max_queues = NVME_QUEUES;
1933
1934         if (sc->max_qentries <= 0) {
1935                 fprintf(stderr, "Invalid qsz option\n");
1936                 return (-1);
1937         }
1938         if (sc->ioslots <= 0) {
1939                 fprintf(stderr, "Invalid ioslots option\n");
1940                 return (-1);
1941         }
1942
1943         return (0);
1944 }
1945
1946 static int
1947 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1948 {
1949         struct pci_nvme_softc *sc;
1950         uint32_t pci_membar_sz;
1951         int     error;
1952
1953         error = 0;
1954
1955         sc = calloc(1, sizeof(struct pci_nvme_softc));
1956         pi->pi_arg = sc;
1957         sc->nsc_pi = pi;
1958
1959         error = pci_nvme_parse_opts(sc, opts);
1960         if (error < 0)
1961                 goto done;
1962         else
1963                 error = 0;
1964
1965         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1966         for (int i = 0; i < sc->ioslots; i++) {
1967                 if (i < (sc->ioslots-1))
1968                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1969                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1970                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1971         }
1972         sc->ioreqs_free = sc->ioreqs;
1973         sc->intr_coales_aggr_thresh = 1;
1974
1975         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1976         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1977         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1978         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1979         pci_set_cfgdata8(pi, PCIR_PROGIF,
1980                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1981
1982         /*
1983          * Allocate size of NVMe registers + doorbell space for all queues.
1984          *
1985          * The specification requires a minimum memory I/O window size of 16K.
1986          * The Windows driver will refuse to start a device with a smaller
1987          * window.
1988          */
1989         pci_membar_sz = sizeof(struct nvme_registers) +
1990             2 * sizeof(uint32_t) * (sc->max_queues + 1);
1991         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1992
1993         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1994
1995         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1996         if (error) {
1997                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1998                 goto done;
1999         }
2000
2001         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2002         if (error) {
2003                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
2004                 goto done;
2005         }
2006
2007         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2008         if (error) {
2009                 WPRINTF(("%s pci add Express capability failed\r\n", __func__));
2010                 goto done;
2011         }
2012
2013         pthread_mutex_init(&sc->mtx, NULL);
2014         sem_init(&sc->iosemlock, 0, sc->ioslots);
2015
2016         pci_nvme_reset(sc);
2017         pci_nvme_init_ctrldata(sc);
2018         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2019         pci_nvme_init_logpages(sc);
2020
2021         pci_lintr_request(pi);
2022
2023 done:
2024         return (error);
2025 }
2026
2027
2028 struct pci_devemu pci_de_nvme = {
2029         .pe_emu =       "nvme",
2030         .pe_init =      pci_nvme_init,
2031         .pe_barwrite =  pci_nvme_write,
2032         .pe_barread =   pci_nvme_read
2033 };
2034 PCI_EMUL_SET(pci_de_nvme);