]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
Fix ftpd privilege escalation via ftpchroot.
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov 
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62
63 #include <assert.h>
64 #include <pthread.h>
65 #include <semaphore.h>
66 #include <stdbool.h>
67 #include <stddef.h>
68 #include <stdint.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
75 #include <vmmapi.h>
76
77 #include <dev/nvme/nvme.h>
78
79 #include "bhyverun.h"
80 #include "block_if.h"
81 #include "pci_emul.h"
82
83
84 static int nvme_debug = 0;
85 #define DPRINTF(params) if (nvme_debug) printf params
86 #define WPRINTF(params) printf params
87
88 /* defaults; can be overridden */
89 #define NVME_MSIX_BAR           4
90
91 #define NVME_IOSLOTS            8
92
93 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
94 #define NVME_MMIO_SPACE_MIN     (1 << 14)
95
96 #define NVME_QUEUES             16
97 #define NVME_MAX_QENTRIES       2048
98
99 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
100 #define NVME_MAX_BLOCKIOVS      512
101
102 /* This is a synthetic status code to indicate there is no status */
103 #define NVME_NO_STATUS          0xffff
104 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
105
106 /* helpers */
107
108 /* Convert a zero-based value into a one-based value */
109 #define ONE_BASED(zero)         ((zero) + 1)
110 /* Convert a one-based value into a zero-based value */
111 #define ZERO_BASED(one)         ((one)  - 1)
112
113 /* Encode number of SQ's and CQ's for Set/Get Features */
114 #define NVME_FEATURE_NUM_QUEUES(sc) \
115         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
116         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
117
118 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
119
120 enum nvme_controller_register_offsets {
121         NVME_CR_CAP_LOW = 0x00,
122         NVME_CR_CAP_HI  = 0x04,
123         NVME_CR_VS      = 0x08,
124         NVME_CR_INTMS   = 0x0c,
125         NVME_CR_INTMC   = 0x10,
126         NVME_CR_CC      = 0x14,
127         NVME_CR_CSTS    = 0x1c,
128         NVME_CR_NSSR    = 0x20,
129         NVME_CR_AQA     = 0x24,
130         NVME_CR_ASQ_LOW = 0x28,
131         NVME_CR_ASQ_HI  = 0x2c,
132         NVME_CR_ACQ_LOW = 0x30,
133         NVME_CR_ACQ_HI  = 0x34,
134 };
135
136 enum nvme_cmd_cdw11 {
137         NVME_CMD_CDW11_PC  = 0x0001,
138         NVME_CMD_CDW11_IEN = 0x0002,
139         NVME_CMD_CDW11_IV  = 0xFFFF0000,
140 };
141
142 #define NVME_CQ_INTEN   0x01
143 #define NVME_CQ_INTCOAL 0x02
144
145 struct nvme_completion_queue {
146         struct nvme_completion *qbase;
147         uint32_t        size;
148         uint16_t        tail; /* nvme progress */
149         uint16_t        head; /* guest progress */
150         uint16_t        intr_vec;
151         uint32_t        intr_en;
152         pthread_mutex_t mtx;
153 };
154
155 struct nvme_submission_queue {
156         struct nvme_command *qbase;
157         uint32_t        size;
158         uint16_t        head; /* nvme progress */
159         uint16_t        tail; /* guest progress */
160         uint16_t        cqid; /* completion queue id */
161         int             busy; /* queue is being processed */
162         int             qpriority;
163 };
164
165 enum nvme_storage_type {
166         NVME_STOR_BLOCKIF = 0,
167         NVME_STOR_RAM = 1,
168 };
169
170 struct pci_nvme_blockstore {
171         enum nvme_storage_type type;
172         void            *ctx;
173         uint64_t        size;
174         uint32_t        sectsz;
175         uint32_t        sectsz_bits;
176         uint64_t        eui64;
177 };
178
179 struct pci_nvme_ioreq {
180         struct pci_nvme_softc *sc;
181         struct pci_nvme_ioreq *next;
182         struct nvme_submission_queue *nvme_sq;
183         uint16_t        sqid;
184
185         /* command information */
186         uint16_t        opc;
187         uint16_t        cid;
188         uint32_t        nsid;
189
190         uint64_t        prev_gpaddr;
191         size_t          prev_size;
192
193         /*
194          * lock if all iovs consumed (big IO);
195          * complete transaction before continuing
196          */
197         pthread_mutex_t mtx;
198         pthread_cond_t  cv;
199
200         struct blockif_req io_req;
201
202         /* pad to fit up to 512 page descriptors from guest IO request */
203         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
204 };
205
206 struct pci_nvme_softc {
207         struct pci_devinst *nsc_pi;
208
209         pthread_mutex_t mtx;
210
211         struct nvme_registers regs;
212
213         struct nvme_namespace_data  nsdata;
214         struct nvme_controller_data ctrldata;
215         struct nvme_error_information_entry err_log;
216         struct nvme_health_information_page health_log;
217         struct nvme_firmware_page fw_log;
218
219         struct pci_nvme_blockstore nvstore;
220
221         uint16_t        max_qentries;   /* max entries per queue */
222         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
223         uint32_t        num_cqueues;
224         uint32_t        num_squeues;
225
226         struct pci_nvme_ioreq *ioreqs;
227         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
228         uint32_t        pending_ios;
229         uint32_t        ioslots;
230         sem_t           iosemlock;
231
232         /*
233          * Memory mapped Submission and Completion queues
234          * Each array includes both Admin and IO queues
235          */
236         struct nvme_completion_queue *compl_queues;
237         struct nvme_submission_queue *submit_queues;
238
239         /* controller features */
240         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
241         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
242         uint32_t        async_ev_config;         /* 0x0B: async event config */
243 };
244
245
246 static void pci_nvme_io_partial(struct blockif_req *br, int err);
247
248 /* Controller Configuration utils */
249 #define NVME_CC_GET_EN(cc) \
250         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
251 #define NVME_CC_GET_CSS(cc) \
252         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
253 #define NVME_CC_GET_SHN(cc) \
254         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
255 #define NVME_CC_GET_IOSQES(cc) \
256         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
257 #define NVME_CC_GET_IOCQES(cc) \
258         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
259
260 #define NVME_CC_WRITE_MASK \
261         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
262          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
263          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
264
265 #define NVME_CC_NEN_WRITE_MASK \
266         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
267          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
268          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
269
270 /* Controller Status utils */
271 #define NVME_CSTS_GET_RDY(sts) \
272         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
273
274 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
275
276 /* Completion Queue status word utils */
277 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
278 #define NVME_STATUS_MASK \
279         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
280          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
281
282 static __inline void
283 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
284 {
285         size_t len;
286
287         len = strnlen(src, dst_size);
288         memset(dst, pad, dst_size);
289         memcpy(dst, src, len);
290 }
291
292 static __inline void
293 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
294 {
295
296         *status &= ~NVME_STATUS_MASK;
297         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
298                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
299 }
300
301 static __inline void
302 pci_nvme_status_genc(uint16_t *status, uint16_t code)
303 {
304
305         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
306 }
307
308 static __inline void
309 pci_nvme_toggle_phase(uint16_t *status, int prev)
310 {
311
312         if (prev)
313                 *status &= ~NVME_STATUS_P;
314         else
315                 *status |= NVME_STATUS_P;
316 }
317
318 static void
319 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
320 {
321         struct nvme_controller_data *cd = &sc->ctrldata;
322
323         cd->vid = 0xFB5D;
324         cd->ssvid = 0x0000;
325
326         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
327         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
328
329         /* Num of submission commands that we can handle at a time (2^rab) */
330         cd->rab   = 4;
331
332         /* FreeBSD OUI */
333         cd->ieee[0] = 0x58;
334         cd->ieee[1] = 0x9c;
335         cd->ieee[2] = 0xfc;
336
337         cd->mic = 0;
338
339         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
340
341         cd->ver = 0x00010300;
342
343         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
344         cd->acl = 2;
345         cd->aerl = 4;
346
347         cd->lpa = 0;    /* TODO: support some simple things like SMART */
348         cd->elpe = 0;   /* max error log page entries */
349         cd->npss = 1;   /* number of power states support */
350
351         /* Warning Composite Temperature Threshold */
352         cd->wctemp = 0x0157;
353
354         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
355             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
356         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
357             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
358         cd->nn = 1;     /* number of namespaces */
359
360         cd->fna = 0x03;
361
362         cd->power_state[0].mp = 10;
363 }
364
365 /*
366  * Calculate the CRC-16 of the given buffer
367  * See copyright attribution at top of file
368  */
369 static uint16_t
370 crc16(uint16_t crc, const void *buffer, unsigned int len)
371 {
372         const unsigned char *cp = buffer;
373         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
374         static uint16_t const crc16_table[256] = {
375                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
376                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
377                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
378                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
379                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
380                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
381                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
382                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
383                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
384                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
385                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
386                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
387                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
388                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
389                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
390                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
391                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
392                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
393                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
394                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
395                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
396                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
397                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
398                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
399                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
400                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
401                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
402                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
403                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
404                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
405                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
406                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
407         };
408
409         while (len--)
410                 crc = (((crc >> 8) & 0xffU) ^
411                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
412         return crc;
413 }
414
415 static void
416 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
417     struct nvme_namespace_data *nd, uint32_t nsid,
418     uint64_t eui64)
419 {
420
421         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
422         nd->ncap = nd->nsze;
423         nd->nuse = nd->nsze;
424
425         /* Get LBA and backstore information from backing store */
426         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
427         nd->flbas = 0;
428
429         /* Create an EUI-64 if user did not provide one */
430         if (eui64 == 0) {
431                 char *data = NULL;
432
433                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
434                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
435
436                 if (data != NULL) {
437                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
438                         free(data);
439                 }
440                 eui64 = (eui64 << 16) | (nsid & 0xffff);
441         }
442         be64enc(nd->eui64, eui64);
443
444         /* LBA data-sz = 2^lbads */
445         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
446 }
447
448 static void
449 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
450 {
451
452         memset(&sc->err_log, 0, sizeof(sc->err_log));
453         memset(&sc->health_log, 0, sizeof(sc->health_log));
454         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
455 }
456
457 static void
458 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
459 {
460         DPRINTF(("%s\r\n", __func__));
461
462         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
463             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
464             (60 << NVME_CAP_LO_REG_TO_SHIFT);
465
466         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
467
468         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
469
470         sc->regs.cc = 0;
471         sc->regs.csts = 0;
472
473         sc->num_cqueues = sc->num_squeues = sc->max_queues;
474         if (sc->submit_queues != NULL) {
475                 for (int i = 0; i < sc->num_squeues + 1; i++) {
476                         /*
477                          * The Admin Submission Queue is at index 0.
478                          * It must not be changed at reset otherwise the
479                          * emulation will be out of sync with the guest.
480                          */
481                         if (i != 0) {
482                                 sc->submit_queues[i].qbase = NULL;
483                                 sc->submit_queues[i].size = 0;
484                                 sc->submit_queues[i].cqid = 0;
485                         }
486                         sc->submit_queues[i].tail = 0;
487                         sc->submit_queues[i].head = 0;
488                         sc->submit_queues[i].busy = 0;
489                 }
490         } else
491                 sc->submit_queues = calloc(sc->num_squeues + 1,
492                                         sizeof(struct nvme_submission_queue));
493
494         if (sc->compl_queues != NULL) {
495                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
496                         /* See Admin Submission Queue note above */
497                         if (i != 0) {
498                                 sc->compl_queues[i].qbase = NULL;
499                                 sc->compl_queues[i].size = 0;
500                         }
501
502                         sc->compl_queues[i].tail = 0;
503                         sc->compl_queues[i].head = 0;
504                 }
505         } else {
506                 sc->compl_queues = calloc(sc->num_cqueues + 1,
507                                         sizeof(struct nvme_completion_queue));
508
509                 for (int i = 0; i < sc->num_cqueues + 1; i++)
510                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
511         }
512 }
513
514 static void
515 pci_nvme_reset(struct pci_nvme_softc *sc)
516 {
517         pthread_mutex_lock(&sc->mtx);
518         pci_nvme_reset_locked(sc);
519         pthread_mutex_unlock(&sc->mtx);
520 }
521
522 static void
523 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
524 {
525         uint16_t acqs, asqs;
526
527         DPRINTF(("%s\r\n", __func__));
528
529         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
530         sc->submit_queues[0].size = asqs;
531         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
532                     sizeof(struct nvme_command) * asqs);
533
534         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
535                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
536
537         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
538             NVME_AQA_REG_ACQS_MASK) + 1;
539         sc->compl_queues[0].size = acqs;
540         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
541                  sizeof(struct nvme_completion) * acqs);
542         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
543                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
544 }
545
546 static int
547 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
548         size_t len)
549 {
550         uint8_t *dst;
551         size_t bytes;
552
553         if (len > (8 * 1024)) {
554                 return (-1);
555         }
556
557         /* Copy from the start of prp1 to the end of the physical page */
558         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
559         bytes = MIN(bytes, len);
560
561         dst = vm_map_gpa(ctx, prp1, bytes);
562         if (dst == NULL) {
563                 return (-1);
564         }
565
566         memcpy(dst, src, bytes);
567
568         src += bytes;
569
570         len -= bytes;
571         if (len == 0) {
572                 return (0);
573         }
574
575         len = MIN(len, PAGE_SIZE);
576
577         dst = vm_map_gpa(ctx, prp2, len);
578         if (dst == NULL) {
579                 return (-1);
580         }
581
582         memcpy(dst, src, len);
583
584         return (0);
585 }
586
587 static int
588 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
589         struct nvme_completion* compl)
590 {
591         uint16_t qid = command->cdw10 & 0xffff;
592
593         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
594         if (qid == 0 || qid > sc->num_squeues) {
595                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
596                         __func__, qid, sc->num_squeues));
597                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
598                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
599                 return (1);
600         }
601
602         sc->submit_queues[qid].qbase = NULL;
603         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
604         return (1);
605 }
606
607 static int
608 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
609         struct nvme_completion* compl)
610 {
611         if (command->cdw11 & NVME_CMD_CDW11_PC) {
612                 uint16_t qid = command->cdw10 & 0xffff;
613                 struct nvme_submission_queue *nsq;
614
615                 if ((qid == 0) || (qid > sc->num_squeues)) {
616                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
617                                 __func__, qid, sc->num_squeues));
618                         pci_nvme_status_tc(&compl->status,
619                             NVME_SCT_COMMAND_SPECIFIC,
620                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
621                         return (1);
622                 }
623
624                 nsq = &sc->submit_queues[qid];
625                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
626
627                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
628                               sizeof(struct nvme_command) * (size_t)nsq->size);
629                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
630                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
631
632                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
633                         qid, nsq->size, nsq->qbase, nsq->cqid));
634
635                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
636
637                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
638                          __func__, qid));
639         } else {
640                 /* 
641                  * Guest sent non-cont submission queue request.
642                  * This setting is unsupported by this emulation.
643                  */
644                 WPRINTF(("%s unsupported non-contig (list-based) "
645                          "create i/o submission queue\r\n", __func__));
646
647                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
648         }
649         return (1);
650 }
651
652 static int
653 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
654         struct nvme_completion* compl)
655 {
656         uint16_t qid = command->cdw10 & 0xffff;
657
658         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
659         if (qid == 0 || qid > sc->num_cqueues) {
660                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
661                         __func__, qid, sc->num_cqueues));
662                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
663                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
664                 return (1);
665         }
666
667         sc->compl_queues[qid].qbase = NULL;
668         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
669         return (1);
670 }
671
672 static int
673 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
674         struct nvme_completion* compl)
675 {
676         if (command->cdw11 & NVME_CMD_CDW11_PC) {
677                 uint16_t qid = command->cdw10 & 0xffff;
678                 struct nvme_completion_queue *ncq;
679
680                 if ((qid == 0) || (qid > sc->num_cqueues)) {
681                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
682                                 __func__, qid, sc->num_cqueues));
683                         pci_nvme_status_tc(&compl->status,
684                             NVME_SCT_COMMAND_SPECIFIC,
685                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
686                         return (1);
687                 }
688
689                 ncq = &sc->compl_queues[qid];
690                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
691                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
692                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
693
694                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
695                              command->prp1,
696                              sizeof(struct nvme_command) * (size_t)ncq->size);
697
698                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
699         } else {
700                 /* 
701                  * Non-contig completion queue unsupported.
702                  */
703                 WPRINTF(("%s unsupported non-contig (list-based) "
704                          "create i/o completion queue\r\n",
705                          __func__));
706
707                 /* 0x12 = Invalid Use of Controller Memory Buffer */
708                 pci_nvme_status_genc(&compl->status, 0x12);
709         }
710
711         return (1);
712 }
713
714 static int
715 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
716         struct nvme_completion* compl)
717 {
718         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
719         uint8_t logpage = command->cdw10 & 0xFF;
720
721         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
722
723         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
724
725         switch (logpage) {
726         case NVME_LOG_ERROR:
727                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
728                     command->prp2, (uint8_t *)&sc->err_log, logsize);
729                 break;
730         case NVME_LOG_HEALTH_INFORMATION:
731                 /* TODO: present some smart info */
732                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
733                     command->prp2, (uint8_t *)&sc->health_log, logsize);
734                 break;
735         case NVME_LOG_FIRMWARE_SLOT:
736                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
737                     command->prp2, (uint8_t *)&sc->fw_log, logsize);
738                 break;
739         default:
740                 WPRINTF(("%s get log page %x command not supported\r\n",
741                         __func__, logpage));
742
743                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
744                     NVME_SC_INVALID_LOG_PAGE);
745         }
746
747         return (1);
748 }
749
750 static int
751 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
752         struct nvme_completion* compl)
753 {
754         void *dest;
755
756         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
757                 command->cdw10 & 0xFF, command->nsid));
758
759         switch (command->cdw10 & 0xFF) {
760         case 0x00: /* return Identify Namespace data structure */
761                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
762                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
763                 break;
764         case 0x01: /* return Identify Controller data structure */
765                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
766                     command->prp2, (uint8_t *)&sc->ctrldata,
767                     sizeof(sc->ctrldata));
768                 break;
769         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
770                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
771                                   sizeof(uint32_t) * 1024);
772                 ((uint32_t *)dest)[0] = 1;
773                 ((uint32_t *)dest)[1] = 0;
774                 break;
775         case 0x11:
776                 pci_nvme_status_genc(&compl->status,
777                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
778                 return (1);
779         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
780         case 0x10:
781         case 0x12:
782         case 0x13:
783         case 0x14:
784         case 0x15:
785         default:
786                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
787                          __func__, command->cdw10 & 0xFF));
788                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
789                 return (1);
790         }
791
792         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
793         return (1);
794 }
795
796 static int
797 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
798         struct nvme_completion* compl)
799 {
800         uint16_t nqr;   /* Number of Queues Requested */
801
802         nqr = command->cdw11 & 0xFFFF;
803         if (nqr == 0xffff) {
804                 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
805                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
806                 return (-1);
807         }
808
809         sc->num_squeues = ONE_BASED(nqr);
810         if (sc->num_squeues > sc->max_queues) {
811                 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
812                                         sc->max_queues));
813                 sc->num_squeues = sc->max_queues;
814         }
815
816         nqr = (command->cdw11 >> 16) & 0xFFFF;
817         if (nqr == 0xffff) {
818                 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
819                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
820                 return (-1);
821         }
822
823         sc->num_cqueues = ONE_BASED(nqr);
824         if (sc->num_cqueues > sc->max_queues) {
825                 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
826                                         sc->max_queues));
827                 sc->num_cqueues = sc->max_queues;
828         }
829
830         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
831
832         return (0);
833 }
834
835 static int
836 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
837         struct nvme_completion* compl)
838 {
839         int feature = command->cdw10 & 0xFF;
840         uint32_t iv;
841
842         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
843         compl->cdw0 = 0;
844
845         switch (feature) {
846         case NVME_FEAT_ARBITRATION:
847                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
848                 break;
849         case NVME_FEAT_POWER_MANAGEMENT:
850                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
851                 break;
852         case NVME_FEAT_LBA_RANGE_TYPE:
853                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
854                 break;
855         case NVME_FEAT_TEMPERATURE_THRESHOLD:
856                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
857                 break;
858         case NVME_FEAT_ERROR_RECOVERY:
859                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
860                 break;
861         case NVME_FEAT_VOLATILE_WRITE_CACHE:
862                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
863                 break;
864         case NVME_FEAT_NUMBER_OF_QUEUES:
865                 nvme_set_feature_queues(sc, command, compl);
866                 break;
867         case NVME_FEAT_INTERRUPT_COALESCING:
868                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
869
870                 /* in uS */
871                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
872
873                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
874                 break;
875         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
876                 iv = command->cdw11 & 0xFFFF;
877
878                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
879                         command->cdw11));
880
881                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
882                         if (sc->compl_queues[i].intr_vec == iv) {
883                                 if (command->cdw11 & (1 << 16))
884                                         sc->compl_queues[i].intr_en |=
885                                                               NVME_CQ_INTCOAL;  
886                                 else
887                                         sc->compl_queues[i].intr_en &=
888                                                              ~NVME_CQ_INTCOAL;  
889                         }
890                 }
891                 break;
892         case NVME_FEAT_WRITE_ATOMICITY:
893                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
894                 break;
895         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
896                 DPRINTF(("  async event configuration 0x%x\r\n",
897                         command->cdw11));
898                 sc->async_ev_config = command->cdw11;
899                 break;
900         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
901                 DPRINTF(("  software progress marker 0x%x\r\n",
902                         command->cdw11));
903                 break;
904         case 0x0C:
905                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
906                         command->cdw11));
907                 break;
908         default:
909                 WPRINTF(("%s invalid feature\r\n", __func__));
910                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
911                 return (1);
912         }
913
914         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
915         return (1);
916 }
917
918 static int
919 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
920         struct nvme_completion* compl)
921 {
922         int feature = command->cdw10 & 0xFF;
923
924         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
925
926         compl->cdw0 = 0;
927
928         switch (feature) {
929         case NVME_FEAT_ARBITRATION:
930                 DPRINTF(("  arbitration\r\n"));
931                 break;
932         case NVME_FEAT_POWER_MANAGEMENT:
933                 DPRINTF(("  power management\r\n"));
934                 break;
935         case NVME_FEAT_LBA_RANGE_TYPE:
936                 DPRINTF(("  lba range\r\n"));
937                 break;
938         case NVME_FEAT_TEMPERATURE_THRESHOLD:
939                 DPRINTF(("  temperature threshold\r\n"));
940                 switch ((command->cdw11 >> 20) & 0x3) {
941                 case 0:
942                         /* Over temp threshold */
943                         compl->cdw0 = 0xFFFF;
944                         break;
945                 case 1:
946                         /* Under temp threshold */
947                         compl->cdw0 = 0;
948                         break;
949                 default:
950                         WPRINTF(("  invalid threshold type select\r\n"));
951                         pci_nvme_status_genc(&compl->status,
952                             NVME_SC_INVALID_FIELD);
953                         return (1);
954                 }
955                 break;
956         case NVME_FEAT_ERROR_RECOVERY:
957                 DPRINTF(("  error recovery\r\n"));
958                 break;
959         case NVME_FEAT_VOLATILE_WRITE_CACHE:
960                 DPRINTF(("  volatile write cache\r\n"));
961                 break;
962         case NVME_FEAT_NUMBER_OF_QUEUES:
963                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
964
965                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
966                         compl->cdw0 & 0xFFFF,
967                         (compl->cdw0 >> 16) & 0xFFFF));
968
969                 break;
970         case NVME_FEAT_INTERRUPT_COALESCING:
971                 DPRINTF(("  interrupt coalescing\r\n"));
972                 break;
973         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
974                 DPRINTF(("  interrupt vector configuration\r\n"));
975                 break;
976         case NVME_FEAT_WRITE_ATOMICITY:
977                 DPRINTF(("  write atomicity\r\n"));
978                 break;
979         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
980                 DPRINTF(("  async event configuration\r\n"));
981                 sc->async_ev_config = command->cdw11;
982                 break;
983         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
984                 DPRINTF(("  software progress marker\r\n"));
985                 break;
986         case 0x0C:
987                 DPRINTF(("  autonomous power state transition\r\n"));
988                 break;
989         default:
990                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
991                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
992                 return (1);
993         }
994
995         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
996         return (1);
997 }
998
999 static int
1000 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1001         struct nvme_completion* compl)
1002 {
1003         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
1004                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1005
1006         /* TODO: search for the command ID and abort it */
1007
1008         compl->cdw0 = 1;
1009         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1010         return (1);
1011 }
1012
1013 static int
1014 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1015         struct nvme_command* command, struct nvme_completion* compl)
1016 {
1017         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
1018
1019         /*
1020          * TODO: raise events when they happen based on the Set Features cmd.
1021          * These events happen async, so only set completion successful if
1022          * there is an event reflective of the request to get event.
1023          */
1024         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1025             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1026         return (0);
1027 }
1028
1029 static void
1030 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1031 {
1032         struct nvme_completion compl;
1033         struct nvme_command *cmd;
1034         struct nvme_submission_queue *sq;
1035         struct nvme_completion_queue *cq;
1036         int do_intr = 0;
1037         uint16_t sqhead;
1038
1039         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
1040
1041         sq = &sc->submit_queues[0];
1042
1043         sqhead = atomic_load_acq_short(&sq->head);
1044
1045         if (atomic_testandset_int(&sq->busy, 1)) {
1046                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
1047                         __func__, sqhead, sq->tail));
1048                 return;
1049         }
1050
1051         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
1052         
1053         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1054                 cmd = &(sq->qbase)[sqhead];
1055                 compl.status = 0;
1056
1057                 switch (cmd->opc) {
1058                 case NVME_OPC_DELETE_IO_SQ:
1059                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
1060                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1061                         break;
1062                 case NVME_OPC_CREATE_IO_SQ:
1063                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
1064                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1065                         break;
1066                 case NVME_OPC_DELETE_IO_CQ:
1067                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
1068                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1069                         break;
1070                 case NVME_OPC_CREATE_IO_CQ:
1071                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
1072                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1073                         break;
1074                 case NVME_OPC_GET_LOG_PAGE:
1075                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1076                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1077                         break;
1078                 case NVME_OPC_IDENTIFY:
1079                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
1080                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
1081                         break;
1082                 case NVME_OPC_ABORT:
1083                         DPRINTF(("%s command ABORT\r\n", __func__));
1084                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
1085                         break;
1086                 case NVME_OPC_SET_FEATURES:
1087                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1088                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1089                         break;
1090                 case NVME_OPC_GET_FEATURES:
1091                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1092                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1093                         break;
1094                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1095                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1096                         /* XXX dont care, unhandled for now
1097                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1098                         */
1099                         compl.status = NVME_NO_STATUS;
1100                         break;
1101                 default:
1102                         WPRINTF(("0x%x command is not implemented\r\n",
1103                             cmd->opc));
1104                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1105                         do_intr |= 1;
1106                 }
1107         
1108                 if (NVME_COMPLETION_VALID(compl)) {
1109                         struct nvme_completion *cp;
1110                         int phase;
1111
1112                         cq = &sc->compl_queues[0];
1113
1114                         cp = &(cq->qbase)[cq->tail];
1115                         cp->cdw0 = compl.cdw0;
1116                         cp->sqid = 0;
1117                         cp->sqhd = sqhead;
1118                         cp->cid = cmd->cid;
1119
1120                         phase = NVME_STATUS_GET_P(cp->status);
1121                         cp->status = compl.status;
1122                         pci_nvme_toggle_phase(&cp->status, phase);
1123
1124                         cq->tail = (cq->tail + 1) % cq->size;
1125                 }
1126                 sqhead = (sqhead + 1) % sq->size;
1127         }
1128
1129         DPRINTF(("setting sqhead %u\r\n", sqhead));
1130         atomic_store_short(&sq->head, sqhead);
1131         atomic_store_int(&sq->busy, 0);
1132
1133         if (do_intr)
1134                 pci_generate_msix(sc->nsc_pi, 0);
1135
1136 }
1137
1138 static int
1139 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1140         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1141 {
1142         int iovidx;
1143
1144         if (req != NULL) {
1145                 /* concatenate contig block-iovs to minimize number of iovs */
1146                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1147                         iovidx = req->io_req.br_iovcnt - 1;
1148
1149                         req->io_req.br_iov[iovidx].iov_base =
1150                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1151                                              req->prev_gpaddr, size);
1152
1153                         req->prev_size += size;
1154                         req->io_req.br_resid += size;
1155
1156                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1157                 } else {
1158                         pthread_mutex_lock(&req->mtx);
1159
1160                         iovidx = req->io_req.br_iovcnt;
1161                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1162                                 int err = 0;
1163
1164                                 DPRINTF(("large I/O, doing partial req\r\n"));
1165
1166                                 iovidx = 0;
1167                                 req->io_req.br_iovcnt = 0;
1168
1169                                 req->io_req.br_callback = pci_nvme_io_partial;
1170
1171                                 if (!do_write)
1172                                         err = blockif_read(sc->nvstore.ctx,
1173                                                            &req->io_req);
1174                                 else
1175                                         err = blockif_write(sc->nvstore.ctx,
1176                                                             &req->io_req);
1177
1178                                 /* wait until req completes before cont */
1179                                 if (err == 0)
1180                                         pthread_cond_wait(&req->cv, &req->mtx);
1181                         }
1182                         if (iovidx == 0) {
1183                                 req->io_req.br_offset = lba;
1184                                 req->io_req.br_resid = 0;
1185                                 req->io_req.br_param = req;
1186                         }
1187
1188                         req->io_req.br_iov[iovidx].iov_base =
1189                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1190                                              gpaddr, size);
1191
1192                         req->io_req.br_iov[iovidx].iov_len = size;
1193
1194                         req->prev_gpaddr = gpaddr;
1195                         req->prev_size = size;
1196                         req->io_req.br_resid += size;
1197
1198                         req->io_req.br_iovcnt++;
1199
1200                         pthread_mutex_unlock(&req->mtx);
1201                 }
1202         } else {
1203                 /* RAM buffer: read/write directly */
1204                 void *p = sc->nvstore.ctx;
1205                 void *gptr;
1206
1207                 if ((lba + size) > sc->nvstore.size) {
1208                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1209                         return (-1);
1210                 }
1211
1212                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1213                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1214                 if (do_write) 
1215                         memcpy(p, gptr, size);
1216                 else
1217                         memcpy(gptr, p, size);
1218         }
1219         return (0);
1220 }
1221
1222 static void
1223 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1224         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1225         uint32_t cdw0, uint16_t status, int ignore_busy)
1226 {
1227         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1228         struct nvme_completion *compl;
1229         int do_intr = 0;
1230         int phase;
1231
1232         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1233                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1234                  NVME_STATUS_GET_SC(status)));
1235
1236         pthread_mutex_lock(&cq->mtx);
1237
1238         assert(cq->qbase != NULL);
1239
1240         compl = &cq->qbase[cq->tail];
1241
1242         compl->sqhd = atomic_load_acq_short(&sq->head);
1243         compl->sqid = sqid;
1244         compl->cid = cid;
1245
1246         // toggle phase
1247         phase = NVME_STATUS_GET_P(compl->status);
1248         compl->status = status;
1249         pci_nvme_toggle_phase(&compl->status, phase);
1250
1251         cq->tail = (cq->tail + 1) % cq->size;
1252
1253         if (cq->intr_en & NVME_CQ_INTEN)
1254                 do_intr = 1;
1255
1256         pthread_mutex_unlock(&cq->mtx);
1257
1258         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1259                 if (do_intr)
1260                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1261 }
1262
1263 static void
1264 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1265 {
1266         req->sc = NULL;
1267         req->nvme_sq = NULL;
1268         req->sqid = 0;
1269
1270         pthread_mutex_lock(&sc->mtx);
1271
1272         req->next = sc->ioreqs_free;
1273         sc->ioreqs_free = req;
1274         sc->pending_ios--;
1275
1276         /* when no more IO pending, can set to ready if device reset/enabled */
1277         if (sc->pending_ios == 0 &&
1278             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1279                 sc->regs.csts |= NVME_CSTS_RDY;
1280
1281         pthread_mutex_unlock(&sc->mtx);
1282
1283         sem_post(&sc->iosemlock);
1284 }
1285
1286 static struct pci_nvme_ioreq *
1287 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1288 {
1289         struct pci_nvme_ioreq *req = NULL;;
1290
1291         sem_wait(&sc->iosemlock);
1292         pthread_mutex_lock(&sc->mtx);
1293
1294         req = sc->ioreqs_free;
1295         assert(req != NULL);
1296
1297         sc->ioreqs_free = req->next;
1298
1299         req->next = NULL;
1300         req->sc = sc;
1301
1302         sc->pending_ios++;
1303
1304         pthread_mutex_unlock(&sc->mtx);
1305
1306         req->io_req.br_iovcnt = 0;
1307         req->io_req.br_offset = 0;
1308         req->io_req.br_resid = 0;
1309         req->io_req.br_param = req;
1310         req->prev_gpaddr = 0;
1311         req->prev_size = 0;
1312
1313         return req;
1314 }
1315
1316 static void
1317 pci_nvme_io_done(struct blockif_req *br, int err)
1318 {
1319         struct pci_nvme_ioreq *req = br->br_param;
1320         struct nvme_submission_queue *sq = req->nvme_sq;
1321         uint16_t code, status;
1322
1323         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1324         
1325         /* TODO return correct error */
1326         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1327         pci_nvme_status_genc(&status, code);
1328
1329         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1330         pci_nvme_release_ioreq(req->sc, req);
1331 }
1332
1333 static void
1334 pci_nvme_io_partial(struct blockif_req *br, int err)
1335 {
1336         struct pci_nvme_ioreq *req = br->br_param;
1337
1338         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1339
1340         pthread_cond_signal(&req->cv);
1341 }
1342
1343
1344 static void
1345 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1346 {
1347         struct nvme_submission_queue *sq;
1348         uint16_t status;
1349         uint16_t sqhead;
1350         int err;
1351
1352         /* handle all submissions up to sq->tail index */
1353         sq = &sc->submit_queues[idx];
1354
1355         if (atomic_testandset_int(&sq->busy, 1)) {
1356                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1357                 return;
1358         }
1359
1360         sqhead = atomic_load_acq_short(&sq->head);
1361
1362         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1363                  idx, sqhead, sq->tail, sq->qbase));
1364
1365         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1366                 struct nvme_command *cmd;
1367                 struct pci_nvme_ioreq *req = NULL;
1368                 uint64_t lba;
1369                 uint64_t nblocks, bytes, size, cpsz;
1370
1371                 /* TODO: support scatter gather list handling */
1372
1373                 cmd = &sq->qbase[sqhead];
1374                 sqhead = (sqhead + 1) % sq->size;
1375
1376                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1377
1378                 if (cmd->opc == NVME_OPC_FLUSH) {
1379                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1380                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1381                                                 status, 1);
1382
1383                         continue;
1384                 } else if (cmd->opc == 0x08) {
1385                         /* TODO: write zeroes */
1386                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1387                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1388                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1389                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1390                                                 status, 1);
1391
1392                         continue;
1393                 }
1394
1395                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1396
1397                 bytes = nblocks * sc->nvstore.sectsz;
1398
1399                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1400                         req = pci_nvme_get_ioreq(sc);
1401                         req->nvme_sq = sq;
1402                         req->sqid = idx;
1403                 }
1404
1405                 /*
1406                  * If data starts mid-page and flows into the next page, then
1407                  * increase page count
1408                  */
1409
1410                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1411                          "(%lu-bytes)\r\n",
1412                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1413                          cmd->opc == NVME_OPC_WRITE ?
1414                              "WRITE" : "READ",
1415                          lba, nblocks, bytes));
1416
1417                 cmd->prp1 &= ~(0x03UL);
1418                 cmd->prp2 &= ~(0x03UL);
1419
1420                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1421
1422                 size = bytes;
1423                 lba *= sc->nvstore.sectsz;
1424
1425                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1426
1427                 if (cpsz > bytes)
1428                         cpsz = bytes;
1429
1430                 if (req != NULL) {
1431                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1432                                                 cmd->cdw10;
1433                         req->opc = cmd->opc;
1434                         req->cid = cmd->cid;
1435                         req->nsid = cmd->nsid;
1436                 }
1437
1438                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1439                     cmd->opc == NVME_OPC_WRITE, lba);
1440                 lba += cpsz;
1441                 size -= cpsz;
1442
1443                 if (size == 0)
1444                         goto iodone;
1445
1446                 if (size <= PAGE_SIZE) {
1447                         /* prp2 is second (and final) page in transfer */
1448
1449                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1450                             size,
1451                             cmd->opc == NVME_OPC_WRITE,
1452                             lba);
1453                 } else {
1454                         uint64_t *prp_list;
1455                         int i;
1456
1457                         /* prp2 is pointer to a physical region page list */
1458                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1459                                                     cmd->prp2, PAGE_SIZE);
1460
1461                         i = 0;
1462                         while (size != 0) {
1463                                 cpsz = MIN(size, PAGE_SIZE);
1464
1465                                 /*
1466                                  * Move to linked physical region page list
1467                                  * in last item.
1468                                  */ 
1469                                 if (i == (NVME_PRP2_ITEMS-1) &&
1470                                     size > PAGE_SIZE) {
1471                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1472                                         prp_list = paddr_guest2host(
1473                                                       sc->nsc_pi->pi_vmctx,
1474                                                       prp_list[i], PAGE_SIZE);
1475                                         i = 0;
1476                                 }
1477                                 if (prp_list[i] == 0) {
1478                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1479                                         err = 1;
1480                                         break;
1481                                 }
1482
1483                                 err = pci_nvme_append_iov_req(sc, req,
1484                                     prp_list[i], cpsz,
1485                                     cmd->opc == NVME_OPC_WRITE, lba);
1486                                 if (err)
1487                                         break;
1488
1489                                 lba += cpsz;
1490                                 size -= cpsz;
1491                                 i++;
1492                         }
1493                 }
1494
1495 iodone:
1496                 if (sc->nvstore.type == NVME_STOR_RAM) {
1497                         uint16_t code, status;
1498
1499                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1500                             NVME_SC_SUCCESS;
1501                         pci_nvme_status_genc(&status, code);
1502
1503                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1504                                                 status, 1);
1505
1506                         continue;
1507                 }
1508
1509
1510                 if (err)
1511                         goto do_error;
1512
1513                 req->io_req.br_callback = pci_nvme_io_done;
1514
1515                 err = 0;
1516                 switch (cmd->opc) {
1517                 case NVME_OPC_READ:
1518                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1519                         break;
1520                 case NVME_OPC_WRITE:
1521                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1522                         break;
1523                 default:
1524                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1525                                  __func__, cmd->opc));
1526                         err = 1;
1527                 }
1528
1529 do_error:
1530                 if (err) {
1531                         uint16_t status;
1532
1533                         pci_nvme_status_genc(&status,
1534                             NVME_SC_DATA_TRANSFER_ERROR);
1535
1536                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1537                                                 status, 1);
1538                         pci_nvme_release_ioreq(sc, req);
1539                 }
1540         }
1541
1542         atomic_store_short(&sq->head, sqhead);
1543         atomic_store_int(&sq->busy, 0);
1544 }
1545
1546 static void
1547 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1548         uint64_t idx, int is_sq, uint64_t value)
1549 {
1550         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1551                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1552
1553         if (is_sq) {
1554                 atomic_store_short(&sc->submit_queues[idx].tail,
1555                                    (uint16_t)value);
1556
1557                 if (idx == 0) {
1558                         pci_nvme_handle_admin_cmd(sc, value);
1559                 } else {
1560                         /* submission queue; handle new entries in SQ */
1561                         if (idx > sc->num_squeues) {
1562                                 WPRINTF(("%s SQ index %lu overflow from "
1563                                          "guest (max %u)\r\n",
1564                                          __func__, idx, sc->num_squeues));
1565                                 return;
1566                         }
1567                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1568                 }
1569         } else {
1570                 if (idx > sc->num_cqueues) {
1571                         WPRINTF(("%s queue index %lu overflow from "
1572                                  "guest (max %u)\r\n",
1573                                  __func__, idx, sc->num_cqueues));
1574                         return;
1575                 }
1576
1577                 sc->compl_queues[idx].head = (uint16_t)value;
1578         }
1579 }
1580
1581 static void
1582 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1583 {
1584         const char *s = iswrite ? "WRITE" : "READ";
1585
1586         switch (offset) {
1587         case NVME_CR_CAP_LOW:
1588                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1589                 break;
1590         case NVME_CR_CAP_HI:
1591                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1592                 break;
1593         case NVME_CR_VS:
1594                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1595                 break;
1596         case NVME_CR_INTMS:
1597                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1598                 break;
1599         case NVME_CR_INTMC:
1600                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1601                 break;
1602         case NVME_CR_CC:
1603                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1604                 break;
1605         case NVME_CR_CSTS:
1606                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1607                 break;
1608         case NVME_CR_NSSR:
1609                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1610                 break;
1611         case NVME_CR_AQA:
1612                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1613                 break;
1614         case NVME_CR_ASQ_LOW:
1615                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1616                 break;
1617         case NVME_CR_ASQ_HI:
1618                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1619                 break;
1620         case NVME_CR_ACQ_LOW:
1621                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1622                 break;
1623         case NVME_CR_ACQ_HI:
1624                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1625                 break;
1626         default:
1627                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1628         }
1629
1630 }
1631
1632 static void
1633 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1634         uint64_t offset, int size, uint64_t value)
1635 {
1636         uint32_t ccreg;
1637
1638         if (offset >= NVME_DOORBELL_OFFSET) {
1639                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1640                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1641                 int is_sq = (belloffset % 8) < 4;
1642
1643                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1644                         WPRINTF(("guest attempted an overflow write offset "
1645                                  "0x%lx, val 0x%lx in %s",
1646                                  offset, value, __func__));
1647                         return;
1648                 }
1649
1650                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1651                 return;
1652         }
1653
1654         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1655                 offset, size, value));
1656
1657         if (size != 4) {
1658                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1659                          "val 0x%lx) to bar0 in %s",
1660                          size, offset, value, __func__));
1661                 /* TODO: shutdown device */
1662                 return;
1663         }
1664
1665         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1666
1667         pthread_mutex_lock(&sc->mtx);
1668
1669         switch (offset) {
1670         case NVME_CR_CAP_LOW:
1671         case NVME_CR_CAP_HI:
1672                 /* readonly */
1673                 break;
1674         case NVME_CR_VS:
1675                 /* readonly */
1676                 break;
1677         case NVME_CR_INTMS:
1678                 /* MSI-X, so ignore */
1679                 break;
1680         case NVME_CR_INTMC:
1681                 /* MSI-X, so ignore */
1682                 break;
1683         case NVME_CR_CC:
1684                 ccreg = (uint32_t)value;
1685
1686                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1687                          "iocqes %u\r\n",
1688                         __func__,
1689                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1690                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1691                          NVME_CC_GET_IOCQES(ccreg)));
1692
1693                 if (NVME_CC_GET_SHN(ccreg)) {
1694                         /* perform shutdown - flush out data to backend */
1695                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1696                             NVME_CSTS_REG_SHST_SHIFT);
1697                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1698                             NVME_CSTS_REG_SHST_SHIFT;
1699                 }
1700                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1701                         if (NVME_CC_GET_EN(ccreg) == 0)
1702                                 /* transition 1-> causes controller reset */
1703                                 pci_nvme_reset_locked(sc);
1704                         else
1705                                 pci_nvme_init_controller(ctx, sc);
1706                 }
1707
1708                 /* Insert the iocqes, iosqes and en bits from the write */
1709                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1710                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1711                 if (NVME_CC_GET_EN(ccreg) == 0) {
1712                         /* Insert the ams, mps and css bit fields */
1713                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1714                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1715                         sc->regs.csts &= ~NVME_CSTS_RDY;
1716                 } else if (sc->pending_ios == 0) {
1717                         sc->regs.csts |= NVME_CSTS_RDY;
1718                 }
1719                 break;
1720         case NVME_CR_CSTS:
1721                 break;
1722         case NVME_CR_NSSR:
1723                 /* ignore writes; don't support subsystem reset */
1724                 break;
1725         case NVME_CR_AQA:
1726                 sc->regs.aqa = (uint32_t)value;
1727                 break;
1728         case NVME_CR_ASQ_LOW:
1729                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1730                                (0xFFFFF000 & value);
1731                 break;
1732         case NVME_CR_ASQ_HI:
1733                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1734                                (value << 32);
1735                 break;
1736         case NVME_CR_ACQ_LOW:
1737                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1738                                (0xFFFFF000 & value);
1739                 break;
1740         case NVME_CR_ACQ_HI:
1741                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1742                                (value << 32);
1743                 break;
1744         default:
1745                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1746                          __func__, offset, value, size));
1747         }
1748         pthread_mutex_unlock(&sc->mtx);
1749 }
1750
1751 static void
1752 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1753                 int baridx, uint64_t offset, int size, uint64_t value)
1754 {
1755         struct pci_nvme_softc* sc = pi->pi_arg;
1756
1757         if (baridx == pci_msix_table_bar(pi) ||
1758             baridx == pci_msix_pba_bar(pi)) {
1759                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1760                          " value 0x%lx\r\n", baridx, offset, size, value));
1761
1762                 pci_emul_msix_twrite(pi, offset, size, value);
1763                 return;
1764         }
1765
1766         switch (baridx) {
1767         case 0:
1768                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1769                 break;
1770
1771         default:
1772                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1773                          __func__, baridx, value));
1774         }
1775 }
1776
1777 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1778         uint64_t offset, int size)
1779 {
1780         uint64_t value;
1781
1782         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1783
1784         if (offset < NVME_DOORBELL_OFFSET) {
1785                 void *p = &(sc->regs);
1786                 pthread_mutex_lock(&sc->mtx);
1787                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1788                 pthread_mutex_unlock(&sc->mtx);
1789         } else {
1790                 value = 0;
1791                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1792         }
1793
1794         switch (size) {
1795         case 1:
1796                 value &= 0xFF;
1797                 break;
1798         case 2:
1799                 value &= 0xFFFF;
1800                 break;
1801         case 4:
1802                 value &= 0xFFFFFFFF;
1803                 break;
1804         }
1805
1806         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1807                  offset, size, (uint32_t)value));
1808
1809         return (value);
1810 }
1811
1812
1813
1814 static uint64_t
1815 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1816     uint64_t offset, int size)
1817 {
1818         struct pci_nvme_softc* sc = pi->pi_arg;
1819
1820         if (baridx == pci_msix_table_bar(pi) ||
1821             baridx == pci_msix_pba_bar(pi)) {
1822                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1823                         baridx, offset, size));
1824
1825                 return pci_emul_msix_tread(pi, offset, size);
1826         }
1827
1828         switch (baridx) {
1829         case 0:
1830                 return pci_nvme_read_bar_0(sc, offset, size);
1831
1832         default:
1833                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1834         }
1835
1836         return (0);
1837 }
1838
1839
1840 static int
1841 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1842 {
1843         char bident[sizeof("XX:X:X")];
1844         char    *uopt, *xopts, *config;
1845         uint32_t sectsz;
1846         int optidx;
1847
1848         sc->max_queues = NVME_QUEUES;
1849         sc->max_qentries = NVME_MAX_QENTRIES;
1850         sc->ioslots = NVME_IOSLOTS;
1851         sc->num_squeues = sc->max_queues;
1852         sc->num_cqueues = sc->max_queues;
1853         sectsz = 0;
1854
1855         uopt = strdup(opts);
1856         optidx = 0;
1857         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1858                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1859         for (xopts = strtok(uopt, ",");
1860              xopts != NULL;
1861              xopts = strtok(NULL, ",")) {
1862
1863                 if ((config = strchr(xopts, '=')) != NULL)
1864                         *config++ = '\0';
1865
1866                 if (!strcmp("maxq", xopts)) {
1867                         sc->max_queues = atoi(config);
1868                 } else if (!strcmp("qsz", xopts)) {
1869                         sc->max_qentries = atoi(config);
1870                 } else if (!strcmp("ioslots", xopts)) {
1871                         sc->ioslots = atoi(config);
1872                 } else if (!strcmp("sectsz", xopts)) {
1873                         sectsz = atoi(config);
1874                 } else if (!strcmp("ser", xopts)) {
1875                         /*
1876                          * This field indicates the Product Serial Number in
1877                          * 7-bit ASCII, unused bytes should be space characters.
1878                          * Ref: NVMe v1.3c.
1879                          */
1880                         cpywithpad((char *)sc->ctrldata.sn,
1881                                    sizeof(sc->ctrldata.sn), config, ' ');
1882                 } else if (!strcmp("ram", xopts)) {
1883                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1884
1885                         sc->nvstore.type = NVME_STOR_RAM;
1886                         sc->nvstore.size = sz * 1024 * 1024;
1887                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1888                         sc->nvstore.sectsz = 4096;
1889                         sc->nvstore.sectsz_bits = 12;
1890                         if (sc->nvstore.ctx == NULL) {
1891                                 perror("Unable to allocate RAM");
1892                                 free(uopt);
1893                                 return (-1);
1894                         }
1895                 } else if (!strcmp("eui64", xopts)) {
1896                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1897                 } else if (optidx == 0) {
1898                         snprintf(bident, sizeof(bident), "%d:%d",
1899                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1900                         sc->nvstore.ctx = blockif_open(xopts, bident);
1901                         if (sc->nvstore.ctx == NULL) {
1902                                 perror("Could not open backing file");
1903                                 free(uopt);
1904                                 return (-1);
1905                         }
1906                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1907                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1908                 } else {
1909                         fprintf(stderr, "Invalid option %s\n", xopts);
1910                         free(uopt);
1911                         return (-1);
1912                 }
1913
1914                 optidx++;
1915         }
1916         free(uopt);
1917
1918         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1919                 fprintf(stderr, "backing store not specified\n");
1920                 return (-1);
1921         }
1922         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1923                 sc->nvstore.sectsz = sectsz;
1924         else if (sc->nvstore.type != NVME_STOR_RAM)
1925                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1926         for (sc->nvstore.sectsz_bits = 9;
1927              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1928              sc->nvstore.sectsz_bits++);
1929
1930         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1931                 sc->max_queues = NVME_QUEUES;
1932
1933         if (sc->max_qentries <= 0) {
1934                 fprintf(stderr, "Invalid qsz option\n");
1935                 return (-1);
1936         }
1937         if (sc->ioslots <= 0) {
1938                 fprintf(stderr, "Invalid ioslots option\n");
1939                 return (-1);
1940         }
1941
1942         return (0);
1943 }
1944
1945 static int
1946 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1947 {
1948         struct pci_nvme_softc *sc;
1949         uint32_t pci_membar_sz;
1950         int     error;
1951
1952         error = 0;
1953
1954         sc = calloc(1, sizeof(struct pci_nvme_softc));
1955         pi->pi_arg = sc;
1956         sc->nsc_pi = pi;
1957
1958         error = pci_nvme_parse_opts(sc, opts);
1959         if (error < 0)
1960                 goto done;
1961         else
1962                 error = 0;
1963
1964         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1965         for (int i = 0; i < sc->ioslots; i++) {
1966                 if (i < (sc->ioslots-1))
1967                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1968                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1969                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1970         }
1971         sc->ioreqs_free = sc->ioreqs;
1972         sc->intr_coales_aggr_thresh = 1;
1973
1974         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1975         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1976         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1977         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1978         pci_set_cfgdata8(pi, PCIR_PROGIF,
1979                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1980
1981         /*
1982          * Allocate size of NVMe registers + doorbell space for all queues.
1983          *
1984          * The specification requires a minimum memory I/O window size of 16K.
1985          * The Windows driver will refuse to start a device with a smaller
1986          * window.
1987          */
1988         pci_membar_sz = sizeof(struct nvme_registers) +
1989             2 * sizeof(uint32_t) * (sc->max_queues + 1);
1990         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1991
1992         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1993
1994         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1995         if (error) {
1996                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1997                 goto done;
1998         }
1999
2000         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2001         if (error) {
2002                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
2003                 goto done;
2004         }
2005
2006         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2007         if (error) {
2008                 WPRINTF(("%s pci add Express capability failed\r\n", __func__));
2009                 goto done;
2010         }
2011
2012         pthread_mutex_init(&sc->mtx, NULL);
2013         sem_init(&sc->iosemlock, 0, sc->ioslots);
2014
2015         pci_nvme_reset(sc);
2016         pci_nvme_init_ctrldata(sc);
2017         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2018         pci_nvme_init_logpages(sc);
2019
2020         pci_lintr_request(pi);
2021
2022 done:
2023         return (error);
2024 }
2025
2026
2027 struct pci_devemu pci_de_nvme = {
2028         .pe_emu =       "nvme",
2029         .pe_init =      pci_nvme_init,
2030         .pe_barwrite =  pci_nvme_write,
2031         .pe_barread =   pci_nvme_read
2032 };
2033 PCI_EMUL_SET(pci_de_nvme);