]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
MFV: r361597
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov 
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62
63 #include <assert.h>
64 #include <pthread.h>
65 #include <semaphore.h>
66 #include <stdbool.h>
67 #include <stddef.h>
68 #include <stdint.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
75 #include <vmmapi.h>
76
77 #include <dev/nvme/nvme.h>
78
79 #include "bhyverun.h"
80 #include "block_if.h"
81 #include "debug.h"
82 #include "pci_emul.h"
83
84
85 static int nvme_debug = 0;
86 #define DPRINTF(params) if (nvme_debug) PRINTLN params
87 #define WPRINTF(params) PRINTLN params
88
89 /* defaults; can be overridden */
90 #define NVME_MSIX_BAR           4
91
92 #define NVME_IOSLOTS            8
93
94 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
95 #define NVME_MMIO_SPACE_MIN     (1 << 14)
96
97 #define NVME_QUEUES             16
98 #define NVME_MAX_QENTRIES       2048
99
100 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
101 #define NVME_MAX_BLOCKIOVS      512
102
103 /* This is a synthetic status code to indicate there is no status */
104 #define NVME_NO_STATUS          0xffff
105 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
106
107 /* helpers */
108
109 /* Convert a zero-based value into a one-based value */
110 #define ONE_BASED(zero)         ((zero) + 1)
111 /* Convert a one-based value into a zero-based value */
112 #define ZERO_BASED(one)         ((one)  - 1)
113
114 /* Encode number of SQ's and CQ's for Set/Get Features */
115 #define NVME_FEATURE_NUM_QUEUES(sc) \
116         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
117         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
118
119 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
120
121 enum nvme_controller_register_offsets {
122         NVME_CR_CAP_LOW = 0x00,
123         NVME_CR_CAP_HI  = 0x04,
124         NVME_CR_VS      = 0x08,
125         NVME_CR_INTMS   = 0x0c,
126         NVME_CR_INTMC   = 0x10,
127         NVME_CR_CC      = 0x14,
128         NVME_CR_CSTS    = 0x1c,
129         NVME_CR_NSSR    = 0x20,
130         NVME_CR_AQA     = 0x24,
131         NVME_CR_ASQ_LOW = 0x28,
132         NVME_CR_ASQ_HI  = 0x2c,
133         NVME_CR_ACQ_LOW = 0x30,
134         NVME_CR_ACQ_HI  = 0x34,
135 };
136
137 enum nvme_cmd_cdw11 {
138         NVME_CMD_CDW11_PC  = 0x0001,
139         NVME_CMD_CDW11_IEN = 0x0002,
140         NVME_CMD_CDW11_IV  = 0xFFFF0000,
141 };
142
143 enum nvme_copy_dir {
144         NVME_COPY_TO_PRP,
145         NVME_COPY_FROM_PRP,
146 };
147
148 #define NVME_CQ_INTEN   0x01
149 #define NVME_CQ_INTCOAL 0x02
150
151 struct nvme_completion_queue {
152         struct nvme_completion *qbase;
153         uint32_t        size;
154         uint16_t        tail; /* nvme progress */
155         uint16_t        head; /* guest progress */
156         uint16_t        intr_vec;
157         uint32_t        intr_en;
158         pthread_mutex_t mtx;
159 };
160
161 struct nvme_submission_queue {
162         struct nvme_command *qbase;
163         uint32_t        size;
164         uint16_t        head; /* nvme progress */
165         uint16_t        tail; /* guest progress */
166         uint16_t        cqid; /* completion queue id */
167         int             busy; /* queue is being processed */
168         int             qpriority;
169 };
170
171 enum nvme_storage_type {
172         NVME_STOR_BLOCKIF = 0,
173         NVME_STOR_RAM = 1,
174 };
175
176 struct pci_nvme_blockstore {
177         enum nvme_storage_type type;
178         void            *ctx;
179         uint64_t        size;
180         uint32_t        sectsz;
181         uint32_t        sectsz_bits;
182         uint64_t        eui64;
183         uint32_t        deallocate:1;
184 };
185
186 struct pci_nvme_ioreq {
187         struct pci_nvme_softc *sc;
188         STAILQ_ENTRY(pci_nvme_ioreq) link;
189         struct nvme_submission_queue *nvme_sq;
190         uint16_t        sqid;
191
192         /* command information */
193         uint16_t        opc;
194         uint16_t        cid;
195         uint32_t        nsid;
196
197         uint64_t        prev_gpaddr;
198         size_t          prev_size;
199
200         /*
201          * lock if all iovs consumed (big IO);
202          * complete transaction before continuing
203          */
204         pthread_mutex_t mtx;
205         pthread_cond_t  cv;
206
207         struct blockif_req io_req;
208
209         /* pad to fit up to 512 page descriptors from guest IO request */
210         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
211 };
212
213 enum nvme_dsm_type {
214         /* Dataset Management bit in ONCS reflects backing storage capability */
215         NVME_DATASET_MANAGEMENT_AUTO,
216         /* Unconditionally set Dataset Management bit in ONCS */
217         NVME_DATASET_MANAGEMENT_ENABLE,
218         /* Unconditionally clear Dataset Management bit in ONCS */
219         NVME_DATASET_MANAGEMENT_DISABLE,
220 };
221
222 struct pci_nvme_softc {
223         struct pci_devinst *nsc_pi;
224
225         pthread_mutex_t mtx;
226
227         struct nvme_registers regs;
228
229         struct nvme_namespace_data  nsdata;
230         struct nvme_controller_data ctrldata;
231         struct nvme_error_information_entry err_log;
232         struct nvme_health_information_page health_log;
233         struct nvme_firmware_page fw_log;
234
235         struct pci_nvme_blockstore nvstore;
236
237         uint16_t        max_qentries;   /* max entries per queue */
238         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
239         uint32_t        num_cqueues;
240         uint32_t        num_squeues;
241
242         struct pci_nvme_ioreq *ioreqs;
243         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
244         uint32_t        pending_ios;
245         uint32_t        ioslots;
246         sem_t           iosemlock;
247
248         /*
249          * Memory mapped Submission and Completion queues
250          * Each array includes both Admin and IO queues
251          */
252         struct nvme_completion_queue *compl_queues;
253         struct nvme_submission_queue *submit_queues;
254
255         /* controller features */
256         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
257         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
258         uint32_t        async_ev_config;         /* 0x0B: async event config */
259
260         enum nvme_dsm_type dataset_management;
261 };
262
263
264 static void pci_nvme_io_partial(struct blockif_req *br, int err);
265
266 /* Controller Configuration utils */
267 #define NVME_CC_GET_EN(cc) \
268         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
269 #define NVME_CC_GET_CSS(cc) \
270         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
271 #define NVME_CC_GET_SHN(cc) \
272         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
273 #define NVME_CC_GET_IOSQES(cc) \
274         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
275 #define NVME_CC_GET_IOCQES(cc) \
276         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
277
278 #define NVME_CC_WRITE_MASK \
279         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
280          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
281          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
282
283 #define NVME_CC_NEN_WRITE_MASK \
284         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
285          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
286          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
287
288 /* Controller Status utils */
289 #define NVME_CSTS_GET_RDY(sts) \
290         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
291
292 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
293
294 /* Completion Queue status word utils */
295 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
296 #define NVME_STATUS_MASK \
297         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
298          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
299
300 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
301         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
302
303 static __inline void
304 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
305 {
306         size_t len;
307
308         len = strnlen(src, dst_size);
309         memset(dst, pad, dst_size);
310         memcpy(dst, src, len);
311 }
312
313 static __inline void
314 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
315 {
316
317         *status &= ~NVME_STATUS_MASK;
318         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
319                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
320 }
321
322 static __inline void
323 pci_nvme_status_genc(uint16_t *status, uint16_t code)
324 {
325
326         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
327 }
328
329 static __inline void
330 pci_nvme_toggle_phase(uint16_t *status, int prev)
331 {
332
333         if (prev)
334                 *status &= ~NVME_STATUS_P;
335         else
336                 *status |= NVME_STATUS_P;
337 }
338
339 static void
340 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
341 {
342         struct nvme_controller_data *cd = &sc->ctrldata;
343
344         cd->vid = 0xFB5D;
345         cd->ssvid = 0x0000;
346
347         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
348         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
349
350         /* Num of submission commands that we can handle at a time (2^rab) */
351         cd->rab   = 4;
352
353         /* FreeBSD OUI */
354         cd->ieee[0] = 0x58;
355         cd->ieee[1] = 0x9c;
356         cd->ieee[2] = 0xfc;
357
358         cd->mic = 0;
359
360         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
361
362         cd->ver = 0x00010300;
363
364         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
365         cd->acl = 2;
366         cd->aerl = 4;
367
368         cd->lpa = 0;    /* TODO: support some simple things like SMART */
369         cd->elpe = 0;   /* max error log page entries */
370         cd->npss = 1;   /* number of power states support */
371
372         /* Warning Composite Temperature Threshold */
373         cd->wctemp = 0x0157;
374
375         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
376             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
377         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
378             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
379         cd->nn = 1;     /* number of namespaces */
380
381         cd->oncs = 0;
382         switch (sc->dataset_management) {
383         case NVME_DATASET_MANAGEMENT_AUTO:
384                 if (sc->nvstore.deallocate)
385                         cd->oncs |= NVME_ONCS_DSM;
386                 break;
387         case NVME_DATASET_MANAGEMENT_ENABLE:
388                 cd->oncs |= NVME_ONCS_DSM;
389                 break;
390         default:
391                 break;
392         }
393
394         cd->fna = 0x03;
395
396         cd->power_state[0].mp = 10;
397 }
398
399 /*
400  * Calculate the CRC-16 of the given buffer
401  * See copyright attribution at top of file
402  */
403 static uint16_t
404 crc16(uint16_t crc, const void *buffer, unsigned int len)
405 {
406         const unsigned char *cp = buffer;
407         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
408         static uint16_t const crc16_table[256] = {
409                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
410                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
411                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
412                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
413                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
414                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
415                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
416                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
417                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
418                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
419                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
420                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
421                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
422                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
423                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
424                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
425                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
426                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
427                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
428                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
429                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
430                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
431                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
432                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
433                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
434                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
435                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
436                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
437                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
438                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
439                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
440                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
441         };
442
443         while (len--)
444                 crc = (((crc >> 8) & 0xffU) ^
445                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
446         return crc;
447 }
448
449 static void
450 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
451     struct nvme_namespace_data *nd, uint32_t nsid,
452     struct pci_nvme_blockstore *nvstore)
453 {
454
455         /* Get capacity and block size information from backing store */
456         nd->nsze = nvstore->size / nvstore->sectsz;
457         nd->ncap = nd->nsze;
458         nd->nuse = nd->nsze;
459
460         if (nvstore->type == NVME_STOR_BLOCKIF)
461                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
462
463         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
464         nd->flbas = 0;
465
466         /* Create an EUI-64 if user did not provide one */
467         if (nvstore->eui64 == 0) {
468                 char *data = NULL;
469                 uint64_t eui64 = nvstore->eui64;
470
471                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
472                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
473
474                 if (data != NULL) {
475                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
476                         free(data);
477                 }
478                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
479         }
480         be64enc(nd->eui64, nvstore->eui64);
481
482         /* LBA data-sz = 2^lbads */
483         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
484 }
485
486 static void
487 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
488 {
489
490         memset(&sc->err_log, 0, sizeof(sc->err_log));
491         memset(&sc->health_log, 0, sizeof(sc->health_log));
492         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
493 }
494
495 static void
496 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
497 {
498         DPRINTF(("%s", __func__));
499
500         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
501             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
502             (60 << NVME_CAP_LO_REG_TO_SHIFT);
503
504         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
505
506         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
507
508         sc->regs.cc = 0;
509         sc->regs.csts = 0;
510
511         sc->num_cqueues = sc->num_squeues = sc->max_queues;
512         if (sc->submit_queues != NULL) {
513                 for (int i = 0; i < sc->num_squeues + 1; i++) {
514                         /*
515                          * The Admin Submission Queue is at index 0.
516                          * It must not be changed at reset otherwise the
517                          * emulation will be out of sync with the guest.
518                          */
519                         if (i != 0) {
520                                 sc->submit_queues[i].qbase = NULL;
521                                 sc->submit_queues[i].size = 0;
522                                 sc->submit_queues[i].cqid = 0;
523                         }
524                         sc->submit_queues[i].tail = 0;
525                         sc->submit_queues[i].head = 0;
526                         sc->submit_queues[i].busy = 0;
527                 }
528         } else
529                 sc->submit_queues = calloc(sc->num_squeues + 1,
530                                         sizeof(struct nvme_submission_queue));
531
532         if (sc->compl_queues != NULL) {
533                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
534                         /* See Admin Submission Queue note above */
535                         if (i != 0) {
536                                 sc->compl_queues[i].qbase = NULL;
537                                 sc->compl_queues[i].size = 0;
538                         }
539
540                         sc->compl_queues[i].tail = 0;
541                         sc->compl_queues[i].head = 0;
542                 }
543         } else {
544                 sc->compl_queues = calloc(sc->num_cqueues + 1,
545                                         sizeof(struct nvme_completion_queue));
546
547                 for (int i = 0; i < sc->num_cqueues + 1; i++)
548                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
549         }
550 }
551
552 static void
553 pci_nvme_reset(struct pci_nvme_softc *sc)
554 {
555         pthread_mutex_lock(&sc->mtx);
556         pci_nvme_reset_locked(sc);
557         pthread_mutex_unlock(&sc->mtx);
558 }
559
560 static void
561 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
562 {
563         uint16_t acqs, asqs;
564
565         DPRINTF(("%s", __func__));
566
567         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
568         sc->submit_queues[0].size = asqs;
569         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
570                     sizeof(struct nvme_command) * asqs);
571
572         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p",
573                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
574
575         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
576             NVME_AQA_REG_ACQS_MASK) + 1;
577         sc->compl_queues[0].size = acqs;
578         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
579                  sizeof(struct nvme_completion) * acqs);
580         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p",
581                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
582 }
583
584 static int
585 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
586         size_t len, enum nvme_copy_dir dir)
587 {
588         uint8_t *p;
589         size_t bytes;
590
591         if (len > (8 * 1024)) {
592                 return (-1);
593         }
594
595         /* Copy from the start of prp1 to the end of the physical page */
596         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
597         bytes = MIN(bytes, len);
598
599         p = vm_map_gpa(ctx, prp1, bytes);
600         if (p == NULL) {
601                 return (-1);
602         }
603
604         if (dir == NVME_COPY_TO_PRP)
605                 memcpy(p, b, bytes);
606         else
607                 memcpy(b, p, bytes);
608
609         b += bytes;
610
611         len -= bytes;
612         if (len == 0) {
613                 return (0);
614         }
615
616         len = MIN(len, PAGE_SIZE);
617
618         p = vm_map_gpa(ctx, prp2, len);
619         if (p == NULL) {
620                 return (-1);
621         }
622
623         if (dir == NVME_COPY_TO_PRP)
624                 memcpy(p, b, len);
625         else
626                 memcpy(b, p, len);
627
628         return (0);
629 }
630
631 static int
632 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
633         struct nvme_completion* compl)
634 {
635         uint16_t qid = command->cdw10 & 0xffff;
636
637         DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid));
638         if (qid == 0 || qid > sc->num_squeues) {
639                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u",
640                         __func__, qid, sc->num_squeues));
641                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
642                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
643                 return (1);
644         }
645
646         sc->submit_queues[qid].qbase = NULL;
647         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
648         return (1);
649 }
650
651 static int
652 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
653         struct nvme_completion* compl)
654 {
655         if (command->cdw11 & NVME_CMD_CDW11_PC) {
656                 uint16_t qid = command->cdw10 & 0xffff;
657                 struct nvme_submission_queue *nsq;
658
659                 if ((qid == 0) || (qid > sc->num_squeues)) {
660                         WPRINTF(("%s queue index %u > num_squeues %u",
661                                 __func__, qid, sc->num_squeues));
662                         pci_nvme_status_tc(&compl->status,
663                             NVME_SCT_COMMAND_SPECIFIC,
664                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
665                         return (1);
666                 }
667
668                 nsq = &sc->submit_queues[qid];
669                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
670
671                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
672                               sizeof(struct nvme_command) * (size_t)nsq->size);
673                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
674                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
675
676                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__,
677                         qid, nsq->size, nsq->qbase, nsq->cqid));
678
679                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
680
681                 DPRINTF(("%s completed creating IOSQ qid %u",
682                          __func__, qid));
683         } else {
684                 /* 
685                  * Guest sent non-cont submission queue request.
686                  * This setting is unsupported by this emulation.
687                  */
688                 WPRINTF(("%s unsupported non-contig (list-based) "
689                          "create i/o submission queue", __func__));
690
691                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
692         }
693         return (1);
694 }
695
696 static int
697 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
698         struct nvme_completion* compl)
699 {
700         uint16_t qid = command->cdw10 & 0xffff;
701
702         DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid));
703         if (qid == 0 || qid > sc->num_cqueues) {
704                 WPRINTF(("%s queue index %u / num_cqueues %u",
705                         __func__, qid, sc->num_cqueues));
706                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
707                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
708                 return (1);
709         }
710
711         sc->compl_queues[qid].qbase = NULL;
712         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
713         return (1);
714 }
715
716 static int
717 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
718         struct nvme_completion* compl)
719 {
720         if (command->cdw11 & NVME_CMD_CDW11_PC) {
721                 uint16_t qid = command->cdw10 & 0xffff;
722                 struct nvme_completion_queue *ncq;
723
724                 if ((qid == 0) || (qid > sc->num_cqueues)) {
725                         WPRINTF(("%s queue index %u > num_cqueues %u",
726                                 __func__, qid, sc->num_cqueues));
727                         pci_nvme_status_tc(&compl->status,
728                             NVME_SCT_COMMAND_SPECIFIC,
729                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
730                         return (1);
731                 }
732
733                 ncq = &sc->compl_queues[qid];
734                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
735                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
736                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
737
738                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
739                              command->prp1,
740                              sizeof(struct nvme_command) * (size_t)ncq->size);
741
742                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
743         } else {
744                 /* 
745                  * Non-contig completion queue unsupported.
746                  */
747                 WPRINTF(("%s unsupported non-contig (list-based) "
748                          "create i/o completion queue",
749                          __func__));
750
751                 /* 0x12 = Invalid Use of Controller Memory Buffer */
752                 pci_nvme_status_genc(&compl->status, 0x12);
753         }
754
755         return (1);
756 }
757
758 static int
759 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
760         struct nvme_completion* compl)
761 {
762         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
763         uint8_t logpage = command->cdw10 & 0xFF;
764
765         DPRINTF(("%s log page %u len %u", __func__, logpage, logsize));
766
767         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
768
769         switch (logpage) {
770         case NVME_LOG_ERROR:
771                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
772                     command->prp2, (uint8_t *)&sc->err_log, logsize,
773                     NVME_COPY_TO_PRP);
774                 break;
775         case NVME_LOG_HEALTH_INFORMATION:
776                 /* TODO: present some smart info */
777                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
778                     command->prp2, (uint8_t *)&sc->health_log, logsize,
779                     NVME_COPY_TO_PRP);
780                 break;
781         case NVME_LOG_FIRMWARE_SLOT:
782                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
783                     command->prp2, (uint8_t *)&sc->fw_log, logsize,
784                     NVME_COPY_TO_PRP);
785                 break;
786         default:
787                 WPRINTF(("%s get log page %x command not supported",
788                         __func__, logpage));
789
790                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
791                     NVME_SC_INVALID_LOG_PAGE);
792         }
793
794         return (1);
795 }
796
797 static int
798 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
799         struct nvme_completion* compl)
800 {
801         void *dest;
802
803         DPRINTF(("%s identify 0x%x nsid 0x%x", __func__,
804                 command->cdw10 & 0xFF, command->nsid));
805
806         switch (command->cdw10 & 0xFF) {
807         case 0x00: /* return Identify Namespace data structure */
808                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
809                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
810                     NVME_COPY_TO_PRP);
811                 break;
812         case 0x01: /* return Identify Controller data structure */
813                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
814                     command->prp2, (uint8_t *)&sc->ctrldata,
815                     sizeof(sc->ctrldata),
816                     NVME_COPY_TO_PRP);
817                 break;
818         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
819                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
820                                   sizeof(uint32_t) * 1024);
821                 ((uint32_t *)dest)[0] = 1;
822                 ((uint32_t *)dest)[1] = 0;
823                 break;
824         case 0x11:
825                 pci_nvme_status_genc(&compl->status,
826                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
827                 return (1);
828         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
829         case 0x10:
830         case 0x12:
831         case 0x13:
832         case 0x14:
833         case 0x15:
834         default:
835                 DPRINTF(("%s unsupported identify command requested 0x%x",
836                          __func__, command->cdw10 & 0xFF));
837                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
838                 return (1);
839         }
840
841         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
842         return (1);
843 }
844
845 static int
846 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
847         struct nvme_completion* compl)
848 {
849         uint16_t nqr;   /* Number of Queues Requested */
850
851         nqr = command->cdw11 & 0xFFFF;
852         if (nqr == 0xffff) {
853                 WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr));
854                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
855                 return (-1);
856         }
857
858         sc->num_squeues = ONE_BASED(nqr);
859         if (sc->num_squeues > sc->max_queues) {
860                 DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues,
861                                         sc->max_queues));
862                 sc->num_squeues = sc->max_queues;
863         }
864
865         nqr = (command->cdw11 >> 16) & 0xFFFF;
866         if (nqr == 0xffff) {
867                 WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr));
868                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
869                 return (-1);
870         }
871
872         sc->num_cqueues = ONE_BASED(nqr);
873         if (sc->num_cqueues > sc->max_queues) {
874                 DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues,
875                                         sc->max_queues));
876                 sc->num_cqueues = sc->max_queues;
877         }
878
879         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
880
881         return (0);
882 }
883
884 static int
885 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
886         struct nvme_completion* compl)
887 {
888         int feature = command->cdw10 & 0xFF;
889         uint32_t iv;
890
891         DPRINTF(("%s feature 0x%x", __func__, feature));
892         compl->cdw0 = 0;
893
894         switch (feature) {
895         case NVME_FEAT_ARBITRATION:
896                 DPRINTF(("  arbitration 0x%x", command->cdw11));
897                 break;
898         case NVME_FEAT_POWER_MANAGEMENT:
899                 DPRINTF(("  power management 0x%x", command->cdw11));
900                 break;
901         case NVME_FEAT_LBA_RANGE_TYPE:
902                 DPRINTF(("  lba range 0x%x", command->cdw11));
903                 break;
904         case NVME_FEAT_TEMPERATURE_THRESHOLD:
905                 DPRINTF(("  temperature threshold 0x%x", command->cdw11));
906                 break;
907         case NVME_FEAT_ERROR_RECOVERY:
908                 DPRINTF(("  error recovery 0x%x", command->cdw11));
909                 break;
910         case NVME_FEAT_VOLATILE_WRITE_CACHE:
911                 DPRINTF(("  volatile write cache 0x%x", command->cdw11));
912                 break;
913         case NVME_FEAT_NUMBER_OF_QUEUES:
914                 nvme_set_feature_queues(sc, command, compl);
915                 break;
916         case NVME_FEAT_INTERRUPT_COALESCING:
917                 DPRINTF(("  interrupt coalescing 0x%x", command->cdw11));
918
919                 /* in uS */
920                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
921
922                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
923                 break;
924         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
925                 iv = command->cdw11 & 0xFFFF;
926
927                 DPRINTF(("  interrupt vector configuration 0x%x",
928                         command->cdw11));
929
930                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
931                         if (sc->compl_queues[i].intr_vec == iv) {
932                                 if (command->cdw11 & (1 << 16))
933                                         sc->compl_queues[i].intr_en |=
934                                                               NVME_CQ_INTCOAL;  
935                                 else
936                                         sc->compl_queues[i].intr_en &=
937                                                              ~NVME_CQ_INTCOAL;  
938                         }
939                 }
940                 break;
941         case NVME_FEAT_WRITE_ATOMICITY:
942                 DPRINTF(("  write atomicity 0x%x", command->cdw11));
943                 break;
944         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
945                 DPRINTF(("  async event configuration 0x%x",
946                         command->cdw11));
947                 sc->async_ev_config = command->cdw11;
948                 break;
949         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
950                 DPRINTF(("  software progress marker 0x%x",
951                         command->cdw11));
952                 break;
953         case 0x0C:
954                 DPRINTF(("  autonomous power state transition 0x%x",
955                         command->cdw11));
956                 break;
957         default:
958                 WPRINTF(("%s invalid feature", __func__));
959                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
960                 return (1);
961         }
962
963         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
964         return (1);
965 }
966
967 static int
968 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
969         struct nvme_completion* compl)
970 {
971         int feature = command->cdw10 & 0xFF;
972
973         DPRINTF(("%s feature 0x%x", __func__, feature));
974
975         compl->cdw0 = 0;
976
977         switch (feature) {
978         case NVME_FEAT_ARBITRATION:
979                 DPRINTF(("  arbitration"));
980                 break;
981         case NVME_FEAT_POWER_MANAGEMENT:
982                 DPRINTF(("  power management"));
983                 break;
984         case NVME_FEAT_LBA_RANGE_TYPE:
985                 DPRINTF(("  lba range"));
986                 break;
987         case NVME_FEAT_TEMPERATURE_THRESHOLD:
988                 DPRINTF(("  temperature threshold"));
989                 switch ((command->cdw11 >> 20) & 0x3) {
990                 case 0:
991                         /* Over temp threshold */
992                         compl->cdw0 = 0xFFFF;
993                         break;
994                 case 1:
995                         /* Under temp threshold */
996                         compl->cdw0 = 0;
997                         break;
998                 default:
999                         WPRINTF(("  invalid threshold type select"));
1000                         pci_nvme_status_genc(&compl->status,
1001                             NVME_SC_INVALID_FIELD);
1002                         return (1);
1003                 }
1004                 break;
1005         case NVME_FEAT_ERROR_RECOVERY:
1006                 DPRINTF(("  error recovery"));
1007                 break;
1008         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1009                 DPRINTF(("  volatile write cache"));
1010                 break;
1011         case NVME_FEAT_NUMBER_OF_QUEUES:
1012                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1013
1014                 DPRINTF(("  number of queues (submit %u, completion %u)",
1015                         compl->cdw0 & 0xFFFF,
1016                         (compl->cdw0 >> 16) & 0xFFFF));
1017
1018                 break;
1019         case NVME_FEAT_INTERRUPT_COALESCING:
1020                 DPRINTF(("  interrupt coalescing"));
1021                 break;
1022         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1023                 DPRINTF(("  interrupt vector configuration"));
1024                 break;
1025         case NVME_FEAT_WRITE_ATOMICITY:
1026                 DPRINTF(("  write atomicity"));
1027                 break;
1028         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1029                 DPRINTF(("  async event configuration"));
1030                 sc->async_ev_config = command->cdw11;
1031                 break;
1032         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1033                 DPRINTF(("  software progress marker"));
1034                 break;
1035         case 0x0C:
1036                 DPRINTF(("  autonomous power state transition"));
1037                 break;
1038         default:
1039                 WPRINTF(("%s invalid feature 0x%x", __func__, feature));
1040                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1041                 return (1);
1042         }
1043
1044         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1045         return (1);
1046 }
1047
1048 static int
1049 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1050         struct nvme_completion* compl)
1051 {
1052         DPRINTF(("%s submission queue %u, command ID 0x%x", __func__,
1053                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1054
1055         /* TODO: search for the command ID and abort it */
1056
1057         compl->cdw0 = 1;
1058         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1059         return (1);
1060 }
1061
1062 static int
1063 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1064         struct nvme_command* command, struct nvme_completion* compl)
1065 {
1066         DPRINTF(("%s async event request 0x%x", __func__, command->cdw11));
1067
1068         /*
1069          * TODO: raise events when they happen based on the Set Features cmd.
1070          * These events happen async, so only set completion successful if
1071          * there is an event reflective of the request to get event.
1072          */
1073         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1074             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1075         return (0);
1076 }
1077
1078 static void
1079 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1080 {
1081         struct nvme_completion compl;
1082         struct nvme_command *cmd;
1083         struct nvme_submission_queue *sq;
1084         struct nvme_completion_queue *cq;
1085         uint16_t sqhead;
1086
1087         DPRINTF(("%s index %u", __func__, (uint32_t)value));
1088
1089         sq = &sc->submit_queues[0];
1090         cq = &sc->compl_queues[0];
1091
1092         sqhead = atomic_load_acq_short(&sq->head);
1093
1094         if (atomic_testandset_int(&sq->busy, 1)) {
1095                 DPRINTF(("%s SQ busy, head %u, tail %u",
1096                         __func__, sqhead, sq->tail));
1097                 return;
1098         }
1099
1100         DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail));
1101         
1102         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1103                 cmd = &(sq->qbase)[sqhead];
1104                 compl.cdw0 = 0;
1105                 compl.status = 0;
1106
1107                 switch (cmd->opc) {
1108                 case NVME_OPC_DELETE_IO_SQ:
1109                         DPRINTF(("%s command DELETE_IO_SQ", __func__));
1110                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1111                         break;
1112                 case NVME_OPC_CREATE_IO_SQ:
1113                         DPRINTF(("%s command CREATE_IO_SQ", __func__));
1114                         nvme_opc_create_io_sq(sc, cmd, &compl);
1115                         break;
1116                 case NVME_OPC_DELETE_IO_CQ:
1117                         DPRINTF(("%s command DELETE_IO_CQ", __func__));
1118                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1119                         break;
1120                 case NVME_OPC_CREATE_IO_CQ:
1121                         DPRINTF(("%s command CREATE_IO_CQ", __func__));
1122                         nvme_opc_create_io_cq(sc, cmd, &compl);
1123                         break;
1124                 case NVME_OPC_GET_LOG_PAGE:
1125                         DPRINTF(("%s command GET_LOG_PAGE", __func__));
1126                         nvme_opc_get_log_page(sc, cmd, &compl);
1127                         break;
1128                 case NVME_OPC_IDENTIFY:
1129                         DPRINTF(("%s command IDENTIFY", __func__));
1130                         nvme_opc_identify(sc, cmd, &compl);
1131                         break;
1132                 case NVME_OPC_ABORT:
1133                         DPRINTF(("%s command ABORT", __func__));
1134                         nvme_opc_abort(sc, cmd, &compl);
1135                         break;
1136                 case NVME_OPC_SET_FEATURES:
1137                         DPRINTF(("%s command SET_FEATURES", __func__));
1138                         nvme_opc_set_features(sc, cmd, &compl);
1139                         break;
1140                 case NVME_OPC_GET_FEATURES:
1141                         DPRINTF(("%s command GET_FEATURES", __func__));
1142                         nvme_opc_get_features(sc, cmd, &compl);
1143                         break;
1144                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1145                         DPRINTF(("%s command ASYNC_EVENT_REQ", __func__));
1146                         /* XXX dont care, unhandled for now
1147                         nvme_opc_async_event_req(sc, cmd, &compl);
1148                         */
1149                         compl.status = NVME_NO_STATUS;
1150                         break;
1151                 default:
1152                         WPRINTF(("0x%x command is not implemented",
1153                             cmd->opc));
1154                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1155                 }
1156                 sqhead = (sqhead + 1) % sq->size;
1157
1158                 if (NVME_COMPLETION_VALID(compl)) {
1159                         struct nvme_completion *cp;
1160                         int phase;
1161
1162                         cp = &(cq->qbase)[cq->tail];
1163                         cp->cdw0 = compl.cdw0;
1164                         cp->sqid = 0;
1165                         cp->sqhd = sqhead;
1166                         cp->cid = cmd->cid;
1167
1168                         phase = NVME_STATUS_GET_P(cp->status);
1169                         cp->status = compl.status;
1170                         pci_nvme_toggle_phase(&cp->status, phase);
1171
1172                         cq->tail = (cq->tail + 1) % cq->size;
1173                 }
1174         }
1175
1176         DPRINTF(("setting sqhead %u", sqhead));
1177         atomic_store_short(&sq->head, sqhead);
1178         atomic_store_int(&sq->busy, 0);
1179
1180         if (cq->head != cq->tail)
1181                 pci_generate_msix(sc->nsc_pi, 0);
1182
1183 }
1184
1185 static int
1186 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1187         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1188 {
1189         int iovidx;
1190
1191         if (req != NULL) {
1192                 /* concatenate contig block-iovs to minimize number of iovs */
1193                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1194                         iovidx = req->io_req.br_iovcnt - 1;
1195
1196                         req->io_req.br_iov[iovidx].iov_base =
1197                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1198                                              req->prev_gpaddr, size);
1199
1200                         req->prev_size += size;
1201                         req->io_req.br_resid += size;
1202
1203                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1204                 } else {
1205                         pthread_mutex_lock(&req->mtx);
1206
1207                         iovidx = req->io_req.br_iovcnt;
1208                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1209                                 int err = 0;
1210
1211                                 DPRINTF(("large I/O, doing partial req"));
1212
1213                                 iovidx = 0;
1214                                 req->io_req.br_iovcnt = 0;
1215
1216                                 req->io_req.br_callback = pci_nvme_io_partial;
1217
1218                                 if (!do_write)
1219                                         err = blockif_read(sc->nvstore.ctx,
1220                                                            &req->io_req);
1221                                 else
1222                                         err = blockif_write(sc->nvstore.ctx,
1223                                                             &req->io_req);
1224
1225                                 /* wait until req completes before cont */
1226                                 if (err == 0)
1227                                         pthread_cond_wait(&req->cv, &req->mtx);
1228                         }
1229                         if (iovidx == 0) {
1230                                 req->io_req.br_offset = lba;
1231                                 req->io_req.br_resid = 0;
1232                                 req->io_req.br_param = req;
1233                         }
1234
1235                         req->io_req.br_iov[iovidx].iov_base =
1236                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1237                                              gpaddr, size);
1238
1239                         req->io_req.br_iov[iovidx].iov_len = size;
1240
1241                         req->prev_gpaddr = gpaddr;
1242                         req->prev_size = size;
1243                         req->io_req.br_resid += size;
1244
1245                         req->io_req.br_iovcnt++;
1246
1247                         pthread_mutex_unlock(&req->mtx);
1248                 }
1249         } else {
1250                 /* RAM buffer: read/write directly */
1251                 void *p = sc->nvstore.ctx;
1252                 void *gptr;
1253
1254                 if ((lba + size) > sc->nvstore.size) {
1255                         WPRINTF(("%s write would overflow RAM", __func__));
1256                         return (-1);
1257                 }
1258
1259                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1260                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1261                 if (do_write) 
1262                         memcpy(p, gptr, size);
1263                 else
1264                         memcpy(gptr, p, size);
1265         }
1266         return (0);
1267 }
1268
1269 static void
1270 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1271         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1272         uint32_t cdw0, uint16_t status, int ignore_busy)
1273 {
1274         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1275         struct nvme_completion *compl;
1276         int phase;
1277
1278         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1279                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1280                  NVME_STATUS_GET_SC(status)));
1281
1282         pthread_mutex_lock(&cq->mtx);
1283
1284         assert(cq->qbase != NULL);
1285
1286         compl = &cq->qbase[cq->tail];
1287
1288         compl->cdw0 = cdw0;
1289         compl->sqid = sqid;
1290         compl->sqhd = atomic_load_acq_short(&sq->head);
1291         compl->cid = cid;
1292
1293         // toggle phase
1294         phase = NVME_STATUS_GET_P(compl->status);
1295         compl->status = status;
1296         pci_nvme_toggle_phase(&compl->status, phase);
1297
1298         cq->tail = (cq->tail + 1) % cq->size;
1299
1300         pthread_mutex_unlock(&cq->mtx);
1301
1302         if (cq->head != cq->tail) {
1303                 if (cq->intr_en & NVME_CQ_INTEN) {
1304                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1305                 } else {
1306                         DPRINTF(("%s: CQ%u interrupt disabled\n",
1307                                                 __func__, sq->cqid));
1308                 }
1309         }
1310 }
1311
1312 static void
1313 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1314 {
1315         req->sc = NULL;
1316         req->nvme_sq = NULL;
1317         req->sqid = 0;
1318
1319         pthread_mutex_lock(&sc->mtx);
1320
1321         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1322         sc->pending_ios--;
1323
1324         /* when no more IO pending, can set to ready if device reset/enabled */
1325         if (sc->pending_ios == 0 &&
1326             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1327                 sc->regs.csts |= NVME_CSTS_RDY;
1328
1329         pthread_mutex_unlock(&sc->mtx);
1330
1331         sem_post(&sc->iosemlock);
1332 }
1333
1334 static struct pci_nvme_ioreq *
1335 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1336 {
1337         struct pci_nvme_ioreq *req = NULL;;
1338
1339         sem_wait(&sc->iosemlock);
1340         pthread_mutex_lock(&sc->mtx);
1341
1342         req = STAILQ_FIRST(&sc->ioreqs_free);
1343         assert(req != NULL);
1344         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1345
1346         req->sc = sc;
1347
1348         sc->pending_ios++;
1349
1350         pthread_mutex_unlock(&sc->mtx);
1351
1352         req->io_req.br_iovcnt = 0;
1353         req->io_req.br_offset = 0;
1354         req->io_req.br_resid = 0;
1355         req->io_req.br_param = req;
1356         req->prev_gpaddr = 0;
1357         req->prev_size = 0;
1358
1359         return req;
1360 }
1361
1362 static void
1363 pci_nvme_io_done(struct blockif_req *br, int err)
1364 {
1365         struct pci_nvme_ioreq *req = br->br_param;
1366         struct nvme_submission_queue *sq = req->nvme_sq;
1367         uint16_t code, status;
1368
1369         DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1370
1371         /* TODO return correct error */
1372         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1373         pci_nvme_status_genc(&status, code);
1374
1375         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1376         pci_nvme_release_ioreq(req->sc, req);
1377 }
1378
1379 static void
1380 pci_nvme_io_partial(struct blockif_req *br, int err)
1381 {
1382         struct pci_nvme_ioreq *req = br->br_param;
1383
1384         DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1385
1386         pthread_cond_signal(&req->cv);
1387 }
1388
1389 static void
1390 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1391 {
1392         struct pci_nvme_ioreq *req = br->br_param;
1393         struct pci_nvme_softc *sc = req->sc;
1394         bool done = true;
1395         uint16_t status;
1396
1397         if (err) {
1398                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1399         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1400                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1401         } else {
1402                 struct iovec *iov = req->io_req.br_iov;
1403
1404                 req->prev_gpaddr++;
1405                 iov += req->prev_gpaddr;
1406
1407                 /* The iov_* values already include the sector size */
1408                 req->io_req.br_offset = (off_t)iov->iov_base;
1409                 req->io_req.br_resid = iov->iov_len;
1410                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1411                         pci_nvme_status_genc(&status,
1412                             NVME_SC_INTERNAL_DEVICE_ERROR);
1413                 } else
1414                         done = false;
1415         }
1416
1417         if (done) {
1418                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1419                     req->cid, 0, status, 0);
1420                 pci_nvme_release_ioreq(sc, req);
1421         }
1422 }
1423
1424 static int
1425 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1426     struct nvme_command *cmd,
1427     struct pci_nvme_blockstore *nvstore,
1428     struct pci_nvme_ioreq *req,
1429     uint16_t *status)
1430 {
1431         int err = -1;
1432
1433         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1434                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1435                 goto out;
1436         }
1437
1438         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1439                 struct nvme_dsm_range *range;
1440                 uint32_t nr, r;
1441                 int sectsz = sc->nvstore.sectsz;
1442
1443                 /*
1444                  * DSM calls are advisory only, and compliant controllers
1445                  * may choose to take no actions (i.e. return Success).
1446                  */
1447                 if (!nvstore->deallocate) {
1448                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1449                         goto out;
1450                 }
1451
1452                 if (req == NULL) {
1453                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1454                         goto out;
1455                 }
1456
1457                 /* copy locally because a range entry could straddle PRPs */
1458                 range = calloc(1, NVME_MAX_DSM_TRIM);
1459                 if (range == NULL) {
1460                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1461                         goto out;
1462                 }
1463                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1464                     (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1465
1466                 req->opc = cmd->opc;
1467                 req->cid = cmd->cid;
1468                 req->nsid = cmd->nsid;
1469                 /*
1470                  * If the request is for more than a single range, store
1471                  * the ranges in the br_iov. Optimize for the common case
1472                  * of a single range.
1473                  *
1474                  * Note that NVMe Number of Ranges is a zero based value
1475                  */
1476                 nr = cmd->cdw10 & 0xff;
1477
1478                 req->io_req.br_iovcnt = 0;
1479                 req->io_req.br_offset = range[0].starting_lba * sectsz;
1480                 req->io_req.br_resid = range[0].length * sectsz;
1481
1482                 if (nr == 0) {
1483                         req->io_req.br_callback = pci_nvme_io_done;
1484                 } else {
1485                         struct iovec *iov = req->io_req.br_iov;
1486
1487                         for (r = 0; r <= nr; r++) {
1488                                 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1489                                 iov[r].iov_len = range[r].length * sectsz;
1490                         }
1491                         req->io_req.br_callback = pci_nvme_dealloc_sm;
1492
1493                         /*
1494                          * Use prev_gpaddr to track the current entry and
1495                          * prev_size to track the number of entries
1496                          */
1497                         req->prev_gpaddr = 0;
1498                         req->prev_size = r;
1499                 }
1500
1501                 err = blockif_delete(nvstore->ctx, &req->io_req);
1502                 if (err)
1503                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1504
1505                 free(range);
1506         }
1507 out:
1508         return (err);
1509 }
1510
1511 static void
1512 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1513 {
1514         struct nvme_submission_queue *sq;
1515         uint16_t status;
1516         uint16_t sqhead;
1517         int err;
1518
1519         /* handle all submissions up to sq->tail index */
1520         sq = &sc->submit_queues[idx];
1521
1522         if (atomic_testandset_int(&sq->busy, 1)) {
1523                 DPRINTF(("%s sqid %u busy", __func__, idx));
1524                 return;
1525         }
1526
1527         sqhead = atomic_load_acq_short(&sq->head);
1528
1529         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1530                  idx, sqhead, sq->tail, sq->qbase));
1531
1532         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1533                 struct nvme_command *cmd;
1534                 struct pci_nvme_ioreq *req = NULL;
1535                 uint64_t lba;
1536                 uint64_t nblocks, bytes, size, cpsz;
1537
1538                 /* TODO: support scatter gather list handling */
1539
1540                 cmd = &sq->qbase[sqhead];
1541                 sqhead = (sqhead + 1) % sq->size;
1542
1543                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1544
1545                 if (cmd->opc == NVME_OPC_FLUSH) {
1546                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1547                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1548                                                 status, 1);
1549
1550                         continue;
1551                 } else if (cmd->opc == 0x08) {
1552                         /* TODO: write zeroes */
1553                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u",
1554                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1555                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1556                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1557                                                 status, 1);
1558
1559                         continue;
1560                 }
1561
1562                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1563                         req = pci_nvme_get_ioreq(sc);
1564                         req->nvme_sq = sq;
1565                         req->sqid = idx;
1566                 }
1567
1568                 if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) {
1569                         if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req,
1570                             &status)) {
1571                                 pci_nvme_set_completion(sc, sq, idx, cmd->cid,
1572                                     0, status, 1);
1573                                 if (req)
1574                                         pci_nvme_release_ioreq(sc, req);
1575                         }
1576                         continue;
1577                 }
1578
1579                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1580
1581                 bytes = nblocks * sc->nvstore.sectsz;
1582
1583                 /*
1584                  * If data starts mid-page and flows into the next page, then
1585                  * increase page count
1586                  */
1587
1588                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1589                          "(%lu-bytes)",
1590                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1591                          cmd->opc == NVME_OPC_WRITE ?
1592                              "WRITE" : "READ",
1593                          lba, nblocks, bytes));
1594
1595                 cmd->prp1 &= ~(0x03UL);
1596                 cmd->prp2 &= ~(0x03UL);
1597
1598                 DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2));
1599
1600                 size = bytes;
1601                 lba *= sc->nvstore.sectsz;
1602
1603                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1604
1605                 if (cpsz > bytes)
1606                         cpsz = bytes;
1607
1608                 if (req != NULL) {
1609                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1610                                                 cmd->cdw10;
1611                         req->opc = cmd->opc;
1612                         req->cid = cmd->cid;
1613                         req->nsid = cmd->nsid;
1614                 }
1615
1616                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1617                     cmd->opc == NVME_OPC_WRITE, lba);
1618                 lba += cpsz;
1619                 size -= cpsz;
1620
1621                 if (size == 0)
1622                         goto iodone;
1623
1624                 if (size <= PAGE_SIZE) {
1625                         /* prp2 is second (and final) page in transfer */
1626
1627                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1628                             size,
1629                             cmd->opc == NVME_OPC_WRITE,
1630                             lba);
1631                 } else {
1632                         uint64_t *prp_list;
1633                         int i;
1634
1635                         /* prp2 is pointer to a physical region page list */
1636                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1637                                                     cmd->prp2, PAGE_SIZE);
1638
1639                         i = 0;
1640                         while (size != 0) {
1641                                 cpsz = MIN(size, PAGE_SIZE);
1642
1643                                 /*
1644                                  * Move to linked physical region page list
1645                                  * in last item.
1646                                  */ 
1647                                 if (i == (NVME_PRP2_ITEMS-1) &&
1648                                     size > PAGE_SIZE) {
1649                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1650                                         prp_list = paddr_guest2host(
1651                                                       sc->nsc_pi->pi_vmctx,
1652                                                       prp_list[i], PAGE_SIZE);
1653                                         i = 0;
1654                                 }
1655                                 if (prp_list[i] == 0) {
1656                                         WPRINTF(("PRP2[%d] = 0 !!!", i));
1657                                         err = 1;
1658                                         break;
1659                                 }
1660
1661                                 err = pci_nvme_append_iov_req(sc, req,
1662                                     prp_list[i], cpsz,
1663                                     cmd->opc == NVME_OPC_WRITE, lba);
1664                                 if (err)
1665                                         break;
1666
1667                                 lba += cpsz;
1668                                 size -= cpsz;
1669                                 i++;
1670                         }
1671                 }
1672
1673 iodone:
1674                 if (sc->nvstore.type == NVME_STOR_RAM) {
1675                         uint16_t code, status;
1676
1677                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1678                             NVME_SC_SUCCESS;
1679                         pci_nvme_status_genc(&status, code);
1680
1681                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1682                                                 status, 1);
1683
1684                         continue;
1685                 }
1686
1687
1688                 if (err)
1689                         goto do_error;
1690
1691                 req->io_req.br_callback = pci_nvme_io_done;
1692
1693                 err = 0;
1694                 switch (cmd->opc) {
1695                 case NVME_OPC_READ:
1696                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1697                         break;
1698                 case NVME_OPC_WRITE:
1699                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1700                         break;
1701                 default:
1702                         WPRINTF(("%s unhandled io command 0x%x",
1703                                  __func__, cmd->opc));
1704                         err = 1;
1705                 }
1706
1707 do_error:
1708                 if (err) {
1709                         uint16_t status;
1710
1711                         pci_nvme_status_genc(&status,
1712                             NVME_SC_DATA_TRANSFER_ERROR);
1713
1714                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1715                                                 status, 1);
1716                         pci_nvme_release_ioreq(sc, req);
1717                 }
1718         }
1719
1720         atomic_store_short(&sq->head, sqhead);
1721         atomic_store_int(&sq->busy, 0);
1722 }
1723
1724 static void
1725 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1726         uint64_t idx, int is_sq, uint64_t value)
1727 {
1728         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx",
1729                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1730
1731         if (is_sq) {
1732                 atomic_store_short(&sc->submit_queues[idx].tail,
1733                                    (uint16_t)value);
1734
1735                 if (idx == 0) {
1736                         pci_nvme_handle_admin_cmd(sc, value);
1737                 } else {
1738                         /* submission queue; handle new entries in SQ */
1739                         if (idx > sc->num_squeues) {
1740                                 WPRINTF(("%s SQ index %lu overflow from "
1741                                          "guest (max %u)",
1742                                          __func__, idx, sc->num_squeues));
1743                                 return;
1744                         }
1745                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1746                 }
1747         } else {
1748                 if (idx > sc->num_cqueues) {
1749                         WPRINTF(("%s queue index %lu overflow from "
1750                                  "guest (max %u)",
1751                                  __func__, idx, sc->num_cqueues));
1752                         return;
1753                 }
1754
1755                 sc->compl_queues[idx].head = (uint16_t)value;
1756         }
1757 }
1758
1759 static void
1760 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1761 {
1762         const char *s = iswrite ? "WRITE" : "READ";
1763
1764         switch (offset) {
1765         case NVME_CR_CAP_LOW:
1766                 DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s));
1767                 break;
1768         case NVME_CR_CAP_HI:
1769                 DPRINTF(("%s %s NVME_CR_CAP_HI", func, s));
1770                 break;
1771         case NVME_CR_VS:
1772                 DPRINTF(("%s %s NVME_CR_VS", func, s));
1773                 break;
1774         case NVME_CR_INTMS:
1775                 DPRINTF(("%s %s NVME_CR_INTMS", func, s));
1776                 break;
1777         case NVME_CR_INTMC:
1778                 DPRINTF(("%s %s NVME_CR_INTMC", func, s));
1779                 break;
1780         case NVME_CR_CC:
1781                 DPRINTF(("%s %s NVME_CR_CC", func, s));
1782                 break;
1783         case NVME_CR_CSTS:
1784                 DPRINTF(("%s %s NVME_CR_CSTS", func, s));
1785                 break;
1786         case NVME_CR_NSSR:
1787                 DPRINTF(("%s %s NVME_CR_NSSR", func, s));
1788                 break;
1789         case NVME_CR_AQA:
1790                 DPRINTF(("%s %s NVME_CR_AQA", func, s));
1791                 break;
1792         case NVME_CR_ASQ_LOW:
1793                 DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s));
1794                 break;
1795         case NVME_CR_ASQ_HI:
1796                 DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s));
1797                 break;
1798         case NVME_CR_ACQ_LOW:
1799                 DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s));
1800                 break;
1801         case NVME_CR_ACQ_HI:
1802                 DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s));
1803                 break;
1804         default:
1805                 DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset));
1806         }
1807
1808 }
1809
1810 static void
1811 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1812         uint64_t offset, int size, uint64_t value)
1813 {
1814         uint32_t ccreg;
1815
1816         if (offset >= NVME_DOORBELL_OFFSET) {
1817                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1818                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1819                 int is_sq = (belloffset % 8) < 4;
1820
1821                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1822                         WPRINTF(("guest attempted an overflow write offset "
1823                                  "0x%lx, val 0x%lx in %s",
1824                                  offset, value, __func__));
1825                         return;
1826                 }
1827
1828                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1829                 return;
1830         }
1831
1832         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx",
1833                 offset, size, value));
1834
1835         if (size != 4) {
1836                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1837                          "val 0x%lx) to bar0 in %s",
1838                          size, offset, value, __func__));
1839                 /* TODO: shutdown device */
1840                 return;
1841         }
1842
1843         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1844
1845         pthread_mutex_lock(&sc->mtx);
1846
1847         switch (offset) {
1848         case NVME_CR_CAP_LOW:
1849         case NVME_CR_CAP_HI:
1850                 /* readonly */
1851                 break;
1852         case NVME_CR_VS:
1853                 /* readonly */
1854                 break;
1855         case NVME_CR_INTMS:
1856                 /* MSI-X, so ignore */
1857                 break;
1858         case NVME_CR_INTMC:
1859                 /* MSI-X, so ignore */
1860                 break;
1861         case NVME_CR_CC:
1862                 ccreg = (uint32_t)value;
1863
1864                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1865                          "iocqes %u",
1866                         __func__,
1867                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1868                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1869                          NVME_CC_GET_IOCQES(ccreg)));
1870
1871                 if (NVME_CC_GET_SHN(ccreg)) {
1872                         /* perform shutdown - flush out data to backend */
1873                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1874                             NVME_CSTS_REG_SHST_SHIFT);
1875                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1876                             NVME_CSTS_REG_SHST_SHIFT;
1877                 }
1878                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1879                         if (NVME_CC_GET_EN(ccreg) == 0)
1880                                 /* transition 1-> causes controller reset */
1881                                 pci_nvme_reset_locked(sc);
1882                         else
1883                                 pci_nvme_init_controller(ctx, sc);
1884                 }
1885
1886                 /* Insert the iocqes, iosqes and en bits from the write */
1887                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1888                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1889                 if (NVME_CC_GET_EN(ccreg) == 0) {
1890                         /* Insert the ams, mps and css bit fields */
1891                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1892                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1893                         sc->regs.csts &= ~NVME_CSTS_RDY;
1894                 } else if (sc->pending_ios == 0) {
1895                         sc->regs.csts |= NVME_CSTS_RDY;
1896                 }
1897                 break;
1898         case NVME_CR_CSTS:
1899                 break;
1900         case NVME_CR_NSSR:
1901                 /* ignore writes; don't support subsystem reset */
1902                 break;
1903         case NVME_CR_AQA:
1904                 sc->regs.aqa = (uint32_t)value;
1905                 break;
1906         case NVME_CR_ASQ_LOW:
1907                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1908                                (0xFFFFF000 & value);
1909                 break;
1910         case NVME_CR_ASQ_HI:
1911                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1912                                (value << 32);
1913                 break;
1914         case NVME_CR_ACQ_LOW:
1915                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1916                                (0xFFFFF000 & value);
1917                 break;
1918         case NVME_CR_ACQ_HI:
1919                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1920                                (value << 32);
1921                 break;
1922         default:
1923                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d",
1924                          __func__, offset, value, size));
1925         }
1926         pthread_mutex_unlock(&sc->mtx);
1927 }
1928
1929 static void
1930 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1931                 int baridx, uint64_t offset, int size, uint64_t value)
1932 {
1933         struct pci_nvme_softc* sc = pi->pi_arg;
1934
1935         if (baridx == pci_msix_table_bar(pi) ||
1936             baridx == pci_msix_pba_bar(pi)) {
1937                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1938                          " value 0x%lx", baridx, offset, size, value));
1939
1940                 pci_emul_msix_twrite(pi, offset, size, value);
1941                 return;
1942         }
1943
1944         switch (baridx) {
1945         case 0:
1946                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1947                 break;
1948
1949         default:
1950                 DPRINTF(("%s unknown baridx %d, val 0x%lx",
1951                          __func__, baridx, value));
1952         }
1953 }
1954
1955 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1956         uint64_t offset, int size)
1957 {
1958         uint64_t value;
1959
1960         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1961
1962         if (offset < NVME_DOORBELL_OFFSET) {
1963                 void *p = &(sc->regs);
1964                 pthread_mutex_lock(&sc->mtx);
1965                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1966                 pthread_mutex_unlock(&sc->mtx);
1967         } else {
1968                 value = 0;
1969                 WPRINTF(("pci_nvme: read invalid offset %ld", offset));
1970         }
1971
1972         switch (size) {
1973         case 1:
1974                 value &= 0xFF;
1975                 break;
1976         case 2:
1977                 value &= 0xFFFF;
1978                 break;
1979         case 4:
1980                 value &= 0xFFFFFFFF;
1981                 break;
1982         }
1983
1984         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x",
1985                  offset, size, (uint32_t)value));
1986
1987         return (value);
1988 }
1989
1990
1991
1992 static uint64_t
1993 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1994     uint64_t offset, int size)
1995 {
1996         struct pci_nvme_softc* sc = pi->pi_arg;
1997
1998         if (baridx == pci_msix_table_bar(pi) ||
1999             baridx == pci_msix_pba_bar(pi)) {
2000                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2001                         baridx, offset, size));
2002
2003                 return pci_emul_msix_tread(pi, offset, size);
2004         }
2005
2006         switch (baridx) {
2007         case 0:
2008                 return pci_nvme_read_bar_0(sc, offset, size);
2009
2010         default:
2011                 DPRINTF(("unknown bar %d, 0x%lx", baridx, offset));
2012         }
2013
2014         return (0);
2015 }
2016
2017
2018 static int
2019 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2020 {
2021         char bident[sizeof("XX:X:X")];
2022         char    *uopt, *xopts, *config;
2023         uint32_t sectsz;
2024         int optidx;
2025
2026         sc->max_queues = NVME_QUEUES;
2027         sc->max_qentries = NVME_MAX_QENTRIES;
2028         sc->ioslots = NVME_IOSLOTS;
2029         sc->num_squeues = sc->max_queues;
2030         sc->num_cqueues = sc->max_queues;
2031         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2032         sectsz = 0;
2033
2034         uopt = strdup(opts);
2035         optidx = 0;
2036         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2037                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2038         for (xopts = strtok(uopt, ",");
2039              xopts != NULL;
2040              xopts = strtok(NULL, ",")) {
2041
2042                 if ((config = strchr(xopts, '=')) != NULL)
2043                         *config++ = '\0';
2044
2045                 if (!strcmp("maxq", xopts)) {
2046                         sc->max_queues = atoi(config);
2047                 } else if (!strcmp("qsz", xopts)) {
2048                         sc->max_qentries = atoi(config);
2049                 } else if (!strcmp("ioslots", xopts)) {
2050                         sc->ioslots = atoi(config);
2051                 } else if (!strcmp("sectsz", xopts)) {
2052                         sectsz = atoi(config);
2053                 } else if (!strcmp("ser", xopts)) {
2054                         /*
2055                          * This field indicates the Product Serial Number in
2056                          * 7-bit ASCII, unused bytes should be space characters.
2057                          * Ref: NVMe v1.3c.
2058                          */
2059                         cpywithpad((char *)sc->ctrldata.sn,
2060                                    sizeof(sc->ctrldata.sn), config, ' ');
2061                 } else if (!strcmp("ram", xopts)) {
2062                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2063
2064                         sc->nvstore.type = NVME_STOR_RAM;
2065                         sc->nvstore.size = sz * 1024 * 1024;
2066                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2067                         sc->nvstore.sectsz = 4096;
2068                         sc->nvstore.sectsz_bits = 12;
2069                         if (sc->nvstore.ctx == NULL) {
2070                                 perror("Unable to allocate RAM");
2071                                 free(uopt);
2072                                 return (-1);
2073                         }
2074                 } else if (!strcmp("eui64", xopts)) {
2075                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2076                 } else if (!strcmp("dsm", xopts)) {
2077                         if (!strcmp("auto", config))
2078                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2079                         else if (!strcmp("enable", config))
2080                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2081                         else if (!strcmp("disable", config))
2082                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2083                 } else if (optidx == 0) {
2084                         snprintf(bident, sizeof(bident), "%d:%d",
2085                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2086                         sc->nvstore.ctx = blockif_open(xopts, bident);
2087                         if (sc->nvstore.ctx == NULL) {
2088                                 perror("Could not open backing file");
2089                                 free(uopt);
2090                                 return (-1);
2091                         }
2092                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2093                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2094                 } else {
2095                         EPRINTLN("Invalid option %s", xopts);
2096                         free(uopt);
2097                         return (-1);
2098                 }
2099
2100                 optidx++;
2101         }
2102         free(uopt);
2103
2104         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2105                 EPRINTLN("backing store not specified");
2106                 return (-1);
2107         }
2108         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2109                 sc->nvstore.sectsz = sectsz;
2110         else if (sc->nvstore.type != NVME_STOR_RAM)
2111                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2112         for (sc->nvstore.sectsz_bits = 9;
2113              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2114              sc->nvstore.sectsz_bits++);
2115
2116         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2117                 sc->max_queues = NVME_QUEUES;
2118
2119         if (sc->max_qentries <= 0) {
2120                 EPRINTLN("Invalid qsz option");
2121                 return (-1);
2122         }
2123         if (sc->ioslots <= 0) {
2124                 EPRINTLN("Invalid ioslots option");
2125                 return (-1);
2126         }
2127
2128         return (0);
2129 }
2130
2131 static int
2132 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2133 {
2134         struct pci_nvme_softc *sc;
2135         uint32_t pci_membar_sz;
2136         int     error;
2137
2138         error = 0;
2139
2140         sc = calloc(1, sizeof(struct pci_nvme_softc));
2141         pi->pi_arg = sc;
2142         sc->nsc_pi = pi;
2143
2144         error = pci_nvme_parse_opts(sc, opts);
2145         if (error < 0)
2146                 goto done;
2147         else
2148                 error = 0;
2149
2150         STAILQ_INIT(&sc->ioreqs_free);
2151         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2152         for (int i = 0; i < sc->ioslots; i++) {
2153                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2154                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2155                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2156         }
2157         sc->intr_coales_aggr_thresh = 1;
2158
2159         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2160         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2161         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2162         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2163         pci_set_cfgdata8(pi, PCIR_PROGIF,
2164                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2165
2166         /*
2167          * Allocate size of NVMe registers + doorbell space for all queues.
2168          *
2169          * The specification requires a minimum memory I/O window size of 16K.
2170          * The Windows driver will refuse to start a device with a smaller
2171          * window.
2172          */
2173         pci_membar_sz = sizeof(struct nvme_registers) +
2174             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2175         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2176
2177         DPRINTF(("nvme membar size: %u", pci_membar_sz));
2178
2179         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2180         if (error) {
2181                 WPRINTF(("%s pci alloc mem bar failed", __func__));
2182                 goto done;
2183         }
2184
2185         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2186         if (error) {
2187                 WPRINTF(("%s pci add msixcap failed", __func__));
2188                 goto done;
2189         }
2190
2191         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2192         if (error) {
2193                 WPRINTF(("%s pci add Express capability failed", __func__));
2194                 goto done;
2195         }
2196
2197         pthread_mutex_init(&sc->mtx, NULL);
2198         sem_init(&sc->iosemlock, 0, sc->ioslots);
2199
2200         pci_nvme_reset(sc);
2201         /*
2202          * Controller data depends on Namespace data so initialize Namespace
2203          * data first.
2204          */
2205         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2206         pci_nvme_init_ctrldata(sc);
2207         pci_nvme_init_logpages(sc);
2208
2209         pci_lintr_request(pi);
2210
2211 done:
2212         return (error);
2213 }
2214
2215
2216 struct pci_devemu pci_de_nvme = {
2217         .pe_emu =       "nvme",
2218         .pe_init =      pci_nvme_init,
2219         .pe_barwrite =  pci_nvme_write,
2220         .pe_barread =   pci_nvme_read
2221 };
2222 PCI_EMUL_SET(pci_de_nvme);