]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
MFC r362746 bhyve: implement NVMe Flush command
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov 
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *
51  */
52
53 /* TODO:
54     - create async event for smart and log
55     - intr coalesce
56  */
57
58 #include <sys/cdefs.h>
59 __FBSDID("$FreeBSD$");
60
61 #include <sys/errno.h>
62 #include <sys/types.h>
63 #include <net/ieee_oui.h>
64
65 #include <assert.h>
66 #include <pthread.h>
67 #include <semaphore.h>
68 #include <stdbool.h>
69 #include <stddef.h>
70 #include <stdint.h>
71 #include <stdio.h>
72 #include <stdlib.h>
73 #include <string.h>
74
75 #include <machine/atomic.h>
76 #include <machine/vmm.h>
77 #include <vmmapi.h>
78
79 #include <dev/nvme/nvme.h>
80
81 #include "bhyverun.h"
82 #include "block_if.h"
83 #include "debug.h"
84 #include "pci_emul.h"
85
86
87 static int nvme_debug = 0;
88 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
89 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
90
91 /* defaults; can be overridden */
92 #define NVME_MSIX_BAR           4
93
94 #define NVME_IOSLOTS            8
95
96 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
97 #define NVME_MMIO_SPACE_MIN     (1 << 14)
98
99 #define NVME_QUEUES             16
100 #define NVME_MAX_QENTRIES       2048
101
102 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
103 #define NVME_MAX_BLOCKIOVS      512
104
105 /* This is a synthetic status code to indicate there is no status */
106 #define NVME_NO_STATUS          0xffff
107 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
108
109 /* helpers */
110
111 /* Convert a zero-based value into a one-based value */
112 #define ONE_BASED(zero)         ((zero) + 1)
113 /* Convert a one-based value into a zero-based value */
114 #define ZERO_BASED(one)         ((one)  - 1)
115
116 /* Encode number of SQ's and CQ's for Set/Get Features */
117 #define NVME_FEATURE_NUM_QUEUES(sc) \
118         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
119         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
120
121 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
122
123 enum nvme_controller_register_offsets {
124         NVME_CR_CAP_LOW = 0x00,
125         NVME_CR_CAP_HI  = 0x04,
126         NVME_CR_VS      = 0x08,
127         NVME_CR_INTMS   = 0x0c,
128         NVME_CR_INTMC   = 0x10,
129         NVME_CR_CC      = 0x14,
130         NVME_CR_CSTS    = 0x1c,
131         NVME_CR_NSSR    = 0x20,
132         NVME_CR_AQA     = 0x24,
133         NVME_CR_ASQ_LOW = 0x28,
134         NVME_CR_ASQ_HI  = 0x2c,
135         NVME_CR_ACQ_LOW = 0x30,
136         NVME_CR_ACQ_HI  = 0x34,
137 };
138
139 enum nvme_cmd_cdw11 {
140         NVME_CMD_CDW11_PC  = 0x0001,
141         NVME_CMD_CDW11_IEN = 0x0002,
142         NVME_CMD_CDW11_IV  = 0xFFFF0000,
143 };
144
145 enum nvme_copy_dir {
146         NVME_COPY_TO_PRP,
147         NVME_COPY_FROM_PRP,
148 };
149
150 #define NVME_CQ_INTEN   0x01
151 #define NVME_CQ_INTCOAL 0x02
152
153 struct nvme_completion_queue {
154         struct nvme_completion *qbase;
155         uint32_t        size;
156         uint16_t        tail; /* nvme progress */
157         uint16_t        head; /* guest progress */
158         uint16_t        intr_vec;
159         uint32_t        intr_en;
160         pthread_mutex_t mtx;
161 };
162
163 struct nvme_submission_queue {
164         struct nvme_command *qbase;
165         uint32_t        size;
166         uint16_t        head; /* nvme progress */
167         uint16_t        tail; /* guest progress */
168         uint16_t        cqid; /* completion queue id */
169         int             busy; /* queue is being processed */
170         int             qpriority;
171 };
172
173 enum nvme_storage_type {
174         NVME_STOR_BLOCKIF = 0,
175         NVME_STOR_RAM = 1,
176 };
177
178 struct pci_nvme_blockstore {
179         enum nvme_storage_type type;
180         void            *ctx;
181         uint64_t        size;
182         uint32_t        sectsz;
183         uint32_t        sectsz_bits;
184         uint64_t        eui64;
185         uint32_t        deallocate:1;
186 };
187
188 struct pci_nvme_ioreq {
189         struct pci_nvme_softc *sc;
190         STAILQ_ENTRY(pci_nvme_ioreq) link;
191         struct nvme_submission_queue *nvme_sq;
192         uint16_t        sqid;
193
194         /* command information */
195         uint16_t        opc;
196         uint16_t        cid;
197         uint32_t        nsid;
198
199         uint64_t        prev_gpaddr;
200         size_t          prev_size;
201
202         /*
203          * lock if all iovs consumed (big IO);
204          * complete transaction before continuing
205          */
206         pthread_mutex_t mtx;
207         pthread_cond_t  cv;
208
209         struct blockif_req io_req;
210
211         /* pad to fit up to 512 page descriptors from guest IO request */
212         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
213 };
214
215 enum nvme_dsm_type {
216         /* Dataset Management bit in ONCS reflects backing storage capability */
217         NVME_DATASET_MANAGEMENT_AUTO,
218         /* Unconditionally set Dataset Management bit in ONCS */
219         NVME_DATASET_MANAGEMENT_ENABLE,
220         /* Unconditionally clear Dataset Management bit in ONCS */
221         NVME_DATASET_MANAGEMENT_DISABLE,
222 };
223
224 struct pci_nvme_softc {
225         struct pci_devinst *nsc_pi;
226
227         pthread_mutex_t mtx;
228
229         struct nvme_registers regs;
230
231         struct nvme_namespace_data  nsdata;
232         struct nvme_controller_data ctrldata;
233         struct nvme_error_information_entry err_log;
234         struct nvme_health_information_page health_log;
235         struct nvme_firmware_page fw_log;
236
237         struct pci_nvme_blockstore nvstore;
238
239         uint16_t        max_qentries;   /* max entries per queue */
240         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
241         uint32_t        num_cqueues;
242         uint32_t        num_squeues;
243
244         struct pci_nvme_ioreq *ioreqs;
245         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
246         uint32_t        pending_ios;
247         uint32_t        ioslots;
248         sem_t           iosemlock;
249
250         /*
251          * Memory mapped Submission and Completion queues
252          * Each array includes both Admin and IO queues
253          */
254         struct nvme_completion_queue *compl_queues;
255         struct nvme_submission_queue *submit_queues;
256
257         /* controller features */
258         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
259         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
260         uint32_t        async_ev_config;         /* 0x0B: async event config */
261
262         enum nvme_dsm_type dataset_management;
263 };
264
265
266 static void pci_nvme_io_partial(struct blockif_req *br, int err);
267
268 /* Controller Configuration utils */
269 #define NVME_CC_GET_EN(cc) \
270         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
271 #define NVME_CC_GET_CSS(cc) \
272         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
273 #define NVME_CC_GET_SHN(cc) \
274         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
275 #define NVME_CC_GET_IOSQES(cc) \
276         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
277 #define NVME_CC_GET_IOCQES(cc) \
278         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
279
280 #define NVME_CC_WRITE_MASK \
281         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
282          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
283          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
284
285 #define NVME_CC_NEN_WRITE_MASK \
286         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
287          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
288          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
289
290 /* Controller Status utils */
291 #define NVME_CSTS_GET_RDY(sts) \
292         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
293
294 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
295
296 /* Completion Queue status word utils */
297 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
298 #define NVME_STATUS_MASK \
299         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
300          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
301
302 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
303         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
304
305 static __inline void
306 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
307 {
308         size_t len;
309
310         len = strnlen(src, dst_size);
311         memset(dst, pad, dst_size);
312         memcpy(dst, src, len);
313 }
314
315 static __inline void
316 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
317 {
318
319         *status &= ~NVME_STATUS_MASK;
320         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
321                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
322 }
323
324 static __inline void
325 pci_nvme_status_genc(uint16_t *status, uint16_t code)
326 {
327
328         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
329 }
330
331 static __inline void
332 pci_nvme_toggle_phase(uint16_t *status, int prev)
333 {
334
335         if (prev)
336                 *status &= ~NVME_STATUS_P;
337         else
338                 *status |= NVME_STATUS_P;
339 }
340
341 static void
342 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
343 {
344         struct nvme_controller_data *cd = &sc->ctrldata;
345
346         cd->vid = 0xFB5D;
347         cd->ssvid = 0x0000;
348
349         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
350         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
351
352         /* Num of submission commands that we can handle at a time (2^rab) */
353         cd->rab   = 4;
354
355         /* FreeBSD OUI */
356         cd->ieee[0] = 0x58;
357         cd->ieee[1] = 0x9c;
358         cd->ieee[2] = 0xfc;
359
360         cd->mic = 0;
361
362         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
363
364         cd->ver = 0x00010300;
365
366         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
367         cd->acl = 2;
368         cd->aerl = 4;
369
370         cd->lpa = 0;    /* TODO: support some simple things like SMART */
371         cd->elpe = 0;   /* max error log page entries */
372         cd->npss = 1;   /* number of power states support */
373
374         /* Warning Composite Temperature Threshold */
375         cd->wctemp = 0x0157;
376
377         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
378             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
379         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
380             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
381         cd->nn = 1;     /* number of namespaces */
382
383         cd->oncs = 0;
384         switch (sc->dataset_management) {
385         case NVME_DATASET_MANAGEMENT_AUTO:
386                 if (sc->nvstore.deallocate)
387                         cd->oncs |= NVME_ONCS_DSM;
388                 break;
389         case NVME_DATASET_MANAGEMENT_ENABLE:
390                 cd->oncs |= NVME_ONCS_DSM;
391                 break;
392         default:
393                 break;
394         }
395
396         cd->fna = 0x03;
397
398         cd->power_state[0].mp = 10;
399 }
400
401 /*
402  * Calculate the CRC-16 of the given buffer
403  * See copyright attribution at top of file
404  */
405 static uint16_t
406 crc16(uint16_t crc, const void *buffer, unsigned int len)
407 {
408         const unsigned char *cp = buffer;
409         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
410         static uint16_t const crc16_table[256] = {
411                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
412                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
413                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
414                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
415                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
416                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
417                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
418                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
419                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
420                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
421                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
422                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
423                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
424                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
425                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
426                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
427                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
428                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
429                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
430                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
431                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
432                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
433                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
434                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
435                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
436                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
437                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
438                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
439                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
440                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
441                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
442                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
443         };
444
445         while (len--)
446                 crc = (((crc >> 8) & 0xffU) ^
447                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
448         return crc;
449 }
450
451 static void
452 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
453     struct nvme_namespace_data *nd, uint32_t nsid,
454     struct pci_nvme_blockstore *nvstore)
455 {
456
457         /* Get capacity and block size information from backing store */
458         nd->nsze = nvstore->size / nvstore->sectsz;
459         nd->ncap = nd->nsze;
460         nd->nuse = nd->nsze;
461
462         if (nvstore->type == NVME_STOR_BLOCKIF)
463                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
464
465         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
466         nd->flbas = 0;
467
468         /* Create an EUI-64 if user did not provide one */
469         if (nvstore->eui64 == 0) {
470                 char *data = NULL;
471                 uint64_t eui64 = nvstore->eui64;
472
473                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
474                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
475
476                 if (data != NULL) {
477                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
478                         free(data);
479                 }
480                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
481         }
482         be64enc(nd->eui64, nvstore->eui64);
483
484         /* LBA data-sz = 2^lbads */
485         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
486 }
487
488 static void
489 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
490 {
491
492         memset(&sc->err_log, 0, sizeof(sc->err_log));
493         memset(&sc->health_log, 0, sizeof(sc->health_log));
494         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
495 }
496
497 static void
498 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
499 {
500         DPRINTF("%s", __func__);
501
502         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
503             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
504             (60 << NVME_CAP_LO_REG_TO_SHIFT);
505
506         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
507
508         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
509
510         sc->regs.cc = 0;
511         sc->regs.csts = 0;
512
513         sc->num_cqueues = sc->num_squeues = sc->max_queues;
514         if (sc->submit_queues != NULL) {
515                 for (int i = 0; i < sc->num_squeues + 1; i++) {
516                         /*
517                          * The Admin Submission Queue is at index 0.
518                          * It must not be changed at reset otherwise the
519                          * emulation will be out of sync with the guest.
520                          */
521                         if (i != 0) {
522                                 sc->submit_queues[i].qbase = NULL;
523                                 sc->submit_queues[i].size = 0;
524                                 sc->submit_queues[i].cqid = 0;
525                         }
526                         sc->submit_queues[i].tail = 0;
527                         sc->submit_queues[i].head = 0;
528                         sc->submit_queues[i].busy = 0;
529                 }
530         } else
531                 sc->submit_queues = calloc(sc->num_squeues + 1,
532                                         sizeof(struct nvme_submission_queue));
533
534         if (sc->compl_queues != NULL) {
535                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
536                         /* See Admin Submission Queue note above */
537                         if (i != 0) {
538                                 sc->compl_queues[i].qbase = NULL;
539                                 sc->compl_queues[i].size = 0;
540                         }
541
542                         sc->compl_queues[i].tail = 0;
543                         sc->compl_queues[i].head = 0;
544                 }
545         } else {
546                 sc->compl_queues = calloc(sc->num_cqueues + 1,
547                                         sizeof(struct nvme_completion_queue));
548
549                 for (int i = 0; i < sc->num_cqueues + 1; i++)
550                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
551         }
552 }
553
554 static void
555 pci_nvme_reset(struct pci_nvme_softc *sc)
556 {
557         pthread_mutex_lock(&sc->mtx);
558         pci_nvme_reset_locked(sc);
559         pthread_mutex_unlock(&sc->mtx);
560 }
561
562 static void
563 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
564 {
565         uint16_t acqs, asqs;
566
567         DPRINTF("%s", __func__);
568
569         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
570         sc->submit_queues[0].size = asqs;
571         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
572                     sizeof(struct nvme_command) * asqs);
573
574         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
575                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
576
577         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
578             NVME_AQA_REG_ACQS_MASK) + 1;
579         sc->compl_queues[0].size = acqs;
580         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
581                  sizeof(struct nvme_completion) * acqs);
582         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
583                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
584 }
585
586 static int
587 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
588         size_t len, enum nvme_copy_dir dir)
589 {
590         uint8_t *p;
591         size_t bytes;
592
593         if (len > (8 * 1024)) {
594                 return (-1);
595         }
596
597         /* Copy from the start of prp1 to the end of the physical page */
598         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
599         bytes = MIN(bytes, len);
600
601         p = vm_map_gpa(ctx, prp1, bytes);
602         if (p == NULL) {
603                 return (-1);
604         }
605
606         if (dir == NVME_COPY_TO_PRP)
607                 memcpy(p, b, bytes);
608         else
609                 memcpy(b, p, bytes);
610
611         b += bytes;
612
613         len -= bytes;
614         if (len == 0) {
615                 return (0);
616         }
617
618         len = MIN(len, PAGE_SIZE);
619
620         p = vm_map_gpa(ctx, prp2, len);
621         if (p == NULL) {
622                 return (-1);
623         }
624
625         if (dir == NVME_COPY_TO_PRP)
626                 memcpy(p, b, len);
627         else
628                 memcpy(b, p, len);
629
630         return (0);
631 }
632
633 static int
634 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
635         struct nvme_completion* compl)
636 {
637         uint16_t qid = command->cdw10 & 0xffff;
638
639         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
640         if (qid == 0 || qid > sc->num_squeues) {
641                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
642                         __func__, qid, sc->num_squeues);
643                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
644                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
645                 return (1);
646         }
647
648         sc->submit_queues[qid].qbase = NULL;
649         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
650         return (1);
651 }
652
653 static int
654 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
655         struct nvme_completion* compl)
656 {
657         if (command->cdw11 & NVME_CMD_CDW11_PC) {
658                 uint16_t qid = command->cdw10 & 0xffff;
659                 struct nvme_submission_queue *nsq;
660
661                 if ((qid == 0) || (qid > sc->num_squeues)) {
662                         WPRINTF("%s queue index %u > num_squeues %u",
663                                 __func__, qid, sc->num_squeues);
664                         pci_nvme_status_tc(&compl->status,
665                             NVME_SCT_COMMAND_SPECIFIC,
666                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
667                         return (1);
668                 }
669
670                 nsq = &sc->submit_queues[qid];
671                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
672
673                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
674                               sizeof(struct nvme_command) * (size_t)nsq->size);
675                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
676                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
677
678                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
679                         qid, nsq->size, nsq->qbase, nsq->cqid);
680
681                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
682
683                 DPRINTF("%s completed creating IOSQ qid %u",
684                          __func__, qid);
685         } else {
686                 /* 
687                  * Guest sent non-cont submission queue request.
688                  * This setting is unsupported by this emulation.
689                  */
690                 WPRINTF("%s unsupported non-contig (list-based) "
691                          "create i/o submission queue", __func__);
692
693                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
694         }
695         return (1);
696 }
697
698 static int
699 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
700         struct nvme_completion* compl)
701 {
702         uint16_t qid = command->cdw10 & 0xffff;
703
704         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
705         if (qid == 0 || qid > sc->num_cqueues) {
706                 WPRINTF("%s queue index %u / num_cqueues %u",
707                         __func__, qid, sc->num_cqueues);
708                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
709                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
710                 return (1);
711         }
712
713         sc->compl_queues[qid].qbase = NULL;
714         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
715         return (1);
716 }
717
718 static int
719 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
720         struct nvme_completion* compl)
721 {
722         if (command->cdw11 & NVME_CMD_CDW11_PC) {
723                 uint16_t qid = command->cdw10 & 0xffff;
724                 struct nvme_completion_queue *ncq;
725
726                 if ((qid == 0) || (qid > sc->num_cqueues)) {
727                         WPRINTF("%s queue index %u > num_cqueues %u",
728                                 __func__, qid, sc->num_cqueues);
729                         pci_nvme_status_tc(&compl->status,
730                             NVME_SCT_COMMAND_SPECIFIC,
731                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
732                         return (1);
733                 }
734
735                 ncq = &sc->compl_queues[qid];
736                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
737                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
738                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
739
740                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
741                              command->prp1,
742                              sizeof(struct nvme_command) * (size_t)ncq->size);
743
744                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
745         } else {
746                 /* 
747                  * Non-contig completion queue unsupported.
748                  */
749                 WPRINTF("%s unsupported non-contig (list-based) "
750                          "create i/o completion queue",
751                          __func__);
752
753                 /* 0x12 = Invalid Use of Controller Memory Buffer */
754                 pci_nvme_status_genc(&compl->status, 0x12);
755         }
756
757         return (1);
758 }
759
760 static int
761 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
762         struct nvme_completion* compl)
763 {
764         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
765         uint8_t logpage = command->cdw10 & 0xFF;
766
767         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
768
769         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
770
771         switch (logpage) {
772         case NVME_LOG_ERROR:
773                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
774                     command->prp2, (uint8_t *)&sc->err_log, logsize,
775                     NVME_COPY_TO_PRP);
776                 break;
777         case NVME_LOG_HEALTH_INFORMATION:
778                 /* TODO: present some smart info */
779                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
780                     command->prp2, (uint8_t *)&sc->health_log, logsize,
781                     NVME_COPY_TO_PRP);
782                 break;
783         case NVME_LOG_FIRMWARE_SLOT:
784                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
785                     command->prp2, (uint8_t *)&sc->fw_log, logsize,
786                     NVME_COPY_TO_PRP);
787                 break;
788         default:
789                 WPRINTF("%s get log page %x command not supported",
790                         __func__, logpage);
791
792                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
793                     NVME_SC_INVALID_LOG_PAGE);
794         }
795
796         return (1);
797 }
798
799 static int
800 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
801         struct nvme_completion* compl)
802 {
803         void *dest;
804
805         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
806                 command->cdw10 & 0xFF, command->nsid);
807
808         switch (command->cdw10 & 0xFF) {
809         case 0x00: /* return Identify Namespace data structure */
810                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
811                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
812                     NVME_COPY_TO_PRP);
813                 break;
814         case 0x01: /* return Identify Controller data structure */
815                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
816                     command->prp2, (uint8_t *)&sc->ctrldata,
817                     sizeof(sc->ctrldata),
818                     NVME_COPY_TO_PRP);
819                 break;
820         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
821                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
822                                   sizeof(uint32_t) * 1024);
823                 ((uint32_t *)dest)[0] = 1;
824                 ((uint32_t *)dest)[1] = 0;
825                 break;
826         case 0x11:
827                 pci_nvme_status_genc(&compl->status,
828                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
829                 return (1);
830         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
831         case 0x10:
832         case 0x12:
833         case 0x13:
834         case 0x14:
835         case 0x15:
836         default:
837                 DPRINTF("%s unsupported identify command requested 0x%x",
838                          __func__, command->cdw10 & 0xFF);
839                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
840                 return (1);
841         }
842
843         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
844         return (1);
845 }
846
847 static int
848 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
849         struct nvme_completion* compl)
850 {
851         uint16_t nqr;   /* Number of Queues Requested */
852
853         nqr = command->cdw11 & 0xFFFF;
854         if (nqr == 0xffff) {
855                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
856                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
857                 return (-1);
858         }
859
860         sc->num_squeues = ONE_BASED(nqr);
861         if (sc->num_squeues > sc->max_queues) {
862                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
863                                         sc->max_queues);
864                 sc->num_squeues = sc->max_queues;
865         }
866
867         nqr = (command->cdw11 >> 16) & 0xFFFF;
868         if (nqr == 0xffff) {
869                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
870                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
871                 return (-1);
872         }
873
874         sc->num_cqueues = ONE_BASED(nqr);
875         if (sc->num_cqueues > sc->max_queues) {
876                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
877                                         sc->max_queues);
878                 sc->num_cqueues = sc->max_queues;
879         }
880
881         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
882
883         return (0);
884 }
885
886 static int
887 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
888         struct nvme_completion* compl)
889 {
890         int feature = command->cdw10 & 0xFF;
891         uint32_t iv;
892
893         DPRINTF("%s feature 0x%x", __func__, feature);
894         compl->cdw0 = 0;
895
896         switch (feature) {
897         case NVME_FEAT_ARBITRATION:
898                 DPRINTF("  arbitration 0x%x", command->cdw11);
899                 break;
900         case NVME_FEAT_POWER_MANAGEMENT:
901                 DPRINTF("  power management 0x%x", command->cdw11);
902                 break;
903         case NVME_FEAT_LBA_RANGE_TYPE:
904                 DPRINTF("  lba range 0x%x", command->cdw11);
905                 break;
906         case NVME_FEAT_TEMPERATURE_THRESHOLD:
907                 DPRINTF("  temperature threshold 0x%x", command->cdw11);
908                 break;
909         case NVME_FEAT_ERROR_RECOVERY:
910                 DPRINTF("  error recovery 0x%x", command->cdw11);
911                 break;
912         case NVME_FEAT_VOLATILE_WRITE_CACHE:
913                 DPRINTF("  volatile write cache 0x%x", command->cdw11);
914                 break;
915         case NVME_FEAT_NUMBER_OF_QUEUES:
916                 nvme_set_feature_queues(sc, command, compl);
917                 break;
918         case NVME_FEAT_INTERRUPT_COALESCING:
919                 DPRINTF("  interrupt coalescing 0x%x", command->cdw11);
920
921                 /* in uS */
922                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
923
924                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
925                 break;
926         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
927                 iv = command->cdw11 & 0xFFFF;
928
929                 DPRINTF("  interrupt vector configuration 0x%x",
930                         command->cdw11);
931
932                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
933                         if (sc->compl_queues[i].intr_vec == iv) {
934                                 if (command->cdw11 & (1 << 16))
935                                         sc->compl_queues[i].intr_en |=
936                                                               NVME_CQ_INTCOAL;  
937                                 else
938                                         sc->compl_queues[i].intr_en &=
939                                                              ~NVME_CQ_INTCOAL;  
940                         }
941                 }
942                 break;
943         case NVME_FEAT_WRITE_ATOMICITY:
944                 DPRINTF("  write atomicity 0x%x", command->cdw11);
945                 break;
946         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
947                 DPRINTF("  async event configuration 0x%x",
948                         command->cdw11);
949                 sc->async_ev_config = command->cdw11;
950                 break;
951         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
952                 DPRINTF("  software progress marker 0x%x",
953                         command->cdw11);
954                 break;
955         case 0x0C:
956                 DPRINTF("  autonomous power state transition 0x%x",
957                         command->cdw11);
958                 break;
959         default:
960                 WPRINTF("%s invalid feature", __func__);
961                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
962                 return (1);
963         }
964
965         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
966         return (1);
967 }
968
969 static int
970 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
971         struct nvme_completion* compl)
972 {
973         int feature = command->cdw10 & 0xFF;
974
975         DPRINTF("%s feature 0x%x", __func__, feature);
976
977         compl->cdw0 = 0;
978
979         switch (feature) {
980         case NVME_FEAT_ARBITRATION:
981                 DPRINTF("  arbitration");
982                 break;
983         case NVME_FEAT_POWER_MANAGEMENT:
984                 DPRINTF("  power management");
985                 break;
986         case NVME_FEAT_LBA_RANGE_TYPE:
987                 DPRINTF("  lba range");
988                 break;
989         case NVME_FEAT_TEMPERATURE_THRESHOLD:
990                 DPRINTF("  temperature threshold");
991                 switch ((command->cdw11 >> 20) & 0x3) {
992                 case 0:
993                         /* Over temp threshold */
994                         compl->cdw0 = 0xFFFF;
995                         break;
996                 case 1:
997                         /* Under temp threshold */
998                         compl->cdw0 = 0;
999                         break;
1000                 default:
1001                         WPRINTF("  invalid threshold type select");
1002                         pci_nvme_status_genc(&compl->status,
1003                             NVME_SC_INVALID_FIELD);
1004                         return (1);
1005                 }
1006                 break;
1007         case NVME_FEAT_ERROR_RECOVERY:
1008                 DPRINTF("  error recovery");
1009                 break;
1010         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1011                 DPRINTF("  volatile write cache");
1012                 break;
1013         case NVME_FEAT_NUMBER_OF_QUEUES:
1014                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1015
1016                 DPRINTF("  number of queues (submit %u, completion %u)",
1017                         compl->cdw0 & 0xFFFF,
1018                         (compl->cdw0 >> 16) & 0xFFFF);
1019
1020                 break;
1021         case NVME_FEAT_INTERRUPT_COALESCING:
1022                 DPRINTF("  interrupt coalescing");
1023                 break;
1024         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1025                 DPRINTF("  interrupt vector configuration");
1026                 break;
1027         case NVME_FEAT_WRITE_ATOMICITY:
1028                 DPRINTF("  write atomicity");
1029                 break;
1030         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1031                 DPRINTF("  async event configuration");
1032                 sc->async_ev_config = command->cdw11;
1033                 break;
1034         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1035                 DPRINTF("  software progress marker");
1036                 break;
1037         case 0x0C:
1038                 DPRINTF("  autonomous power state transition");
1039                 break;
1040         default:
1041                 WPRINTF("%s invalid feature 0x%x", __func__, feature);
1042                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1043                 return (1);
1044         }
1045
1046         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1047         return (1);
1048 }
1049
1050 static int
1051 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1052         struct nvme_completion* compl)
1053 {
1054         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1055                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1056
1057         /* TODO: search for the command ID and abort it */
1058
1059         compl->cdw0 = 1;
1060         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1061         return (1);
1062 }
1063
1064 static int
1065 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1066         struct nvme_command* command, struct nvme_completion* compl)
1067 {
1068         DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1069
1070         /*
1071          * TODO: raise events when they happen based on the Set Features cmd.
1072          * These events happen async, so only set completion successful if
1073          * there is an event reflective of the request to get event.
1074          */
1075         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1076             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1077         return (0);
1078 }
1079
1080 static void
1081 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1082 {
1083         struct nvme_completion compl;
1084         struct nvme_command *cmd;
1085         struct nvme_submission_queue *sq;
1086         struct nvme_completion_queue *cq;
1087         uint16_t sqhead;
1088
1089         DPRINTF("%s index %u", __func__, (uint32_t)value);
1090
1091         sq = &sc->submit_queues[0];
1092         cq = &sc->compl_queues[0];
1093
1094         sqhead = atomic_load_acq_short(&sq->head);
1095
1096         if (atomic_testandset_int(&sq->busy, 1)) {
1097                 DPRINTF("%s SQ busy, head %u, tail %u",
1098                         __func__, sqhead, sq->tail);
1099                 return;
1100         }
1101
1102         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1103         
1104         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1105                 cmd = &(sq->qbase)[sqhead];
1106                 compl.status = 0;
1107
1108                 switch (cmd->opc) {
1109                 case NVME_OPC_DELETE_IO_SQ:
1110                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1111                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1112                         break;
1113                 case NVME_OPC_CREATE_IO_SQ:
1114                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1115                         nvme_opc_create_io_sq(sc, cmd, &compl);
1116                         break;
1117                 case NVME_OPC_DELETE_IO_CQ:
1118                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1119                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1120                         break;
1121                 case NVME_OPC_CREATE_IO_CQ:
1122                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1123                         nvme_opc_create_io_cq(sc, cmd, &compl);
1124                         break;
1125                 case NVME_OPC_GET_LOG_PAGE:
1126                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1127                         nvme_opc_get_log_page(sc, cmd, &compl);
1128                         break;
1129                 case NVME_OPC_IDENTIFY:
1130                         DPRINTF("%s command IDENTIFY", __func__);
1131                         nvme_opc_identify(sc, cmd, &compl);
1132                         break;
1133                 case NVME_OPC_ABORT:
1134                         DPRINTF("%s command ABORT", __func__);
1135                         nvme_opc_abort(sc, cmd, &compl);
1136                         break;
1137                 case NVME_OPC_SET_FEATURES:
1138                         DPRINTF("%s command SET_FEATURES", __func__);
1139                         nvme_opc_set_features(sc, cmd, &compl);
1140                         break;
1141                 case NVME_OPC_GET_FEATURES:
1142                         DPRINTF("%s command GET_FEATURES", __func__);
1143                         nvme_opc_get_features(sc, cmd, &compl);
1144                         break;
1145                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1146                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1147                         /* XXX dont care, unhandled for now
1148                         nvme_opc_async_event_req(sc, cmd, &compl);
1149                         */
1150                         compl.status = NVME_NO_STATUS;
1151                         break;
1152                 default:
1153                         WPRINTF("0x%x command is not implemented",
1154                             cmd->opc);
1155                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1156                 }
1157                 sqhead = (sqhead + 1) % sq->size;
1158
1159                 if (NVME_COMPLETION_VALID(compl)) {
1160                         struct nvme_completion *cp;
1161                         int phase;
1162
1163                         cp = &(cq->qbase)[cq->tail];
1164                         cp->cdw0 = compl.cdw0;
1165                         cp->sqid = 0;
1166                         cp->sqhd = sqhead;
1167                         cp->cid = cmd->cid;
1168
1169                         phase = NVME_STATUS_GET_P(cp->status);
1170                         cp->status = compl.status;
1171                         pci_nvme_toggle_phase(&cp->status, phase);
1172
1173                         cq->tail = (cq->tail + 1) % cq->size;
1174                 }
1175         }
1176
1177         DPRINTF("setting sqhead %u", sqhead);
1178         atomic_store_short(&sq->head, sqhead);
1179         atomic_store_int(&sq->busy, 0);
1180
1181         if (cq->head != cq->tail)
1182                 pci_generate_msix(sc->nsc_pi, 0);
1183
1184 }
1185
1186 static int
1187 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1188         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1189 {
1190         int iovidx;
1191
1192         if (req != NULL) {
1193                 /* concatenate contig block-iovs to minimize number of iovs */
1194                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1195                         iovidx = req->io_req.br_iovcnt - 1;
1196
1197                         req->io_req.br_iov[iovidx].iov_base =
1198                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1199                                              req->prev_gpaddr, size);
1200
1201                         req->prev_size += size;
1202                         req->io_req.br_resid += size;
1203
1204                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1205                 } else {
1206                         pthread_mutex_lock(&req->mtx);
1207
1208                         iovidx = req->io_req.br_iovcnt;
1209                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1210                                 int err = 0;
1211
1212                                 DPRINTF("large I/O, doing partial req");
1213
1214                                 iovidx = 0;
1215                                 req->io_req.br_iovcnt = 0;
1216
1217                                 req->io_req.br_callback = pci_nvme_io_partial;
1218
1219                                 if (!do_write)
1220                                         err = blockif_read(sc->nvstore.ctx,
1221                                                            &req->io_req);
1222                                 else
1223                                         err = blockif_write(sc->nvstore.ctx,
1224                                                             &req->io_req);
1225
1226                                 /* wait until req completes before cont */
1227                                 if (err == 0)
1228                                         pthread_cond_wait(&req->cv, &req->mtx);
1229                         }
1230                         if (iovidx == 0) {
1231                                 req->io_req.br_offset = lba;
1232                                 req->io_req.br_resid = 0;
1233                                 req->io_req.br_param = req;
1234                         }
1235
1236                         req->io_req.br_iov[iovidx].iov_base =
1237                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1238                                              gpaddr, size);
1239
1240                         req->io_req.br_iov[iovidx].iov_len = size;
1241
1242                         req->prev_gpaddr = gpaddr;
1243                         req->prev_size = size;
1244                         req->io_req.br_resid += size;
1245
1246                         req->io_req.br_iovcnt++;
1247
1248                         pthread_mutex_unlock(&req->mtx);
1249                 }
1250         } else {
1251                 /* RAM buffer: read/write directly */
1252                 void *p = sc->nvstore.ctx;
1253                 void *gptr;
1254
1255                 if ((lba + size) > sc->nvstore.size) {
1256                         WPRINTF("%s write would overflow RAM", __func__);
1257                         return (-1);
1258                 }
1259
1260                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1261                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1262                 if (do_write) 
1263                         memcpy(p, gptr, size);
1264                 else
1265                         memcpy(gptr, p, size);
1266         }
1267         return (0);
1268 }
1269
1270 static void
1271 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1272         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1273         uint32_t cdw0, uint16_t status, int ignore_busy)
1274 {
1275         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1276         struct nvme_completion *compl;
1277         int phase;
1278
1279         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1280                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1281                  NVME_STATUS_GET_SC(status));
1282
1283         pthread_mutex_lock(&cq->mtx);
1284
1285         assert(cq->qbase != NULL);
1286
1287         compl = &cq->qbase[cq->tail];
1288
1289         compl->cdw0 = cdw0;
1290         compl->sqid = sqid;
1291         compl->sqhd = atomic_load_acq_short(&sq->head);
1292         compl->cid = cid;
1293
1294         // toggle phase
1295         phase = NVME_STATUS_GET_P(compl->status);
1296         compl->status = status;
1297         pci_nvme_toggle_phase(&compl->status, phase);
1298
1299         cq->tail = (cq->tail + 1) % cq->size;
1300
1301         pthread_mutex_unlock(&cq->mtx);
1302
1303         if (cq->head != cq->tail) {
1304                 if (cq->intr_en & NVME_CQ_INTEN) {
1305                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1306                 } else {
1307                         DPRINTF("%s: CQ%u interrupt disabled",
1308                                                 __func__, sq->cqid);
1309                 }
1310         }
1311 }
1312
1313 static void
1314 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1315 {
1316         req->sc = NULL;
1317         req->nvme_sq = NULL;
1318         req->sqid = 0;
1319
1320         pthread_mutex_lock(&sc->mtx);
1321
1322         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1323         sc->pending_ios--;
1324
1325         /* when no more IO pending, can set to ready if device reset/enabled */
1326         if (sc->pending_ios == 0 &&
1327             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1328                 sc->regs.csts |= NVME_CSTS_RDY;
1329
1330         pthread_mutex_unlock(&sc->mtx);
1331
1332         sem_post(&sc->iosemlock);
1333 }
1334
1335 static struct pci_nvme_ioreq *
1336 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1337 {
1338         struct pci_nvme_ioreq *req = NULL;;
1339
1340         sem_wait(&sc->iosemlock);
1341         pthread_mutex_lock(&sc->mtx);
1342
1343         req = STAILQ_FIRST(&sc->ioreqs_free);
1344         assert(req != NULL);
1345         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1346
1347         req->sc = sc;
1348
1349         sc->pending_ios++;
1350
1351         pthread_mutex_unlock(&sc->mtx);
1352
1353         req->io_req.br_iovcnt = 0;
1354         req->io_req.br_offset = 0;
1355         req->io_req.br_resid = 0;
1356         req->io_req.br_param = req;
1357         req->prev_gpaddr = 0;
1358         req->prev_size = 0;
1359
1360         return req;
1361 }
1362
1363 static void
1364 pci_nvme_io_done(struct blockif_req *br, int err)
1365 {
1366         struct pci_nvme_ioreq *req = br->br_param;
1367         struct nvme_submission_queue *sq = req->nvme_sq;
1368         uint16_t code, status;
1369
1370         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1371
1372         /* TODO return correct error */
1373         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1374         pci_nvme_status_genc(&status, code);
1375
1376         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1377         pci_nvme_release_ioreq(req->sc, req);
1378 }
1379
1380 static void
1381 pci_nvme_io_partial(struct blockif_req *br, int err)
1382 {
1383         struct pci_nvme_ioreq *req = br->br_param;
1384
1385         DPRINTF("%s error %d %s", __func__, err, strerror(err));
1386
1387         pthread_cond_signal(&req->cv);
1388 }
1389
1390 /*
1391  * Implements the Flush command. The specification states:
1392  *    If a volatile write cache is not present, Flush commands complete
1393  *    successfully and have no effect
1394  * in the description of the Volatile Write Cache (VWC) field of the Identify
1395  * Controller data. Therefore, set status to Success if the command is
1396  * not supported (i.e. RAM or as indicated by the blockif).
1397  */
1398 static bool
1399 nvme_opc_flush(struct pci_nvme_softc *sc,
1400     struct nvme_command *cmd,
1401     struct pci_nvme_blockstore *nvstore,
1402     struct pci_nvme_ioreq *req,
1403     uint16_t *status)
1404 {
1405         bool pending = false;
1406
1407         if (nvstore->type == NVME_STOR_RAM) {
1408                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1409         } else {
1410                 int err;
1411
1412                 req->io_req.br_callback = pci_nvme_io_done;
1413
1414                 err = blockif_flush(nvstore->ctx, &req->io_req);
1415                 switch (err) {
1416                 case 0:
1417                         pending = true;
1418                         break;
1419                 case EOPNOTSUPP:
1420                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1421                         break;
1422                 default:
1423                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1424                 }
1425         }
1426
1427         return (pending);
1428 }
1429
1430 static bool
1431 nvme_opc_write_read(struct pci_nvme_softc *sc,
1432     struct nvme_command *cmd,
1433     struct pci_nvme_blockstore *nvstore,
1434     struct pci_nvme_ioreq *req,
1435     uint16_t *status)
1436 {
1437         uint64_t lba, nblocks, bytes;
1438         size_t offset;
1439         bool is_write = cmd->opc == NVME_OPC_WRITE;
1440         bool pending = false;
1441
1442         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1443         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1444
1445         offset = lba * nvstore->sectsz;
1446         bytes  = nblocks * nvstore->sectsz;
1447
1448         if ((offset + bytes) > nvstore->size) {
1449                 WPRINTF("%s command would exceed LBA range", __func__);
1450                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
1451                 goto out;
1452         }
1453
1454         req->io_req.br_offset = lba;
1455
1456         /* PRP bits 1:0 must be zero */
1457         cmd->prp1 &= ~0x3UL;
1458         cmd->prp2 &= ~0x3UL;
1459
1460         if (nvstore->type == NVME_STOR_RAM) {
1461                 uint8_t *buf = nvstore->ctx;
1462                 enum nvme_copy_dir dir;
1463
1464                 if (is_write)
1465                         dir = NVME_COPY_TO_PRP;
1466                 else
1467                         dir = NVME_COPY_FROM_PRP;
1468
1469                 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1470                     buf + offset, bytes, dir))
1471                         pci_nvme_status_genc(status,
1472                             NVME_SC_DATA_TRANSFER_ERROR);
1473                 else
1474                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1475         } else {
1476                 uint64_t size;
1477                 int err;
1478
1479                 size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes);
1480                 if (pci_nvme_append_iov_req(sc, req, cmd->prp1,
1481                     size, is_write, offset)) {
1482                         pci_nvme_status_genc(status,
1483                             NVME_SC_DATA_TRANSFER_ERROR);
1484                         goto out;
1485                 }
1486
1487                 offset += size;
1488                 bytes  -= size;
1489
1490                 if (bytes == 0) {
1491                         ;
1492                 } else if (bytes <= PAGE_SIZE) {
1493                         size = bytes;
1494                         if (pci_nvme_append_iov_req(sc, req, cmd->prp2,
1495                             size, is_write, offset)) {
1496                                 pci_nvme_status_genc(status,
1497                                     NVME_SC_DATA_TRANSFER_ERROR);
1498                                 goto out;
1499                         }
1500                 } else {
1501                         void *vmctx = sc->nsc_pi->pi_vmctx;
1502                         uint64_t *prp_list = &cmd->prp2;
1503                         uint64_t *last = prp_list;
1504
1505                         /* PRP2 is pointer to a physical region page list */
1506                         while (bytes) {
1507                                 /* Last entry in list points to the next list */
1508                                 if (prp_list == last) {
1509                                         uint64_t prp = *prp_list;
1510
1511                                         prp_list = paddr_guest2host(vmctx, prp,
1512                                             PAGE_SIZE - (prp % PAGE_SIZE));
1513                                         last = prp_list + (NVME_PRP2_ITEMS - 1);
1514                                 }
1515
1516                                 size = MIN(bytes, PAGE_SIZE);
1517
1518                                 if (pci_nvme_append_iov_req(sc, req, *prp_list,
1519                                     size, is_write, offset)) {
1520                                         pci_nvme_status_genc(status,
1521                                             NVME_SC_DATA_TRANSFER_ERROR);
1522                                         goto out;
1523                                 }
1524
1525                                 offset += size;
1526                                 bytes  -= size;
1527
1528                                 prp_list++;
1529                         }
1530                 }
1531                 req->io_req.br_callback = pci_nvme_io_done;
1532                 if (is_write)
1533                         err = blockif_write(nvstore->ctx, &req->io_req);
1534                 else
1535                         err = blockif_read(nvstore->ctx, &req->io_req);
1536
1537                 if (err)
1538                         pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR);
1539                 else
1540                         pending = true;
1541         }
1542 out:
1543         return (pending);
1544 }
1545
1546 static void
1547 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1548 {
1549         struct pci_nvme_ioreq *req = br->br_param;
1550         struct pci_nvme_softc *sc = req->sc;
1551         bool done = true;
1552         uint16_t status;
1553
1554         if (err) {
1555                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1556         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1557                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1558         } else {
1559                 struct iovec *iov = req->io_req.br_iov;
1560
1561                 req->prev_gpaddr++;
1562                 iov += req->prev_gpaddr;
1563
1564                 /* The iov_* values already include the sector size */
1565                 req->io_req.br_offset = (off_t)iov->iov_base;
1566                 req->io_req.br_resid = iov->iov_len;
1567                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1568                         pci_nvme_status_genc(&status,
1569                             NVME_SC_INTERNAL_DEVICE_ERROR);
1570                 } else
1571                         done = false;
1572         }
1573
1574         if (done) {
1575                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1576                     req->cid, 0, status, 0);
1577                 pci_nvme_release_ioreq(sc, req);
1578         }
1579 }
1580
1581 static bool
1582 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1583     struct nvme_command *cmd,
1584     struct pci_nvme_blockstore *nvstore,
1585     struct pci_nvme_ioreq *req,
1586     uint16_t *status)
1587 {
1588         int err;
1589         bool pending = false;
1590
1591         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1592                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1593                 goto out;
1594         }
1595
1596         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1597                 struct nvme_dsm_range *range;
1598                 uint32_t nr, r;
1599                 int sectsz = sc->nvstore.sectsz;
1600
1601                 /*
1602                  * DSM calls are advisory only, and compliant controllers
1603                  * may choose to take no actions (i.e. return Success).
1604                  */
1605                 if (!nvstore->deallocate) {
1606                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1607                         goto out;
1608                 }
1609
1610                 if (req == NULL) {
1611                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1612                         goto out;
1613                 }
1614
1615                 /* copy locally because a range entry could straddle PRPs */
1616                 range = calloc(1, NVME_MAX_DSM_TRIM);
1617                 if (range == NULL) {
1618                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1619                         goto out;
1620                 }
1621                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1622                     (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1623
1624                 /*
1625                  * If the request is for more than a single range, store
1626                  * the ranges in the br_iov. Optimize for the common case
1627                  * of a single range.
1628                  *
1629                  * Note that NVMe Number of Ranges is a zero based value
1630                  */
1631                 nr = cmd->cdw10 & 0xff;
1632
1633                 req->io_req.br_iovcnt = 0;
1634                 req->io_req.br_offset = range[0].starting_lba * sectsz;
1635                 req->io_req.br_resid = range[0].length * sectsz;
1636
1637                 if (nr == 0) {
1638                         req->io_req.br_callback = pci_nvme_io_done;
1639                 } else {
1640                         struct iovec *iov = req->io_req.br_iov;
1641
1642                         for (r = 0; r <= nr; r++) {
1643                                 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1644                                 iov[r].iov_len = range[r].length * sectsz;
1645                         }
1646                         req->io_req.br_callback = pci_nvme_dealloc_sm;
1647
1648                         /*
1649                          * Use prev_gpaddr to track the current entry and
1650                          * prev_size to track the number of entries
1651                          */
1652                         req->prev_gpaddr = 0;
1653                         req->prev_size = r;
1654                 }
1655
1656                 err = blockif_delete(nvstore->ctx, &req->io_req);
1657                 if (err)
1658                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1659                 else
1660                         pending = true;
1661
1662                 free(range);
1663         }
1664 out:
1665         return (pending);
1666 }
1667
1668 static void
1669 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1670 {
1671         struct nvme_submission_queue *sq;
1672         uint16_t status;
1673         uint16_t sqhead;
1674
1675         /* handle all submissions up to sq->tail index */
1676         sq = &sc->submit_queues[idx];
1677
1678         if (atomic_testandset_int(&sq->busy, 1)) {
1679                 DPRINTF("%s sqid %u busy", __func__, idx);
1680                 return;
1681         }
1682
1683         sqhead = atomic_load_acq_short(&sq->head);
1684
1685         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1686                  idx, sqhead, sq->tail, sq->qbase);
1687
1688         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1689                 struct nvme_command *cmd;
1690                 struct pci_nvme_ioreq *req;
1691                 uint32_t nsid;
1692                 bool pending;
1693
1694                 pending = false;
1695                 req = NULL;
1696                 status = 0;
1697
1698                 cmd = &sq->qbase[sqhead];
1699                 sqhead = (sqhead + 1) % sq->size;
1700
1701                 nsid = le32toh(cmd->nsid);
1702                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
1703                         pci_nvme_status_genc(&status,
1704                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1705                         status |=
1706                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
1707                         goto complete;
1708                 }
1709
1710                 req = pci_nvme_get_ioreq(sc);
1711                 if (req == NULL) {
1712                         pci_nvme_status_genc(&status,
1713                             NVME_SC_INTERNAL_DEVICE_ERROR);
1714                         WPRINTF("%s: unable to allocate IO req", __func__);
1715                         goto complete;
1716                 }
1717                 req->nvme_sq = sq;
1718                 req->sqid = idx;
1719                 req->opc = cmd->opc;
1720                 req->cid = cmd->cid;
1721                 req->nsid = cmd->nsid;
1722
1723                 switch (cmd->opc) {
1724                 case NVME_OPC_FLUSH:
1725                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
1726                             req, &status);
1727                         break;
1728                 case NVME_OPC_WRITE:
1729                 case NVME_OPC_READ:
1730                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
1731                             req, &status);
1732                         break;
1733                 case NVME_OPC_WRITE_ZEROES:
1734                         /* TODO: write zeroes
1735                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
1736                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
1737                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1738                         break;
1739                 case NVME_OPC_DATASET_MANAGEMENT:
1740                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
1741                             req, &status);
1742                         break;
1743                 default:
1744                         WPRINTF("%s unhandled io command 0x%x",
1745                             __func__, cmd->opc);
1746                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
1747                 }
1748 complete:
1749                 if (!pending) {
1750                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1751                             status, 1);
1752                         if (req != NULL)
1753                                 pci_nvme_release_ioreq(sc, req);
1754                 }
1755         }
1756
1757         atomic_store_short(&sq->head, sqhead);
1758         atomic_store_int(&sq->busy, 0);
1759 }
1760
1761 static void
1762 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1763         uint64_t idx, int is_sq, uint64_t value)
1764 {
1765         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
1766                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
1767
1768         if (is_sq) {
1769                 atomic_store_short(&sc->submit_queues[idx].tail,
1770                                    (uint16_t)value);
1771
1772                 if (idx == 0) {
1773                         pci_nvme_handle_admin_cmd(sc, value);
1774                 } else {
1775                         /* submission queue; handle new entries in SQ */
1776                         if (idx > sc->num_squeues) {
1777                                 WPRINTF("%s SQ index %lu overflow from "
1778                                          "guest (max %u)",
1779                                          __func__, idx, sc->num_squeues);
1780                                 return;
1781                         }
1782                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1783                 }
1784         } else {
1785                 if (idx > sc->num_cqueues) {
1786                         WPRINTF("%s queue index %lu overflow from "
1787                                  "guest (max %u)",
1788                                  __func__, idx, sc->num_cqueues);
1789                         return;
1790                 }
1791
1792                 sc->compl_queues[idx].head = (uint16_t)value;
1793         }
1794 }
1795
1796 static void
1797 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1798 {
1799         const char *s = iswrite ? "WRITE" : "READ";
1800
1801         switch (offset) {
1802         case NVME_CR_CAP_LOW:
1803                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
1804                 break;
1805         case NVME_CR_CAP_HI:
1806                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
1807                 break;
1808         case NVME_CR_VS:
1809                 DPRINTF("%s %s NVME_CR_VS", func, s);
1810                 break;
1811         case NVME_CR_INTMS:
1812                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
1813                 break;
1814         case NVME_CR_INTMC:
1815                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
1816                 break;
1817         case NVME_CR_CC:
1818                 DPRINTF("%s %s NVME_CR_CC", func, s);
1819                 break;
1820         case NVME_CR_CSTS:
1821                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
1822                 break;
1823         case NVME_CR_NSSR:
1824                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
1825                 break;
1826         case NVME_CR_AQA:
1827                 DPRINTF("%s %s NVME_CR_AQA", func, s);
1828                 break;
1829         case NVME_CR_ASQ_LOW:
1830                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
1831                 break;
1832         case NVME_CR_ASQ_HI:
1833                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
1834                 break;
1835         case NVME_CR_ACQ_LOW:
1836                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
1837                 break;
1838         case NVME_CR_ACQ_HI:
1839                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
1840                 break;
1841         default:
1842                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
1843         }
1844
1845 }
1846
1847 static void
1848 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1849         uint64_t offset, int size, uint64_t value)
1850 {
1851         uint32_t ccreg;
1852
1853         if (offset >= NVME_DOORBELL_OFFSET) {
1854                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1855                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1856                 int is_sq = (belloffset % 8) < 4;
1857
1858                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1859                         WPRINTF("guest attempted an overflow write offset "
1860                                  "0x%lx, val 0x%lx in %s",
1861                                  offset, value, __func__);
1862                         return;
1863                 }
1864
1865                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1866                 return;
1867         }
1868
1869         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
1870                 offset, size, value);
1871
1872         if (size != 4) {
1873                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
1874                          "val 0x%lx) to bar0 in %s",
1875                          size, offset, value, __func__);
1876                 /* TODO: shutdown device */
1877                 return;
1878         }
1879
1880         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1881
1882         pthread_mutex_lock(&sc->mtx);
1883
1884         switch (offset) {
1885         case NVME_CR_CAP_LOW:
1886         case NVME_CR_CAP_HI:
1887                 /* readonly */
1888                 break;
1889         case NVME_CR_VS:
1890                 /* readonly */
1891                 break;
1892         case NVME_CR_INTMS:
1893                 /* MSI-X, so ignore */
1894                 break;
1895         case NVME_CR_INTMC:
1896                 /* MSI-X, so ignore */
1897                 break;
1898         case NVME_CR_CC:
1899                 ccreg = (uint32_t)value;
1900
1901                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1902                          "iocqes %u",
1903                         __func__,
1904                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1905                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1906                          NVME_CC_GET_IOCQES(ccreg));
1907
1908                 if (NVME_CC_GET_SHN(ccreg)) {
1909                         /* perform shutdown - flush out data to backend */
1910                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1911                             NVME_CSTS_REG_SHST_SHIFT);
1912                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1913                             NVME_CSTS_REG_SHST_SHIFT;
1914                 }
1915                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1916                         if (NVME_CC_GET_EN(ccreg) == 0)
1917                                 /* transition 1-> causes controller reset */
1918                                 pci_nvme_reset_locked(sc);
1919                         else
1920                                 pci_nvme_init_controller(ctx, sc);
1921                 }
1922
1923                 /* Insert the iocqes, iosqes and en bits from the write */
1924                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1925                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1926                 if (NVME_CC_GET_EN(ccreg) == 0) {
1927                         /* Insert the ams, mps and css bit fields */
1928                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1929                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1930                         sc->regs.csts &= ~NVME_CSTS_RDY;
1931                 } else if (sc->pending_ios == 0) {
1932                         sc->regs.csts |= NVME_CSTS_RDY;
1933                 }
1934                 break;
1935         case NVME_CR_CSTS:
1936                 break;
1937         case NVME_CR_NSSR:
1938                 /* ignore writes; don't support subsystem reset */
1939                 break;
1940         case NVME_CR_AQA:
1941                 sc->regs.aqa = (uint32_t)value;
1942                 break;
1943         case NVME_CR_ASQ_LOW:
1944                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1945                                (0xFFFFF000 & value);
1946                 break;
1947         case NVME_CR_ASQ_HI:
1948                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1949                                (value << 32);
1950                 break;
1951         case NVME_CR_ACQ_LOW:
1952                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1953                                (0xFFFFF000 & value);
1954                 break;
1955         case NVME_CR_ACQ_HI:
1956                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1957                                (value << 32);
1958                 break;
1959         default:
1960                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
1961                          __func__, offset, value, size);
1962         }
1963         pthread_mutex_unlock(&sc->mtx);
1964 }
1965
1966 static void
1967 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1968                 int baridx, uint64_t offset, int size, uint64_t value)
1969 {
1970         struct pci_nvme_softc* sc = pi->pi_arg;
1971
1972         if (baridx == pci_msix_table_bar(pi) ||
1973             baridx == pci_msix_pba_bar(pi)) {
1974                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1975                          " value 0x%lx", baridx, offset, size, value);
1976
1977                 pci_emul_msix_twrite(pi, offset, size, value);
1978                 return;
1979         }
1980
1981         switch (baridx) {
1982         case 0:
1983                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1984                 break;
1985
1986         default:
1987                 DPRINTF("%s unknown baridx %d, val 0x%lx",
1988                          __func__, baridx, value);
1989         }
1990 }
1991
1992 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1993         uint64_t offset, int size)
1994 {
1995         uint64_t value;
1996
1997         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1998
1999         if (offset < NVME_DOORBELL_OFFSET) {
2000                 void *p = &(sc->regs);
2001                 pthread_mutex_lock(&sc->mtx);
2002                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2003                 pthread_mutex_unlock(&sc->mtx);
2004         } else {
2005                 value = 0;
2006                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2007         }
2008
2009         switch (size) {
2010         case 1:
2011                 value &= 0xFF;
2012                 break;
2013         case 2:
2014                 value &= 0xFFFF;
2015                 break;
2016         case 4:
2017                 value &= 0xFFFFFFFF;
2018                 break;
2019         }
2020
2021         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2022                  offset, size, (uint32_t)value);
2023
2024         return (value);
2025 }
2026
2027
2028
2029 static uint64_t
2030 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2031     uint64_t offset, int size)
2032 {
2033         struct pci_nvme_softc* sc = pi->pi_arg;
2034
2035         if (baridx == pci_msix_table_bar(pi) ||
2036             baridx == pci_msix_pba_bar(pi)) {
2037                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2038                         baridx, offset, size);
2039
2040                 return pci_emul_msix_tread(pi, offset, size);
2041         }
2042
2043         switch (baridx) {
2044         case 0:
2045                 return pci_nvme_read_bar_0(sc, offset, size);
2046
2047         default:
2048                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2049         }
2050
2051         return (0);
2052 }
2053
2054
2055 static int
2056 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2057 {
2058         char bident[sizeof("XX:X:X")];
2059         char    *uopt, *xopts, *config;
2060         uint32_t sectsz;
2061         int optidx;
2062
2063         sc->max_queues = NVME_QUEUES;
2064         sc->max_qentries = NVME_MAX_QENTRIES;
2065         sc->ioslots = NVME_IOSLOTS;
2066         sc->num_squeues = sc->max_queues;
2067         sc->num_cqueues = sc->max_queues;
2068         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2069         sectsz = 0;
2070
2071         uopt = strdup(opts);
2072         optidx = 0;
2073         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2074                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2075         for (xopts = strtok(uopt, ",");
2076              xopts != NULL;
2077              xopts = strtok(NULL, ",")) {
2078
2079                 if ((config = strchr(xopts, '=')) != NULL)
2080                         *config++ = '\0';
2081
2082                 if (!strcmp("maxq", xopts)) {
2083                         sc->max_queues = atoi(config);
2084                 } else if (!strcmp("qsz", xopts)) {
2085                         sc->max_qentries = atoi(config);
2086                 } else if (!strcmp("ioslots", xopts)) {
2087                         sc->ioslots = atoi(config);
2088                 } else if (!strcmp("sectsz", xopts)) {
2089                         sectsz = atoi(config);
2090                 } else if (!strcmp("ser", xopts)) {
2091                         /*
2092                          * This field indicates the Product Serial Number in
2093                          * 7-bit ASCII, unused bytes should be space characters.
2094                          * Ref: NVMe v1.3c.
2095                          */
2096                         cpywithpad((char *)sc->ctrldata.sn,
2097                                    sizeof(sc->ctrldata.sn), config, ' ');
2098                 } else if (!strcmp("ram", xopts)) {
2099                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
2100
2101                         sc->nvstore.type = NVME_STOR_RAM;
2102                         sc->nvstore.size = sz * 1024 * 1024;
2103                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2104                         sc->nvstore.sectsz = 4096;
2105                         sc->nvstore.sectsz_bits = 12;
2106                         if (sc->nvstore.ctx == NULL) {
2107                                 perror("Unable to allocate RAM");
2108                                 free(uopt);
2109                                 return (-1);
2110                         }
2111                 } else if (!strcmp("eui64", xopts)) {
2112                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2113                 } else if (!strcmp("dsm", xopts)) {
2114                         if (!strcmp("auto", config))
2115                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2116                         else if (!strcmp("enable", config))
2117                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2118                         else if (!strcmp("disable", config))
2119                                 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2120                 } else if (optidx == 0) {
2121                         snprintf(bident, sizeof(bident), "%d:%d",
2122                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2123                         sc->nvstore.ctx = blockif_open(xopts, bident);
2124                         if (sc->nvstore.ctx == NULL) {
2125                                 perror("Could not open backing file");
2126                                 free(uopt);
2127                                 return (-1);
2128                         }
2129                         sc->nvstore.type = NVME_STOR_BLOCKIF;
2130                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2131                 } else {
2132                         EPRINTLN("Invalid option %s", xopts);
2133                         free(uopt);
2134                         return (-1);
2135                 }
2136
2137                 optidx++;
2138         }
2139         free(uopt);
2140
2141         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2142                 EPRINTLN("backing store not specified");
2143                 return (-1);
2144         }
2145         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2146                 sc->nvstore.sectsz = sectsz;
2147         else if (sc->nvstore.type != NVME_STOR_RAM)
2148                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2149         for (sc->nvstore.sectsz_bits = 9;
2150              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2151              sc->nvstore.sectsz_bits++);
2152
2153         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2154                 sc->max_queues = NVME_QUEUES;
2155
2156         if (sc->max_qentries <= 0) {
2157                 EPRINTLN("Invalid qsz option");
2158                 return (-1);
2159         }
2160         if (sc->ioslots <= 0) {
2161                 EPRINTLN("Invalid ioslots option");
2162                 return (-1);
2163         }
2164
2165         return (0);
2166 }
2167
2168 static int
2169 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2170 {
2171         struct pci_nvme_softc *sc;
2172         uint32_t pci_membar_sz;
2173         int     error;
2174
2175         error = 0;
2176
2177         sc = calloc(1, sizeof(struct pci_nvme_softc));
2178         pi->pi_arg = sc;
2179         sc->nsc_pi = pi;
2180
2181         error = pci_nvme_parse_opts(sc, opts);
2182         if (error < 0)
2183                 goto done;
2184         else
2185                 error = 0;
2186
2187         STAILQ_INIT(&sc->ioreqs_free);
2188         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2189         for (int i = 0; i < sc->ioslots; i++) {
2190                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2191                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2192                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2193         }
2194         sc->intr_coales_aggr_thresh = 1;
2195
2196         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2197         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2198         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2199         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2200         pci_set_cfgdata8(pi, PCIR_PROGIF,
2201                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2202
2203         /*
2204          * Allocate size of NVMe registers + doorbell space for all queues.
2205          *
2206          * The specification requires a minimum memory I/O window size of 16K.
2207          * The Windows driver will refuse to start a device with a smaller
2208          * window.
2209          */
2210         pci_membar_sz = sizeof(struct nvme_registers) +
2211             2 * sizeof(uint32_t) * (sc->max_queues + 1);
2212         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2213
2214         DPRINTF("nvme membar size: %u", pci_membar_sz);
2215
2216         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2217         if (error) {
2218                 WPRINTF("%s pci alloc mem bar failed", __func__);
2219                 goto done;
2220         }
2221
2222         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2223         if (error) {
2224                 WPRINTF("%s pci add msixcap failed", __func__);
2225                 goto done;
2226         }
2227
2228         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2229         if (error) {
2230                 WPRINTF("%s pci add Express capability failed", __func__);
2231                 goto done;
2232         }
2233
2234         pthread_mutex_init(&sc->mtx, NULL);
2235         sem_init(&sc->iosemlock, 0, sc->ioslots);
2236
2237         pci_nvme_reset(sc);
2238         /*
2239          * Controller data depends on Namespace data so initialize Namespace
2240          * data first.
2241          */
2242         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2243         pci_nvme_init_ctrldata(sc);
2244         pci_nvme_init_logpages(sc);
2245
2246         pci_lintr_request(pi);
2247
2248 done:
2249         return (error);
2250 }
2251
2252
2253 struct pci_devemu pci_de_nvme = {
2254         .pe_emu =       "nvme",
2255         .pe_init =      pci_nvme_init,
2256         .pe_barwrite =  pci_nvme_write,
2257         .pe_barread =   pci_nvme_read
2258 };
2259 PCI_EMUL_SET(pci_de_nvme);