]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
Add liblutok a lightweight C++ API for lua.
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov 
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62
63 #include <assert.h>
64 #include <pthread.h>
65 #include <semaphore.h>
66 #include <stdbool.h>
67 #include <stddef.h>
68 #include <stdint.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
75 #include <vmmapi.h>
76
77 #include <dev/nvme/nvme.h>
78
79 #include "bhyverun.h"
80 #include "block_if.h"
81 #include "debug.h"
82 #include "pci_emul.h"
83
84
85 static int nvme_debug = 0;
86 #define DPRINTF(params) if (nvme_debug) PRINTLN params
87 #define WPRINTF(params) PRINTLN params
88
89 /* defaults; can be overridden */
90 #define NVME_MSIX_BAR           4
91
92 #define NVME_IOSLOTS            8
93
94 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
95 #define NVME_MMIO_SPACE_MIN     (1 << 14)
96
97 #define NVME_QUEUES             16
98 #define NVME_MAX_QENTRIES       2048
99
100 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
101 #define NVME_MAX_BLOCKIOVS      512
102
103 /* This is a synthetic status code to indicate there is no status */
104 #define NVME_NO_STATUS          0xffff
105 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
106
107 /* helpers */
108
109 /* Convert a zero-based value into a one-based value */
110 #define ONE_BASED(zero)         ((zero) + 1)
111 /* Convert a one-based value into a zero-based value */
112 #define ZERO_BASED(one)         ((one)  - 1)
113
114 /* Encode number of SQ's and CQ's for Set/Get Features */
115 #define NVME_FEATURE_NUM_QUEUES(sc) \
116         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
117         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
118
119 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
120
121 enum nvme_controller_register_offsets {
122         NVME_CR_CAP_LOW = 0x00,
123         NVME_CR_CAP_HI  = 0x04,
124         NVME_CR_VS      = 0x08,
125         NVME_CR_INTMS   = 0x0c,
126         NVME_CR_INTMC   = 0x10,
127         NVME_CR_CC      = 0x14,
128         NVME_CR_CSTS    = 0x1c,
129         NVME_CR_NSSR    = 0x20,
130         NVME_CR_AQA     = 0x24,
131         NVME_CR_ASQ_LOW = 0x28,
132         NVME_CR_ASQ_HI  = 0x2c,
133         NVME_CR_ACQ_LOW = 0x30,
134         NVME_CR_ACQ_HI  = 0x34,
135 };
136
137 enum nvme_cmd_cdw11 {
138         NVME_CMD_CDW11_PC  = 0x0001,
139         NVME_CMD_CDW11_IEN = 0x0002,
140         NVME_CMD_CDW11_IV  = 0xFFFF0000,
141 };
142
143 #define NVME_CQ_INTEN   0x01
144 #define NVME_CQ_INTCOAL 0x02
145
146 struct nvme_completion_queue {
147         struct nvme_completion *qbase;
148         uint32_t        size;
149         uint16_t        tail; /* nvme progress */
150         uint16_t        head; /* guest progress */
151         uint16_t        intr_vec;
152         uint32_t        intr_en;
153         pthread_mutex_t mtx;
154 };
155
156 struct nvme_submission_queue {
157         struct nvme_command *qbase;
158         uint32_t        size;
159         uint16_t        head; /* nvme progress */
160         uint16_t        tail; /* guest progress */
161         uint16_t        cqid; /* completion queue id */
162         int             busy; /* queue is being processed */
163         int             qpriority;
164 };
165
166 enum nvme_storage_type {
167         NVME_STOR_BLOCKIF = 0,
168         NVME_STOR_RAM = 1,
169 };
170
171 struct pci_nvme_blockstore {
172         enum nvme_storage_type type;
173         void            *ctx;
174         uint64_t        size;
175         uint32_t        sectsz;
176         uint32_t        sectsz_bits;
177         uint64_t        eui64;
178 };
179
180 struct pci_nvme_ioreq {
181         struct pci_nvme_softc *sc;
182         struct pci_nvme_ioreq *next;
183         struct nvme_submission_queue *nvme_sq;
184         uint16_t        sqid;
185
186         /* command information */
187         uint16_t        opc;
188         uint16_t        cid;
189         uint32_t        nsid;
190
191         uint64_t        prev_gpaddr;
192         size_t          prev_size;
193
194         /*
195          * lock if all iovs consumed (big IO);
196          * complete transaction before continuing
197          */
198         pthread_mutex_t mtx;
199         pthread_cond_t  cv;
200
201         struct blockif_req io_req;
202
203         /* pad to fit up to 512 page descriptors from guest IO request */
204         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
205 };
206
207 struct pci_nvme_softc {
208         struct pci_devinst *nsc_pi;
209
210         pthread_mutex_t mtx;
211
212         struct nvme_registers regs;
213
214         struct nvme_namespace_data  nsdata;
215         struct nvme_controller_data ctrldata;
216         struct nvme_error_information_entry err_log;
217         struct nvme_health_information_page health_log;
218         struct nvme_firmware_page fw_log;
219
220         struct pci_nvme_blockstore nvstore;
221
222         uint16_t        max_qentries;   /* max entries per queue */
223         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
224         uint32_t        num_cqueues;
225         uint32_t        num_squeues;
226
227         struct pci_nvme_ioreq *ioreqs;
228         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
229         uint32_t        pending_ios;
230         uint32_t        ioslots;
231         sem_t           iosemlock;
232
233         /*
234          * Memory mapped Submission and Completion queues
235          * Each array includes both Admin and IO queues
236          */
237         struct nvme_completion_queue *compl_queues;
238         struct nvme_submission_queue *submit_queues;
239
240         /* controller features */
241         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
242         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
243         uint32_t        async_ev_config;         /* 0x0B: async event config */
244 };
245
246
247 static void pci_nvme_io_partial(struct blockif_req *br, int err);
248
249 /* Controller Configuration utils */
250 #define NVME_CC_GET_EN(cc) \
251         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
252 #define NVME_CC_GET_CSS(cc) \
253         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
254 #define NVME_CC_GET_SHN(cc) \
255         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
256 #define NVME_CC_GET_IOSQES(cc) \
257         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
258 #define NVME_CC_GET_IOCQES(cc) \
259         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
260
261 #define NVME_CC_WRITE_MASK \
262         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
263          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
264          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
265
266 #define NVME_CC_NEN_WRITE_MASK \
267         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
268          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
269          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
270
271 /* Controller Status utils */
272 #define NVME_CSTS_GET_RDY(sts) \
273         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
274
275 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
276
277 /* Completion Queue status word utils */
278 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
279 #define NVME_STATUS_MASK \
280         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
281          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
282
283 static __inline void
284 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
285 {
286         size_t len;
287
288         len = strnlen(src, dst_size);
289         memset(dst, pad, dst_size);
290         memcpy(dst, src, len);
291 }
292
293 static __inline void
294 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
295 {
296
297         *status &= ~NVME_STATUS_MASK;
298         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
299                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
300 }
301
302 static __inline void
303 pci_nvme_status_genc(uint16_t *status, uint16_t code)
304 {
305
306         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
307 }
308
309 static __inline void
310 pci_nvme_toggle_phase(uint16_t *status, int prev)
311 {
312
313         if (prev)
314                 *status &= ~NVME_STATUS_P;
315         else
316                 *status |= NVME_STATUS_P;
317 }
318
319 static void
320 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
321 {
322         struct nvme_controller_data *cd = &sc->ctrldata;
323
324         cd->vid = 0xFB5D;
325         cd->ssvid = 0x0000;
326
327         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
328         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
329
330         /* Num of submission commands that we can handle at a time (2^rab) */
331         cd->rab   = 4;
332
333         /* FreeBSD OUI */
334         cd->ieee[0] = 0x58;
335         cd->ieee[1] = 0x9c;
336         cd->ieee[2] = 0xfc;
337
338         cd->mic = 0;
339
340         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
341
342         cd->ver = 0x00010300;
343
344         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
345         cd->acl = 2;
346         cd->aerl = 4;
347
348         cd->lpa = 0;    /* TODO: support some simple things like SMART */
349         cd->elpe = 0;   /* max error log page entries */
350         cd->npss = 1;   /* number of power states support */
351
352         /* Warning Composite Temperature Threshold */
353         cd->wctemp = 0x0157;
354
355         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
356             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
357         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
358             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
359         cd->nn = 1;     /* number of namespaces */
360
361         cd->fna = 0x03;
362
363         cd->power_state[0].mp = 10;
364 }
365
366 /*
367  * Calculate the CRC-16 of the given buffer
368  * See copyright attribution at top of file
369  */
370 static uint16_t
371 crc16(uint16_t crc, const void *buffer, unsigned int len)
372 {
373         const unsigned char *cp = buffer;
374         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
375         static uint16_t const crc16_table[256] = {
376                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
377                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
378                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
379                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
380                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
381                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
382                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
383                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
384                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
385                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
386                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
387                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
388                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
389                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
390                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
391                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
392                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
393                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
394                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
395                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
396                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
397                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
398                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
399                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
400                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
401                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
402                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
403                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
404                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
405                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
406                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
407                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
408         };
409
410         while (len--)
411                 crc = (((crc >> 8) & 0xffU) ^
412                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
413         return crc;
414 }
415
416 static void
417 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
418     struct nvme_namespace_data *nd, uint32_t nsid,
419     uint64_t eui64)
420 {
421
422         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
423         nd->ncap = nd->nsze;
424         nd->nuse = nd->nsze;
425
426         /* Get LBA and backstore information from backing store */
427         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
428         nd->flbas = 0;
429
430         /* Create an EUI-64 if user did not provide one */
431         if (eui64 == 0) {
432                 char *data = NULL;
433
434                 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
435                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
436
437                 if (data != NULL) {
438                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
439                         free(data);
440                 }
441                 eui64 = (eui64 << 16) | (nsid & 0xffff);
442         }
443         be64enc(nd->eui64, eui64);
444
445         /* LBA data-sz = 2^lbads */
446         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
447 }
448
449 static void
450 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
451 {
452
453         memset(&sc->err_log, 0, sizeof(sc->err_log));
454         memset(&sc->health_log, 0, sizeof(sc->health_log));
455         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
456 }
457
458 static void
459 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
460 {
461         DPRINTF(("%s", __func__));
462
463         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
464             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
465             (60 << NVME_CAP_LO_REG_TO_SHIFT);
466
467         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
468
469         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
470
471         sc->regs.cc = 0;
472         sc->regs.csts = 0;
473
474         sc->num_cqueues = sc->num_squeues = sc->max_queues;
475         if (sc->submit_queues != NULL) {
476                 for (int i = 0; i < sc->num_squeues + 1; i++) {
477                         /*
478                          * The Admin Submission Queue is at index 0.
479                          * It must not be changed at reset otherwise the
480                          * emulation will be out of sync with the guest.
481                          */
482                         if (i != 0) {
483                                 sc->submit_queues[i].qbase = NULL;
484                                 sc->submit_queues[i].size = 0;
485                                 sc->submit_queues[i].cqid = 0;
486                         }
487                         sc->submit_queues[i].tail = 0;
488                         sc->submit_queues[i].head = 0;
489                         sc->submit_queues[i].busy = 0;
490                 }
491         } else
492                 sc->submit_queues = calloc(sc->num_squeues + 1,
493                                         sizeof(struct nvme_submission_queue));
494
495         if (sc->compl_queues != NULL) {
496                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
497                         /* See Admin Submission Queue note above */
498                         if (i != 0) {
499                                 sc->compl_queues[i].qbase = NULL;
500                                 sc->compl_queues[i].size = 0;
501                         }
502
503                         sc->compl_queues[i].tail = 0;
504                         sc->compl_queues[i].head = 0;
505                 }
506         } else {
507                 sc->compl_queues = calloc(sc->num_cqueues + 1,
508                                         sizeof(struct nvme_completion_queue));
509
510                 for (int i = 0; i < sc->num_cqueues + 1; i++)
511                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
512         }
513 }
514
515 static void
516 pci_nvme_reset(struct pci_nvme_softc *sc)
517 {
518         pthread_mutex_lock(&sc->mtx);
519         pci_nvme_reset_locked(sc);
520         pthread_mutex_unlock(&sc->mtx);
521 }
522
523 static void
524 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
525 {
526         uint16_t acqs, asqs;
527
528         DPRINTF(("%s", __func__));
529
530         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
531         sc->submit_queues[0].size = asqs;
532         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
533                     sizeof(struct nvme_command) * asqs);
534
535         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p",
536                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
537
538         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
539             NVME_AQA_REG_ACQS_MASK) + 1;
540         sc->compl_queues[0].size = acqs;
541         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
542                  sizeof(struct nvme_completion) * acqs);
543         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p",
544                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
545 }
546
547 static int
548 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
549         size_t len)
550 {
551         uint8_t *dst;
552         size_t bytes;
553
554         if (len > (8 * 1024)) {
555                 return (-1);
556         }
557
558         /* Copy from the start of prp1 to the end of the physical page */
559         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
560         bytes = MIN(bytes, len);
561
562         dst = vm_map_gpa(ctx, prp1, bytes);
563         if (dst == NULL) {
564                 return (-1);
565         }
566
567         memcpy(dst, src, bytes);
568
569         src += bytes;
570
571         len -= bytes;
572         if (len == 0) {
573                 return (0);
574         }
575
576         len = MIN(len, PAGE_SIZE);
577
578         dst = vm_map_gpa(ctx, prp2, len);
579         if (dst == NULL) {
580                 return (-1);
581         }
582
583         memcpy(dst, src, len);
584
585         return (0);
586 }
587
588 static int
589 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
590         struct nvme_completion* compl)
591 {
592         uint16_t qid = command->cdw10 & 0xffff;
593
594         DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid));
595         if (qid == 0 || qid > sc->num_squeues) {
596                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u",
597                         __func__, qid, sc->num_squeues));
598                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
599                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
600                 return (1);
601         }
602
603         sc->submit_queues[qid].qbase = NULL;
604         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
605         return (1);
606 }
607
608 static int
609 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
610         struct nvme_completion* compl)
611 {
612         if (command->cdw11 & NVME_CMD_CDW11_PC) {
613                 uint16_t qid = command->cdw10 & 0xffff;
614                 struct nvme_submission_queue *nsq;
615
616                 if ((qid == 0) || (qid > sc->num_squeues)) {
617                         WPRINTF(("%s queue index %u > num_squeues %u",
618                                 __func__, qid, sc->num_squeues));
619                         pci_nvme_status_tc(&compl->status,
620                             NVME_SCT_COMMAND_SPECIFIC,
621                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
622                         return (1);
623                 }
624
625                 nsq = &sc->submit_queues[qid];
626                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
627
628                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
629                               sizeof(struct nvme_command) * (size_t)nsq->size);
630                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
631                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
632
633                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__,
634                         qid, nsq->size, nsq->qbase, nsq->cqid));
635
636                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
637
638                 DPRINTF(("%s completed creating IOSQ qid %u",
639                          __func__, qid));
640         } else {
641                 /* 
642                  * Guest sent non-cont submission queue request.
643                  * This setting is unsupported by this emulation.
644                  */
645                 WPRINTF(("%s unsupported non-contig (list-based) "
646                          "create i/o submission queue", __func__));
647
648                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
649         }
650         return (1);
651 }
652
653 static int
654 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
655         struct nvme_completion* compl)
656 {
657         uint16_t qid = command->cdw10 & 0xffff;
658
659         DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid));
660         if (qid == 0 || qid > sc->num_cqueues) {
661                 WPRINTF(("%s queue index %u / num_cqueues %u",
662                         __func__, qid, sc->num_cqueues));
663                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
664                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
665                 return (1);
666         }
667
668         sc->compl_queues[qid].qbase = NULL;
669         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
670         return (1);
671 }
672
673 static int
674 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
675         struct nvme_completion* compl)
676 {
677         if (command->cdw11 & NVME_CMD_CDW11_PC) {
678                 uint16_t qid = command->cdw10 & 0xffff;
679                 struct nvme_completion_queue *ncq;
680
681                 if ((qid == 0) || (qid > sc->num_cqueues)) {
682                         WPRINTF(("%s queue index %u > num_cqueues %u",
683                                 __func__, qid, sc->num_cqueues));
684                         pci_nvme_status_tc(&compl->status,
685                             NVME_SCT_COMMAND_SPECIFIC,
686                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
687                         return (1);
688                 }
689
690                 ncq = &sc->compl_queues[qid];
691                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
692                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
693                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
694
695                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
696                              command->prp1,
697                              sizeof(struct nvme_command) * (size_t)ncq->size);
698
699                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
700         } else {
701                 /* 
702                  * Non-contig completion queue unsupported.
703                  */
704                 WPRINTF(("%s unsupported non-contig (list-based) "
705                          "create i/o completion queue",
706                          __func__));
707
708                 /* 0x12 = Invalid Use of Controller Memory Buffer */
709                 pci_nvme_status_genc(&compl->status, 0x12);
710         }
711
712         return (1);
713 }
714
715 static int
716 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
717         struct nvme_completion* compl)
718 {
719         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
720         uint8_t logpage = command->cdw10 & 0xFF;
721
722         DPRINTF(("%s log page %u len %u", __func__, logpage, logsize));
723
724         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
725
726         switch (logpage) {
727         case NVME_LOG_ERROR:
728                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
729                     command->prp2, (uint8_t *)&sc->err_log, logsize);
730                 break;
731         case NVME_LOG_HEALTH_INFORMATION:
732                 /* TODO: present some smart info */
733                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
734                     command->prp2, (uint8_t *)&sc->health_log, logsize);
735                 break;
736         case NVME_LOG_FIRMWARE_SLOT:
737                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
738                     command->prp2, (uint8_t *)&sc->fw_log, logsize);
739                 break;
740         default:
741                 WPRINTF(("%s get log page %x command not supported",
742                         __func__, logpage));
743
744                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
745                     NVME_SC_INVALID_LOG_PAGE);
746         }
747
748         return (1);
749 }
750
751 static int
752 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
753         struct nvme_completion* compl)
754 {
755         void *dest;
756
757         DPRINTF(("%s identify 0x%x nsid 0x%x", __func__,
758                 command->cdw10 & 0xFF, command->nsid));
759
760         switch (command->cdw10 & 0xFF) {
761         case 0x00: /* return Identify Namespace data structure */
762                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
763                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
764                 break;
765         case 0x01: /* return Identify Controller data structure */
766                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
767                     command->prp2, (uint8_t *)&sc->ctrldata,
768                     sizeof(sc->ctrldata));
769                 break;
770         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
771                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
772                                   sizeof(uint32_t) * 1024);
773                 ((uint32_t *)dest)[0] = 1;
774                 ((uint32_t *)dest)[1] = 0;
775                 break;
776         case 0x11:
777                 pci_nvme_status_genc(&compl->status,
778                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
779                 return (1);
780         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
781         case 0x10:
782         case 0x12:
783         case 0x13:
784         case 0x14:
785         case 0x15:
786         default:
787                 DPRINTF(("%s unsupported identify command requested 0x%x",
788                          __func__, command->cdw10 & 0xFF));
789                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
790                 return (1);
791         }
792
793         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
794         return (1);
795 }
796
797 static int
798 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
799         struct nvme_completion* compl)
800 {
801         uint16_t nqr;   /* Number of Queues Requested */
802
803         nqr = command->cdw11 & 0xFFFF;
804         if (nqr == 0xffff) {
805                 WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr));
806                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
807                 return (-1);
808         }
809
810         sc->num_squeues = ONE_BASED(nqr);
811         if (sc->num_squeues > sc->max_queues) {
812                 DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues,
813                                         sc->max_queues));
814                 sc->num_squeues = sc->max_queues;
815         }
816
817         nqr = (command->cdw11 >> 16) & 0xFFFF;
818         if (nqr == 0xffff) {
819                 WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr));
820                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
821                 return (-1);
822         }
823
824         sc->num_cqueues = ONE_BASED(nqr);
825         if (sc->num_cqueues > sc->max_queues) {
826                 DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues,
827                                         sc->max_queues));
828                 sc->num_cqueues = sc->max_queues;
829         }
830
831         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
832
833         return (0);
834 }
835
836 static int
837 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
838         struct nvme_completion* compl)
839 {
840         int feature = command->cdw10 & 0xFF;
841         uint32_t iv;
842
843         DPRINTF(("%s feature 0x%x", __func__, feature));
844         compl->cdw0 = 0;
845
846         switch (feature) {
847         case NVME_FEAT_ARBITRATION:
848                 DPRINTF(("  arbitration 0x%x", command->cdw11));
849                 break;
850         case NVME_FEAT_POWER_MANAGEMENT:
851                 DPRINTF(("  power management 0x%x", command->cdw11));
852                 break;
853         case NVME_FEAT_LBA_RANGE_TYPE:
854                 DPRINTF(("  lba range 0x%x", command->cdw11));
855                 break;
856         case NVME_FEAT_TEMPERATURE_THRESHOLD:
857                 DPRINTF(("  temperature threshold 0x%x", command->cdw11));
858                 break;
859         case NVME_FEAT_ERROR_RECOVERY:
860                 DPRINTF(("  error recovery 0x%x", command->cdw11));
861                 break;
862         case NVME_FEAT_VOLATILE_WRITE_CACHE:
863                 DPRINTF(("  volatile write cache 0x%x", command->cdw11));
864                 break;
865         case NVME_FEAT_NUMBER_OF_QUEUES:
866                 nvme_set_feature_queues(sc, command, compl);
867                 break;
868         case NVME_FEAT_INTERRUPT_COALESCING:
869                 DPRINTF(("  interrupt coalescing 0x%x", command->cdw11));
870
871                 /* in uS */
872                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
873
874                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
875                 break;
876         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
877                 iv = command->cdw11 & 0xFFFF;
878
879                 DPRINTF(("  interrupt vector configuration 0x%x",
880                         command->cdw11));
881
882                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
883                         if (sc->compl_queues[i].intr_vec == iv) {
884                                 if (command->cdw11 & (1 << 16))
885                                         sc->compl_queues[i].intr_en |=
886                                                               NVME_CQ_INTCOAL;  
887                                 else
888                                         sc->compl_queues[i].intr_en &=
889                                                              ~NVME_CQ_INTCOAL;  
890                         }
891                 }
892                 break;
893         case NVME_FEAT_WRITE_ATOMICITY:
894                 DPRINTF(("  write atomicity 0x%x", command->cdw11));
895                 break;
896         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
897                 DPRINTF(("  async event configuration 0x%x",
898                         command->cdw11));
899                 sc->async_ev_config = command->cdw11;
900                 break;
901         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
902                 DPRINTF(("  software progress marker 0x%x",
903                         command->cdw11));
904                 break;
905         case 0x0C:
906                 DPRINTF(("  autonomous power state transition 0x%x",
907                         command->cdw11));
908                 break;
909         default:
910                 WPRINTF(("%s invalid feature", __func__));
911                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
912                 return (1);
913         }
914
915         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
916         return (1);
917 }
918
919 static int
920 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
921         struct nvme_completion* compl)
922 {
923         int feature = command->cdw10 & 0xFF;
924
925         DPRINTF(("%s feature 0x%x", __func__, feature));
926
927         compl->cdw0 = 0;
928
929         switch (feature) {
930         case NVME_FEAT_ARBITRATION:
931                 DPRINTF(("  arbitration"));
932                 break;
933         case NVME_FEAT_POWER_MANAGEMENT:
934                 DPRINTF(("  power management"));
935                 break;
936         case NVME_FEAT_LBA_RANGE_TYPE:
937                 DPRINTF(("  lba range"));
938                 break;
939         case NVME_FEAT_TEMPERATURE_THRESHOLD:
940                 DPRINTF(("  temperature threshold"));
941                 switch ((command->cdw11 >> 20) & 0x3) {
942                 case 0:
943                         /* Over temp threshold */
944                         compl->cdw0 = 0xFFFF;
945                         break;
946                 case 1:
947                         /* Under temp threshold */
948                         compl->cdw0 = 0;
949                         break;
950                 default:
951                         WPRINTF(("  invalid threshold type select"));
952                         pci_nvme_status_genc(&compl->status,
953                             NVME_SC_INVALID_FIELD);
954                         return (1);
955                 }
956                 break;
957         case NVME_FEAT_ERROR_RECOVERY:
958                 DPRINTF(("  error recovery"));
959                 break;
960         case NVME_FEAT_VOLATILE_WRITE_CACHE:
961                 DPRINTF(("  volatile write cache"));
962                 break;
963         case NVME_FEAT_NUMBER_OF_QUEUES:
964                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
965
966                 DPRINTF(("  number of queues (submit %u, completion %u)",
967                         compl->cdw0 & 0xFFFF,
968                         (compl->cdw0 >> 16) & 0xFFFF));
969
970                 break;
971         case NVME_FEAT_INTERRUPT_COALESCING:
972                 DPRINTF(("  interrupt coalescing"));
973                 break;
974         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
975                 DPRINTF(("  interrupt vector configuration"));
976                 break;
977         case NVME_FEAT_WRITE_ATOMICITY:
978                 DPRINTF(("  write atomicity"));
979                 break;
980         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
981                 DPRINTF(("  async event configuration"));
982                 sc->async_ev_config = command->cdw11;
983                 break;
984         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
985                 DPRINTF(("  software progress marker"));
986                 break;
987         case 0x0C:
988                 DPRINTF(("  autonomous power state transition"));
989                 break;
990         default:
991                 WPRINTF(("%s invalid feature 0x%x", __func__, feature));
992                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
993                 return (1);
994         }
995
996         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
997         return (1);
998 }
999
1000 static int
1001 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1002         struct nvme_completion* compl)
1003 {
1004         DPRINTF(("%s submission queue %u, command ID 0x%x", __func__,
1005                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1006
1007         /* TODO: search for the command ID and abort it */
1008
1009         compl->cdw0 = 1;
1010         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1011         return (1);
1012 }
1013
1014 static int
1015 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1016         struct nvme_command* command, struct nvme_completion* compl)
1017 {
1018         DPRINTF(("%s async event request 0x%x", __func__, command->cdw11));
1019
1020         /*
1021          * TODO: raise events when they happen based on the Set Features cmd.
1022          * These events happen async, so only set completion successful if
1023          * there is an event reflective of the request to get event.
1024          */
1025         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1026             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1027         return (0);
1028 }
1029
1030 static void
1031 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1032 {
1033         struct nvme_completion compl;
1034         struct nvme_command *cmd;
1035         struct nvme_submission_queue *sq;
1036         struct nvme_completion_queue *cq;
1037         int do_intr = 0;
1038         uint16_t sqhead;
1039
1040         DPRINTF(("%s index %u", __func__, (uint32_t)value));
1041
1042         sq = &sc->submit_queues[0];
1043
1044         sqhead = atomic_load_acq_short(&sq->head);
1045
1046         if (atomic_testandset_int(&sq->busy, 1)) {
1047                 DPRINTF(("%s SQ busy, head %u, tail %u",
1048                         __func__, sqhead, sq->tail));
1049                 return;
1050         }
1051
1052         DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail));
1053         
1054         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1055                 cmd = &(sq->qbase)[sqhead];
1056                 compl.cdw0 = 0;
1057                 compl.status = 0;
1058
1059                 switch (cmd->opc) {
1060                 case NVME_OPC_DELETE_IO_SQ:
1061                         DPRINTF(("%s command DELETE_IO_SQ", __func__));
1062                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1063                         break;
1064                 case NVME_OPC_CREATE_IO_SQ:
1065                         DPRINTF(("%s command CREATE_IO_SQ", __func__));
1066                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1067                         break;
1068                 case NVME_OPC_DELETE_IO_CQ:
1069                         DPRINTF(("%s command DELETE_IO_CQ", __func__));
1070                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1071                         break;
1072                 case NVME_OPC_CREATE_IO_CQ:
1073                         DPRINTF(("%s command CREATE_IO_CQ", __func__));
1074                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1075                         break;
1076                 case NVME_OPC_GET_LOG_PAGE:
1077                         DPRINTF(("%s command GET_LOG_PAGE", __func__));
1078                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1079                         break;
1080                 case NVME_OPC_IDENTIFY:
1081                         DPRINTF(("%s command IDENTIFY", __func__));
1082                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
1083                         break;
1084                 case NVME_OPC_ABORT:
1085                         DPRINTF(("%s command ABORT", __func__));
1086                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
1087                         break;
1088                 case NVME_OPC_SET_FEATURES:
1089                         DPRINTF(("%s command SET_FEATURES", __func__));
1090                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1091                         break;
1092                 case NVME_OPC_GET_FEATURES:
1093                         DPRINTF(("%s command GET_FEATURES", __func__));
1094                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1095                         break;
1096                 case NVME_OPC_ASYNC_EVENT_REQUEST:
1097                         DPRINTF(("%s command ASYNC_EVENT_REQ", __func__));
1098                         /* XXX dont care, unhandled for now
1099                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1100                         */
1101                         compl.status = NVME_NO_STATUS;
1102                         break;
1103                 default:
1104                         WPRINTF(("0x%x command is not implemented",
1105                             cmd->opc));
1106                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1107                         do_intr |= 1;
1108                 }
1109         
1110                 if (NVME_COMPLETION_VALID(compl)) {
1111                         struct nvme_completion *cp;
1112                         int phase;
1113
1114                         cq = &sc->compl_queues[0];
1115
1116                         cp = &(cq->qbase)[cq->tail];
1117                         cp->cdw0 = compl.cdw0;
1118                         cp->sqid = 0;
1119                         cp->sqhd = sqhead;
1120                         cp->cid = cmd->cid;
1121
1122                         phase = NVME_STATUS_GET_P(cp->status);
1123                         cp->status = compl.status;
1124                         pci_nvme_toggle_phase(&cp->status, phase);
1125
1126                         cq->tail = (cq->tail + 1) % cq->size;
1127                 }
1128                 sqhead = (sqhead + 1) % sq->size;
1129         }
1130
1131         DPRINTF(("setting sqhead %u", sqhead));
1132         atomic_store_short(&sq->head, sqhead);
1133         atomic_store_int(&sq->busy, 0);
1134
1135         if (do_intr)
1136                 pci_generate_msix(sc->nsc_pi, 0);
1137
1138 }
1139
1140 static int
1141 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1142         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1143 {
1144         int iovidx;
1145
1146         if (req != NULL) {
1147                 /* concatenate contig block-iovs to minimize number of iovs */
1148                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1149                         iovidx = req->io_req.br_iovcnt - 1;
1150
1151                         req->io_req.br_iov[iovidx].iov_base =
1152                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1153                                              req->prev_gpaddr, size);
1154
1155                         req->prev_size += size;
1156                         req->io_req.br_resid += size;
1157
1158                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1159                 } else {
1160                         pthread_mutex_lock(&req->mtx);
1161
1162                         iovidx = req->io_req.br_iovcnt;
1163                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1164                                 int err = 0;
1165
1166                                 DPRINTF(("large I/O, doing partial req"));
1167
1168                                 iovidx = 0;
1169                                 req->io_req.br_iovcnt = 0;
1170
1171                                 req->io_req.br_callback = pci_nvme_io_partial;
1172
1173                                 if (!do_write)
1174                                         err = blockif_read(sc->nvstore.ctx,
1175                                                            &req->io_req);
1176                                 else
1177                                         err = blockif_write(sc->nvstore.ctx,
1178                                                             &req->io_req);
1179
1180                                 /* wait until req completes before cont */
1181                                 if (err == 0)
1182                                         pthread_cond_wait(&req->cv, &req->mtx);
1183                         }
1184                         if (iovidx == 0) {
1185                                 req->io_req.br_offset = lba;
1186                                 req->io_req.br_resid = 0;
1187                                 req->io_req.br_param = req;
1188                         }
1189
1190                         req->io_req.br_iov[iovidx].iov_base =
1191                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1192                                              gpaddr, size);
1193
1194                         req->io_req.br_iov[iovidx].iov_len = size;
1195
1196                         req->prev_gpaddr = gpaddr;
1197                         req->prev_size = size;
1198                         req->io_req.br_resid += size;
1199
1200                         req->io_req.br_iovcnt++;
1201
1202                         pthread_mutex_unlock(&req->mtx);
1203                 }
1204         } else {
1205                 /* RAM buffer: read/write directly */
1206                 void *p = sc->nvstore.ctx;
1207                 void *gptr;
1208
1209                 if ((lba + size) > sc->nvstore.size) {
1210                         WPRINTF(("%s write would overflow RAM", __func__));
1211                         return (-1);
1212                 }
1213
1214                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1215                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1216                 if (do_write) 
1217                         memcpy(p, gptr, size);
1218                 else
1219                         memcpy(gptr, p, size);
1220         }
1221         return (0);
1222 }
1223
1224 static void
1225 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1226         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1227         uint32_t cdw0, uint16_t status, int ignore_busy)
1228 {
1229         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1230         struct nvme_completion *compl;
1231         int do_intr = 0;
1232         int phase;
1233
1234         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1235                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1236                  NVME_STATUS_GET_SC(status)));
1237
1238         pthread_mutex_lock(&cq->mtx);
1239
1240         assert(cq->qbase != NULL);
1241
1242         compl = &cq->qbase[cq->tail];
1243
1244         compl->sqhd = atomic_load_acq_short(&sq->head);
1245         compl->sqid = sqid;
1246         compl->cid = cid;
1247
1248         // toggle phase
1249         phase = NVME_STATUS_GET_P(compl->status);
1250         compl->status = status;
1251         pci_nvme_toggle_phase(&compl->status, phase);
1252
1253         cq->tail = (cq->tail + 1) % cq->size;
1254
1255         if (cq->intr_en & NVME_CQ_INTEN)
1256                 do_intr = 1;
1257
1258         pthread_mutex_unlock(&cq->mtx);
1259
1260         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1261                 if (do_intr)
1262                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1263 }
1264
1265 static void
1266 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1267 {
1268         req->sc = NULL;
1269         req->nvme_sq = NULL;
1270         req->sqid = 0;
1271
1272         pthread_mutex_lock(&sc->mtx);
1273
1274         req->next = sc->ioreqs_free;
1275         sc->ioreqs_free = req;
1276         sc->pending_ios--;
1277
1278         /* when no more IO pending, can set to ready if device reset/enabled */
1279         if (sc->pending_ios == 0 &&
1280             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1281                 sc->regs.csts |= NVME_CSTS_RDY;
1282
1283         pthread_mutex_unlock(&sc->mtx);
1284
1285         sem_post(&sc->iosemlock);
1286 }
1287
1288 static struct pci_nvme_ioreq *
1289 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1290 {
1291         struct pci_nvme_ioreq *req = NULL;;
1292
1293         sem_wait(&sc->iosemlock);
1294         pthread_mutex_lock(&sc->mtx);
1295
1296         req = sc->ioreqs_free;
1297         assert(req != NULL);
1298
1299         sc->ioreqs_free = req->next;
1300
1301         req->next = NULL;
1302         req->sc = sc;
1303
1304         sc->pending_ios++;
1305
1306         pthread_mutex_unlock(&sc->mtx);
1307
1308         req->io_req.br_iovcnt = 0;
1309         req->io_req.br_offset = 0;
1310         req->io_req.br_resid = 0;
1311         req->io_req.br_param = req;
1312         req->prev_gpaddr = 0;
1313         req->prev_size = 0;
1314
1315         return req;
1316 }
1317
1318 static void
1319 pci_nvme_io_done(struct blockif_req *br, int err)
1320 {
1321         struct pci_nvme_ioreq *req = br->br_param;
1322         struct nvme_submission_queue *sq = req->nvme_sq;
1323         uint16_t code, status;
1324
1325         DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1326         
1327         /* TODO return correct error */
1328         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1329         pci_nvme_status_genc(&status, code);
1330
1331         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1332         pci_nvme_release_ioreq(req->sc, req);
1333 }
1334
1335 static void
1336 pci_nvme_io_partial(struct blockif_req *br, int err)
1337 {
1338         struct pci_nvme_ioreq *req = br->br_param;
1339
1340         DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1341
1342         pthread_cond_signal(&req->cv);
1343 }
1344
1345
1346 static void
1347 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1348 {
1349         struct nvme_submission_queue *sq;
1350         uint16_t status;
1351         uint16_t sqhead;
1352         int err;
1353
1354         /* handle all submissions up to sq->tail index */
1355         sq = &sc->submit_queues[idx];
1356
1357         if (atomic_testandset_int(&sq->busy, 1)) {
1358                 DPRINTF(("%s sqid %u busy", __func__, idx));
1359                 return;
1360         }
1361
1362         sqhead = atomic_load_acq_short(&sq->head);
1363
1364         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1365                  idx, sqhead, sq->tail, sq->qbase));
1366
1367         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1368                 struct nvme_command *cmd;
1369                 struct pci_nvme_ioreq *req = NULL;
1370                 uint64_t lba;
1371                 uint64_t nblocks, bytes, size, cpsz;
1372
1373                 /* TODO: support scatter gather list handling */
1374
1375                 cmd = &sq->qbase[sqhead];
1376                 sqhead = (sqhead + 1) % sq->size;
1377
1378                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1379
1380                 if (cmd->opc == NVME_OPC_FLUSH) {
1381                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1382                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1383                                                 status, 1);
1384
1385                         continue;
1386                 } else if (cmd->opc == 0x08) {
1387                         /* TODO: write zeroes */
1388                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u",
1389                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1390                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1391                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1392                                                 status, 1);
1393
1394                         continue;
1395                 }
1396
1397                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1398
1399                 bytes = nblocks * sc->nvstore.sectsz;
1400
1401                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1402                         req = pci_nvme_get_ioreq(sc);
1403                         req->nvme_sq = sq;
1404                         req->sqid = idx;
1405                 }
1406
1407                 /*
1408                  * If data starts mid-page and flows into the next page, then
1409                  * increase page count
1410                  */
1411
1412                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1413                          "(%lu-bytes)",
1414                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1415                          cmd->opc == NVME_OPC_WRITE ?
1416                              "WRITE" : "READ",
1417                          lba, nblocks, bytes));
1418
1419                 cmd->prp1 &= ~(0x03UL);
1420                 cmd->prp2 &= ~(0x03UL);
1421
1422                 DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2));
1423
1424                 size = bytes;
1425                 lba *= sc->nvstore.sectsz;
1426
1427                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1428
1429                 if (cpsz > bytes)
1430                         cpsz = bytes;
1431
1432                 if (req != NULL) {
1433                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1434                                                 cmd->cdw10;
1435                         req->opc = cmd->opc;
1436                         req->cid = cmd->cid;
1437                         req->nsid = cmd->nsid;
1438                 }
1439
1440                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1441                     cmd->opc == NVME_OPC_WRITE, lba);
1442                 lba += cpsz;
1443                 size -= cpsz;
1444
1445                 if (size == 0)
1446                         goto iodone;
1447
1448                 if (size <= PAGE_SIZE) {
1449                         /* prp2 is second (and final) page in transfer */
1450
1451                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1452                             size,
1453                             cmd->opc == NVME_OPC_WRITE,
1454                             lba);
1455                 } else {
1456                         uint64_t *prp_list;
1457                         int i;
1458
1459                         /* prp2 is pointer to a physical region page list */
1460                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1461                                                     cmd->prp2, PAGE_SIZE);
1462
1463                         i = 0;
1464                         while (size != 0) {
1465                                 cpsz = MIN(size, PAGE_SIZE);
1466
1467                                 /*
1468                                  * Move to linked physical region page list
1469                                  * in last item.
1470                                  */ 
1471                                 if (i == (NVME_PRP2_ITEMS-1) &&
1472                                     size > PAGE_SIZE) {
1473                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1474                                         prp_list = paddr_guest2host(
1475                                                       sc->nsc_pi->pi_vmctx,
1476                                                       prp_list[i], PAGE_SIZE);
1477                                         i = 0;
1478                                 }
1479                                 if (prp_list[i] == 0) {
1480                                         WPRINTF(("PRP2[%d] = 0 !!!", i));
1481                                         err = 1;
1482                                         break;
1483                                 }
1484
1485                                 err = pci_nvme_append_iov_req(sc, req,
1486                                     prp_list[i], cpsz,
1487                                     cmd->opc == NVME_OPC_WRITE, lba);
1488                                 if (err)
1489                                         break;
1490
1491                                 lba += cpsz;
1492                                 size -= cpsz;
1493                                 i++;
1494                         }
1495                 }
1496
1497 iodone:
1498                 if (sc->nvstore.type == NVME_STOR_RAM) {
1499                         uint16_t code, status;
1500
1501                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1502                             NVME_SC_SUCCESS;
1503                         pci_nvme_status_genc(&status, code);
1504
1505                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1506                                                 status, 1);
1507
1508                         continue;
1509                 }
1510
1511
1512                 if (err)
1513                         goto do_error;
1514
1515                 req->io_req.br_callback = pci_nvme_io_done;
1516
1517                 err = 0;
1518                 switch (cmd->opc) {
1519                 case NVME_OPC_READ:
1520                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1521                         break;
1522                 case NVME_OPC_WRITE:
1523                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1524                         break;
1525                 default:
1526                         WPRINTF(("%s unhandled io command 0x%x",
1527                                  __func__, cmd->opc));
1528                         err = 1;
1529                 }
1530
1531 do_error:
1532                 if (err) {
1533                         uint16_t status;
1534
1535                         pci_nvme_status_genc(&status,
1536                             NVME_SC_DATA_TRANSFER_ERROR);
1537
1538                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1539                                                 status, 1);
1540                         pci_nvme_release_ioreq(sc, req);
1541                 }
1542         }
1543
1544         atomic_store_short(&sq->head, sqhead);
1545         atomic_store_int(&sq->busy, 0);
1546 }
1547
1548 static void
1549 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1550         uint64_t idx, int is_sq, uint64_t value)
1551 {
1552         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx",
1553                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1554
1555         if (is_sq) {
1556                 atomic_store_short(&sc->submit_queues[idx].tail,
1557                                    (uint16_t)value);
1558
1559                 if (idx == 0) {
1560                         pci_nvme_handle_admin_cmd(sc, value);
1561                 } else {
1562                         /* submission queue; handle new entries in SQ */
1563                         if (idx > sc->num_squeues) {
1564                                 WPRINTF(("%s SQ index %lu overflow from "
1565                                          "guest (max %u)",
1566                                          __func__, idx, sc->num_squeues));
1567                                 return;
1568                         }
1569                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1570                 }
1571         } else {
1572                 if (idx > sc->num_cqueues) {
1573                         WPRINTF(("%s queue index %lu overflow from "
1574                                  "guest (max %u)",
1575                                  __func__, idx, sc->num_cqueues));
1576                         return;
1577                 }
1578
1579                 sc->compl_queues[idx].head = (uint16_t)value;
1580         }
1581 }
1582
1583 static void
1584 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1585 {
1586         const char *s = iswrite ? "WRITE" : "READ";
1587
1588         switch (offset) {
1589         case NVME_CR_CAP_LOW:
1590                 DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s));
1591                 break;
1592         case NVME_CR_CAP_HI:
1593                 DPRINTF(("%s %s NVME_CR_CAP_HI", func, s));
1594                 break;
1595         case NVME_CR_VS:
1596                 DPRINTF(("%s %s NVME_CR_VS", func, s));
1597                 break;
1598         case NVME_CR_INTMS:
1599                 DPRINTF(("%s %s NVME_CR_INTMS", func, s));
1600                 break;
1601         case NVME_CR_INTMC:
1602                 DPRINTF(("%s %s NVME_CR_INTMC", func, s));
1603                 break;
1604         case NVME_CR_CC:
1605                 DPRINTF(("%s %s NVME_CR_CC", func, s));
1606                 break;
1607         case NVME_CR_CSTS:
1608                 DPRINTF(("%s %s NVME_CR_CSTS", func, s));
1609                 break;
1610         case NVME_CR_NSSR:
1611                 DPRINTF(("%s %s NVME_CR_NSSR", func, s));
1612                 break;
1613         case NVME_CR_AQA:
1614                 DPRINTF(("%s %s NVME_CR_AQA", func, s));
1615                 break;
1616         case NVME_CR_ASQ_LOW:
1617                 DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s));
1618                 break;
1619         case NVME_CR_ASQ_HI:
1620                 DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s));
1621                 break;
1622         case NVME_CR_ACQ_LOW:
1623                 DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s));
1624                 break;
1625         case NVME_CR_ACQ_HI:
1626                 DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s));
1627                 break;
1628         default:
1629                 DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset));
1630         }
1631
1632 }
1633
1634 static void
1635 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1636         uint64_t offset, int size, uint64_t value)
1637 {
1638         uint32_t ccreg;
1639
1640         if (offset >= NVME_DOORBELL_OFFSET) {
1641                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1642                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1643                 int is_sq = (belloffset % 8) < 4;
1644
1645                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1646                         WPRINTF(("guest attempted an overflow write offset "
1647                                  "0x%lx, val 0x%lx in %s",
1648                                  offset, value, __func__));
1649                         return;
1650                 }
1651
1652                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1653                 return;
1654         }
1655
1656         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx",
1657                 offset, size, value));
1658
1659         if (size != 4) {
1660                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1661                          "val 0x%lx) to bar0 in %s",
1662                          size, offset, value, __func__));
1663                 /* TODO: shutdown device */
1664                 return;
1665         }
1666
1667         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1668
1669         pthread_mutex_lock(&sc->mtx);
1670
1671         switch (offset) {
1672         case NVME_CR_CAP_LOW:
1673         case NVME_CR_CAP_HI:
1674                 /* readonly */
1675                 break;
1676         case NVME_CR_VS:
1677                 /* readonly */
1678                 break;
1679         case NVME_CR_INTMS:
1680                 /* MSI-X, so ignore */
1681                 break;
1682         case NVME_CR_INTMC:
1683                 /* MSI-X, so ignore */
1684                 break;
1685         case NVME_CR_CC:
1686                 ccreg = (uint32_t)value;
1687
1688                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1689                          "iocqes %u",
1690                         __func__,
1691                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1692                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1693                          NVME_CC_GET_IOCQES(ccreg)));
1694
1695                 if (NVME_CC_GET_SHN(ccreg)) {
1696                         /* perform shutdown - flush out data to backend */
1697                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1698                             NVME_CSTS_REG_SHST_SHIFT);
1699                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1700                             NVME_CSTS_REG_SHST_SHIFT;
1701                 }
1702                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1703                         if (NVME_CC_GET_EN(ccreg) == 0)
1704                                 /* transition 1-> causes controller reset */
1705                                 pci_nvme_reset_locked(sc);
1706                         else
1707                                 pci_nvme_init_controller(ctx, sc);
1708                 }
1709
1710                 /* Insert the iocqes, iosqes and en bits from the write */
1711                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1712                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1713                 if (NVME_CC_GET_EN(ccreg) == 0) {
1714                         /* Insert the ams, mps and css bit fields */
1715                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1716                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1717                         sc->regs.csts &= ~NVME_CSTS_RDY;
1718                 } else if (sc->pending_ios == 0) {
1719                         sc->regs.csts |= NVME_CSTS_RDY;
1720                 }
1721                 break;
1722         case NVME_CR_CSTS:
1723                 break;
1724         case NVME_CR_NSSR:
1725                 /* ignore writes; don't support subsystem reset */
1726                 break;
1727         case NVME_CR_AQA:
1728                 sc->regs.aqa = (uint32_t)value;
1729                 break;
1730         case NVME_CR_ASQ_LOW:
1731                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1732                                (0xFFFFF000 & value);
1733                 break;
1734         case NVME_CR_ASQ_HI:
1735                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1736                                (value << 32);
1737                 break;
1738         case NVME_CR_ACQ_LOW:
1739                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1740                                (0xFFFFF000 & value);
1741                 break;
1742         case NVME_CR_ACQ_HI:
1743                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1744                                (value << 32);
1745                 break;
1746         default:
1747                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d",
1748                          __func__, offset, value, size));
1749         }
1750         pthread_mutex_unlock(&sc->mtx);
1751 }
1752
1753 static void
1754 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1755                 int baridx, uint64_t offset, int size, uint64_t value)
1756 {
1757         struct pci_nvme_softc* sc = pi->pi_arg;
1758
1759         if (baridx == pci_msix_table_bar(pi) ||
1760             baridx == pci_msix_pba_bar(pi)) {
1761                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1762                          " value 0x%lx", baridx, offset, size, value));
1763
1764                 pci_emul_msix_twrite(pi, offset, size, value);
1765                 return;
1766         }
1767
1768         switch (baridx) {
1769         case 0:
1770                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1771                 break;
1772
1773         default:
1774                 DPRINTF(("%s unknown baridx %d, val 0x%lx",
1775                          __func__, baridx, value));
1776         }
1777 }
1778
1779 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1780         uint64_t offset, int size)
1781 {
1782         uint64_t value;
1783
1784         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1785
1786         if (offset < NVME_DOORBELL_OFFSET) {
1787                 void *p = &(sc->regs);
1788                 pthread_mutex_lock(&sc->mtx);
1789                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1790                 pthread_mutex_unlock(&sc->mtx);
1791         } else {
1792                 value = 0;
1793                 WPRINTF(("pci_nvme: read invalid offset %ld", offset));
1794         }
1795
1796         switch (size) {
1797         case 1:
1798                 value &= 0xFF;
1799                 break;
1800         case 2:
1801                 value &= 0xFFFF;
1802                 break;
1803         case 4:
1804                 value &= 0xFFFFFFFF;
1805                 break;
1806         }
1807
1808         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x",
1809                  offset, size, (uint32_t)value));
1810
1811         return (value);
1812 }
1813
1814
1815
1816 static uint64_t
1817 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1818     uint64_t offset, int size)
1819 {
1820         struct pci_nvme_softc* sc = pi->pi_arg;
1821
1822         if (baridx == pci_msix_table_bar(pi) ||
1823             baridx == pci_msix_pba_bar(pi)) {
1824                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
1825                         baridx, offset, size));
1826
1827                 return pci_emul_msix_tread(pi, offset, size);
1828         }
1829
1830         switch (baridx) {
1831         case 0:
1832                 return pci_nvme_read_bar_0(sc, offset, size);
1833
1834         default:
1835                 DPRINTF(("unknown bar %d, 0x%lx", baridx, offset));
1836         }
1837
1838         return (0);
1839 }
1840
1841
1842 static int
1843 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1844 {
1845         char bident[sizeof("XX:X:X")];
1846         char    *uopt, *xopts, *config;
1847         uint32_t sectsz;
1848         int optidx;
1849
1850         sc->max_queues = NVME_QUEUES;
1851         sc->max_qentries = NVME_MAX_QENTRIES;
1852         sc->ioslots = NVME_IOSLOTS;
1853         sc->num_squeues = sc->max_queues;
1854         sc->num_cqueues = sc->max_queues;
1855         sectsz = 0;
1856
1857         uopt = strdup(opts);
1858         optidx = 0;
1859         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1860                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1861         for (xopts = strtok(uopt, ",");
1862              xopts != NULL;
1863              xopts = strtok(NULL, ",")) {
1864
1865                 if ((config = strchr(xopts, '=')) != NULL)
1866                         *config++ = '\0';
1867
1868                 if (!strcmp("maxq", xopts)) {
1869                         sc->max_queues = atoi(config);
1870                 } else if (!strcmp("qsz", xopts)) {
1871                         sc->max_qentries = atoi(config);
1872                 } else if (!strcmp("ioslots", xopts)) {
1873                         sc->ioslots = atoi(config);
1874                 } else if (!strcmp("sectsz", xopts)) {
1875                         sectsz = atoi(config);
1876                 } else if (!strcmp("ser", xopts)) {
1877                         /*
1878                          * This field indicates the Product Serial Number in
1879                          * 7-bit ASCII, unused bytes should be space characters.
1880                          * Ref: NVMe v1.3c.
1881                          */
1882                         cpywithpad((char *)sc->ctrldata.sn,
1883                                    sizeof(sc->ctrldata.sn), config, ' ');
1884                 } else if (!strcmp("ram", xopts)) {
1885                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1886
1887                         sc->nvstore.type = NVME_STOR_RAM;
1888                         sc->nvstore.size = sz * 1024 * 1024;
1889                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1890                         sc->nvstore.sectsz = 4096;
1891                         sc->nvstore.sectsz_bits = 12;
1892                         if (sc->nvstore.ctx == NULL) {
1893                                 perror("Unable to allocate RAM");
1894                                 free(uopt);
1895                                 return (-1);
1896                         }
1897                 } else if (!strcmp("eui64", xopts)) {
1898                         sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1899                 } else if (optidx == 0) {
1900                         snprintf(bident, sizeof(bident), "%d:%d",
1901                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1902                         sc->nvstore.ctx = blockif_open(xopts, bident);
1903                         if (sc->nvstore.ctx == NULL) {
1904                                 perror("Could not open backing file");
1905                                 free(uopt);
1906                                 return (-1);
1907                         }
1908                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1909                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1910                 } else {
1911                         EPRINTLN("Invalid option %s", xopts);
1912                         free(uopt);
1913                         return (-1);
1914                 }
1915
1916                 optidx++;
1917         }
1918         free(uopt);
1919
1920         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1921                 EPRINTLN("backing store not specified");
1922                 return (-1);
1923         }
1924         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1925                 sc->nvstore.sectsz = sectsz;
1926         else if (sc->nvstore.type != NVME_STOR_RAM)
1927                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1928         for (sc->nvstore.sectsz_bits = 9;
1929              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1930              sc->nvstore.sectsz_bits++);
1931
1932         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1933                 sc->max_queues = NVME_QUEUES;
1934
1935         if (sc->max_qentries <= 0) {
1936                 EPRINTLN("Invalid qsz option");
1937                 return (-1);
1938         }
1939         if (sc->ioslots <= 0) {
1940                 EPRINTLN("Invalid ioslots option");
1941                 return (-1);
1942         }
1943
1944         return (0);
1945 }
1946
1947 static int
1948 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1949 {
1950         struct pci_nvme_softc *sc;
1951         uint32_t pci_membar_sz;
1952         int     error;
1953
1954         error = 0;
1955
1956         sc = calloc(1, sizeof(struct pci_nvme_softc));
1957         pi->pi_arg = sc;
1958         sc->nsc_pi = pi;
1959
1960         error = pci_nvme_parse_opts(sc, opts);
1961         if (error < 0)
1962                 goto done;
1963         else
1964                 error = 0;
1965
1966         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1967         for (int i = 0; i < sc->ioslots; i++) {
1968                 if (i < (sc->ioslots-1))
1969                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1970                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1971                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1972         }
1973         sc->ioreqs_free = sc->ioreqs;
1974         sc->intr_coales_aggr_thresh = 1;
1975
1976         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1977         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1978         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1979         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1980         pci_set_cfgdata8(pi, PCIR_PROGIF,
1981                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1982
1983         /*
1984          * Allocate size of NVMe registers + doorbell space for all queues.
1985          *
1986          * The specification requires a minimum memory I/O window size of 16K.
1987          * The Windows driver will refuse to start a device with a smaller
1988          * window.
1989          */
1990         pci_membar_sz = sizeof(struct nvme_registers) +
1991             2 * sizeof(uint32_t) * (sc->max_queues + 1);
1992         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1993
1994         DPRINTF(("nvme membar size: %u", pci_membar_sz));
1995
1996         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1997         if (error) {
1998                 WPRINTF(("%s pci alloc mem bar failed", __func__));
1999                 goto done;
2000         }
2001
2002         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2003         if (error) {
2004                 WPRINTF(("%s pci add msixcap failed", __func__));
2005                 goto done;
2006         }
2007
2008         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2009         if (error) {
2010                 WPRINTF(("%s pci add Express capability failed", __func__));
2011                 goto done;
2012         }
2013
2014         pthread_mutex_init(&sc->mtx, NULL);
2015         sem_init(&sc->iosemlock, 0, sc->ioslots);
2016
2017         pci_nvme_reset(sc);
2018         pci_nvme_init_ctrldata(sc);
2019         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2020         pci_nvme_init_logpages(sc);
2021
2022         pci_lintr_request(pi);
2023
2024 done:
2025         return (error);
2026 }
2027
2028
2029 struct pci_devemu pci_de_nvme = {
2030         .pe_emu =       "nvme",
2031         .pe_init =      pci_nvme_init,
2032         .pe_barwrite =  pci_nvme_write,
2033         .pe_barread =   pci_nvme_read
2034 };
2035 PCI_EMUL_SET(pci_de_nvme);