]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/bhyve/pci_nvme.c
bhyve: Fix NVMe BAR size calculation
[FreeBSD/FreeBSD.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55
56 #include <sys/types.h>
57
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71
72 #include <dev/nvme/nvme.h>
73
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77
78
79 static int nvme_debug = 0;
80 #define DPRINTF(params) if (nvme_debug) printf params
81 #define WPRINTF(params) printf params
82
83 /* defaults; can be overridden */
84 #define NVME_MSIX_BAR           4
85
86 #define NVME_IOSLOTS            8
87
88 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
89 #define NVME_MMIO_SPACE_MIN     (1 << 14)
90
91 #define NVME_QUEUES             16
92 #define NVME_MAX_QENTRIES       2048
93
94 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
95 #define NVME_MAX_BLOCKIOVS      512
96
97 /* helpers */
98
99 /* Convert a zero-based value into a one-based value */
100 #define ONE_BASED(zero)         ((zero) + 1)
101 /* Convert a one-based value into a zero-based value */
102 #define ZERO_BASED(one)         ((one)  - 1)
103
104 /* Encode number of SQ's and CQ's for Set/Get Features */
105 #define NVME_FEATURE_NUM_QUEUES(sc) \
106         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
107         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
108
109 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
110
111 enum nvme_controller_register_offsets {
112         NVME_CR_CAP_LOW = 0x00,
113         NVME_CR_CAP_HI  = 0x04,
114         NVME_CR_VS      = 0x08,
115         NVME_CR_INTMS   = 0x0c,
116         NVME_CR_INTMC   = 0x10,
117         NVME_CR_CC      = 0x14,
118         NVME_CR_CSTS    = 0x1c,
119         NVME_CR_NSSR    = 0x20,
120         NVME_CR_AQA     = 0x24,
121         NVME_CR_ASQ_LOW = 0x28,
122         NVME_CR_ASQ_HI  = 0x2c,
123         NVME_CR_ACQ_LOW = 0x30,
124         NVME_CR_ACQ_HI  = 0x34,
125 };
126
127 enum nvme_cmd_cdw11 {
128         NVME_CMD_CDW11_PC  = 0x0001,
129         NVME_CMD_CDW11_IEN = 0x0002,
130         NVME_CMD_CDW11_IV  = 0xFFFF0000,
131 };
132
133 #define NVME_CQ_INTEN   0x01
134 #define NVME_CQ_INTCOAL 0x02
135
136 struct nvme_completion_queue {
137         struct nvme_completion *qbase;
138         uint32_t        size;
139         uint16_t        tail; /* nvme progress */
140         uint16_t        head; /* guest progress */
141         uint16_t        intr_vec;
142         uint32_t        intr_en;
143         pthread_mutex_t mtx;
144 };
145
146 struct nvme_submission_queue {
147         struct nvme_command *qbase;
148         uint32_t        size;
149         uint16_t        head; /* nvme progress */
150         uint16_t        tail; /* guest progress */
151         uint16_t        cqid; /* completion queue id */
152         int             busy; /* queue is being processed */
153         int             qpriority;
154 };
155
156 enum nvme_storage_type {
157         NVME_STOR_BLOCKIF = 0,
158         NVME_STOR_RAM = 1,
159 };
160
161 struct pci_nvme_blockstore {
162         enum nvme_storage_type type;
163         void            *ctx;
164         uint64_t        size;
165         uint32_t        sectsz;
166         uint32_t        sectsz_bits;
167 };
168
169 struct pci_nvme_ioreq {
170         struct pci_nvme_softc *sc;
171         struct pci_nvme_ioreq *next;
172         struct nvme_submission_queue *nvme_sq;
173         uint16_t        sqid;
174
175         /* command information */
176         uint16_t        opc;
177         uint16_t        cid;
178         uint32_t        nsid;
179
180         uint64_t        prev_gpaddr;
181         size_t          prev_size;
182
183         /*
184          * lock if all iovs consumed (big IO);
185          * complete transaction before continuing
186          */
187         pthread_mutex_t mtx;
188         pthread_cond_t  cv;
189
190         struct blockif_req io_req;
191
192         /* pad to fit up to 512 page descriptors from guest IO request */
193         struct iovec    iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
194 };
195
196 struct pci_nvme_softc {
197         struct pci_devinst *nsc_pi;
198
199         pthread_mutex_t mtx;
200
201         struct nvme_registers regs;
202
203         struct nvme_namespace_data  nsdata;
204         struct nvme_controller_data ctrldata;
205
206         struct pci_nvme_blockstore nvstore;
207
208         uint16_t        max_qentries;   /* max entries per queue */
209         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
210         uint32_t        num_cqueues;
211         uint32_t        num_squeues;
212
213         struct pci_nvme_ioreq *ioreqs;
214         struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
215         uint32_t        pending_ios;
216         uint32_t        ioslots;
217         sem_t           iosemlock;
218
219         /*
220          * Memory mapped Submission and Completion queues
221          * Each array includes both Admin and IO queues
222          */
223         struct nvme_completion_queue *compl_queues;
224         struct nvme_submission_queue *submit_queues;
225
226         /* controller features */
227         uint32_t        intr_coales_aggr_time;   /* 0x08: uS to delay intr */
228         uint32_t        intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
229         uint32_t        async_ev_config;         /* 0x0B: async event config */
230 };
231
232
233 static void pci_nvme_io_partial(struct blockif_req *br, int err);
234
235 /* Controller Configuration utils */
236 #define NVME_CC_GET_EN(cc) \
237         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
238 #define NVME_CC_GET_CSS(cc) \
239         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
240 #define NVME_CC_GET_SHN(cc) \
241         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
242 #define NVME_CC_GET_IOSQES(cc) \
243         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
244 #define NVME_CC_GET_IOCQES(cc) \
245         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
246
247 #define NVME_CC_WRITE_MASK \
248         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
249          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
250          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
251
252 #define NVME_CC_NEN_WRITE_MASK \
253         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
254          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
255          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
256
257 /* Controller Status utils */
258 #define NVME_CSTS_GET_RDY(sts) \
259         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
260
261 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
262
263 /* Completion Queue status word utils */
264 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
265 #define NVME_STATUS_MASK \
266         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
267          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
268
269 static __inline void
270 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
271 {
272         size_t len;
273
274         len = strnlen(src, dst_size);
275         memset(dst, pad, dst_size);
276         memcpy(dst, src, len);
277 }
278
279 static __inline void
280 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
281 {
282
283         *status &= ~NVME_STATUS_MASK;
284         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
285                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
286 }
287
288 static __inline void
289 pci_nvme_status_genc(uint16_t *status, uint16_t code)
290 {
291
292         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
293 }
294
295 static __inline void
296 pci_nvme_toggle_phase(uint16_t *status, int prev)
297 {
298
299         if (prev)
300                 *status &= ~NVME_STATUS_P;
301         else
302                 *status |= NVME_STATUS_P;
303 }
304
305 static void
306 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
307 {
308         struct nvme_controller_data *cd = &sc->ctrldata;
309
310         cd->vid = 0xFB5D;
311         cd->ssvid = 0x0000;
312
313         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
314         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
315
316         /* Num of submission commands that we can handle at a time (2^rab) */
317         cd->rab   = 4;
318
319         /* FreeBSD OUI */
320         cd->ieee[0] = 0x58;
321         cd->ieee[1] = 0x9c;
322         cd->ieee[2] = 0xfc;
323
324         cd->mic = 0;
325
326         cd->mdts = 9;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
327
328         cd->ver = 0x00010300;
329
330         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
331         cd->acl = 2;
332         cd->aerl = 4;
333
334         cd->lpa = 0;    /* TODO: support some simple things like SMART */
335         cd->elpe = 0;   /* max error log page entries */
336         cd->npss = 1;   /* number of power states support */
337
338         /* Warning Composite Temperature Threshold */
339         cd->wctemp = 0x0157;
340
341         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
342             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
343         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
344             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
345         cd->nn = 1;     /* number of namespaces */
346
347         cd->fna = 0x03;
348
349         cd->power_state[0].mp = 10;
350 }
351
352 static void
353 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
354 {
355         struct nvme_namespace_data *nd;
356
357         nd = &sc->nsdata;
358
359         nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
360         nd->ncap = nd->nsze;
361         nd->nuse = nd->nsze;
362
363         /* Get LBA and backstore information from backing store */
364         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
365         /* LBA data-sz = 2^lbads */
366         nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
367
368         nd->flbas = 0;
369 }
370
371 static void
372 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
373 {
374         DPRINTF(("%s\r\n", __func__));
375
376         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
377             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
378             (60 << NVME_CAP_LO_REG_TO_SHIFT);
379
380         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
381
382         sc->regs.vs = 0x00010300;       /* NVMe v1.3 */
383
384         sc->regs.cc = 0;
385         sc->regs.csts = 0;
386
387         sc->num_cqueues = sc->num_squeues = sc->max_queues;
388         if (sc->submit_queues != NULL) {
389                 for (int i = 0; i < sc->num_squeues + 1; i++) {
390                         /*
391                          * The Admin Submission Queue is at index 0.
392                          * It must not be changed at reset otherwise the
393                          * emulation will be out of sync with the guest.
394                          */
395                         if (i != 0) {
396                                 sc->submit_queues[i].qbase = NULL;
397                                 sc->submit_queues[i].size = 0;
398                                 sc->submit_queues[i].cqid = 0;
399                         }
400                         sc->submit_queues[i].tail = 0;
401                         sc->submit_queues[i].head = 0;
402                         sc->submit_queues[i].busy = 0;
403                 }
404         } else
405                 sc->submit_queues = calloc(sc->num_squeues + 1,
406                                         sizeof(struct nvme_submission_queue));
407
408         if (sc->compl_queues != NULL) {
409                 for (int i = 0; i < sc->num_cqueues + 1; i++) {
410                         /* See Admin Submission Queue note above */
411                         if (i != 0) {
412                                 sc->compl_queues[i].qbase = NULL;
413                                 sc->compl_queues[i].size = 0;
414                         }
415
416                         sc->compl_queues[i].tail = 0;
417                         sc->compl_queues[i].head = 0;
418                 }
419         } else {
420                 sc->compl_queues = calloc(sc->num_cqueues + 1,
421                                         sizeof(struct nvme_completion_queue));
422
423                 for (int i = 0; i < sc->num_cqueues + 1; i++)
424                         pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
425         }
426 }
427
428 static void
429 pci_nvme_reset(struct pci_nvme_softc *sc)
430 {
431         pthread_mutex_lock(&sc->mtx);
432         pci_nvme_reset_locked(sc);
433         pthread_mutex_unlock(&sc->mtx);
434 }
435
436 static void
437 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
438 {
439         uint16_t acqs, asqs;
440
441         DPRINTF(("%s\r\n", __func__));
442
443         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
444         sc->submit_queues[0].size = asqs;
445         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
446                     sizeof(struct nvme_command) * asqs);
447
448         DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
449                 __func__, sc->regs.asq, sc->submit_queues[0].qbase));
450
451         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 
452             NVME_AQA_REG_ACQS_MASK) + 1;
453         sc->compl_queues[0].size = acqs;
454         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
455                  sizeof(struct nvme_completion) * acqs);
456         DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
457                 __func__, sc->regs.acq, sc->compl_queues[0].qbase));
458 }
459
460 static int
461 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
462         struct nvme_completion* compl)
463 {
464         uint16_t qid = command->cdw10 & 0xffff;
465
466         DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
467         if (qid == 0 || qid > sc->num_squeues) {
468                 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
469                         __func__, qid, sc->num_squeues));
470                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
471                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
472                 return (1);
473         }
474
475         sc->submit_queues[qid].qbase = NULL;
476         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
477         return (1);
478 }
479
480 static int
481 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
482         struct nvme_completion* compl)
483 {
484         if (command->cdw11 & NVME_CMD_CDW11_PC) {
485                 uint16_t qid = command->cdw10 & 0xffff;
486                 struct nvme_submission_queue *nsq;
487
488                 if ((qid == 0) || (qid > sc->num_squeues)) {
489                         WPRINTF(("%s queue index %u > num_squeues %u\r\n",
490                                 __func__, qid, sc->num_squeues));
491                         pci_nvme_status_tc(&compl->status,
492                             NVME_SCT_COMMAND_SPECIFIC,
493                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
494                         return (1);
495                 }
496
497                 nsq = &sc->submit_queues[qid];
498                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
499
500                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
501                               sizeof(struct nvme_command) * (size_t)nsq->size);
502                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
503                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
504
505                 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
506                         qid, nsq->size, nsq->qbase, nsq->cqid));
507
508                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
509
510                 DPRINTF(("%s completed creating IOSQ qid %u\r\n",
511                          __func__, qid));
512         } else {
513                 /* 
514                  * Guest sent non-cont submission queue request.
515                  * This setting is unsupported by this emulation.
516                  */
517                 WPRINTF(("%s unsupported non-contig (list-based) "
518                          "create i/o submission queue\r\n", __func__));
519
520                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
521         }
522         return (1);
523 }
524
525 static int
526 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
527         struct nvme_completion* compl)
528 {
529         uint16_t qid = command->cdw10 & 0xffff;
530
531         DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
532         if (qid == 0 || qid > sc->num_cqueues) {
533                 WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
534                         __func__, qid, sc->num_cqueues));
535                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
536                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
537                 return (1);
538         }
539
540         sc->compl_queues[qid].qbase = NULL;
541         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
542         return (1);
543 }
544
545 static int
546 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
547         struct nvme_completion* compl)
548 {
549         if (command->cdw11 & NVME_CMD_CDW11_PC) {
550                 uint16_t qid = command->cdw10 & 0xffff;
551                 struct nvme_completion_queue *ncq;
552
553                 if ((qid == 0) || (qid > sc->num_cqueues)) {
554                         WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
555                                 __func__, qid, sc->num_cqueues));
556                         pci_nvme_status_tc(&compl->status,
557                             NVME_SCT_COMMAND_SPECIFIC,
558                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
559                         return (1);
560                 }
561
562                 ncq = &sc->compl_queues[qid];
563                 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
564                 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
565                 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
566
567                 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
568                              command->prp1,
569                              sizeof(struct nvme_command) * (size_t)ncq->size);
570
571                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
572         } else {
573                 /* 
574                  * Non-contig completion queue unsupported.
575                  */
576                 WPRINTF(("%s unsupported non-contig (list-based) "
577                          "create i/o completion queue\r\n",
578                          __func__));
579
580                 /* 0x12 = Invalid Use of Controller Memory Buffer */
581                 pci_nvme_status_genc(&compl->status, 0x12);
582         }
583
584         return (1);
585 }
586
587 static int
588 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
589         struct nvme_completion* compl)
590 {
591         uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
592         uint8_t logpage = command->cdw10 & 0xFF;
593         void *data;
594
595         DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
596
597         if (logpage >= 1 && logpage <= 3)
598                 data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
599                                   PAGE_SIZE);
600
601         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
602
603         switch (logpage) {
604         case 0x01: /* Error information */
605                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
606                 break;
607         case 0x02: /* SMART/Health information */
608                 /* TODO: present some smart info */
609                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
610                 break;
611         case 0x03: /* Firmware slot information */
612                 memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
613                 break;
614         default:
615                 WPRINTF(("%s get log page %x command not supported\r\n",
616                         __func__, logpage));
617
618                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
619                     NVME_SC_INVALID_LOG_PAGE);
620         }
621
622         return (1);
623 }
624
625 static int
626 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
627         struct nvme_completion* compl)
628 {
629         void *dest;
630
631         DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
632                 command->cdw10 & 0xFF, command->nsid));
633
634         switch (command->cdw10 & 0xFF) {
635         case 0x00: /* return Identify Namespace data structure */
636                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
637                                   sizeof(sc->nsdata));
638                 memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
639                 break;
640         case 0x01: /* return Identify Controller data structure */
641                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
642                                   sizeof(sc->ctrldata));
643                 memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
644                 break;
645         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
646                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
647                                   sizeof(uint32_t) * 1024);
648                 ((uint32_t *)dest)[0] = 1;
649                 ((uint32_t *)dest)[1] = 0;
650                 break;
651         case 0x11:
652                 pci_nvme_status_genc(&compl->status,
653                     NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
654                 return (1);
655         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
656         case 0x10:
657         case 0x12:
658         case 0x13:
659         case 0x14:
660         case 0x15:
661         default:
662                 DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
663                          __func__, command->cdw10 & 0xFF));
664                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
665                 return (1);
666         }
667
668         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
669         return (1);
670 }
671
672 static int
673 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
674         struct nvme_completion* compl)
675 {
676         uint16_t nqr;   /* Number of Queues Requested */
677
678         nqr = command->cdw11 & 0xFFFF;
679         if (nqr == 0xffff) {
680                 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
681                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
682                 return (-1);
683         }
684
685         sc->num_squeues = ONE_BASED(nqr);
686         if (sc->num_squeues > sc->max_queues) {
687                 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
688                                         sc->max_queues));
689                 sc->num_squeues = sc->max_queues;
690         }
691
692         nqr = (command->cdw11 >> 16) & 0xFFFF;
693         if (nqr == 0xffff) {
694                 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
695                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
696                 return (-1);
697         }
698
699         sc->num_cqueues = ONE_BASED(nqr);
700         if (sc->num_cqueues > sc->max_queues) {
701                 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
702                                         sc->max_queues));
703                 sc->num_cqueues = sc->max_queues;
704         }
705
706         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
707
708         return (0);
709 }
710
711 static int
712 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
713         struct nvme_completion* compl)
714 {
715         int feature = command->cdw10 & 0xFF;
716         uint32_t iv;
717
718         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
719         compl->cdw0 = 0;
720
721         switch (feature) {
722         case NVME_FEAT_ARBITRATION:
723                 DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
724                 break;
725         case NVME_FEAT_POWER_MANAGEMENT:
726                 DPRINTF(("  power management 0x%x\r\n", command->cdw11));
727                 break;
728         case NVME_FEAT_LBA_RANGE_TYPE:
729                 DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
730                 break;
731         case NVME_FEAT_TEMPERATURE_THRESHOLD:
732                 DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
733                 break;
734         case NVME_FEAT_ERROR_RECOVERY:
735                 DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
736                 break;
737         case NVME_FEAT_VOLATILE_WRITE_CACHE:
738                 DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
739                 break;
740         case NVME_FEAT_NUMBER_OF_QUEUES:
741                 nvme_set_feature_queues(sc, command, compl);
742                 break;
743         case NVME_FEAT_INTERRUPT_COALESCING:
744                 DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
745
746                 /* in uS */
747                 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
748
749                 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
750                 break;
751         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
752                 iv = command->cdw11 & 0xFFFF;
753
754                 DPRINTF(("  interrupt vector configuration 0x%x\r\n",
755                         command->cdw11));
756
757                 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
758                         if (sc->compl_queues[i].intr_vec == iv) {
759                                 if (command->cdw11 & (1 << 16))
760                                         sc->compl_queues[i].intr_en |=
761                                                               NVME_CQ_INTCOAL;  
762                                 else
763                                         sc->compl_queues[i].intr_en &=
764                                                              ~NVME_CQ_INTCOAL;  
765                         }
766                 }
767                 break;
768         case NVME_FEAT_WRITE_ATOMICITY:
769                 DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
770                 break;
771         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
772                 DPRINTF(("  async event configuration 0x%x\r\n",
773                         command->cdw11));
774                 sc->async_ev_config = command->cdw11;
775                 break;
776         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
777                 DPRINTF(("  software progress marker 0x%x\r\n",
778                         command->cdw11));
779                 break;
780         case 0x0C:
781                 DPRINTF(("  autonomous power state transition 0x%x\r\n",
782                         command->cdw11));
783                 break;
784         default:
785                 WPRINTF(("%s invalid feature\r\n", __func__));
786                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
787                 return (1);
788         }
789
790         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
791         return (1);
792 }
793
794 static int
795 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
796         struct nvme_completion* compl)
797 {
798         int feature = command->cdw10 & 0xFF;
799
800         DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
801
802         compl->cdw0 = 0;
803
804         switch (feature) {
805         case NVME_FEAT_ARBITRATION:
806                 DPRINTF(("  arbitration\r\n"));
807                 break;
808         case NVME_FEAT_POWER_MANAGEMENT:
809                 DPRINTF(("  power management\r\n"));
810                 break;
811         case NVME_FEAT_LBA_RANGE_TYPE:
812                 DPRINTF(("  lba range\r\n"));
813                 break;
814         case NVME_FEAT_TEMPERATURE_THRESHOLD:
815                 DPRINTF(("  temperature threshold\r\n"));
816                 switch ((command->cdw11 >> 20) & 0x3) {
817                 case 0:
818                         /* Over temp threshold */
819                         compl->cdw0 = 0xFFFF;
820                         break;
821                 case 1:
822                         /* Under temp threshold */
823                         compl->cdw0 = 0;
824                         break;
825                 default:
826                         WPRINTF(("  invalid threshold type select\r\n"));
827                         pci_nvme_status_genc(&compl->status,
828                             NVME_SC_INVALID_FIELD);
829                         return (1);
830                 }
831                 break;
832         case NVME_FEAT_ERROR_RECOVERY:
833                 DPRINTF(("  error recovery\r\n"));
834                 break;
835         case NVME_FEAT_VOLATILE_WRITE_CACHE:
836                 DPRINTF(("  volatile write cache\r\n"));
837                 break;
838         case NVME_FEAT_NUMBER_OF_QUEUES:
839                 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
840
841                 DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
842                         compl->cdw0 & 0xFFFF,
843                         (compl->cdw0 >> 16) & 0xFFFF));
844
845                 break;
846         case NVME_FEAT_INTERRUPT_COALESCING:
847                 DPRINTF(("  interrupt coalescing\r\n"));
848                 break;
849         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
850                 DPRINTF(("  interrupt vector configuration\r\n"));
851                 break;
852         case NVME_FEAT_WRITE_ATOMICITY:
853                 DPRINTF(("  write atomicity\r\n"));
854                 break;
855         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
856                 DPRINTF(("  async event configuration\r\n"));
857                 sc->async_ev_config = command->cdw11;
858                 break;
859         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
860                 DPRINTF(("  software progress marker\r\n"));
861                 break;
862         case 0x0C:
863                 DPRINTF(("  autonomous power state transition\r\n"));
864                 break;
865         default:
866                 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
867                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
868                 return (1);
869         }
870
871         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
872         return (1);
873 }
874
875 static int
876 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
877         struct nvme_completion* compl)
878 {
879         DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
880                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
881
882         /* TODO: search for the command ID and abort it */
883
884         compl->cdw0 = 1;
885         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
886         return (1);
887 }
888
889 static int
890 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
891         struct nvme_command* command, struct nvme_completion* compl)
892 {
893         DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
894
895         /*
896          * TODO: raise events when they happen based on the Set Features cmd.
897          * These events happen async, so only set completion successful if
898          * there is an event reflective of the request to get event.
899          */
900         pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
901             NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
902         return (0);
903 }
904
905 static void
906 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
907 {
908         struct nvme_completion compl;
909         struct nvme_command *cmd;
910         struct nvme_submission_queue *sq;
911         struct nvme_completion_queue *cq;
912         int do_intr = 0;
913         uint16_t sqhead;
914
915         DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
916
917         sq = &sc->submit_queues[0];
918
919         sqhead = atomic_load_acq_short(&sq->head);
920
921         if (atomic_testandset_int(&sq->busy, 1)) {
922                 DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
923                         __func__, sqhead, sq->tail));
924                 return;
925         }
926
927         DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
928         
929         while (sqhead != atomic_load_acq_short(&sq->tail)) {
930                 cmd = &(sq->qbase)[sqhead];
931                 compl.status = 0;
932
933                 switch (cmd->opc) {
934                 case NVME_OPC_DELETE_IO_SQ:
935                         DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
936                         do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
937                         break;
938                 case NVME_OPC_CREATE_IO_SQ:
939                         DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
940                         do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
941                         break;
942                 case NVME_OPC_DELETE_IO_CQ:
943                         DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
944                         do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
945                         break;
946                 case NVME_OPC_CREATE_IO_CQ:
947                         DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
948                         do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
949                         break;
950                 case NVME_OPC_GET_LOG_PAGE:
951                         DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
952                         do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
953                         break;
954                 case NVME_OPC_IDENTIFY:
955                         DPRINTF(("%s command IDENTIFY\r\n", __func__));
956                         do_intr |= nvme_opc_identify(sc, cmd, &compl);
957                         break;
958                 case NVME_OPC_ABORT:
959                         DPRINTF(("%s command ABORT\r\n", __func__));
960                         do_intr |= nvme_opc_abort(sc, cmd, &compl);
961                         break;
962                 case NVME_OPC_SET_FEATURES:
963                         DPRINTF(("%s command SET_FEATURES\r\n", __func__));
964                         do_intr |= nvme_opc_set_features(sc, cmd, &compl);
965                         break;
966                 case NVME_OPC_GET_FEATURES:
967                         DPRINTF(("%s command GET_FEATURES\r\n", __func__));
968                         do_intr |= nvme_opc_get_features(sc, cmd, &compl);
969                         break;
970                 case NVME_OPC_ASYNC_EVENT_REQUEST:
971                         DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
972                         /* XXX dont care, unhandled for now
973                         do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
974                         */
975                         break;
976                 default:
977                         WPRINTF(("0x%x command is not implemented\r\n",
978                             cmd->opc));
979                 }
980         
981                 /* for now skip async event generation */
982                 if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
983                         struct nvme_completion *cp;
984                         int phase;
985
986                         cq = &sc->compl_queues[0];
987
988                         cp = &(cq->qbase)[cq->tail];
989                         cp->cdw0 = compl.cdw0;
990                         cp->sqid = 0;
991                         cp->sqhd = sqhead;
992                         cp->cid = cmd->cid;
993
994                         phase = NVME_STATUS_GET_P(cp->status);
995                         cp->status = compl.status;
996                         pci_nvme_toggle_phase(&cp->status, phase);
997
998                         cq->tail = (cq->tail + 1) % cq->size;
999                 }
1000                 sqhead = (sqhead + 1) % sq->size;
1001         }
1002
1003         DPRINTF(("setting sqhead %u\r\n", sqhead));
1004         atomic_store_short(&sq->head, sqhead);
1005         atomic_store_int(&sq->busy, 0);
1006
1007         if (do_intr)
1008                 pci_generate_msix(sc->nsc_pi, 0);
1009
1010 }
1011
1012 static int
1013 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1014         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1015 {
1016         int iovidx;
1017
1018         if (req != NULL) {
1019                 /* concatenate contig block-iovs to minimize number of iovs */
1020                 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1021                         iovidx = req->io_req.br_iovcnt - 1;
1022
1023                         req->io_req.br_iov[iovidx].iov_base =
1024                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1025                                              req->prev_gpaddr, size);
1026
1027                         req->prev_size += size;
1028                         req->io_req.br_resid += size;
1029
1030                         req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1031                 } else {
1032                         pthread_mutex_lock(&req->mtx);
1033
1034                         iovidx = req->io_req.br_iovcnt;
1035                         if (iovidx == NVME_MAX_BLOCKIOVS) {
1036                                 int err = 0;
1037
1038                                 DPRINTF(("large I/O, doing partial req\r\n"));
1039
1040                                 iovidx = 0;
1041                                 req->io_req.br_iovcnt = 0;
1042
1043                                 req->io_req.br_callback = pci_nvme_io_partial;
1044
1045                                 if (!do_write)
1046                                         err = blockif_read(sc->nvstore.ctx,
1047                                                            &req->io_req);
1048                                 else
1049                                         err = blockif_write(sc->nvstore.ctx,
1050                                                             &req->io_req);
1051
1052                                 /* wait until req completes before cont */
1053                                 if (err == 0)
1054                                         pthread_cond_wait(&req->cv, &req->mtx);
1055                         }
1056                         if (iovidx == 0) {
1057                                 req->io_req.br_offset = lba;
1058                                 req->io_req.br_resid = 0;
1059                                 req->io_req.br_param = req;
1060                         }
1061
1062                         req->io_req.br_iov[iovidx].iov_base =
1063                             paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1064                                              gpaddr, size);
1065
1066                         req->io_req.br_iov[iovidx].iov_len = size;
1067
1068                         req->prev_gpaddr = gpaddr;
1069                         req->prev_size = size;
1070                         req->io_req.br_resid += size;
1071
1072                         req->io_req.br_iovcnt++;
1073
1074                         pthread_mutex_unlock(&req->mtx);
1075                 }
1076         } else {
1077                 /* RAM buffer: read/write directly */
1078                 void *p = sc->nvstore.ctx;
1079                 void *gptr;
1080
1081                 if ((lba + size) > sc->nvstore.size) {
1082                         WPRINTF(("%s write would overflow RAM\r\n", __func__));
1083                         return (-1);
1084                 }
1085
1086                 p = (void *)((uintptr_t)p + (uintptr_t)lba);
1087                 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1088                 if (do_write) 
1089                         memcpy(p, gptr, size);
1090                 else
1091                         memcpy(gptr, p, size);
1092         }
1093         return (0);
1094 }
1095
1096 static void
1097 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1098         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1099         uint32_t cdw0, uint16_t status, int ignore_busy)
1100 {
1101         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1102         struct nvme_completion *compl;
1103         int do_intr = 0;
1104         int phase;
1105
1106         DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1107                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1108                  NVME_STATUS_GET_SC(status)));
1109
1110         pthread_mutex_lock(&cq->mtx);
1111
1112         assert(cq->qbase != NULL);
1113
1114         compl = &cq->qbase[cq->tail];
1115
1116         compl->sqhd = atomic_load_acq_short(&sq->head);
1117         compl->sqid = sqid;
1118         compl->cid = cid;
1119
1120         // toggle phase
1121         phase = NVME_STATUS_GET_P(compl->status);
1122         compl->status = status;
1123         pci_nvme_toggle_phase(&compl->status, phase);
1124
1125         cq->tail = (cq->tail + 1) % cq->size;
1126
1127         if (cq->intr_en & NVME_CQ_INTEN)
1128                 do_intr = 1;
1129
1130         pthread_mutex_unlock(&cq->mtx);
1131
1132         if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1133                 if (do_intr)
1134                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1135 }
1136
1137 static void
1138 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1139 {
1140         req->sc = NULL;
1141         req->nvme_sq = NULL;
1142         req->sqid = 0;
1143
1144         pthread_mutex_lock(&sc->mtx);
1145
1146         req->next = sc->ioreqs_free;
1147         sc->ioreqs_free = req;
1148         sc->pending_ios--;
1149
1150         /* when no more IO pending, can set to ready if device reset/enabled */
1151         if (sc->pending_ios == 0 &&
1152             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1153                 sc->regs.csts |= NVME_CSTS_RDY;
1154
1155         pthread_mutex_unlock(&sc->mtx);
1156
1157         sem_post(&sc->iosemlock);
1158 }
1159
1160 static struct pci_nvme_ioreq *
1161 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1162 {
1163         struct pci_nvme_ioreq *req = NULL;;
1164
1165         sem_wait(&sc->iosemlock);
1166         pthread_mutex_lock(&sc->mtx);
1167
1168         req = sc->ioreqs_free;
1169         assert(req != NULL);
1170
1171         sc->ioreqs_free = req->next;
1172
1173         req->next = NULL;
1174         req->sc = sc;
1175
1176         sc->pending_ios++;
1177
1178         pthread_mutex_unlock(&sc->mtx);
1179
1180         req->io_req.br_iovcnt = 0;
1181         req->io_req.br_offset = 0;
1182         req->io_req.br_resid = 0;
1183         req->io_req.br_param = req;
1184         req->prev_gpaddr = 0;
1185         req->prev_size = 0;
1186
1187         return req;
1188 }
1189
1190 static void
1191 pci_nvme_io_done(struct blockif_req *br, int err)
1192 {
1193         struct pci_nvme_ioreq *req = br->br_param;
1194         struct nvme_submission_queue *sq = req->nvme_sq;
1195         uint16_t code, status;
1196
1197         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1198         
1199         /* TODO return correct error */
1200         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1201         pci_nvme_status_genc(&status, code);
1202
1203         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1204         pci_nvme_release_ioreq(req->sc, req);
1205 }
1206
1207 static void
1208 pci_nvme_io_partial(struct blockif_req *br, int err)
1209 {
1210         struct pci_nvme_ioreq *req = br->br_param;
1211
1212         DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1213
1214         pthread_cond_signal(&req->cv);
1215 }
1216
1217
1218 static void
1219 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1220 {
1221         struct nvme_submission_queue *sq;
1222         uint16_t status;
1223         uint16_t sqhead;
1224         int err;
1225
1226         /* handle all submissions up to sq->tail index */
1227         sq = &sc->submit_queues[idx];
1228
1229         if (atomic_testandset_int(&sq->busy, 1)) {
1230                 DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1231                 return;
1232         }
1233
1234         sqhead = atomic_load_acq_short(&sq->head);
1235
1236         DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1237                  idx, sqhead, sq->tail, sq->qbase));
1238
1239         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1240                 struct nvme_command *cmd;
1241                 struct pci_nvme_ioreq *req = NULL;
1242                 uint64_t lba;
1243                 uint64_t nblocks, bytes, size, cpsz;
1244
1245                 /* TODO: support scatter gather list handling */
1246
1247                 cmd = &sq->qbase[sqhead];
1248                 sqhead = (sqhead + 1) % sq->size;
1249
1250                 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1251
1252                 if (cmd->opc == NVME_OPC_FLUSH) {
1253                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1254                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1255                                                 status, 1);
1256
1257                         continue;
1258                 } else if (cmd->opc == 0x08) {
1259                         /* TODO: write zeroes */
1260                         WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1261                                 __func__, lba, cmd->cdw12 & 0xFFFF));
1262                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1263                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1264                                                 status, 1);
1265
1266                         continue;
1267                 }
1268
1269                 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1270
1271                 bytes = nblocks * sc->nvstore.sectsz;
1272
1273                 if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1274                         req = pci_nvme_get_ioreq(sc);
1275                         req->nvme_sq = sq;
1276                         req->sqid = idx;
1277                 }
1278
1279                 /*
1280                  * If data starts mid-page and flows into the next page, then
1281                  * increase page count
1282                  */
1283
1284                 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1285                          "(%lu-bytes)\r\n",
1286                          sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1287                          cmd->opc == NVME_OPC_WRITE ?
1288                              "WRITE" : "READ",
1289                          lba, nblocks, bytes));
1290
1291                 cmd->prp1 &= ~(0x03UL);
1292                 cmd->prp2 &= ~(0x03UL);
1293
1294                 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1295
1296                 size = bytes;
1297                 lba *= sc->nvstore.sectsz;
1298
1299                 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1300
1301                 if (cpsz > bytes)
1302                         cpsz = bytes;
1303
1304                 if (req != NULL) {
1305                         req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1306                                                 cmd->cdw10;
1307                         req->opc = cmd->opc;
1308                         req->cid = cmd->cid;
1309                         req->nsid = cmd->nsid;
1310                 }
1311
1312                 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1313                     cmd->opc == NVME_OPC_WRITE, lba);
1314                 lba += cpsz;
1315                 size -= cpsz;
1316
1317                 if (size == 0)
1318                         goto iodone;
1319
1320                 if (size <= PAGE_SIZE) {
1321                         /* prp2 is second (and final) page in transfer */
1322
1323                         err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1324                             size,
1325                             cmd->opc == NVME_OPC_WRITE,
1326                             lba);
1327                 } else {
1328                         uint64_t *prp_list;
1329                         int i;
1330
1331                         /* prp2 is pointer to a physical region page list */
1332                         prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1333                                                     cmd->prp2, PAGE_SIZE);
1334
1335                         i = 0;
1336                         while (size != 0) {
1337                                 cpsz = MIN(size, PAGE_SIZE);
1338
1339                                 /*
1340                                  * Move to linked physical region page list
1341                                  * in last item.
1342                                  */ 
1343                                 if (i == (NVME_PRP2_ITEMS-1) &&
1344                                     size > PAGE_SIZE) {
1345                                         assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1346                                         prp_list = paddr_guest2host(
1347                                                       sc->nsc_pi->pi_vmctx,
1348                                                       prp_list[i], PAGE_SIZE);
1349                                         i = 0;
1350                                 }
1351                                 if (prp_list[i] == 0) {
1352                                         WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1353                                         err = 1;
1354                                         break;
1355                                 }
1356
1357                                 err = pci_nvme_append_iov_req(sc, req,
1358                                     prp_list[i], cpsz,
1359                                     cmd->opc == NVME_OPC_WRITE, lba);
1360                                 if (err)
1361                                         break;
1362
1363                                 lba += cpsz;
1364                                 size -= cpsz;
1365                                 i++;
1366                         }
1367                 }
1368
1369 iodone:
1370                 if (sc->nvstore.type == NVME_STOR_RAM) {
1371                         uint16_t code, status;
1372
1373                         code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1374                             NVME_SC_SUCCESS;
1375                         pci_nvme_status_genc(&status, code);
1376
1377                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1378                                                 status, 1);
1379
1380                         continue;
1381                 }
1382
1383
1384                 if (err)
1385                         goto do_error;
1386
1387                 req->io_req.br_callback = pci_nvme_io_done;
1388
1389                 err = 0;
1390                 switch (cmd->opc) {
1391                 case NVME_OPC_READ:
1392                         err = blockif_read(sc->nvstore.ctx, &req->io_req);
1393                         break;
1394                 case NVME_OPC_WRITE:
1395                         err = blockif_write(sc->nvstore.ctx, &req->io_req);
1396                         break;
1397                 default:
1398                         WPRINTF(("%s unhandled io command 0x%x\r\n",
1399                                  __func__, cmd->opc));
1400                         err = 1;
1401                 }
1402
1403 do_error:
1404                 if (err) {
1405                         uint16_t status;
1406
1407                         pci_nvme_status_genc(&status,
1408                             NVME_SC_DATA_TRANSFER_ERROR);
1409
1410                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1411                                                 status, 1);
1412                         pci_nvme_release_ioreq(sc, req);
1413                 }
1414         }
1415
1416         atomic_store_short(&sq->head, sqhead);
1417         atomic_store_int(&sq->busy, 0);
1418 }
1419
1420 static void
1421 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1422         uint64_t idx, int is_sq, uint64_t value)
1423 {
1424         DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1425                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1426
1427         if (is_sq) {
1428                 atomic_store_short(&sc->submit_queues[idx].tail,
1429                                    (uint16_t)value);
1430
1431                 if (idx == 0) {
1432                         pci_nvme_handle_admin_cmd(sc, value);
1433                 } else {
1434                         /* submission queue; handle new entries in SQ */
1435                         if (idx > sc->num_squeues) {
1436                                 WPRINTF(("%s SQ index %lu overflow from "
1437                                          "guest (max %u)\r\n",
1438                                          __func__, idx, sc->num_squeues));
1439                                 return;
1440                         }
1441                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1442                 }
1443         } else {
1444                 if (idx > sc->num_cqueues) {
1445                         WPRINTF(("%s queue index %lu overflow from "
1446                                  "guest (max %u)\r\n",
1447                                  __func__, idx, sc->num_cqueues));
1448                         return;
1449                 }
1450
1451                 sc->compl_queues[idx].head = (uint16_t)value;
1452         }
1453 }
1454
1455 static void
1456 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1457 {
1458         const char *s = iswrite ? "WRITE" : "READ";
1459
1460         switch (offset) {
1461         case NVME_CR_CAP_LOW:
1462                 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1463                 break;
1464         case NVME_CR_CAP_HI:
1465                 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1466                 break;
1467         case NVME_CR_VS:
1468                 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1469                 break;
1470         case NVME_CR_INTMS:
1471                 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1472                 break;
1473         case NVME_CR_INTMC:
1474                 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1475                 break;
1476         case NVME_CR_CC:
1477                 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1478                 break;
1479         case NVME_CR_CSTS:
1480                 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1481                 break;
1482         case NVME_CR_NSSR:
1483                 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1484                 break;
1485         case NVME_CR_AQA:
1486                 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1487                 break;
1488         case NVME_CR_ASQ_LOW:
1489                 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1490                 break;
1491         case NVME_CR_ASQ_HI:
1492                 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1493                 break;
1494         case NVME_CR_ACQ_LOW:
1495                 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1496                 break;
1497         case NVME_CR_ACQ_HI:
1498                 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1499                 break;
1500         default:
1501                 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1502         }
1503
1504 }
1505
1506 static void
1507 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1508         uint64_t offset, int size, uint64_t value)
1509 {
1510         uint32_t ccreg;
1511
1512         if (offset >= NVME_DOORBELL_OFFSET) {
1513                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1514                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1515                 int is_sq = (belloffset % 8) < 4;
1516
1517                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1518                         WPRINTF(("guest attempted an overflow write offset "
1519                                  "0x%lx, val 0x%lx in %s",
1520                                  offset, value, __func__));
1521                         return;
1522                 }
1523
1524                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1525                 return;
1526         }
1527
1528         DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1529                 offset, size, value));
1530
1531         if (size != 4) {
1532                 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1533                          "val 0x%lx) to bar0 in %s",
1534                          size, offset, value, __func__));
1535                 /* TODO: shutdown device */
1536                 return;
1537         }
1538
1539         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1540
1541         pthread_mutex_lock(&sc->mtx);
1542
1543         switch (offset) {
1544         case NVME_CR_CAP_LOW:
1545         case NVME_CR_CAP_HI:
1546                 /* readonly */
1547                 break;
1548         case NVME_CR_VS:
1549                 /* readonly */
1550                 break;
1551         case NVME_CR_INTMS:
1552                 /* MSI-X, so ignore */
1553                 break;
1554         case NVME_CR_INTMC:
1555                 /* MSI-X, so ignore */
1556                 break;
1557         case NVME_CR_CC:
1558                 ccreg = (uint32_t)value;
1559
1560                 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1561                          "iocqes %u\r\n",
1562                         __func__,
1563                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1564                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1565                          NVME_CC_GET_IOCQES(ccreg)));
1566
1567                 if (NVME_CC_GET_SHN(ccreg)) {
1568                         /* perform shutdown - flush out data to backend */
1569                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1570                             NVME_CSTS_REG_SHST_SHIFT);
1571                         sc->regs.csts |= NVME_SHST_COMPLETE <<
1572                             NVME_CSTS_REG_SHST_SHIFT;
1573                 }
1574                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1575                         if (NVME_CC_GET_EN(ccreg) == 0)
1576                                 /* transition 1-> causes controller reset */
1577                                 pci_nvme_reset_locked(sc);
1578                         else
1579                                 pci_nvme_init_controller(ctx, sc);
1580                 }
1581
1582                 /* Insert the iocqes, iosqes and en bits from the write */
1583                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1584                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1585                 if (NVME_CC_GET_EN(ccreg) == 0) {
1586                         /* Insert the ams, mps and css bit fields */
1587                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1588                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1589                         sc->regs.csts &= ~NVME_CSTS_RDY;
1590                 } else if (sc->pending_ios == 0) {
1591                         sc->regs.csts |= NVME_CSTS_RDY;
1592                 }
1593                 break;
1594         case NVME_CR_CSTS:
1595                 break;
1596         case NVME_CR_NSSR:
1597                 /* ignore writes; don't support subsystem reset */
1598                 break;
1599         case NVME_CR_AQA:
1600                 sc->regs.aqa = (uint32_t)value;
1601                 break;
1602         case NVME_CR_ASQ_LOW:
1603                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1604                                (0xFFFFF000 & value);
1605                 break;
1606         case NVME_CR_ASQ_HI:
1607                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1608                                (value << 32);
1609                 break;
1610         case NVME_CR_ACQ_LOW:
1611                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1612                                (0xFFFFF000 & value);
1613                 break;
1614         case NVME_CR_ACQ_HI:
1615                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1616                                (value << 32);
1617                 break;
1618         default:
1619                 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1620                          __func__, offset, value, size));
1621         }
1622         pthread_mutex_unlock(&sc->mtx);
1623 }
1624
1625 static void
1626 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1627                 int baridx, uint64_t offset, int size, uint64_t value)
1628 {
1629         struct pci_nvme_softc* sc = pi->pi_arg;
1630
1631         if (baridx == pci_msix_table_bar(pi) ||
1632             baridx == pci_msix_pba_bar(pi)) {
1633                 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1634                          " value 0x%lx\r\n", baridx, offset, size, value));
1635
1636                 pci_emul_msix_twrite(pi, offset, size, value);
1637                 return;
1638         }
1639
1640         switch (baridx) {
1641         case 0:
1642                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1643                 break;
1644
1645         default:
1646                 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1647                          __func__, baridx, value));
1648         }
1649 }
1650
1651 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1652         uint64_t offset, int size)
1653 {
1654         uint64_t value;
1655
1656         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1657
1658         if (offset < NVME_DOORBELL_OFFSET) {
1659                 void *p = &(sc->regs);
1660                 pthread_mutex_lock(&sc->mtx);
1661                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
1662                 pthread_mutex_unlock(&sc->mtx);
1663         } else {
1664                 value = 0;
1665                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1666         }
1667
1668         switch (size) {
1669         case 1:
1670                 value &= 0xFF;
1671                 break;
1672         case 2:
1673                 value &= 0xFFFF;
1674                 break;
1675         case 4:
1676                 value &= 0xFFFFFFFF;
1677                 break;
1678         }
1679
1680         DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1681                  offset, size, (uint32_t)value));
1682
1683         return (value);
1684 }
1685
1686
1687
1688 static uint64_t
1689 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1690     uint64_t offset, int size)
1691 {
1692         struct pci_nvme_softc* sc = pi->pi_arg;
1693
1694         if (baridx == pci_msix_table_bar(pi) ||
1695             baridx == pci_msix_pba_bar(pi)) {
1696                 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1697                         baridx, offset, size));
1698
1699                 return pci_emul_msix_tread(pi, offset, size);
1700         }
1701
1702         switch (baridx) {
1703         case 0:
1704                 return pci_nvme_read_bar_0(sc, offset, size);
1705
1706         default:
1707                 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1708         }
1709
1710         return (0);
1711 }
1712
1713
1714 static int
1715 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1716 {
1717         char bident[sizeof("XX:X:X")];
1718         char    *uopt, *xopts, *config;
1719         uint32_t sectsz;
1720         int optidx;
1721
1722         sc->max_queues = NVME_QUEUES;
1723         sc->max_qentries = NVME_MAX_QENTRIES;
1724         sc->ioslots = NVME_IOSLOTS;
1725         sc->num_squeues = sc->max_queues;
1726         sc->num_cqueues = sc->max_queues;
1727         sectsz = 0;
1728
1729         uopt = strdup(opts);
1730         optidx = 0;
1731         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1732                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1733         for (xopts = strtok(uopt, ",");
1734              xopts != NULL;
1735              xopts = strtok(NULL, ",")) {
1736
1737                 if ((config = strchr(xopts, '=')) != NULL)
1738                         *config++ = '\0';
1739
1740                 if (!strcmp("maxq", xopts)) {
1741                         sc->max_queues = atoi(config);
1742                 } else if (!strcmp("qsz", xopts)) {
1743                         sc->max_qentries = atoi(config);
1744                 } else if (!strcmp("ioslots", xopts)) {
1745                         sc->ioslots = atoi(config);
1746                 } else if (!strcmp("sectsz", xopts)) {
1747                         sectsz = atoi(config);
1748                 } else if (!strcmp("ser", xopts)) {
1749                         /*
1750                          * This field indicates the Product Serial Number in
1751                          * 7-bit ASCII, unused bytes should be space characters.
1752                          * Ref: NVMe v1.3c.
1753                          */
1754                         cpywithpad((char *)sc->ctrldata.sn,
1755                                    sizeof(sc->ctrldata.sn), config, ' ');
1756                 } else if (!strcmp("ram", xopts)) {
1757                         uint64_t sz = strtoull(&xopts[4], NULL, 10);
1758
1759                         sc->nvstore.type = NVME_STOR_RAM;
1760                         sc->nvstore.size = sz * 1024 * 1024;
1761                         sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1762                         sc->nvstore.sectsz = 4096;
1763                         sc->nvstore.sectsz_bits = 12;
1764                         if (sc->nvstore.ctx == NULL) {
1765                                 perror("Unable to allocate RAM");
1766                                 free(uopt);
1767                                 return (-1);
1768                         }
1769                 } else if (optidx == 0) {
1770                         snprintf(bident, sizeof(bident), "%d:%d",
1771                                  sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1772                         sc->nvstore.ctx = blockif_open(xopts, bident);
1773                         if (sc->nvstore.ctx == NULL) {
1774                                 perror("Could not open backing file");
1775                                 free(uopt);
1776                                 return (-1);
1777                         }
1778                         sc->nvstore.type = NVME_STOR_BLOCKIF;
1779                         sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1780                 } else {
1781                         fprintf(stderr, "Invalid option %s\n", xopts);
1782                         free(uopt);
1783                         return (-1);
1784                 }
1785
1786                 optidx++;
1787         }
1788         free(uopt);
1789
1790         if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1791                 fprintf(stderr, "backing store not specified\n");
1792                 return (-1);
1793         }
1794         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1795                 sc->nvstore.sectsz = sectsz;
1796         else if (sc->nvstore.type != NVME_STOR_RAM)
1797                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1798         for (sc->nvstore.sectsz_bits = 9;
1799              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1800              sc->nvstore.sectsz_bits++);
1801
1802         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1803                 sc->max_queues = NVME_QUEUES;
1804
1805         if (sc->max_qentries <= 0) {
1806                 fprintf(stderr, "Invalid qsz option\n");
1807                 return (-1);
1808         }
1809         if (sc->ioslots <= 0) {
1810                 fprintf(stderr, "Invalid ioslots option\n");
1811                 return (-1);
1812         }
1813
1814         return (0);
1815 }
1816
1817 static int
1818 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1819 {
1820         struct pci_nvme_softc *sc;
1821         uint32_t pci_membar_sz;
1822         int     error;
1823
1824         error = 0;
1825
1826         sc = calloc(1, sizeof(struct pci_nvme_softc));
1827         pi->pi_arg = sc;
1828         sc->nsc_pi = pi;
1829
1830         error = pci_nvme_parse_opts(sc, opts);
1831         if (error < 0)
1832                 goto done;
1833         else
1834                 error = 0;
1835
1836         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1837         for (int i = 0; i < sc->ioslots; i++) {
1838                 if (i < (sc->ioslots-1))
1839                         sc->ioreqs[i].next = &sc->ioreqs[i+1];
1840                 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1841                 pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1842         }
1843         sc->ioreqs_free = sc->ioreqs;
1844         sc->intr_coales_aggr_thresh = 1;
1845
1846         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1847         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1848         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1849         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1850         pci_set_cfgdata8(pi, PCIR_PROGIF,
1851                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1852
1853         /*
1854          * Allocate size of NVMe registers + doorbell space for all queues.
1855          *
1856          * The specification requires a minimum memory I/O window size of 16K.
1857          * The Windows driver will refuse to start a device with a smaller
1858          * window.
1859          */
1860         pci_membar_sz = sizeof(struct nvme_registers) +
1861             2 * sizeof(uint32_t) * (sc->max_queues + 1);
1862         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1863
1864         DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1865
1866         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1867         if (error) {
1868                 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1869                 goto done;
1870         }
1871
1872         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
1873         if (error) {
1874                 WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1875                 goto done;
1876         }
1877
1878         pthread_mutex_init(&sc->mtx, NULL);
1879         sem_init(&sc->iosemlock, 0, sc->ioslots);
1880
1881         pci_nvme_reset(sc);
1882         pci_nvme_init_ctrldata(sc);
1883         pci_nvme_init_nsdata(sc);
1884
1885         pci_lintr_request(pi);
1886
1887 done:
1888         return (error);
1889 }
1890
1891
1892 struct pci_devemu pci_de_nvme = {
1893         .pe_emu =       "nvme",
1894         .pe_init =      pci_nvme_init,
1895         .pe_barwrite =  pci_nvme_write,
1896         .pe_barread =   pci_nvme_read
1897 };
1898 PCI_EMUL_SET(pci_de_nvme);