2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (C) 2012-2016 Intel Corporation
6 * Copyright (C) 2018-2020 Alexander Motin <mav@FreeBSD.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/queue.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <sys/taskqueue.h>
42 #include <machine/atomic.h>
44 #include <geom/geom.h>
45 #include <geom/geom_disk.h>
47 #include <dev/nvme/nvme.h>
48 #include <dev/nvme/nvme_private.h>
50 #include <dev/pci/pcivar.h>
55 struct nvd_controller;
57 static disk_ioctl_t nvd_ioctl;
58 static disk_strategy_t nvd_strategy;
59 static dumper_t nvd_dump;
60 static disk_getattr_t nvd_getattr;
62 static void nvd_done(void *arg, const struct nvme_completion *cpl);
63 static void nvd_gone(struct nvd_disk *ndisk);
65 static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr);
67 static void *nvd_new_controller(struct nvme_controller *ctrlr);
68 static void nvd_controller_fail(void *ctrlr);
70 static int nvd_load(void);
71 static void nvd_unload(void);
73 MALLOC_DEFINE(M_NVD, "nvd", "nvd(4) allocations");
75 struct nvme_consumer *consumer_handle;
78 struct nvd_controller *ctrlr;
80 struct bio_queue_head bioq;
86 struct nvme_namespace *ns;
89 #define NVD_ODEPTH (1 << 30)
90 uint32_t ordered_in_flight;
93 TAILQ_ENTRY(nvd_disk) global_tailq;
94 TAILQ_ENTRY(nvd_disk) ctrlr_tailq;
97 struct nvd_controller {
98 struct nvme_controller *ctrlr;
99 TAILQ_ENTRY(nvd_controller) tailq;
100 TAILQ_HEAD(, nvd_disk) disk_head;
103 static struct mtx nvd_lock;
104 static TAILQ_HEAD(, nvd_controller) ctrlr_head;
105 static TAILQ_HEAD(disk_list, nvd_disk) disk_head;
107 static SYSCTL_NODE(_hw, OID_AUTO, nvd, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
108 "nvd driver parameters");
110 * The NVMe specification does not define a maximum or optimal delete size, so
111 * technically max delete size is min(full size of the namespace, 2^32 - 1
112 * LBAs). A single delete for a multi-TB NVMe namespace though may take much
113 * longer to complete than the nvme(4) I/O timeout period. So choose a sensible
114 * default here that is still suitably large to minimize the number of overall
117 static uint64_t nvd_delete_max = (1024 * 1024 * 1024); /* 1GB */
118 SYSCTL_UQUAD(_hw_nvd, OID_AUTO, delete_max, CTLFLAG_RDTUN, &nvd_delete_max, 0,
119 "nvd maximum BIO_DELETE size in bytes");
121 static int nvd_modevent(module_t mod, int type, void *arg)
139 moduledata_t nvd_mod = {
141 (modeventhand_t)nvd_modevent,
145 DECLARE_MODULE(nvd, nvd_mod, SI_SUB_DRIVERS, SI_ORDER_ANY);
146 MODULE_VERSION(nvd, 1);
147 MODULE_DEPEND(nvd, nvme, 1, 1, 1);
155 mtx_init(&nvd_lock, "nvd_lock", NULL, MTX_DEF);
156 TAILQ_INIT(&ctrlr_head);
157 TAILQ_INIT(&disk_head);
159 consumer_handle = nvme_register_consumer(nvd_new_disk,
160 nvd_new_controller, NULL, nvd_controller_fail);
162 return (consumer_handle != NULL ? 0 : -1);
168 struct nvd_controller *ctrlr;
169 struct nvd_disk *ndisk;
175 while ((ctrlr = TAILQ_FIRST(&ctrlr_head)) != NULL) {
176 TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
177 TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
179 while (!TAILQ_EMPTY(&ctrlr->disk_head))
180 msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_unload",0);
183 mtx_unlock(&nvd_lock);
185 nvme_unregister_consumer(consumer_handle);
187 mtx_destroy(&nvd_lock);
191 nvd_bio_submit(struct nvd_disk *ndisk, struct bio *bp)
195 bp->bio_driver1 = NULL;
196 if (__predict_false(bp->bio_flags & BIO_ORDERED))
197 atomic_add_int(&ndisk->cur_depth, NVD_ODEPTH);
199 atomic_add_int(&ndisk->cur_depth, 1);
200 err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done);
202 if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
203 atomic_add_int(&ndisk->cur_depth, -NVD_ODEPTH);
204 atomic_add_int(&ndisk->ordered_in_flight, -1);
205 wakeup(&ndisk->cur_depth);
207 if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == 1 &&
208 __predict_false(ndisk->ordered_in_flight != 0))
209 wakeup(&ndisk->cur_depth);
212 bp->bio_flags |= BIO_ERROR;
213 bp->bio_resid = bp->bio_bcount;
219 nvd_strategy(struct bio *bp)
221 struct nvd_disk *ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1;
224 * bio with BIO_ORDERED flag must be executed after all previous
225 * bios in the queue, and before any successive bios.
227 if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
228 if (atomic_fetchadd_int(&ndisk->ordered_in_flight, 1) == 0 &&
229 ndisk->cur_depth == 0 && bioq_first(&ndisk->bioq) == NULL) {
230 nvd_bio_submit(ndisk, bp);
233 } else if (__predict_true(ndisk->ordered_in_flight == 0)) {
234 nvd_bio_submit(ndisk, bp);
239 * There are ordered bios in flight, so we need to submit
240 * bios through the task queue to enforce ordering.
242 mtx_lock(&ndisk->bioqlock);
243 bioq_insert_tail(&ndisk->bioq, bp);
244 mtx_unlock(&ndisk->bioqlock);
245 taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask);
249 nvd_gone(struct nvd_disk *ndisk)
253 printf(NVD_STR"%u: detached\n", ndisk->unit);
254 mtx_lock(&ndisk->bioqlock);
255 disk_gone(ndisk->disk);
256 while ((bp = bioq_takefirst(&ndisk->bioq)) != NULL) {
257 if (__predict_false(bp->bio_flags & BIO_ORDERED))
258 atomic_add_int(&ndisk->ordered_in_flight, -1);
259 bp->bio_error = ENXIO;
260 bp->bio_flags |= BIO_ERROR;
261 bp->bio_resid = bp->bio_bcount;
264 mtx_unlock(&ndisk->bioqlock);
268 nvd_gonecb(struct disk *dp)
270 struct nvd_disk *ndisk = (struct nvd_disk *)dp->d_drv1;
272 disk_destroy(ndisk->disk);
274 TAILQ_REMOVE(&disk_head, ndisk, global_tailq);
275 TAILQ_REMOVE(&ndisk->ctrlr->disk_head, ndisk, ctrlr_tailq);
276 if (TAILQ_EMPTY(&ndisk->ctrlr->disk_head))
277 wakeup(&ndisk->ctrlr->disk_head);
278 mtx_unlock(&nvd_lock);
279 taskqueue_free(ndisk->tq);
280 mtx_destroy(&ndisk->bioqlock);
285 nvd_ioctl(struct disk *dp, u_long cmd, void *data, int fflag,
288 struct nvd_disk *ndisk = dp->d_drv1;
290 return (nvme_ns_ioctl_process(ndisk->ns, cmd, data, fflag, td));
294 nvd_dump(void *arg, void *virt, vm_offset_t phys, off_t offset, size_t len)
296 struct disk *dp = arg;
297 struct nvd_disk *ndisk = dp->d_drv1;
299 return (nvme_ns_dump(ndisk->ns, virt, offset, len));
303 nvd_getattr(struct bio *bp)
305 struct nvd_disk *ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1;
306 const struct nvme_namespace_data *nsdata;
309 if (!strcmp("GEOM::lunid", bp->bio_attribute)) {
310 nsdata = nvme_ns_get_data(ndisk->ns);
312 /* Try to return NGUID as lunid. */
313 for (i = 0; i < sizeof(nsdata->nguid); i++) {
314 if (nsdata->nguid[i] != 0)
317 if (i < sizeof(nsdata->nguid)) {
318 if (bp->bio_length < sizeof(nsdata->nguid) * 2 + 1)
320 for (i = 0; i < sizeof(nsdata->nguid); i++) {
321 sprintf(&bp->bio_data[i * 2], "%02x",
324 bp->bio_completed = bp->bio_length;
328 /* Try to return EUI64 as lunid. */
329 for (i = 0; i < sizeof(nsdata->eui64); i++) {
330 if (nsdata->eui64[i] != 0)
333 if (i < sizeof(nsdata->eui64)) {
334 if (bp->bio_length < sizeof(nsdata->eui64) * 2 + 1)
336 for (i = 0; i < sizeof(nsdata->eui64); i++) {
337 sprintf(&bp->bio_data[i * 2], "%02x",
340 bp->bio_completed = bp->bio_length;
348 nvd_done(void *arg, const struct nvme_completion *cpl)
350 struct bio *bp = (struct bio *)arg;
351 struct nvd_disk *ndisk = bp->bio_disk->d_drv1;
353 if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
354 atomic_add_int(&ndisk->cur_depth, -NVD_ODEPTH);
355 atomic_add_int(&ndisk->ordered_in_flight, -1);
356 wakeup(&ndisk->cur_depth);
358 if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == 1 &&
359 __predict_false(ndisk->ordered_in_flight != 0))
360 wakeup(&ndisk->cur_depth);
367 nvd_bioq_process(void *arg, int pending)
369 struct nvd_disk *ndisk = arg;
373 mtx_lock(&ndisk->bioqlock);
374 bp = bioq_takefirst(&ndisk->bioq);
375 mtx_unlock(&ndisk->bioqlock);
379 if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
381 * bio with BIO_ORDERED flag set must be executed
382 * after all previous bios.
384 while (ndisk->cur_depth > 0)
385 tsleep(&ndisk->cur_depth, 0, "nvdorb", 1);
388 * bio with BIO_ORDERED flag set must be completed
389 * before proceeding with additional bios.
391 while (ndisk->cur_depth >= NVD_ODEPTH)
392 tsleep(&ndisk->cur_depth, 0, "nvdora", 1);
395 nvd_bio_submit(ndisk, bp);
400 nvd_new_controller(struct nvme_controller *ctrlr)
402 struct nvd_controller *nvd_ctrlr;
404 nvd_ctrlr = malloc(sizeof(struct nvd_controller), M_NVD,
407 nvd_ctrlr->ctrlr = ctrlr;
408 TAILQ_INIT(&nvd_ctrlr->disk_head);
410 TAILQ_INSERT_TAIL(&ctrlr_head, nvd_ctrlr, tailq);
411 mtx_unlock(&nvd_lock);
417 nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg)
419 uint8_t descr[NVME_MODEL_NUMBER_LENGTH+1];
420 struct nvd_disk *ndisk, *tnd;
422 struct nvd_controller *ctrlr = ctrlr_arg;
423 device_t dev = ctrlr->ctrlr->dev;
426 ndisk = malloc(sizeof(struct nvd_disk), M_NVD, M_ZERO | M_WAITOK);
427 ndisk->ctrlr = ctrlr;
429 ndisk->cur_depth = 0;
430 ndisk->ordered_in_flight = 0;
431 mtx_init(&ndisk->bioqlock, "nvd bioq lock", NULL, MTX_DEF);
432 bioq_init(&ndisk->bioq);
433 TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk);
437 TAILQ_FOREACH(tnd, &disk_head, global_tailq) {
438 if (tnd->unit > unit)
440 unit = tnd->unit + 1;
444 TAILQ_INSERT_BEFORE(tnd, ndisk, global_tailq);
446 TAILQ_INSERT_TAIL(&disk_head, ndisk, global_tailq);
447 TAILQ_INSERT_TAIL(&ctrlr->disk_head, ndisk, ctrlr_tailq);
448 mtx_unlock(&nvd_lock);
450 ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK,
451 taskqueue_thread_enqueue, &ndisk->tq);
452 taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq");
454 disk = ndisk->disk = disk_alloc();
455 disk->d_strategy = nvd_strategy;
456 disk->d_ioctl = nvd_ioctl;
457 disk->d_dump = nvd_dump;
458 disk->d_getattr = nvd_getattr;
459 disk->d_gone = nvd_gonecb;
460 disk->d_name = NVD_STR;
461 disk->d_unit = ndisk->unit;
462 disk->d_drv1 = ndisk;
464 disk->d_sectorsize = nvme_ns_get_sector_size(ns);
465 disk->d_mediasize = (off_t)nvme_ns_get_size(ns);
466 disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns);
467 disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns);
468 if (disk->d_delmaxsize > nvd_delete_max)
469 disk->d_delmaxsize = nvd_delete_max;
470 disk->d_stripesize = nvme_ns_get_stripesize(ns);
471 disk->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
472 if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED)
473 disk->d_flags |= DISKFLAG_CANDELETE;
474 if (nvme_ns_get_flags(ns) & NVME_NS_FLUSH_SUPPORTED)
475 disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
478 * d_ident and d_descr are both far bigger than the length of either
479 * the serial or model number strings.
481 nvme_strvis(disk->d_ident, nvme_ns_get_serial_number(ns),
482 sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH);
483 nvme_strvis(descr, nvme_ns_get_model_number(ns), sizeof(descr),
484 NVME_MODEL_NUMBER_LENGTH);
485 strlcpy(disk->d_descr, descr, sizeof(descr));
487 disk->d_hba_vendor = pci_get_vendor(dev);
488 disk->d_hba_device = pci_get_device(dev);
489 disk->d_hba_subvendor = pci_get_subvendor(dev);
490 disk->d_hba_subdevice = pci_get_subdevice(dev);
491 disk->d_rotation_rate = DISK_RR_NON_ROTATING;
492 strlcpy(disk->d_attachment, device_get_nameunit(dev),
493 sizeof(disk->d_attachment));
495 disk_create(disk, DISK_VERSION);
497 printf(NVD_STR"%u: <%s> NVMe namespace\n", disk->d_unit, descr);
498 printf(NVD_STR"%u: %juMB (%ju %u byte sectors)\n", disk->d_unit,
499 (uintmax_t)disk->d_mediasize / (1024*1024),
500 (uintmax_t)disk->d_mediasize / disk->d_sectorsize,
507 nvd_controller_fail(void *ctrlr_arg)
509 struct nvd_controller *ctrlr = ctrlr_arg;
510 struct nvd_disk *ndisk;
513 TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
514 TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
516 while (!TAILQ_EMPTY(&ctrlr->disk_head))
517 msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_fail", 0);
518 mtx_unlock(&nvd_lock);