2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (C) 2012-2016 Intel Corporation
6 * Copyright (C) 2018 Alexander Motin <mav@FreeBSD.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/queue.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <sys/taskqueue.h>
42 #include <machine/atomic.h>
44 #include <geom/geom.h>
45 #include <geom/geom_disk.h>
47 #include <dev/nvme/nvme.h>
52 struct nvd_controller;
54 static disk_ioctl_t nvd_ioctl;
55 static disk_strategy_t nvd_strategy;
56 static dumper_t nvd_dump;
58 static void nvd_done(void *arg, const struct nvme_completion *cpl);
59 static void nvd_gone(struct nvd_disk *ndisk);
61 static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr);
63 static void *nvd_new_controller(struct nvme_controller *ctrlr);
64 static void nvd_controller_fail(void *ctrlr);
66 static int nvd_load(void);
67 static void nvd_unload(void);
69 MALLOC_DEFINE(M_NVD, "nvd", "nvd(4) allocations");
71 struct nvme_consumer *consumer_handle;
74 struct nvd_controller *ctrlr;
76 struct bio_queue_head bioq;
82 struct nvme_namespace *ns;
85 #define NVD_ODEPTH (1 << 30)
86 uint32_t ordered_in_flight;
89 TAILQ_ENTRY(nvd_disk) global_tailq;
90 TAILQ_ENTRY(nvd_disk) ctrlr_tailq;
93 struct nvd_controller {
95 TAILQ_ENTRY(nvd_controller) tailq;
96 TAILQ_HEAD(, nvd_disk) disk_head;
99 static struct mtx nvd_lock;
100 static TAILQ_HEAD(, nvd_controller) ctrlr_head;
101 static TAILQ_HEAD(disk_list, nvd_disk) disk_head;
103 static SYSCTL_NODE(_hw, OID_AUTO, nvd, CTLFLAG_RD, 0, "nvd driver parameters");
105 * The NVMe specification does not define a maximum or optimal delete size, so
106 * technically max delete size is min(full size of the namespace, 2^32 - 1
107 * LBAs). A single delete for a multi-TB NVMe namespace though may take much
108 * longer to complete than the nvme(4) I/O timeout period. So choose a sensible
109 * default here that is still suitably large to minimize the number of overall
112 static uint64_t nvd_delete_max = (1024 * 1024 * 1024); /* 1GB */
113 SYSCTL_UQUAD(_hw_nvd, OID_AUTO, delete_max, CTLFLAG_RDTUN, &nvd_delete_max, 0,
114 "nvd maximum BIO_DELETE size in bytes");
116 static int nvd_modevent(module_t mod, int type, void *arg)
134 moduledata_t nvd_mod = {
136 (modeventhand_t)nvd_modevent,
140 DECLARE_MODULE(nvd, nvd_mod, SI_SUB_DRIVERS, SI_ORDER_ANY);
141 MODULE_VERSION(nvd, 1);
142 MODULE_DEPEND(nvd, nvme, 1, 1, 1);
150 mtx_init(&nvd_lock, "nvd_lock", NULL, MTX_DEF);
151 TAILQ_INIT(&ctrlr_head);
152 TAILQ_INIT(&disk_head);
154 consumer_handle = nvme_register_consumer(nvd_new_disk,
155 nvd_new_controller, NULL, nvd_controller_fail);
157 return (consumer_handle != NULL ? 0 : -1);
163 struct nvd_controller *ctrlr;
164 struct nvd_disk *ndisk;
170 while ((ctrlr = TAILQ_FIRST(&ctrlr_head)) != NULL) {
171 TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
172 TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
174 while (!TAILQ_EMPTY(&ctrlr->disk_head))
175 msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_unload",0);
178 mtx_unlock(&nvd_lock);
180 nvme_unregister_consumer(consumer_handle);
182 mtx_destroy(&nvd_lock);
186 nvd_bio_submit(struct nvd_disk *ndisk, struct bio *bp)
190 bp->bio_driver1 = NULL;
191 if (__predict_false(bp->bio_flags & BIO_ORDERED))
192 atomic_add_int(&ndisk->cur_depth, NVD_ODEPTH);
194 atomic_add_int(&ndisk->cur_depth, 1);
195 err = nvme_ns_bio_process(ndisk->ns, bp, nvd_done);
197 if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
198 atomic_add_int(&ndisk->cur_depth, -NVD_ODEPTH);
199 atomic_add_int(&ndisk->ordered_in_flight, -1);
200 wakeup(&ndisk->cur_depth);
202 if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == 1 &&
203 __predict_false(ndisk->ordered_in_flight != 0))
204 wakeup(&ndisk->cur_depth);
207 bp->bio_flags |= BIO_ERROR;
208 bp->bio_resid = bp->bio_bcount;
214 nvd_strategy(struct bio *bp)
216 struct nvd_disk *ndisk = (struct nvd_disk *)bp->bio_disk->d_drv1;
219 * bio with BIO_ORDERED flag must be executed after all previous
220 * bios in the queue, and before any successive bios.
222 if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
223 if (atomic_fetchadd_int(&ndisk->ordered_in_flight, 1) == 0 &&
224 ndisk->cur_depth == 0 && bioq_first(&ndisk->bioq) == NULL) {
225 nvd_bio_submit(ndisk, bp);
228 } else if (__predict_true(ndisk->ordered_in_flight == 0)) {
229 nvd_bio_submit(ndisk, bp);
234 * There are ordered bios in flight, so we need to submit
235 * bios through the task queue to enforce ordering.
237 mtx_lock(&ndisk->bioqlock);
238 bioq_insert_tail(&ndisk->bioq, bp);
239 mtx_unlock(&ndisk->bioqlock);
240 taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask);
244 nvd_gone(struct nvd_disk *ndisk)
248 printf(NVD_STR"%u: detached\n", ndisk->unit);
249 mtx_lock(&ndisk->bioqlock);
250 disk_gone(ndisk->disk);
251 while ((bp = bioq_takefirst(&ndisk->bioq)) != NULL) {
252 if (__predict_false(bp->bio_flags & BIO_ORDERED))
253 atomic_add_int(&ndisk->ordered_in_flight, -1);
254 bp->bio_error = ENXIO;
255 bp->bio_flags |= BIO_ERROR;
256 bp->bio_resid = bp->bio_bcount;
259 mtx_unlock(&ndisk->bioqlock);
263 nvd_gonecb(struct disk *dp)
265 struct nvd_disk *ndisk = (struct nvd_disk *)dp->d_drv1;
267 disk_destroy(ndisk->disk);
269 TAILQ_REMOVE(&disk_head, ndisk, global_tailq);
270 TAILQ_REMOVE(&ndisk->ctrlr->disk_head, ndisk, ctrlr_tailq);
271 if (TAILQ_EMPTY(&ndisk->ctrlr->disk_head))
272 wakeup(&ndisk->ctrlr->disk_head);
273 mtx_unlock(&nvd_lock);
274 taskqueue_free(ndisk->tq);
275 mtx_destroy(&ndisk->bioqlock);
280 nvd_ioctl(struct disk *dp, u_long cmd, void *data, int fflag,
283 struct nvd_disk *ndisk = dp->d_drv1;
285 return (nvme_ns_ioctl_process(ndisk->ns, cmd, data, fflag, td));
289 nvd_dump(void *arg, void *virt, vm_offset_t phys, off_t offset, size_t len)
291 struct disk *dp = arg;
292 struct nvd_disk *ndisk = dp->d_drv1;
294 return (nvme_ns_dump(ndisk->ns, virt, offset, len));
298 nvd_done(void *arg, const struct nvme_completion *cpl)
300 struct bio *bp = (struct bio *)arg;
301 struct nvd_disk *ndisk = bp->bio_disk->d_drv1;
303 if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
304 atomic_add_int(&ndisk->cur_depth, -NVD_ODEPTH);
305 atomic_add_int(&ndisk->ordered_in_flight, -1);
306 wakeup(&ndisk->cur_depth);
308 if (atomic_fetchadd_int(&ndisk->cur_depth, -1) == 1 &&
309 __predict_false(ndisk->ordered_in_flight != 0))
310 wakeup(&ndisk->cur_depth);
317 nvd_bioq_process(void *arg, int pending)
319 struct nvd_disk *ndisk = arg;
323 mtx_lock(&ndisk->bioqlock);
324 bp = bioq_takefirst(&ndisk->bioq);
325 mtx_unlock(&ndisk->bioqlock);
329 if (__predict_false(bp->bio_flags & BIO_ORDERED)) {
331 * bio with BIO_ORDERED flag set must be executed
332 * after all previous bios.
334 while (ndisk->cur_depth > 0)
335 tsleep(&ndisk->cur_depth, 0, "nvdorb", 1);
338 * bio with BIO_ORDERED flag set must be completed
339 * before proceeding with additional bios.
341 while (ndisk->cur_depth >= NVD_ODEPTH)
342 tsleep(&ndisk->cur_depth, 0, "nvdora", 1);
345 nvd_bio_submit(ndisk, bp);
350 nvd_new_controller(struct nvme_controller *ctrlr)
352 struct nvd_controller *nvd_ctrlr;
354 nvd_ctrlr = malloc(sizeof(struct nvd_controller), M_NVD,
357 TAILQ_INIT(&nvd_ctrlr->disk_head);
359 TAILQ_INSERT_TAIL(&ctrlr_head, nvd_ctrlr, tailq);
360 mtx_unlock(&nvd_lock);
366 nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg)
368 uint8_t descr[NVME_MODEL_NUMBER_LENGTH+1];
369 struct nvd_disk *ndisk, *tnd;
371 struct nvd_controller *ctrlr = ctrlr_arg;
374 ndisk = malloc(sizeof(struct nvd_disk), M_NVD, M_ZERO | M_WAITOK);
375 ndisk->ctrlr = ctrlr;
377 ndisk->cur_depth = 0;
378 ndisk->ordered_in_flight = 0;
379 mtx_init(&ndisk->bioqlock, "nvd bioq lock", NULL, MTX_DEF);
380 bioq_init(&ndisk->bioq);
381 TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk);
385 TAILQ_FOREACH(tnd, &disk_head, global_tailq) {
386 if (tnd->unit > unit)
388 unit = tnd->unit + 1;
392 TAILQ_INSERT_BEFORE(tnd, ndisk, global_tailq);
394 TAILQ_INSERT_TAIL(&disk_head, ndisk, global_tailq);
395 TAILQ_INSERT_TAIL(&ctrlr->disk_head, ndisk, ctrlr_tailq);
396 mtx_unlock(&nvd_lock);
398 ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK,
399 taskqueue_thread_enqueue, &ndisk->tq);
400 taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq");
402 disk = ndisk->disk = disk_alloc();
403 disk->d_strategy = nvd_strategy;
404 disk->d_ioctl = nvd_ioctl;
405 disk->d_dump = nvd_dump;
406 disk->d_gone = nvd_gonecb;
407 disk->d_name = NVD_STR;
408 disk->d_unit = ndisk->unit;
409 disk->d_drv1 = ndisk;
411 disk->d_sectorsize = nvme_ns_get_sector_size(ns);
412 disk->d_mediasize = (off_t)nvme_ns_get_size(ns);
413 disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns);
414 disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns);
415 if (disk->d_delmaxsize > nvd_delete_max)
416 disk->d_delmaxsize = nvd_delete_max;
417 disk->d_stripesize = nvme_ns_get_stripesize(ns);
418 disk->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
419 if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED)
420 disk->d_flags |= DISKFLAG_CANDELETE;
421 if (nvme_ns_get_flags(ns) & NVME_NS_FLUSH_SUPPORTED)
422 disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
425 * d_ident and d_descr are both far bigger than the length of either
426 * the serial or model number strings.
428 nvme_strvis(disk->d_ident, nvme_ns_get_serial_number(ns),
429 sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH);
430 nvme_strvis(descr, nvme_ns_get_model_number(ns), sizeof(descr),
431 NVME_MODEL_NUMBER_LENGTH);
432 strlcpy(disk->d_descr, descr, sizeof(descr));
434 disk->d_rotation_rate = DISK_RR_NON_ROTATING;
436 disk_create(disk, DISK_VERSION);
438 printf(NVD_STR"%u: <%s> NVMe namespace\n", disk->d_unit, descr);
439 printf(NVD_STR"%u: %juMB (%ju %u byte sectors)\n", disk->d_unit,
440 (uintmax_t)disk->d_mediasize / (1024*1024),
441 (uintmax_t)disk->d_mediasize / disk->d_sectorsize,
448 nvd_controller_fail(void *ctrlr_arg)
450 struct nvd_controller *ctrlr = ctrlr_arg;
451 struct nvd_disk *ndisk;
454 TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
455 TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
457 while (!TAILQ_EMPTY(&ctrlr->disk_head))
458 msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_fail", 0);
459 mtx_unlock(&nvd_lock);