2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2022 Marshall Kirk McKusick <mckusick@mckusick.com>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/param.h>
31 #include <sys/ctype.h>
32 #include <sys/kernel.h>
34 #include <sys/malloc.h>
35 #include <sys/module.h>
36 #include <sys/reboot.h>
37 #include <sys/rwlock.h>
39 #include <sys/sysctl.h>
41 #include <geom/geom.h>
42 #include <geom/geom_dbg.h>
43 #include <geom/union/g_union.h>
45 SYSCTL_DECL(_kern_geom);
46 static SYSCTL_NODE(_kern_geom, OID_AUTO, union, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
48 static u_int g_union_debug = 0;
49 SYSCTL_UINT(_kern_geom_union, OID_AUTO, debug, CTLFLAG_RW, &g_union_debug, 0,
52 static void g_union_config(struct gctl_req *req, struct g_class *mp,
54 static g_access_t g_union_access;
55 static g_start_t g_union_start;
56 static g_dumpconf_t g_union_dumpconf;
57 static g_orphan_t g_union_orphan;
58 static int g_union_destroy_geom(struct gctl_req *req, struct g_class *mp,
60 static g_provgone_t g_union_providergone;
61 static g_resize_t g_union_resize;
63 struct g_class g_union_class = {
64 .name = G_UNION_CLASS_NAME,
66 .ctlreq = g_union_config,
67 .access = g_union_access,
68 .start = g_union_start,
69 .dumpconf = g_union_dumpconf,
70 .orphan = g_union_orphan,
71 .destroy_geom = g_union_destroy_geom,
72 .providergone = g_union_providergone,
73 .resize = g_union_resize,
76 static void g_union_ctl_create(struct gctl_req *req, struct g_class *mp, bool);
77 static intmax_t g_union_fetcharg(struct gctl_req *req, const char *name);
78 static bool g_union_verify_nprefix(const char *name);
79 static void g_union_ctl_destroy(struct gctl_req *req, struct g_class *mp, bool);
80 static struct g_geom *g_union_find_geom(struct g_class *mp, const char *name);
81 static void g_union_ctl_reset(struct gctl_req *req, struct g_class *mp, bool);
82 static void g_union_ctl_revert(struct gctl_req *req, struct g_class *mp, bool);
83 static void g_union_revert(struct g_union_softc *sc);
84 static void g_union_doio(struct g_union_wip *wip);
85 static void g_union_ctl_commit(struct gctl_req *req, struct g_class *mp, bool);
86 static void g_union_setmap(struct bio *bp, struct g_union_softc *sc);
87 static bool g_union_getmap(struct bio *bp, struct g_union_softc *sc,
89 static void g_union_done(struct bio *bp);
90 static void g_union_kerneldump(struct bio *bp, struct g_union_softc *sc);
91 static int g_union_dumper(void *, void *, vm_offset_t, off_t, size_t);
92 static int g_union_destroy(struct gctl_req *req, struct g_geom *gp, bool force);
95 * Operate on union-specific configuration commands.
98 g_union_config(struct gctl_req *req, struct g_class *mp, const char *verb)
100 uint32_t *version, *verbose;
104 version = gctl_get_paraml(req, "version", sizeof(*version));
105 if (version == NULL) {
106 gctl_error(req, "No '%s' argument.", "version");
109 if (*version != G_UNION_VERSION) {
110 gctl_error(req, "Userland and kernel parts are out of sync.");
113 verbose = gctl_get_paraml(req, "verbose", sizeof(*verbose));
114 if (verbose == NULL) {
115 gctl_error(req, "No '%s' argument.", "verbose");
118 if (strcmp(verb, "create") == 0) {
119 g_union_ctl_create(req, mp, *verbose);
121 } else if (strcmp(verb, "destroy") == 0) {
122 g_union_ctl_destroy(req, mp, *verbose);
124 } else if (strcmp(verb, "reset") == 0) {
125 g_union_ctl_reset(req, mp, *verbose);
127 } else if (strcmp(verb, "revert") == 0) {
128 g_union_ctl_revert(req, mp, *verbose);
130 } else if (strcmp(verb, "commit") == 0) {
131 g_union_ctl_commit(req, mp, *verbose);
135 gctl_error(req, "Unknown verb.");
139 * Create a union device.
142 g_union_ctl_create(struct gctl_req *req, struct g_class *mp, bool verbose)
144 struct g_provider *upperpp, *lowerpp, *newpp;
145 struct g_consumer *uppercp, *lowercp;
146 struct g_union_softc *sc;
147 struct g_geom_alias *gap;
149 intmax_t offset, secsize, size, needed;
150 const char *gunionname;
151 int *nargs, error, i, n;
156 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
158 gctl_error(req, "No '%s' argument.", "nargs");
162 gctl_error(req, "Missing device(s).");
166 gctl_error(req, "Extra device(s).");
170 offset = g_union_fetcharg(req, "offset");
171 size = g_union_fetcharg(req, "size");
172 secsize = g_union_fetcharg(req, "secsize");
173 gunionname = gctl_get_asciiparam(req, "gunionname");
175 upperpp = gctl_get_provider(req, "arg0");
176 lowerpp = gctl_get_provider(req, "arg1");
177 if (upperpp == NULL || lowerpp == NULL)
178 /* error message provided by gctl_get_provider() */
180 /* Create the union */
182 secsize = lowerpp->sectorsize;
183 else if ((secsize % lowerpp->sectorsize) != 0) {
184 gctl_error(req, "Sector size %jd is not a multiple of lower "
185 "provider %s's %jd sector size.", (intmax_t)secsize,
186 lowerpp->name, (intmax_t)lowerpp->sectorsize);
189 if (secsize > maxphys) {
190 gctl_error(req, "Too big secsize %jd for lower provider %s.",
191 (intmax_t)secsize, lowerpp->name);
194 if (secsize % upperpp->sectorsize != 0) {
195 gctl_error(req, "Sector size %jd is not a multiple of upper "
196 "provider %s's %jd sector size.", (intmax_t)secsize,
197 upperpp->name, (intmax_t)upperpp->sectorsize);
200 if ((offset % secsize) != 0) {
201 gctl_error(req, "Offset %jd is not a multiple of lower "
202 "provider %s's %jd sector size.", (intmax_t)offset,
203 lowerpp->name, (intmax_t)lowerpp->sectorsize);
207 size = lowerpp->mediasize - offset;
210 if ((size % secsize) != 0) {
211 gctl_error(req, "Size %jd is not a multiple of sector size "
212 "%jd.", (intmax_t)size, (intmax_t)secsize);
215 if (offset + size < lowerpp->mediasize) {
216 gctl_error(req, "Size %jd is too small for lower provider %s, "
217 "needs %jd.", (intmax_t)(offset + size), lowerpp->name,
221 if (size > upperpp->mediasize) {
222 gctl_error(req, "Upper provider %s size (%jd) is too small, "
223 "needs %jd.", upperpp->name, (intmax_t)upperpp->mediasize,
227 if (gunionname != NULL && !g_union_verify_nprefix(gunionname)) {
228 gctl_error(req, "Gunion name %s must be alphanumeric.",
232 if (gunionname != NULL) {
233 n = snprintf(name, sizeof(name), "%s%s", gunionname,
236 n = snprintf(name, sizeof(name), "%s-%s%s", upperpp->name,
237 lowerpp->name, G_UNION_SUFFIX);
239 if (n <= 0 || n >= sizeof(name)) {
240 gctl_error(req, "Invalid provider name.");
243 LIST_FOREACH(gp, &mp->geom, geom) {
244 if (strcmp(gp->name, name) == 0) {
245 gctl_error(req, "Provider %s already exists.", name);
249 gp = g_new_geomf(mp, "%s", name);
250 sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
251 rw_init(&sc->sc_rwlock, "gunion");
252 TAILQ_INIT(&sc->sc_wiplist);
253 sc->sc_offset = offset;
255 sc->sc_sectorsize = secsize;
265 sc->sc_readbytes = 0;
266 sc->sc_wrotebytes = 0;
267 sc->sc_writemap_memory = 0;
270 newpp = g_new_providerf(gp, "%s", gp->name);
271 newpp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
272 newpp->mediasize = size;
273 newpp->sectorsize = secsize;
274 LIST_FOREACH(gap, &upperpp->aliases, ga_next)
275 g_provider_add_alias(newpp, "%s%s", gap->ga_alias,
277 LIST_FOREACH(gap, &lowerpp->aliases, ga_next)
278 g_provider_add_alias(newpp, "%s%s", gap->ga_alias,
280 lowercp = g_new_consumer(gp);
281 lowercp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
282 if ((error = g_attach(lowercp, lowerpp)) != 0) {
283 gctl_error(req, "Error %d: cannot attach to provider %s.",
284 error, lowerpp->name);
287 /* request read and exclusive access for lower */
288 if ((error = g_access(lowercp, 1, 0, 1)) != 0) {
289 gctl_error(req, "Error %d: cannot obtain exclusive access to "
290 "%s.\n\tMust be unmounted or mounted read-only.", error,
294 uppercp = g_new_consumer(gp);
295 uppercp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
296 if ((error = g_attach(uppercp, upperpp)) != 0) {
297 gctl_error(req, "Error %d: cannot attach to provider %s.",
298 error, upperpp->name);
301 /* request read, write, and exclusive access for upper */
302 if ((error = g_access(uppercp, 1, 1, 1)) != 0) {
303 gctl_error(req, "Error %d: cannot obtain write access to %s.",
304 error, upperpp->name);
307 sc->sc_uppercp = uppercp;
308 sc->sc_lowercp = lowercp;
310 newpp->flags |= (upperpp->flags & G_PF_ACCEPT_UNMAPPED) &
311 (lowerpp->flags & G_PF_ACCEPT_UNMAPPED);
312 g_error_provider(newpp, 0);
314 * Allocate the map that tracks the sectors that have been written
315 * to the top layer. We use a 2-level hierarchy as that lets us
316 * map up to 1 petabyte using allocations of less than 33 Mb
317 * when using 4K byte sectors (or 268 Mb with 512 byte sectors).
319 * We totally populate the leaf nodes rather than allocating them
320 * as they are first used because their usage occurs in the
321 * g_union_start() routine that may be running in the g_down
322 * thread which cannot sleep.
324 sc->sc_map_size = roundup(size / secsize, BITS_PER_ENTRY);
325 needed = sc->sc_map_size / BITS_PER_ENTRY;
326 for (sc->sc_root_size = 1;
327 sc->sc_root_size * sc->sc_root_size < needed;
330 sc->sc_writemap_root = g_malloc(sc->sc_root_size * sizeof(uint64_t *),
332 sc->sc_leaf_size = sc->sc_root_size;
333 sc->sc_bits_per_leaf = sc->sc_leaf_size * BITS_PER_ENTRY;
334 sc->sc_leafused = g_malloc(roundup(sc->sc_root_size, BITS_PER_ENTRY),
336 for (i = 0; i < sc->sc_root_size; i++)
337 sc->sc_writemap_root[i] =
338 g_malloc(sc->sc_leaf_size * sizeof(uint64_t),
340 sc->sc_writemap_memory =
341 (sc->sc_root_size + sc->sc_root_size * sc->sc_leaf_size) *
342 sizeof(uint64_t) + roundup(sc->sc_root_size, BITS_PER_ENTRY);
344 gctl_error(req, "Device %s created with memory map size %jd.",
345 gp->name, (intmax_t)sc->sc_writemap_memory);
346 G_UNION_DEBUG(1, "Device %s created with memory map size %jd.",
347 gp->name, (intmax_t)sc->sc_writemap_memory);
353 g_destroy_consumer(uppercp);
354 g_access(lowercp, -1, 0, -1);
358 g_destroy_consumer(lowercp);
359 g_destroy_provider(newpp);
364 * Fetch named option and verify that it is positive.
367 g_union_fetcharg(struct gctl_req *req, const char *name)
371 val = gctl_get_paraml_opt(req, name, sizeof(*val));
376 gctl_error(req, "Invalid '%s': negative value, using default.", name);
381 * Verify that a name is alphanumeric.
384 g_union_verify_nprefix(const char *name)
388 for (i = 0; i < strlen(name); i++) {
389 if (isalpha(name[i]) == 0 && isdigit(name[i]) == 0) {
397 * Destroy a union device.
400 g_union_ctl_destroy(struct gctl_req *req, struct g_class *mp, bool verbose)
402 int *nargs, *force, error, i;
409 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
411 gctl_error(req, "No '%s' argument.", "nargs");
415 gctl_error(req, "Missing device(s).");
418 force = gctl_get_paraml(req, "force", sizeof(*force));
420 gctl_error(req, "No 'force' argument.");
424 for (i = 0; i < *nargs; i++) {
425 snprintf(param, sizeof(param), "arg%d", i);
426 name = gctl_get_asciiparam(req, param);
428 gctl_msg(req, "No '%s' argument.", param);
431 if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0)
432 name += strlen(_PATH_DEV);
433 gp = g_union_find_geom(mp, name);
435 gctl_msg(req, "Device %s is invalid.", name);
438 error = g_union_destroy(verbose ? req : NULL, gp, *force);
440 gctl_msg(req, "Error %d: cannot destroy device %s.",
443 gctl_post_messages(req);
449 static struct g_geom *
450 g_union_find_geom(struct g_class *mp, const char *name)
454 LIST_FOREACH(gp, &mp->geom, geom) {
455 if (strcmp(gp->name, name) == 0)
462 * Zero out all the statistics associated with a union device.
465 g_union_ctl_reset(struct gctl_req *req, struct g_class *mp, bool verbose)
467 struct g_union_softc *sc;
468 struct g_provider *pp;
475 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
477 gctl_error(req, "No '%s' argument.", "nargs");
481 gctl_error(req, "Missing device(s).");
485 for (i = 0; i < *nargs; i++) {
486 snprintf(param, sizeof(param), "arg%d", i);
487 pp = gctl_get_provider(req, param);
489 gctl_msg(req, "No '%s' argument.", param);
493 if (gp->class != mp) {
494 gctl_msg(req, "Provider %s is invalid.",
508 sc->sc_readbytes = 0;
509 sc->sc_wrotebytes = 0;
511 gctl_msg(req, "Device %s has been reset.", pp->name);
512 G_UNION_DEBUG(1, "Device %s has been reset.", pp->name);
514 gctl_post_messages(req);
518 * Revert all write requests made to the top layer of the union.
521 g_union_ctl_revert(struct gctl_req *req, struct g_class *mp, bool verbose)
523 struct g_union_softc *sc;
524 struct g_provider *pp;
531 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
533 gctl_error(req, "No '%s' argument.", "nargs");
537 gctl_error(req, "Missing device(s).");
541 for (i = 0; i < *nargs; i++) {
542 snprintf(param, sizeof(param), "arg%d", i);
543 pp = gctl_get_provider(req, param);
545 gctl_msg(req, "No '%s' argument.", param);
549 if (gp->class != mp) {
550 gctl_msg(req, "Provider %s is invalid.", pp->name);
554 if (g_union_get_writelock(sc) != 0) {
555 gctl_msg(req, "Revert already in progress for "
556 "provider %s.", pp->name);
560 * No mount or other use of union is allowed.
562 if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0) {
563 gctl_msg(req, "Unable to get exclusive access for "
564 "reverting of %s;\n\t%s cannot be mounted or "
565 "otherwise open during a revert.",
567 g_union_rel_writelock(sc);
571 g_union_rel_writelock(sc);
573 gctl_msg(req, "Device %s has been reverted.", pp->name);
574 G_UNION_DEBUG(1, "Device %s has been reverted.", pp->name);
576 gctl_post_messages(req);
580 * Revert union writes by zero'ing out the writemap.
583 g_union_revert(struct g_union_softc *sc)
588 for (i = 0; i < sc->sc_root_size; i++)
589 memset(sc->sc_writemap_root[i], 0,
590 sc->sc_leaf_size * sizeof(uint64_t));
591 memset(sc->sc_leafused, 0, roundup(sc->sc_root_size, BITS_PER_ENTRY));
596 * Commit all the writes made in the top layer to the lower layer.
599 g_union_ctl_commit(struct gctl_req *req, struct g_class *mp, bool verbose)
601 struct g_union_softc *sc;
602 struct g_provider *pp, *lowerpp;
603 struct g_consumer *lowercp;
607 off_t len2rd, len2wt, savelen;
608 int i, error, error1, *nargs, *force, *reboot;
612 nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
614 gctl_error(req, "No '%s' argument.", "nargs");
618 gctl_error(req, "Missing device(s).");
621 force = gctl_get_paraml(req, "force", sizeof(*force));
623 gctl_error(req, "No 'force' argument.");
626 reboot = gctl_get_paraml(req, "reboot", sizeof(*reboot));
627 if (reboot == NULL) {
628 gctl_error(req, "No 'reboot' argument.");
632 /* Get a bio buffer to do our I/O */
634 bp->bio_data = g_malloc(MAXBSIZE, M_WAITOK);
635 bp->bio_done = biodone;
636 for (i = 0; i < *nargs; i++) {
637 snprintf(param, sizeof(param), "arg%d", i);
638 pp = gctl_get_provider(req, param);
640 gctl_msg(req, "No '%s' argument.", param);
644 if (gp->class != mp) {
645 gctl_msg(req, "Provider %s is invalid.", pp->name);
649 if (g_union_get_writelock(sc) != 0) {
650 gctl_msg(req, "Commit already in progress for "
651 "provider %s.", pp->name);
655 /* upgrade to write access for lower */
656 lowercp = sc->sc_lowercp;
657 lowerpp = lowercp->provider;
659 * No mount or other use of union is allowed, unless the
660 * -f flag is given which allows read-only mount or usage.
662 if ((*force == false && pp->acr > 0) || pp->acw > 0 ||
664 gctl_msg(req, "Unable to get exclusive access for "
665 "writing of %s.\n\tNote that %s cannot be mounted "
666 "or otherwise\n\topen during a commit unless the "
667 "-f flag is used.", pp->name, pp->name);
668 g_union_rel_writelock(sc);
672 * No mount or other use of lower media is allowed, unless the
673 * -f flag is given which allows read-only mount or usage.
675 if ((*force == false && lowerpp->acr > lowercp->acr) ||
676 lowerpp->acw > lowercp->acw ||
677 lowerpp->ace > lowercp->ace) {
678 gctl_msg(req, "provider %s is unable to get "
679 "exclusive access to %s\n\tfor writing. Note that "
680 "%s cannot be mounted or otherwise open\n\tduring "
681 "a commit unless the -f flag is used.", pp->name,
682 lowerpp->name, lowerpp->name);
683 g_union_rel_writelock(sc);
686 if ((error = g_access(lowercp, 0, 1, 0)) != 0) {
687 gctl_msg(req, "Error %d: provider %s is unable to "
688 "access %s for writing.", error, pp->name,
690 g_union_rel_writelock(sc);
694 /* Loop over write map copying across written blocks */
696 bp->bio_length = sc->sc_map_size * sc->sc_sectorsize;
699 while (bp->bio_length > 0) {
700 if (!g_union_getmap(bp, sc, &len2rd)) {
701 /* not written, so skip */
702 bp->bio_offset += len2rd;
703 bp->bio_length -= len2rd;
707 /* need to read then write len2rd sectors */
708 for ( ; len2rd > 0; len2rd -= len2wt) {
709 /* limit ourselves to MAXBSIZE size I/Os */
711 if (len2wt > MAXBSIZE)
713 savelen = bp->bio_length;
714 bp->bio_length = len2wt;
715 bp->bio_cmd = BIO_READ;
716 g_io_request(bp, sc->sc_uppercp);
717 if ((error = biowait(bp, "rdunion")) != 0) {
718 gctl_msg(req, "Commit read error %d "
719 "in provider %s, commit aborted.",
723 bp->bio_flags &= ~BIO_DONE;
724 bp->bio_cmd = BIO_WRITE;
725 g_io_request(bp, lowercp);
726 if ((error = biowait(bp, "wtunion")) != 0) {
727 gctl_msg(req, "Commit write error %d "
728 "in provider %s, commit aborted.",
732 bp->bio_flags &= ~BIO_DONE;
733 bp->bio_offset += len2wt;
734 bp->bio_length = savelen - len2wt;
739 /* clear the write map */
743 /* return lower to previous access */
744 if ((error1 = g_access(lowercp, 0, -1, 0)) != 0) {
745 G_UNION_DEBUG(2, "Error %d: device %s could not reset "
746 "access to %s (r=0 w=-1 e=0).", error1, pp->name,
749 g_union_rel_writelock(sc);
750 if (error == 0 && verbose)
751 gctl_msg(req, "Device %s has been committed.",
753 G_UNION_DEBUG(1, "Device %s has been committed.", pp->name);
755 gctl_post_messages(req);
756 g_free(bp->bio_data);
759 kern_reboot(RB_AUTOBOOT);
763 * Generally allow access unless a commit is in progress.
766 g_union_access(struct g_provider *pp, int r, int w, int e)
768 struct g_union_softc *sc;
770 sc = pp->geom->softc;
772 if (r <= 0 && w <= 0 && e <= 0)
779 if (g_union_get_writelock(sc) != 0) {
780 if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0)
784 g_union_rel_writelock(sc);
789 * Initiate an I/O operation on the union device.
792 g_union_start(struct bio *bp)
794 struct g_union_softc *sc;
795 struct g_union_wip *wip;
798 sc = bp->bio_to->geom->softc;
799 if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
800 wip = g_malloc(sizeof(*wip), M_NOWAIT);
802 g_io_deliver(bp, ENOMEM);
805 TAILQ_INIT(&wip->wip_waiting);
808 wip->wip_start = bp->bio_offset + sc->sc_offset;
809 wip->wip_end = wip->wip_start + bp->bio_length - 1;
817 * All commands other than read and write are passed through to
818 * the upper-level device since it is writable and thus able to
819 * respond to delete, flush, and speedup requests.
821 cbp = g_clone_bio(bp);
823 g_io_deliver(bp, ENOMEM);
826 cbp->bio_offset = bp->bio_offset + sc->sc_offset;
827 cbp->bio_done = g_std_done;
829 switch (cbp->bio_cmd) {
831 G_UNION_LOGREQ(cbp, "Delete request received.");
832 atomic_add_long(&sc->sc_deletes, 1);
835 G_UNION_LOGREQ(cbp, "Getattr request received.");
836 atomic_add_long(&sc->sc_getattrs, 1);
837 if (strcmp(cbp->bio_attribute, "GEOM::kerneldump") != 0)
838 /* forward the GETATTR to the lower-level device */
840 g_union_kerneldump(bp, sc);
843 G_UNION_LOGREQ(cbp, "Flush request received.");
844 atomic_add_long(&sc->sc_flushes, 1);
847 G_UNION_LOGREQ(cbp, "Speedup request received.");
848 atomic_add_long(&sc->sc_speedups, 1);
851 G_UNION_LOGREQ(cbp, "Cmd0 request received.");
852 atomic_add_long(&sc->sc_cmd0s, 1);
855 G_UNION_LOGREQ(cbp, "Cmd1 request received.");
856 atomic_add_long(&sc->sc_cmd1s, 1);
859 G_UNION_LOGREQ(cbp, "Cmd2 request received.");
860 atomic_add_long(&sc->sc_cmd2s, 1);
863 G_UNION_LOGREQ(cbp, "Unknown (%d) request received.",
867 g_io_request(cbp, sc->sc_uppercp);
871 * Initiate a read or write operation on the union device.
874 g_union_doio(struct g_union_wip *wip)
876 struct g_union_softc *sc;
877 struct g_consumer *cp, *firstcp;
878 struct g_union_wip *activewip;
879 struct bio *cbp, *firstbp;
880 off_t rdlen, len2rd, offset;
881 int iocnt, needstoblock;
885 * To maintain consistency, we cannot allow concurrent reads
886 * or writes to the same block.
888 * A work-in-progress (wip) structure is allocated for each
889 * read or write request. All active requests are kept on the
890 * softc sc_wiplist. As each request arrives, it is checked to
891 * see if it overlaps any of the active entries. If it does not
892 * overlap, then it is added to the active list and initiated.
893 * If it does overlap an active entry, it is added to the
894 * wip_waiting list for the active entry that it overlaps.
895 * When an active entry completes, it restarts all the requests
896 * on its wip_waiting list.
900 TAILQ_FOREACH(activewip, &sc->sc_wiplist, wip_next) {
901 if (wip->wip_end < activewip->wip_start ||
902 wip->wip_start > activewip->wip_end)
905 if (wip->wip_bp->bio_cmd == BIO_WRITE)
906 if (activewip->wip_bp->bio_cmd == BIO_WRITE)
907 sc->sc_writeblockwrite += 1;
909 sc->sc_readblockwrite += 1;
911 if (activewip->wip_bp->bio_cmd == BIO_WRITE)
912 sc->sc_writeblockread += 1;
914 sc->sc_readcurrentread += 1;
917 /* Put request on a waiting list if necessary */
919 TAILQ_INSERT_TAIL(&activewip->wip_waiting, wip,
925 /* Put request on the active list */
926 TAILQ_INSERT_TAIL(&sc->sc_wiplist, wip, wip_next);
929 * Process I/O requests that have been cleared to go.
931 cbp = g_clone_bio(wip->wip_bp);
933 TAILQ_REMOVE(&sc->sc_wiplist, wip, wip_next);
935 KASSERT(TAILQ_FIRST(&wip->wip_waiting) == NULL,
936 ("g_union_doio: non-empty work-in-progress waiting queue"));
937 g_io_deliver(wip->wip_bp, ENOMEM);
942 cbp->bio_caller1 = wip;
943 cbp->bio_done = g_union_done;
944 cbp->bio_offset = wip->wip_start;
947 * Writes are always done to the top level. The blocks that
948 * are written are recorded in the bitmap when the I/O completes.
950 if (cbp->bio_cmd == BIO_WRITE) {
951 G_UNION_LOGREQ(cbp, "Sending %jd byte write request to upper "
952 "level.", cbp->bio_length);
953 atomic_add_long(&sc->sc_writes, 1);
954 atomic_add_long(&sc->sc_wrotebytes, cbp->bio_length);
955 g_io_request(cbp, sc->sc_uppercp);
959 * The usual read case is that we either read the top layer
960 * if the block has been previously written or the bottom layer
961 * if it has not been written. However, it is possible that
962 * only part of the block has been written, For example we may
963 * have written a UFS/FFS file fragment comprising several
964 * sectors out of an 8-sector block. Here, if the entire
965 * 8-sector block is read for example by a snapshot needing
966 * to copy the full block, then we need to read the written
967 * sectors from the upper level and the unwritten sectors from
968 * the lower level. We do this by alternately reading from the
969 * top and bottom layers until we complete the read. We
970 * simplify for the common case to just do the I/O and return.
972 atomic_add_long(&sc->sc_reads, 1);
973 atomic_add_long(&sc->sc_readbytes, cbp->bio_length);
974 rdlen = cbp->bio_length;
976 for (iocnt = 0; ; iocnt++) {
977 if (g_union_getmap(cbp, sc, &len2rd)) {
986 /* Check if only a single read is required */
987 if (iocnt == 0 && rdlen == len2rd) {
988 G_UNION_LOGREQLVL((cp == sc->sc_uppercp) ?
989 3 : 4, cbp, "Sending %jd byte read "
990 "request to %s level.", len2rd, level);
991 g_io_request(cbp, cp);
994 cbp->bio_length = len2rd;
995 if ((cbp->bio_flags & BIO_UNMAPPED) != 0)
996 cbp->bio_ma_offset += offset;
998 cbp->bio_data += offset;
1001 G_UNION_LOGREQLVL(3, cbp, "Sending %jd byte read "
1002 "request to %s level.", len2rd, level);
1004 * To avoid prematurely notifying our consumer
1005 * that their I/O has completed, we have to delay
1006 * issuing our first I/O request until we have
1007 * issued all the additional I/O requests.
1010 atomic_add_long(&wip->wip_numios, 1);
1011 g_io_request(cbp, cp);
1018 /* set up for next read */
1019 cbp = g_clone_bio(wip->wip_bp);
1021 wip->wip_error = ENOMEM;
1022 atomic_add_long(&wip->wip_numios, -1);
1025 cbp->bio_caller1 = wip;
1026 cbp->bio_done = g_union_done;
1027 cbp->bio_offset += offset;
1028 cbp->bio_length = rdlen;
1029 atomic_add_long(&sc->sc_reads, 1);
1031 /* We have issued all our I/O, so start the first one */
1032 g_io_request(firstbp, firstcp);
1037 * Used when completing a union I/O operation.
1040 g_union_done(struct bio *bp)
1042 struct g_union_wip *wip, *waitingwip;
1043 struct g_union_softc *sc;
1045 wip = bp->bio_caller1;
1046 if (wip->wip_error != 0 && bp->bio_error == 0)
1047 bp->bio_error = wip->wip_error;
1049 if (atomic_fetchadd_long(&wip->wip_numios, -1) == 1) {
1052 if (bp->bio_cmd == BIO_WRITE)
1053 g_union_setmap(bp, sc);
1054 TAILQ_REMOVE(&sc->sc_wiplist, wip, wip_next);
1056 while ((waitingwip = TAILQ_FIRST(&wip->wip_waiting)) != NULL) {
1057 TAILQ_REMOVE(&wip->wip_waiting, waitingwip, wip_next);
1058 g_union_doio(waitingwip);
1066 * Record blocks that have been written in the map.
1069 g_union_setmap(struct bio *bp, struct g_union_softc *sc)
1074 off_t start, numsec;
1077 KASSERT(bp->bio_offset % sc->sc_sectorsize == 0,
1078 ("g_union_setmap: offset not on sector boundry"));
1079 KASSERT(bp->bio_length % sc->sc_sectorsize == 0,
1080 ("g_union_setmap: length not a multiple of sectors"));
1081 start = bp->bio_offset / sc->sc_sectorsize;
1082 numsec = bp->bio_length / sc->sc_sectorsize;
1083 KASSERT(start + numsec <= sc->sc_map_size,
1084 ("g_union_setmap: block %jd is out of range", start + numsec));
1085 for ( ; numsec > 0; numsec--, start++) {
1086 root_idx = start / sc->sc_bits_per_leaf;
1087 leaf = &sc->sc_writemap_root[root_idx];
1089 [(start % sc->sc_bits_per_leaf) / BITS_PER_ENTRY];
1090 *wordp |= 1ULL << (start % BITS_PER_ENTRY);
1091 sc->sc_leafused[root_idx / BITS_PER_ENTRY] |=
1092 1ULL << (root_idx % BITS_PER_ENTRY);
1097 * Check map to determine whether blocks have been written.
1099 * Return true if they have been written so should be read from the top
1100 * layer. Return false if they have not been written so should be read
1101 * from the bottom layer. Return in len2read the bytes to be read. See
1102 * the comment above the BIO_READ implementation in g_union_start() for
1103 * an explantion of why len2read may be shorter than the buffer length.
1106 g_union_getmap(struct bio *bp, struct g_union_softc *sc, off_t *len2read)
1108 off_t start, numsec, leafresid, bitloc;
1109 bool first, maptype, retval;
1110 uint64_t *leaf, word;
1113 KASSERT(bp->bio_offset % sc->sc_sectorsize == 0,
1114 ("g_union_getmap: offset not on sector boundry"));
1115 KASSERT(bp->bio_length % sc->sc_sectorsize == 0,
1116 ("g_union_getmap: length not a multiple of sectors"));
1117 start = bp->bio_offset / sc->sc_sectorsize;
1118 numsec = bp->bio_length / sc->sc_sectorsize;
1119 G_UNION_DEBUG(4, "g_union_getmap: check %jd sectors starting at %jd\n",
1121 KASSERT(start + numsec <= sc->sc_map_size,
1122 ("g_union_getmap: block %jd is out of range", start + numsec));
1123 root_idx = start / sc->sc_bits_per_leaf;
1126 while (numsec > 0) {
1127 /* Check first if the leaf records any written sectors */
1128 root_idx = start / sc->sc_bits_per_leaf;
1129 leafresid = sc->sc_bits_per_leaf -
1130 (start % sc->sc_bits_per_leaf);
1131 if (((sc->sc_leafused[root_idx / BITS_PER_ENTRY]) &
1132 (1ULL << (root_idx % BITS_PER_ENTRY))) == 0) {
1139 numsec -= leafresid;
1143 /* Check up to a word boundry, then check word by word */
1144 leaf = sc->sc_writemap_root[root_idx];
1145 word = leaf[(start % sc->sc_bits_per_leaf) / BITS_PER_ENTRY];
1146 bitloc = start % BITS_PER_ENTRY;
1147 if (bitloc == 0 && (word == 0 || word == ~0)) {
1155 if ((word == 0 && maptype) ||
1156 (word == ~0 && !maptype))
1158 numsec -= BITS_PER_ENTRY;
1159 start += BITS_PER_ENTRY;
1162 for ( ; bitloc < BITS_PER_ENTRY; bitloc ++) {
1163 retval = (word & (1ULL << bitloc)) != 0;
1168 if (maptype == retval) {
1181 *len2read = bp->bio_length - (numsec * sc->sc_sectorsize);
1182 G_UNION_DEBUG(maptype ? 3 : 4,
1183 "g_union_getmap: return maptype %swritten for %jd "
1184 "sectors ending at %jd\n", maptype ? "" : "NOT ",
1185 *len2read / sc->sc_sectorsize, start - 1);
1190 * Fill in details for a BIO_GETATTR request.
1193 g_union_kerneldump(struct bio *bp, struct g_union_softc *sc)
1195 struct g_kerneldump *gkd;
1197 struct g_provider *pp;
1199 gkd = (struct g_kerneldump *)bp->bio_data;
1200 gp = bp->bio_to->geom;
1201 g_trace(G_T_TOPOLOGY, "%s(%s, %jd, %jd)", __func__, gp->name,
1202 (intmax_t)gkd->offset, (intmax_t)gkd->length);
1204 pp = LIST_FIRST(&gp->provider);
1206 gkd->di.dumper = g_union_dumper;
1208 gkd->di.blocksize = pp->sectorsize;
1209 gkd->di.maxiosize = DFLTPHYS;
1210 gkd->di.mediaoffset = sc->sc_offset + gkd->offset;
1211 if (gkd->offset > sc->sc_size) {
1212 g_io_deliver(bp, ENODEV);
1215 if (gkd->offset + gkd->length > sc->sc_size)
1216 gkd->length = sc->sc_size - gkd->offset;
1217 gkd->di.mediasize = gkd->length;
1218 g_io_deliver(bp, 0);
1222 * Handler for g_union_kerneldump().
1225 g_union_dumper(void *priv, void *virtual, vm_offset_t physical, off_t offset,
1233 * List union statistics.
1236 g_union_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1237 struct g_consumer *cp, struct g_provider *pp)
1239 struct g_union_softc *sc;
1241 if (pp != NULL || cp != NULL || gp->softc == NULL)
1244 sbuf_printf(sb, "%s<Reads>%ju</Reads>\n", indent,
1245 (uintmax_t)sc->sc_reads);
1246 sbuf_printf(sb, "%s<Writes>%ju</Writes>\n", indent,
1247 (uintmax_t)sc->sc_writes);
1248 sbuf_printf(sb, "%s<Deletes>%ju</Deletes>\n", indent,
1249 (uintmax_t)sc->sc_deletes);
1250 sbuf_printf(sb, "%s<Getattrs>%ju</Getattrs>\n", indent,
1251 (uintmax_t)sc->sc_getattrs);
1252 sbuf_printf(sb, "%s<Flushes>%ju</Flushes>\n", indent,
1253 (uintmax_t)sc->sc_flushes);
1254 sbuf_printf(sb, "%s<Speedups>%ju</Speedups>\n", indent,
1255 (uintmax_t)sc->sc_speedups);
1256 sbuf_printf(sb, "%s<Cmd0s>%ju</Cmd0s>\n", indent,
1257 (uintmax_t)sc->sc_cmd0s);
1258 sbuf_printf(sb, "%s<Cmd1s>%ju</Cmd1s>\n", indent,
1259 (uintmax_t)sc->sc_cmd1s);
1260 sbuf_printf(sb, "%s<Cmd2s>%ju</Cmd2s>\n", indent,
1261 (uintmax_t)sc->sc_cmd2s);
1262 sbuf_printf(sb, "%s<ReadCurrentRead>%ju</ReadCurrentRead>\n", indent,
1263 (uintmax_t)sc->sc_readcurrentread);
1264 sbuf_printf(sb, "%s<ReadBlockWrite>%ju</ReadBlockWrite>\n", indent,
1265 (uintmax_t)sc->sc_readblockwrite);
1266 sbuf_printf(sb, "%s<WriteBlockRead>%ju</WriteBlockRead>\n", indent,
1267 (uintmax_t)sc->sc_writeblockread);
1268 sbuf_printf(sb, "%s<WriteBlockWrite>%ju</WriteBlockWrite>\n", indent,
1269 (uintmax_t)sc->sc_writeblockwrite);
1270 sbuf_printf(sb, "%s<ReadBytes>%ju</ReadBytes>\n", indent,
1271 (uintmax_t)sc->sc_readbytes);
1272 sbuf_printf(sb, "%s<WroteBytes>%ju</WroteBytes>\n", indent,
1273 (uintmax_t)sc->sc_wrotebytes);
1274 sbuf_printf(sb, "%s<Offset>%jd</Offset>\n", indent,
1275 (intmax_t)sc->sc_offset);
1279 * Clean up an orphaned geom.
1282 g_union_orphan(struct g_consumer *cp)
1285 g_topology_assert();
1286 g_union_destroy(NULL, cp->geom, true);
1290 * Clean up a union geom.
1293 g_union_destroy_geom(struct gctl_req *req, struct g_class *mp,
1297 return (g_union_destroy(NULL, gp, false));
1301 * Clean up a union device.
1304 g_union_destroy(struct gctl_req *req, struct g_geom *gp, bool force)
1306 struct g_union_softc *sc;
1307 struct g_provider *pp;
1310 g_topology_assert();
1314 pp = LIST_FIRST(&gp->provider);
1315 if ((sc->sc_flags & DOING_COMMIT) != 0 ||
1316 (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0))) {
1319 gctl_msg(req, "Device %s is still in use, "
1320 "so is being forcibly removed.", gp->name);
1321 G_UNION_DEBUG(1, "Device %s is still in use, so "
1322 "is being forcibly removed.", gp->name);
1325 gctl_msg(req, "Device %s is still open "
1326 "(r=%d w=%d e=%d).", gp->name, pp->acr,
1328 G_UNION_DEBUG(1, "Device %s is still open "
1329 "(r=%d w=%d e=%d).", gp->name, pp->acr,
1335 gctl_msg(req, "Device %s removed.", gp->name);
1336 G_UNION_DEBUG(1, "Device %s removed.", gp->name);
1338 /* Close consumers */
1339 if ((error = g_access(sc->sc_lowercp, -1, 0, -1)) != 0)
1340 G_UNION_DEBUG(2, "Error %d: device %s could not reset access "
1341 "to %s.", error, gp->name, sc->sc_lowercp->provider->name);
1342 if ((error = g_access(sc->sc_uppercp, -1, -1, -1)) != 0)
1343 G_UNION_DEBUG(2, "Error %d: device %s could not reset access "
1344 "to %s.", error, gp->name, sc->sc_uppercp->provider->name);
1346 g_wither_geom(gp, ENXIO);
1352 * Clean up a union provider.
1355 g_union_providergone(struct g_provider *pp)
1358 struct g_union_softc *sc;
1364 for (i = 0; i < sc->sc_root_size; i++)
1365 g_free(sc->sc_writemap_root[i]);
1366 g_free(sc->sc_writemap_root);
1367 g_free(sc->sc_leafused);
1368 rw_destroy(&sc->sc_rwlock);
1373 * Respond to a resized provider.
1376 g_union_resize(struct g_consumer *cp)
1378 struct g_union_softc *sc;
1381 g_topology_assert();
1387 * If size has gotten bigger, ignore it and just keep using
1388 * the space we already had. Otherwise we are done.
1390 if (sc->sc_size < cp->provider->mediasize - sc->sc_offset)
1392 g_union_destroy(NULL, gp, true);
1395 DECLARE_GEOM_CLASS(g_union_class, g_union);
1396 MODULE_VERSION(geom_union, 0);