2 * SPDX-License-Identifier: BSD-4-Clause
4 * Copyright (c) 2003 Poul-Henning Kamp.
5 * Copyright (c) 1995 Jason R. Thorpe.
6 * Copyright (c) 1990, 1993
7 * The Regents of the University of California. All rights reserved.
9 * Copyright (c) 1988 University of Utah.
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 * must display the following acknowledgement:
25 * This product includes software developed for the NetBSD Project
27 * 4. The names of the authors may not be used to endorse or promote products
28 * derived from this software without specific prior written permission.
30 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
31 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
33 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
34 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
36 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
37 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
38 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
42 * Dynamic configuration and disklabel support by:
43 * Jason R. Thorpe <thorpej@nas.nasa.gov>
44 * Numerical Aerodynamic Simulation Facility
46 * NASA Ames Research Center
47 * Moffett Field, CA 94035
49 * from: Utah $Hdr: cd.c 1.6 90/11/28$
50 * @(#)cd.c 8.2 (Berkeley) 11/16/93
51 * $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $
54 #include <sys/cdefs.h>
55 __FBSDID("$FreeBSD$");
57 #include <sys/param.h>
58 #include <sys/systm.h>
59 #include <sys/kernel.h>
60 #include <sys/module.h>
62 #include <sys/malloc.h>
64 #include <geom/geom.h>
67 * Number of blocks to untouched in front of a component partition.
68 * This is to avoid violating its disklabel area when it starts at the
69 * beginning of the slice.
71 #if !defined(CCD_OFFSET)
76 #define CCDF_UNIFORM 0x02 /* use LCCD of sizes for uniform interleave */
77 #define CCDF_MIRROR 0x04 /* use mirroring */
78 #define CCDF_NO_OFFSET 0x08 /* do not leave space in front */
79 #define CCDF_LINUX 0x10 /* use Linux compatibility mode */
81 /* Mask of user-settable ccd flags. */
82 #define CCDF_USERMASK (CCDF_UNIFORM|CCDF_MIRROR)
85 * Interleave description table.
86 * Computed at boot time to speed irregular-interleave lookups.
87 * The idea is that we interleave in "groups". First we interleave
88 * evenly over all component disks up to the size of the smallest
89 * component (the first group), then we interleave evenly over all
90 * remaining disks up to the size of the next-smallest (second group),
93 * Each table entry describes the interleave characteristics of one
94 * of these groups. For example if a concatenated disk consisted of
95 * three components of 5, 3, and 7 DEV_BSIZE blocks interleaved at
96 * DEV_BSIZE (1), the table would have three entries:
98 * ndisk startblk startoff dev
104 * which says that the first nine blocks (0-8) are interleaved over
105 * 3 disks (0, 1, 2) starting at block offset 0 on any component disk,
106 * the next 4 blocks (9-12) are interleaved over 2 disks (0, 2) starting
107 * at component block 3, and the remaining blocks (13-14) are on disk
108 * 2 starting at offset 5.
111 int ii_ndisk; /* # of disks range is interleaved over */
112 daddr_t ii_startblk; /* starting scaled block # for range */
113 daddr_t ii_startoff; /* starting component offset (block #) */
114 int *ii_index; /* ordered list of components in range */
118 * Component info table.
119 * Describes a single component of a concatenated disk.
122 daddr_t ci_size; /* size */
123 struct g_provider *ci_provider; /* provider */
124 struct g_consumer *ci_consumer; /* consumer */
128 * A concatenated disk is described by this structure.
132 LIST_ENTRY(ccd_s) list;
134 int sc_unit; /* logical unit number */
135 int sc_flags; /* flags */
136 daddr_t sc_size; /* size of ccd */
137 int sc_ileave; /* interleave */
138 u_int sc_ndisks; /* number of components */
139 struct ccdcinfo *sc_cinfo; /* component info */
140 struct ccdiinfo *sc_itable; /* interleave table */
141 u_int32_t sc_secsize; /* # bytes per sector */
142 int sc_pick; /* side of mirror picked */
143 daddr_t sc_blk[2]; /* mirror localization */
144 u_int32_t sc_offset; /* actual offset used */
147 static g_start_t g_ccd_start;
148 static void ccdiodone(struct bio *bp);
149 static void ccdinterleave(struct ccd_s *);
150 static int ccdinit(struct gctl_req *req, struct ccd_s *);
151 static int ccdbuffer(struct bio **ret, struct ccd_s *,
152 struct bio *, daddr_t, caddr_t, long);
155 g_ccd_orphan(struct g_consumer *cp)
158 * XXX: We don't do anything here. It is not obvious
159 * XXX: what DTRT would be, so we do what the previous
160 * XXX: code did: ignore it and let the user cope.
165 g_ccd_access(struct g_provider *pp, int dr, int dw, int de)
168 struct g_consumer *cp1, *cp2;
176 LIST_FOREACH(cp1, &gp->consumer, consumer) {
177 error = g_access(cp1, dr, dw, de);
179 LIST_FOREACH(cp2, &gp->consumer, consumer) {
182 g_access(cp2, -dr, -dw, -de);
191 * Free the softc and its substructures.
194 g_ccd_freesc(struct ccd_s *sc)
198 g_free(sc->sc_cinfo);
199 if (sc->sc_itable != NULL) {
200 for (ii = sc->sc_itable; ii->ii_ndisk > 0; ii++)
201 if (ii->ii_index != NULL)
202 g_free(ii->ii_index);
203 g_free(sc->sc_itable);
210 ccdinit(struct gctl_req *req, struct ccd_s *cs)
225 if (cs->sc_flags & CCDF_LINUX) {
228 if (cs->sc_flags & CCDF_MIRROR && cs->sc_ndisks != 2)
229 gctl_error(req, "Mirror mode for Linux raids is "
230 "only supported with 2 devices");
232 if (cs->sc_flags & CCDF_NO_OFFSET)
235 cs->sc_offset = CCD_OFFSET;
238 for (ix = 0; ix < cs->sc_ndisks; ix++) {
239 ci = &cs->sc_cinfo[ix];
241 mediasize = ci->ci_provider->mediasize;
242 sectorsize = ci->ci_provider->sectorsize;
243 if (sectorsize > maxsecsize)
244 maxsecsize = sectorsize;
245 size = mediasize / DEV_BSIZE - cs->sc_offset;
247 /* Truncate to interleave boundary */
249 if (cs->sc_ileave > 1)
250 size -= size % cs->sc_ileave;
253 gctl_error(req, "Component %s has effective size zero",
254 ci->ci_provider->name);
258 if (minsize == 0 || size < minsize)
265 * Don't allow the interleave to be smaller than
266 * the biggest component sector.
268 if ((cs->sc_ileave > 0) &&
269 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
270 gctl_error(req, "Interleave to small for sector size");
275 * If uniform interleave is desired set all sizes to that of
276 * the smallest component. This will guarantee that a single
277 * interleave table is generated.
279 * Lost space must be taken into account when calculating the
280 * overall size. Half the space is lost when CCDF_MIRROR is
283 if (cs->sc_flags & CCDF_UNIFORM) {
284 for (ix = 0; ix < cs->sc_ndisks; ix++) {
285 ci = &cs->sc_cinfo[ix];
286 ci->ci_size = minsize;
288 cs->sc_size = cs->sc_ndisks * minsize;
291 if (cs->sc_flags & CCDF_MIRROR) {
293 * Check to see if an even number of components
294 * have been specified. The interleave must also
295 * be non-zero in order for us to be able to
296 * guarantee the topology.
298 if (cs->sc_ndisks % 2) {
300 "Mirroring requires an even number of disks");
303 if (cs->sc_ileave == 0) {
305 "An interleave must be specified when mirroring");
308 cs->sc_size = (cs->sc_ndisks/2) * minsize;
312 * Construct the interleave table.
317 * Create pseudo-geometry based on 1MB cylinders. It's
320 cs->sc_secsize = maxsecsize;
326 ccdinterleave(struct ccd_s *cs)
328 struct ccdcinfo *ci, *smallci;
336 * Allocate an interleave table. The worst case occurs when each
337 * of N disks is of a different size, resulting in N interleave
340 * Chances are this is too big, but we don't care.
342 size = (cs->sc_ndisks + 1) * sizeof(struct ccdiinfo);
343 cs->sc_itable = g_malloc(size, M_WAITOK | M_ZERO);
346 * Trivial case: no interleave (actually interleave of disk size).
347 * Each table entry represents a single component in its entirety.
349 * An interleave of 0 may not be used with a mirror setup.
351 if (cs->sc_ileave == 0) {
355 for (ix = 0; ix < cs->sc_ndisks; ix++) {
356 /* Allocate space for ii_index. */
357 ii->ii_index = g_malloc(sizeof(int), M_WAITOK);
359 ii->ii_startblk = bn;
361 ii->ii_index[0] = ix;
362 bn += cs->sc_cinfo[ix].ci_size;
370 * The following isn't fast or pretty; it doesn't have to be.
374 for (ii = cs->sc_itable; ; ii++) {
376 * Allocate space for ii_index. We might allocate more then
379 ii->ii_index = g_malloc((sizeof(int) * cs->sc_ndisks),
383 * Locate the smallest of the remaining components
386 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_ndisks];
388 if (ci->ci_size > size &&
390 ci->ci_size < smallci->ci_size)) {
396 * Nobody left, all done
398 if (smallci == NULL) {
400 g_free(ii->ii_index);
406 * Record starting logical block using an sc_ileave blocksize.
408 ii->ii_startblk = bn / cs->sc_ileave;
411 * Record starting component block using an sc_ileave
412 * blocksize. This value is relative to the beginning of
415 ii->ii_startoff = lbn;
418 * Determine how many disks take part in this interleave
419 * and record their indices.
422 for (ci = cs->sc_cinfo;
423 ci < &cs->sc_cinfo[cs->sc_ndisks]; ci++) {
424 if (ci->ci_size >= smallci->ci_size) {
425 ii->ii_index[ix++] = ci - cs->sc_cinfo;
429 bn += ix * (smallci->ci_size - size);
430 lbn = smallci->ci_size / cs->sc_ileave;
431 size = smallci->ci_size;
436 g_ccd_start(struct bio *bp)
445 cs = bp->bio_to->geom->softc;
448 * Block all GETATTR requests, we wouldn't know which of our
449 * subdevices we should ship it off to.
450 * XXX: this may not be the right policy.
452 if(bp->bio_cmd == BIO_GETATTR) {
453 g_io_deliver(bp, EINVAL);
458 * Translate the partition-relative block number to an absolute.
460 bn = bp->bio_offset / cs->sc_secsize;
463 * Allocate component buffers and fire off the requests
466 for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
467 err = ccdbuffer(cbp, cs, bp, bn, addr, bcount);
469 bp->bio_completed += bcount;
470 if (bp->bio_error == 0)
472 if (bp->bio_completed == bp->bio_length)
473 g_io_deliver(bp, bp->bio_error);
476 rcount = cbp[0]->bio_length;
478 if (cs->sc_flags & CCDF_MIRROR) {
480 * Mirroring. Writes go to both disks, reads are
481 * taken from whichever disk seems most appropriate.
483 * We attempt to localize reads to the disk whos arm
484 * is nearest the read request. We ignore seeks due
485 * to writes when making this determination and we
486 * also try to avoid hogging.
488 if (cbp[0]->bio_cmd != BIO_READ) {
489 g_io_request(cbp[0], cbp[0]->bio_from);
490 g_io_request(cbp[1], cbp[1]->bio_from);
492 int pick = cs->sc_pick;
493 daddr_t range = cs->sc_size / 16;
495 if (bn < cs->sc_blk[pick] - range ||
496 bn > cs->sc_blk[pick] + range
498 cs->sc_pick = pick = 1 - pick;
500 cs->sc_blk[pick] = bn + btodb(rcount);
501 g_io_request(cbp[pick], cbp[pick]->bio_from);
507 g_io_request(cbp[0], cbp[0]->bio_from);
515 * Build a component buffer header.
518 ccdbuffer(struct bio **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
520 struct ccdcinfo *ci, *ci2 = NULL;
526 * Determine which component bn falls in.
531 if (cs->sc_ileave == 0) {
533 * Serially concatenated and neither a mirror nor a parity
534 * config. This is a special case.
539 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
547 * Calculate cbn, the logical superblock (sc_ileave chunks),
548 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
551 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
552 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
555 * Figure out which interleave table to use.
557 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
558 if (ii->ii_startblk > cbn)
564 * off is the logical superblock relative to the beginning
565 * of this interleave block.
567 off = cbn - ii->ii_startblk;
570 * We must calculate which disk component to use (ccdisk),
571 * and recalculate cbn to be the superblock relative to
572 * the beginning of the component. This is typically done by
573 * adding 'off' and ii->ii_startoff together. However, 'off'
574 * must typically be divided by the number of components in
575 * this interleave array to be properly convert it from a
576 * CCD-relative logical superblock number to a
577 * component-relative superblock number.
579 if (ii->ii_ndisk == 1) {
581 * When we have just one disk, it can't be a mirror
582 * or a parity config.
584 ccdisk = ii->ii_index[0];
585 cbn = ii->ii_startoff + off;
587 if (cs->sc_flags & CCDF_MIRROR) {
589 * We have forced a uniform mapping, resulting
590 * in a single interleave array. We double
591 * up on the first half of the available
592 * components and our mirror is in the second
593 * half. This only works with a single
594 * interleave array because doubling up
595 * doubles the number of sectors, so there
596 * cannot be another interleave array because
597 * the next interleave array's calculations
600 int ndisk2 = ii->ii_ndisk / 2;
601 ccdisk = ii->ii_index[off % ndisk2];
602 cbn = ii->ii_startoff + off / ndisk2;
603 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
605 ccdisk = ii->ii_index[off % ii->ii_ndisk];
606 cbn = ii->ii_startoff + off / ii->ii_ndisk;
610 ci = &cs->sc_cinfo[ccdisk];
613 * Convert cbn from a superblock to a normal block so it
614 * can be used to calculate (along with cboff) the normal
615 * block index into this particular disk.
617 cbn *= cs->sc_ileave;
621 * Fill in the component buf structure.
623 cbp = g_clone_bio(bp);
626 cbp->bio_done = g_std_done;
627 cbp->bio_offset = dbtob(cbn + cboff + cs->sc_offset);
628 cbp->bio_data = addr;
629 if (cs->sc_ileave == 0)
630 cbc = dbtob((off_t)(ci->ci_size - cbn));
632 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
633 cbp->bio_length = (cbc < bcount) ? cbc : bcount;
635 cbp->bio_from = ci->ci_consumer;
638 if (cs->sc_flags & CCDF_MIRROR) {
639 cbp = g_clone_bio(bp);
642 cbp->bio_done = cb[0]->bio_done = ccdiodone;
643 cbp->bio_offset = cb[0]->bio_offset;
644 cbp->bio_data = cb[0]->bio_data;
645 cbp->bio_length = cb[0]->bio_length;
646 cbp->bio_from = ci2->ci_consumer;
647 cbp->bio_caller1 = cb[0];
648 cb[0]->bio_caller1 = cbp;
655 * Called only for mirrored operations.
658 ccdiodone(struct bio *cbp)
660 struct bio *mbp, *pbp;
662 mbp = cbp->bio_caller1;
663 pbp = cbp->bio_parent;
665 if (pbp->bio_cmd == BIO_READ) {
666 if (cbp->bio_error == 0) {
667 /* We will not be needing the partner bio */
676 /* Try partner the bio instead */
677 mbp->bio_caller1 = NULL;
680 g_io_request(mbp, mbp->bio_from);
682 * XXX: If this comes back OK, we should actually
683 * try to write the good data on the failed mirror
691 mbp->bio_caller1 = NULL;
693 if (cbp->bio_error != 0 && pbp->bio_error == 0)
694 pbp->bio_error = cbp->bio_error;
702 g_ccd_create(struct gctl_req *req, struct g_class *mp)
704 int *unit, *ileave, *nprovider;
706 struct g_consumer *cp;
707 struct g_provider *pp;
714 unit = gctl_get_paraml(req, "unit", sizeof (*unit));
716 gctl_error(req, "unit parameter not given");
719 ileave = gctl_get_paraml(req, "ileave", sizeof (*ileave));
720 if (ileave == NULL) {
721 gctl_error(req, "ileave parameter not given");
724 nprovider = gctl_get_paraml(req, "nprovider", sizeof (*nprovider));
725 if (nprovider == NULL) {
726 gctl_error(req, "nprovider parameter not given");
730 /* Check for duplicate unit */
731 LIST_FOREACH(gp, &mp->geom, geom) {
733 if (sc != NULL && sc->sc_unit == *unit) {
734 gctl_error(req, "Unit %d already configured", *unit);
739 if (*nprovider <= 0) {
740 gctl_error(req, "Bogus nprovider argument (= %d)", *nprovider);
744 /* Check all providers are valid */
745 for (i = 0; i < *nprovider; i++) {
746 sprintf(buf, "provider%d", i);
747 pp = gctl_get_provider(req, buf);
752 gp = g_new_geomf(mp, "ccd%d", *unit);
753 sc = g_malloc(sizeof *sc, M_WAITOK | M_ZERO);
755 sc->sc_ndisks = *nprovider;
757 /* Allocate space for the component info. */
758 sc->sc_cinfo = g_malloc(sc->sc_ndisks * sizeof(struct ccdcinfo),
761 /* Create consumers and attach to all providers */
762 for (i = 0; i < *nprovider; i++) {
763 sprintf(buf, "provider%d", i);
764 pp = gctl_get_provider(req, buf);
765 cp = g_new_consumer(gp);
766 error = g_attach(cp, pp);
767 KASSERT(error == 0, ("attach to %s failed", pp->name));
768 sc->sc_cinfo[i].ci_consumer = cp;
769 sc->sc_cinfo[i].ci_provider = pp;
773 sc->sc_ileave = *ileave;
775 if (gctl_get_param(req, "no_offset", NULL))
776 sc->sc_flags |= CCDF_NO_OFFSET;
777 if (gctl_get_param(req, "linux", NULL))
778 sc->sc_flags |= CCDF_LINUX;
780 if (gctl_get_param(req, "uniform", NULL))
781 sc->sc_flags |= CCDF_UNIFORM;
782 if (gctl_get_param(req, "mirror", NULL))
783 sc->sc_flags |= CCDF_MIRROR;
785 if (sc->sc_ileave == 0 && (sc->sc_flags & CCDF_MIRROR)) {
786 printf("%s: disabling mirror, interleave is 0\n", gp->name);
787 sc->sc_flags &= ~(CCDF_MIRROR);
790 if ((sc->sc_flags & CCDF_MIRROR) && !(sc->sc_flags & CCDF_UNIFORM)) {
791 printf("%s: mirror/parity forces uniform flag\n", gp->name);
792 sc->sc_flags |= CCDF_UNIFORM;
795 error = ccdinit(req, sc);
799 g_wither_geom(gp, ENXIO);
803 pp = g_new_providerf(gp, "%s", gp->name);
804 pp->mediasize = sc->sc_size * (off_t)sc->sc_secsize;
805 pp->sectorsize = sc->sc_secsize;
806 g_error_provider(pp, 0);
808 sb = sbuf_new_auto();
809 sbuf_printf(sb, "ccd%d: %d components ", sc->sc_unit, *nprovider);
810 for (i = 0; i < *nprovider; i++) {
811 sbuf_printf(sb, "%s%s",
813 sc->sc_cinfo[i].ci_provider->name);
815 sbuf_printf(sb, "), %jd blocks ", (off_t)pp->mediasize / DEV_BSIZE);
816 if (sc->sc_ileave != 0)
817 sbuf_printf(sb, "interleaved at %d blocks\n",
820 sbuf_printf(sb, "concatenated\n");
822 gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
827 g_ccd_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp)
829 struct g_provider *pp;
834 pp = LIST_FIRST(&gp->provider);
835 if (sc == NULL || pp == NULL)
837 if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
838 gctl_error(req, "%s is open(r%dw%de%d)", gp->name,
839 pp->acr, pp->acw, pp->ace);
844 g_wither_geom(gp, ENXIO);
849 g_ccd_list(struct gctl_req *req, struct g_class *mp)
856 up = gctl_get_paraml(req, "unit", sizeof (*up));
858 gctl_error(req, "unit parameter not given");
862 sb = sbuf_new_auto();
863 LIST_FOREACH(gp, &mp->geom, geom) {
865 if (cs == NULL || (unit >= 0 && unit != cs->sc_unit))
867 sbuf_printf(sb, "ccd%d\t\t%d\t%d\t",
868 cs->sc_unit, cs->sc_ileave, cs->sc_flags & CCDF_USERMASK);
870 for (i = 0; i < cs->sc_ndisks; ++i) {
871 sbuf_printf(sb, "%s/dev/%s", i == 0 ? "" : " ",
872 cs->sc_cinfo[i].ci_provider->name);
874 sbuf_printf(sb, "\n");
877 gctl_set_param_err(req, "output", sbuf_data(sb), sbuf_len(sb) + 1);
882 g_ccd_config(struct gctl_req *req, struct g_class *mp, char const *verb)
887 if (!strcmp(verb, "create geom")) {
888 g_ccd_create(req, mp);
889 } else if (!strcmp(verb, "destroy geom")) {
890 gp = gctl_get_geom(req, mp, "geom");
892 g_ccd_destroy_geom(req, mp, gp);
893 } else if (!strcmp(verb, "list")) {
896 gctl_error(req, "unknown verb");
900 static struct g_class g_ccd_class = {
902 .version = G_VERSION,
903 .ctlreq = g_ccd_config,
904 .destroy_geom = g_ccd_destroy_geom,
905 .start = g_ccd_start,
906 .orphan = g_ccd_orphan,
907 .access = g_ccd_access,
910 DECLARE_GEOM_CLASS(g_ccd_class, g_ccd);