3 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
6 * Copyright (c) 1995 Jason R. Thorpe.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project
21 * 4. The name of the author may not be used to endorse or promote products
22 * derived from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * Copyright (c) 1988 University of Utah.
39 * Copyright (c) 1990, 1993
40 * The Regents of the University of California. All rights reserved.
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. All advertising materials mentioning features or use of this software
55 * must display the following acknowledgement:
56 * This product includes software developed by the University of
57 * California, Berkeley and its contributors.
58 * 4. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 * @(#)cd.c 8.2 (Berkeley) 11/16/93
80 * "Concatenated" disk driver.
82 * Dynamic configuration and disklabel support by:
83 * Jason R. Thorpe <thorpej@nas.nasa.gov>
84 * Numerical Aerodynamic Simulation Facility
86 * NASA Ames Research Center
87 * Moffett Field, CA 94035
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/kernel.h>
96 #include <sys/module.h>
99 #include <sys/malloc.h>
100 #include <sys/namei.h>
101 #include <sys/conf.h>
102 #include <sys/stat.h>
103 #include <sys/sysctl.h>
104 #include <sys/disklabel.h>
105 #include <ufs/ffs/fs.h>
106 #include <sys/devicestat.h>
107 #include <sys/fcntl.h>
108 #include <sys/vnode.h>
110 #include <sys/ccdvar.h>
112 #include <vm/vm_zone.h>
114 #if defined(CCDDEBUG) && !defined(DEBUG)
119 #define CCDB_FOLLOW 0x01
120 #define CCDB_INIT 0x02
122 #define CCDB_LABEL 0x08
123 #define CCDB_VNODE 0x10
124 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
126 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
130 #define ccdunit(x) dkunit(x)
131 #define ccdpart(x) dkpart(x)
134 This is how mirroring works (only writes are special):
136 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
137 linked together by the cb_mirror field. "cb_pflags &
138 CCDPF_MIRROR_DONE" is set to 0 on both of them.
140 When a component returns to ccdiodone(), it checks if "cb_pflags &
141 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
142 flag and returns. If it is, it means its partner has already
143 returned, so it will go to the regular cleanup.
148 struct buf cb_buf; /* new I/O buf */
149 struct buf *cb_obp; /* ptr. to original I/O buf */
150 struct ccdbuf *cb_freenext; /* free list link */
151 int cb_unit; /* target unit */
152 int cb_comp; /* target component */
153 int cb_pflags; /* mirror/parity status flag */
154 struct ccdbuf *cb_mirror; /* mirror counterpart */
157 /* bits in cb_pflags */
158 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
160 #define CCDLABELDEV(dev) \
161 (makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_dump_t ccddump;
168 static d_psize_t ccdsize;
170 #define NCCDFREEHIWAT 16
172 #define CDEV_MAJOR 74
173 #define BDEV_MAJOR 21
175 static struct cdevsw ccd_cdevsw = {
177 /* close */ ccdclose,
179 /* write */ physwrite,
180 /* ioctl */ ccdioctl,
183 /* strategy */ ccdstrategy,
185 /* maj */ CDEV_MAJOR,
189 /* bmaj */ BDEV_MAJOR
192 /* called during module initialization */
193 static void ccdattach __P((void));
194 static int ccd_modevent __P((module_t, int, void *));
196 /* called by biodone() at interrupt time */
197 static void ccdiodone __P((struct ccdbuf *cbp));
199 static void ccdstart __P((struct ccd_softc *, struct buf *));
200 static void ccdinterleave __P((struct ccd_softc *, int));
201 static void ccdintr __P((struct ccd_softc *, struct buf *));
202 static int ccdinit __P((struct ccddevice *, char **, struct proc *));
203 static int ccdlookup __P((char *, struct proc *p, struct vnode **));
204 static void ccdbuffer __P((struct ccdbuf **ret, struct ccd_softc *,
205 struct buf *, daddr_t, caddr_t, long));
206 static void ccdgetdisklabel __P((dev_t));
207 static void ccdmakedisklabel __P((struct ccd_softc *));
208 static int ccdlock __P((struct ccd_softc *));
209 static void ccdunlock __P((struct ccd_softc *));
212 static void printiinfo __P((struct ccdiinfo *));
215 /* Non-private for the benefit of libkvm. */
216 struct ccd_softc *ccd_softc;
217 struct ccddevice *ccddevs;
218 struct ccdbuf *ccdfreebufs;
219 static int numccdfreebufs;
220 static int numccd = 0;
223 * getccdbuf() - Allocate and zero a ccd buffer.
225 * This routine is called at splbio().
230 getccdbuf(struct ccdbuf *cpy)
235 * Allocate from freelist or malloc as necessary
237 if ((cbp = ccdfreebufs) != NULL) {
238 ccdfreebufs = cbp->cb_freenext;
241 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
245 * Used by mirroring code
248 bcopy(cpy, cbp, sizeof(struct ccdbuf));
250 bzero(cbp, sizeof(struct ccdbuf));
253 * independant struct buf initialization
255 LIST_INIT(&cbp->cb_buf.b_dep);
256 BUF_LOCKINIT(&cbp->cb_buf);
257 BUF_LOCK(&cbp->cb_buf, LK_EXCLUSIVE);
258 BUF_KERNPROC(&cbp->cb_buf);
264 * putccdbuf() - Free a ccd buffer.
266 * This routine is called at splbio().
271 putccdbuf(struct ccdbuf *cbp)
273 BUF_UNLOCK(&cbp->cb_buf);
274 BUF_LOCKFREE(&cbp->cb_buf);
276 if (numccdfreebufs < NCCDFREEHIWAT) {
277 cbp->cb_freenext = ccdfreebufs;
281 free((caddr_t)cbp, M_DEVBUF);
287 * Number of blocks to untouched in front of a component partition.
288 * This is to avoid violating its disklabel area when it starts at the
289 * beginning of the slice.
291 #if !defined(CCD_OFFSET)
292 #define CCD_OFFSET 16
296 * Called by main() during pseudo-device attachment. All we need
297 * to do is allocate enough space for devices to be configured later, and
307 printf("ccd0-%d: Concatenated disk drivers\n", num-1);
309 printf("ccd0: Concatenated disk driver\n");
311 ccd_softc = (struct ccd_softc *)malloc(num * sizeof(struct ccd_softc),
313 ccddevs = (struct ccddevice *)malloc(num * sizeof(struct ccddevice),
315 if ((ccd_softc == NULL) || (ccddevs == NULL)) {
316 printf("WARNING: no memory for concatenated disks\n");
317 if (ccd_softc != NULL)
318 free(ccd_softc, M_DEVBUF);
320 free(ccddevs, M_DEVBUF);
324 bzero(ccd_softc, num * sizeof(struct ccd_softc));
325 bzero(ccddevs, num * sizeof(struct ccddevice));
327 cdevsw_add(&ccd_cdevsw);
328 /* XXX: is this necessary? */
329 for (i = 0; i < numccd; ++i)
330 ccddevs[i].ccd_dk = -1;
334 ccd_modevent(mod, type, data)
347 printf("ccd0: Unload not supported!\n");
351 default: /* MOD_SHUTDOWN etc */
357 DEV_MODULE(ccd, ccd_modevent, NULL);
360 ccdinit(ccd, cpaths, p)
361 struct ccddevice *ccd;
365 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
366 struct ccdcinfo *ci = NULL; /* XXX */
372 struct partinfo dpart;
373 struct ccdgeom *ccg = &cs->sc_geom;
374 char tmppath[MAXPATHLEN];
378 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
379 printf("ccdinit: unit %d\n", ccd->ccd_unit);
383 cs->sc_ileave = ccd->ccd_interleave;
384 cs->sc_nccdisks = ccd->ccd_ndev;
386 /* Allocate space for the component info. */
387 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
391 * Verify that each component piece exists and record
392 * relevant information about it.
396 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
397 vp = ccd->ccd_vpp[ix];
398 ci = &cs->sc_cinfo[ix];
402 * Copy in the pathname of the component.
404 bzero(tmppath, sizeof(tmppath)); /* sanity */
405 if ((error = copyinstr(cpaths[ix], tmppath,
406 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
408 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
409 printf("ccd%d: can't copy path, error = %d\n",
410 ccd->ccd_unit, error);
414 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
415 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
417 ci->ci_dev = vn_todev(vp);
420 * Get partition information for the component.
422 if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
423 FREAD, p->p_ucred, p)) != 0) {
425 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
426 printf("ccd%d: %s: ioctl failed, error = %d\n",
427 ccd->ccd_unit, ci->ci_path, error);
431 if (dpart.part->p_fstype == FS_BSDFFS) {
433 ((dpart.disklab->d_secsize > maxsecsize) ?
434 dpart.disklab->d_secsize : maxsecsize);
435 size = dpart.part->p_size - CCD_OFFSET;
438 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
439 printf("ccd%d: %s: incorrect partition type\n",
440 ccd->ccd_unit, ci->ci_path);
447 * Calculate the size, truncating to an interleave
448 * boundary if necessary.
451 if (cs->sc_ileave > 1)
452 size -= size % cs->sc_ileave;
456 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
457 printf("ccd%d: %s: size == 0\n",
458 ccd->ccd_unit, ci->ci_path);
464 if (minsize == 0 || size < minsize)
471 * Don't allow the interleave to be smaller than
472 * the biggest component sector.
474 if ((cs->sc_ileave > 0) &&
475 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
477 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
478 printf("ccd%d: interleave must be at least %d\n",
479 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
486 * If uniform interleave is desired set all sizes to that of
487 * the smallest component. This will guarentee that a single
488 * interleave table is generated.
490 * Lost space must be taken into account when calculating the
491 * overall size. Half the space is lost when CCDF_MIRROR is
492 * specified. One disk is lost when CCDF_PARITY is specified.
494 if (ccd->ccd_flags & CCDF_UNIFORM) {
495 for (ci = cs->sc_cinfo;
496 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
497 ci->ci_size = minsize;
499 if (ccd->ccd_flags & CCDF_MIRROR) {
501 * Check to see if an even number of components
502 * have been specified. The interleave must also
503 * be non-zero in order for us to be able to
504 * guarentee the topology.
506 if (cs->sc_nccdisks % 2) {
507 printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
511 if (cs->sc_ileave == 0) {
512 printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
516 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
517 } else if (ccd->ccd_flags & CCDF_PARITY) {
518 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
520 if (cs->sc_ileave == 0) {
521 printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
525 cs->sc_size = cs->sc_nccdisks * minsize;
530 * Construct the interleave table.
532 ccdinterleave(cs, ccd->ccd_unit);
535 * Create pseudo-geometry based on 1MB cylinders. It's
538 ccg->ccg_secsize = maxsecsize;
539 ccg->ccg_ntracks = 1;
540 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
541 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
544 * Add an devstat entry for this device.
546 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
547 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
548 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
549 DEVSTAT_PRIORITY_ARRAY);
551 cs->sc_flags |= CCDF_INITED;
552 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
553 cs->sc_unit = ccd->ccd_unit;
556 while (ci > cs->sc_cinfo) {
558 free(ci->ci_path, M_DEVBUF);
560 free(cs->sc_cinfo, M_DEVBUF);
565 ccdinterleave(cs, unit)
566 struct ccd_softc *cs;
569 struct ccdcinfo *ci, *smallci;
576 if (ccddebug & CCDB_INIT)
577 printf("ccdinterleave(%x): ileave %d\n", cs, cs->sc_ileave);
581 * Allocate an interleave table. The worst case occurs when each
582 * of N disks is of a different size, resulting in N interleave
585 * Chances are this is too big, but we don't care.
587 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
588 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF, M_WAITOK);
589 bzero((caddr_t)cs->sc_itable, size);
592 * Trivial case: no interleave (actually interleave of disk size).
593 * Each table entry represents a single component in its entirety.
595 * An interleave of 0 may not be used with a mirror or parity setup.
597 if (cs->sc_ileave == 0) {
601 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
602 /* Allocate space for ii_index. */
603 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
605 ii->ii_startblk = bn;
607 ii->ii_index[0] = ix;
608 bn += cs->sc_cinfo[ix].ci_size;
613 if (ccddebug & CCDB_INIT)
614 printiinfo(cs->sc_itable);
620 * The following isn't fast or pretty; it doesn't have to be.
624 for (ii = cs->sc_itable; ; ii++) {
626 * Allocate space for ii_index. We might allocate more then
629 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
633 * Locate the smallest of the remaining components
636 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
638 if (ci->ci_size > size &&
640 ci->ci_size < smallci->ci_size)) {
646 * Nobody left, all done
648 if (smallci == NULL) {
654 * Record starting logical block using an sc_ileave blocksize.
656 ii->ii_startblk = bn / cs->sc_ileave;
659 * Record starting comopnent block using an sc_ileave
660 * blocksize. This value is relative to the beginning of
663 ii->ii_startoff = lbn;
666 * Determine how many disks take part in this interleave
667 * and record their indices.
670 for (ci = cs->sc_cinfo;
671 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
672 if (ci->ci_size >= smallci->ci_size) {
673 ii->ii_index[ix++] = ci - cs->sc_cinfo;
677 bn += ix * (smallci->ci_size - size);
678 lbn = smallci->ci_size / cs->sc_ileave;
679 size = smallci->ci_size;
682 if (ccddebug & CCDB_INIT)
683 printiinfo(cs->sc_itable);
689 ccdopen(dev, flags, fmt, p)
694 int unit = ccdunit(dev);
695 struct ccd_softc *cs;
696 struct disklabel *lp;
697 int error = 0, part, pmask;
700 if (ccddebug & CCDB_FOLLOW)
701 printf("ccdopen(%x, %x)\n", dev, flags);
705 cs = &ccd_softc[unit];
707 if ((error = ccdlock(cs)) != 0)
716 * If we're initialized, check to see if there are any other
717 * open partitions. If not, then it's safe to update
718 * the in-core disklabel.
720 if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
721 ccdgetdisklabel(dev);
723 /* Check that the partition exists. */
724 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
725 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
730 /* Prevent our unit from being unconfigured while open. */
733 cs->sc_copenmask |= pmask;
737 cs->sc_bopenmask |= pmask;
741 cs->sc_copenmask | cs->sc_bopenmask;
750 ccdclose(dev, flags, fmt, p)
755 int unit = ccdunit(dev);
756 struct ccd_softc *cs;
760 if (ccddebug & CCDB_FOLLOW)
761 printf("ccdclose(%x, %x)\n", dev, flags);
766 cs = &ccd_softc[unit];
768 if ((error = ccdlock(cs)) != 0)
773 /* ...that much closer to allowing unconfiguration... */
776 cs->sc_copenmask &= ~(1 << part);
780 cs->sc_bopenmask &= ~(1 << part);
784 cs->sc_copenmask | cs->sc_bopenmask;
794 int unit = ccdunit(bp->b_dev);
795 struct ccd_softc *cs = &ccd_softc[unit];
798 struct disklabel *lp;
801 if (ccddebug & CCDB_FOLLOW)
802 printf("ccdstrategy(%x): unit %d\n", bp, unit);
804 if ((cs->sc_flags & CCDF_INITED) == 0) {
806 bp->b_flags |= B_ERROR;
810 /* If it's a nil transfer, wake up the top half now. */
811 if (bp->b_bcount == 0)
817 * Do bounds checking and adjust transfer. If there's an
818 * error, the bounds check will flag that for us.
820 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
821 if (ccdpart(bp->b_dev) != RAW_PART) {
822 if (bounds_check_with_label(bp, lp, wlabel) <= 0)
825 int pbn; /* in sc_secsize chunks */
826 long sz; /* in sc_secsize chunks */
828 pbn = bp->b_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
829 sz = howmany(bp->b_bcount, cs->sc_geom.ccg_secsize);
832 * If out of bounds return an error. If at the EOF point,
833 * simply read or write less.
836 if (pbn < 0 || pbn >= cs->sc_size) {
837 bp->b_resid = bp->b_bcount;
838 if (pbn != cs->sc_size) {
839 bp->b_error = EINVAL;
840 bp->b_flags |= B_ERROR | B_INVAL;
846 * If the request crosses EOF, truncate the request.
848 if (pbn + sz > cs->sc_size) {
849 bp->b_bcount = (cs->sc_size - pbn) *
850 cs->sc_geom.ccg_secsize;
854 bp->b_resid = bp->b_bcount;
869 struct ccd_softc *cs;
873 struct ccdbuf *cbp[4];
874 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
877 struct partition *pp;
880 if (ccddebug & CCDB_FOLLOW)
881 printf("ccdstart(%x, %x)\n", cs, bp);
884 /* Record the transaction start */
885 devstat_start_transaction(&cs->device_stats);
888 * Translate the partition-relative block number to an absolute.
891 if (ccdpart(bp->b_dev) != RAW_PART) {
892 pp = &cs->sc_label.d_partitions[ccdpart(bp->b_dev)];
897 * Allocate component buffers and fire off the requests
900 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
901 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
902 rcount = cbp[0]->cb_buf.b_bcount;
904 if (cs->sc_cflags & CCDF_MIRROR) {
906 * Mirroring. Writes go to both disks, reads are
907 * taken from whichever disk seems most appropriate.
909 * We attempt to localize reads to the disk whos arm
910 * is nearest the read request. We ignore seeks due
911 * to writes when making this determination and we
912 * also try to avoid hogging.
914 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0) {
915 cbp[0]->cb_buf.b_vp->v_numoutput++;
916 cbp[1]->cb_buf.b_vp->v_numoutput++;
917 VOP_STRATEGY(cbp[0]->cb_buf.b_vp,
919 VOP_STRATEGY(cbp[1]->cb_buf.b_vp,
922 int pick = cs->sc_pick;
923 daddr_t range = cs->sc_size / 16;
925 if (bn < cs->sc_blk[pick] - range ||
926 bn > cs->sc_blk[pick] + range
928 cs->sc_pick = pick = 1 - pick;
930 cs->sc_blk[pick] = bn + btodb(rcount);
931 VOP_STRATEGY(cbp[pick]->cb_buf.b_vp,
938 if ((cbp[0]->cb_buf.b_flags & B_READ) == 0)
939 cbp[0]->cb_buf.b_vp->v_numoutput++;
940 VOP_STRATEGY(cbp[0]->cb_buf.b_vp, &cbp[0]->cb_buf);
948 * Build a component buffer header.
951 ccdbuffer(cb, cs, bp, bn, addr, bcount)
953 struct ccd_softc *cs;
959 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
965 if (ccddebug & CCDB_IO)
966 printf("ccdbuffer(%x, %x, %d, %x, %d)\n",
967 cs, bp, bn, addr, bcount);
970 * Determine which component bn falls in.
975 if (cs->sc_ileave == 0) {
977 * Serially concatenated and neither a mirror nor a parity
978 * config. This is a special case.
983 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
991 * Calculate cbn, the logical superblock (sc_ileave chunks),
992 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
995 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
996 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
999 * Figure out which interleave table to use.
1001 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
1002 if (ii->ii_startblk > cbn)
1008 * off is the logical superblock relative to the beginning
1009 * of this interleave block.
1011 off = cbn - ii->ii_startblk;
1014 * We must calculate which disk component to use (ccdisk),
1015 * and recalculate cbn to be the superblock relative to
1016 * the beginning of the component. This is typically done by
1017 * adding 'off' and ii->ii_startoff together. However, 'off'
1018 * must typically be divided by the number of components in
1019 * this interleave array to be properly convert it from a
1020 * CCD-relative logical superblock number to a
1021 * component-relative superblock number.
1023 if (ii->ii_ndisk == 1) {
1025 * When we have just one disk, it can't be a mirror
1026 * or a parity config.
1028 ccdisk = ii->ii_index[0];
1029 cbn = ii->ii_startoff + off;
1031 if (cs->sc_cflags & CCDF_MIRROR) {
1033 * We have forced a uniform mapping, resulting
1034 * in a single interleave array. We double
1035 * up on the first half of the available
1036 * components and our mirror is in the second
1037 * half. This only works with a single
1038 * interleave array because doubling up
1039 * doubles the number of sectors, so there
1040 * cannot be another interleave array because
1041 * the next interleave array's calculations
1044 int ndisk2 = ii->ii_ndisk / 2;
1045 ccdisk = ii->ii_index[off % ndisk2];
1046 cbn = ii->ii_startoff + off / ndisk2;
1047 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1048 } else if (cs->sc_cflags & CCDF_PARITY) {
1050 * XXX not implemented yet
1052 int ndisk2 = ii->ii_ndisk - 1;
1053 ccdisk = ii->ii_index[off % ndisk2];
1054 cbn = ii->ii_startoff + off / ndisk2;
1055 if (cbn % ii->ii_ndisk <= ccdisk)
1058 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1059 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1063 ci = &cs->sc_cinfo[ccdisk];
1066 * Convert cbn from a superblock to a normal block so it
1067 * can be used to calculate (along with cboff) the normal
1068 * block index into this particular disk.
1070 cbn *= cs->sc_ileave;
1074 * Fill in the component buf structure.
1076 cbp = getccdbuf(NULL);
1077 cbp->cb_buf.b_flags = bp->b_flags | B_CALL;
1078 cbp->cb_buf.b_iodone = (void (*)(struct buf *))ccdiodone;
1079 cbp->cb_buf.b_dev = ci->ci_dev; /* XXX */
1080 cbp->cb_buf.b_blkno = cbn + cboff + CCD_OFFSET;
1081 cbp->cb_buf.b_offset = dbtob(cbn + cboff + CCD_OFFSET);
1082 cbp->cb_buf.b_data = addr;
1083 cbp->cb_buf.b_vp = ci->ci_vp;
1084 if (cs->sc_ileave == 0)
1085 cbc = dbtob((off_t)(ci->ci_size - cbn));
1087 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1088 cbp->cb_buf.b_bcount = (cbc < bcount) ? cbc : bcount;
1089 cbp->cb_buf.b_bufsize = cbp->cb_buf.b_bcount;
1092 * context for ccdiodone
1095 cbp->cb_unit = cs - ccd_softc;
1096 cbp->cb_comp = ci - cs->sc_cinfo;
1099 if (ccddebug & CCDB_IO)
1100 printf(" dev %x(u%d): cbp %x bn %d addr %x bcnt %d\n",
1101 ci->ci_dev, ci-cs->sc_cinfo, cbp, cbp->cb_buf.b_blkno,
1102 cbp->cb_buf.b_data, cbp->cb_buf.b_bcount);
1107 * Note: both I/O's setup when reading from mirror, but only one
1110 if (cs->sc_cflags & CCDF_MIRROR) {
1111 /* mirror, setup second I/O */
1112 cbp = getccdbuf(cb[0]);
1113 cbp->cb_buf.b_dev = ci2->ci_dev;
1114 cbp->cb_buf.b_vp = ci2->ci_vp;
1115 cbp->cb_comp = ci2 - cs->sc_cinfo;
1117 /* link together the ccdbuf's and clear "mirror done" flag */
1118 cb[0]->cb_mirror = cb[1];
1119 cb[1]->cb_mirror = cb[0];
1120 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1121 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1127 struct ccd_softc *cs;
1131 if (ccddebug & CCDB_FOLLOW)
1132 printf("ccdintr(%x, %x)\n", cs, bp);
1135 * Request is done for better or worse, wakeup the top half.
1137 if (bp->b_flags & B_ERROR)
1138 bp->b_resid = bp->b_bcount;
1139 devstat_end_transaction_buf(&cs->device_stats, bp);
1144 * Called at interrupt time.
1145 * Mark the component as done and if all components are done,
1146 * take a ccd interrupt.
1152 struct buf *bp = cbp->cb_obp;
1153 int unit = cbp->cb_unit;
1158 if (ccddebug & CCDB_FOLLOW)
1159 printf("ccdiodone(%x)\n", cbp);
1160 if (ccddebug & CCDB_IO) {
1161 printf("ccdiodone: bp %x bcount %d resid %d\n",
1162 bp, bp->b_bcount, bp->b_resid);
1163 printf(" dev %x(u%d), cbp %x bn %d addr %x bcnt %d\n",
1164 cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
1165 cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
1166 cbp->cb_buf.b_bcount);
1170 * If an error occured, report it. If this is a mirrored
1171 * configuration and the first of two possible reads, do not
1172 * set the error in the bp yet because the second read may
1176 if (cbp->cb_buf.b_flags & B_ERROR) {
1177 const char *msg = "";
1179 if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1180 (cbp->cb_buf.b_flags & B_READ) &&
1181 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1183 * We will try our read on the other disk down
1184 * below, also reverse the default pick so if we
1185 * are doing a scan we do not keep hitting the
1188 struct ccd_softc *cs = &ccd_softc[unit];
1190 msg = ", trying other disk";
1191 cs->sc_pick = 1 - cs->sc_pick;
1192 cs->sc_blk[cs->sc_pick] = bp->b_blkno;
1194 bp->b_flags |= B_ERROR;
1195 bp->b_error = cbp->cb_buf.b_error ?
1196 cbp->cb_buf.b_error : EIO;
1198 printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1199 unit, bp->b_error, cbp->cb_comp,
1200 (int)cbp->cb_buf.b_blkno, bp->b_blkno, msg);
1204 * Process mirror. If we are writing, I/O has been initiated on both
1205 * buffers and we fall through only after both are finished.
1207 * If we are reading only one I/O is initiated at a time. If an
1208 * error occurs we initiate the second I/O and return, otherwise
1209 * we free the second I/O without initiating it.
1212 if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1213 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
1215 * When writing, handshake with the second buffer
1216 * to determine when both are done. If both are not
1217 * done, return here.
1219 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1220 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1227 * When reading, either dispose of the second buffer
1228 * or initiate I/O on the second buffer if an error
1229 * occured with this one.
1231 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1232 if (cbp->cb_buf.b_flags & B_ERROR) {
1233 cbp->cb_mirror->cb_pflags |=
1236 cbp->cb_mirror->cb_buf.b_vp,
1237 &cbp->cb_mirror->cb_buf
1243 putccdbuf(cbp->cb_mirror);
1251 * use b_bufsize to determine how big the original request was rather
1252 * then b_bcount, because b_bcount may have been truncated for EOF.
1254 * XXX We check for an error, but we do not test the resid for an
1255 * aligned EOF condition. This may result in character & block
1256 * device access not recognizing EOF properly when read or written
1257 * sequentially, but will not effect filesystems.
1259 count = cbp->cb_buf.b_bufsize;
1263 * If all done, "interrupt".
1265 bp->b_resid -= count;
1266 if (bp->b_resid < 0)
1267 panic("ccdiodone: count");
1268 if (bp->b_resid == 0)
1269 ccdintr(&ccd_softc[unit], bp);
1274 ccdioctl(dev, cmd, data, flag, p)
1281 int unit = ccdunit(dev);
1282 int i, j, lookedup = 0, error = 0;
1284 struct ccd_softc *cs;
1285 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1286 struct ccddevice ccd;
1292 cs = &ccd_softc[unit];
1294 bzero(&ccd, sizeof(ccd));
1298 if (cs->sc_flags & CCDF_INITED)
1301 if ((flag & FWRITE) == 0)
1304 if ((error = ccdlock(cs)) != 0)
1307 /* Fill in some important bits. */
1308 ccd.ccd_unit = unit;
1309 ccd.ccd_interleave = ccio->ccio_ileave;
1310 if (ccd.ccd_interleave == 0 &&
1311 ((ccio->ccio_flags & CCDF_MIRROR) ||
1312 (ccio->ccio_flags & CCDF_PARITY))) {
1313 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1314 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1316 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1317 (ccio->ccio_flags & CCDF_PARITY)) {
1318 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1319 ccio->ccio_flags &= ~CCDF_PARITY;
1321 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1322 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1323 printf("ccd%d: mirror/parity forces uniform flag\n",
1325 ccio->ccio_flags |= CCDF_UNIFORM;
1327 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1330 * Allocate space for and copy in the array of
1331 * componet pathnames and device numbers.
1333 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1334 M_DEVBUF, M_WAITOK);
1335 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1336 M_DEVBUF, M_WAITOK);
1338 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1339 ccio->ccio_ndisks * sizeof(char **));
1341 free(vpp, M_DEVBUF);
1342 free(cpp, M_DEVBUF);
1348 if (ccddebug & CCDB_INIT)
1349 for (i = 0; i < ccio->ccio_ndisks; ++i)
1350 printf("ccdioctl: component %d: 0x%x\n",
1354 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1356 if (ccddebug & CCDB_INIT)
1357 printf("ccdioctl: lookedup = %d\n", lookedup);
1359 if ((error = ccdlookup(cpp[i], p, &vpp[i])) != 0) {
1360 for (j = 0; j < lookedup; ++j)
1361 (void)vn_close(vpp[j], FREAD|FWRITE,
1363 free(vpp, M_DEVBUF);
1364 free(cpp, M_DEVBUF);
1372 ccd.ccd_ndev = ccio->ccio_ndisks;
1375 * Initialize the ccd. Fills in the softc for us.
1377 if ((error = ccdinit(&ccd, cpp, p)) != 0) {
1378 for (j = 0; j < lookedup; ++j)
1379 (void)vn_close(vpp[j], FREAD|FWRITE,
1381 bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1382 free(vpp, M_DEVBUF);
1383 free(cpp, M_DEVBUF);
1389 * The ccd has been successfully initialized, so
1390 * we can place it into the array and read the disklabel.
1392 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1393 ccio->ccio_unit = unit;
1394 ccio->ccio_size = cs->sc_size;
1395 ccdgetdisklabel(dev);
1402 if ((cs->sc_flags & CCDF_INITED) == 0)
1405 if ((flag & FWRITE) == 0)
1408 if ((error = ccdlock(cs)) != 0)
1412 * Don't unconfigure if any other partitions are open
1413 * or if both the character and block flavors of this
1414 * partition are open.
1416 part = ccdpart(dev);
1417 pmask = (1 << part);
1418 if ((cs->sc_openmask & ~pmask) ||
1419 ((cs->sc_bopenmask & pmask) &&
1420 (cs->sc_copenmask & pmask))) {
1426 * Free ccd_softc information and clear entry.
1429 /* Close the components and free their pathnames. */
1430 for (i = 0; i < cs->sc_nccdisks; ++i) {
1432 * XXX: this close could potentially fail and
1433 * cause Bad Things. Maybe we need to force
1434 * the close to happen?
1437 if (ccddebug & CCDB_VNODE)
1438 vprint("CCDIOCCLR: vnode info",
1439 cs->sc_cinfo[i].ci_vp);
1441 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1443 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1446 /* Free interleave index. */
1447 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1448 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1450 /* Free component info and interleave table. */
1451 free(cs->sc_cinfo, M_DEVBUF);
1452 free(cs->sc_itable, M_DEVBUF);
1453 cs->sc_flags &= ~CCDF_INITED;
1456 * Free ccddevice information and clear entry.
1458 free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1459 free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1461 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1464 * And remove the devstat entry.
1466 devstat_remove_entry(&cs->device_stats);
1468 /* This must be atomic. */
1471 bzero(cs, sizeof(struct ccd_softc));
1477 if ((cs->sc_flags & CCDF_INITED) == 0)
1480 *(struct disklabel *)data = cs->sc_label;
1484 if ((cs->sc_flags & CCDF_INITED) == 0)
1487 ((struct partinfo *)data)->disklab = &cs->sc_label;
1488 ((struct partinfo *)data)->part =
1489 &cs->sc_label.d_partitions[ccdpart(dev)];
1494 if ((cs->sc_flags & CCDF_INITED) == 0)
1497 if ((flag & FWRITE) == 0)
1500 if ((error = ccdlock(cs)) != 0)
1503 cs->sc_flags |= CCDF_LABELLING;
1505 error = setdisklabel(&cs->sc_label,
1506 (struct disklabel *)data, 0);
1508 if (cmd == DIOCWDINFO)
1509 error = writedisklabel(CCDLABELDEV(dev),
1513 cs->sc_flags &= ~CCDF_LABELLING;
1522 if ((cs->sc_flags & CCDF_INITED) == 0)
1525 if ((flag & FWRITE) == 0)
1527 if (*(int *)data != 0)
1528 cs->sc_flags |= CCDF_WLABEL;
1530 cs->sc_flags &= ~CCDF_WLABEL;
1544 struct ccd_softc *cs;
1547 if (ccdopen(dev, 0, S_IFBLK, curproc))
1550 cs = &ccd_softc[ccdunit(dev)];
1551 part = ccdpart(dev);
1553 if ((cs->sc_flags & CCDF_INITED) == 0)
1556 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1559 size = cs->sc_label.d_partitions[part].p_size;
1561 if (ccdclose(dev, 0, S_IFBLK, curproc))
1572 /* Not implemented. */
1577 * Lookup the provided name in the filesystem. If the file exists,
1578 * is a valid block device, and isn't being used by anyone else,
1579 * set *vpp to the file's vnode.
1582 ccdlookup(path, p, vpp)
1585 struct vnode **vpp; /* result */
1587 struct nameidata nd;
1591 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p);
1592 if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
1594 if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1595 printf("ccdlookup: vn_open error = %d\n", error);
1601 if (vp->v_usecount > 1) {
1606 if (!vn_isdisk(vp, &error))
1610 if (ccddebug & CCDB_VNODE)
1611 vprint("ccdlookup: vnode info", vp);
1614 VOP_UNLOCK(vp, 0, p);
1615 NDFREE(&nd, NDF_ONLY_PNBUF);
1619 VOP_UNLOCK(vp, 0, p);
1620 NDFREE(&nd, NDF_ONLY_PNBUF);
1621 /* vn_close does vrele() for vp */
1622 (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
1627 * Read the disklabel from the ccd. If one is not present, fake one
1631 ccdgetdisklabel(dev)
1634 int unit = ccdunit(dev);
1635 struct ccd_softc *cs = &ccd_softc[unit];
1637 struct disklabel *lp = &cs->sc_label;
1638 struct ccdgeom *ccg = &cs->sc_geom;
1640 bzero(lp, sizeof(*lp));
1642 lp->d_secperunit = cs->sc_size;
1643 lp->d_secsize = ccg->ccg_secsize;
1644 lp->d_nsectors = ccg->ccg_nsectors;
1645 lp->d_ntracks = ccg->ccg_ntracks;
1646 lp->d_ncylinders = ccg->ccg_ncylinders;
1647 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1649 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1650 lp->d_type = DTYPE_CCD;
1651 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1653 lp->d_interleave = 1;
1656 lp->d_partitions[RAW_PART].p_offset = 0;
1657 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1658 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1659 lp->d_npartitions = RAW_PART + 1;
1661 lp->d_bbsize = BBSIZE; /* XXX */
1662 lp->d_sbsize = SBSIZE; /* XXX */
1664 lp->d_magic = DISKMAGIC;
1665 lp->d_magic2 = DISKMAGIC;
1666 lp->d_checksum = dkcksum(&cs->sc_label);
1669 * Call the generic disklabel extraction routine.
1671 errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1672 if (errstring != NULL)
1673 ccdmakedisklabel(cs);
1676 /* It's actually extremely common to have unlabeled ccds. */
1677 if (ccddebug & CCDB_LABEL)
1678 if (errstring != NULL)
1679 printf("ccd%d: %s\n", unit, errstring);
1684 * Take care of things one might want to take care of in the event
1685 * that a disklabel isn't present.
1688 ccdmakedisklabel(cs)
1689 struct ccd_softc *cs;
1691 struct disklabel *lp = &cs->sc_label;
1694 * For historical reasons, if there's no disklabel present
1695 * the raw partition must be marked FS_BSDFFS.
1697 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1699 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1703 * Wait interruptibly for an exclusive lock.
1706 * Several drivers do this; it should be abstracted and made MP-safe.
1710 struct ccd_softc *cs;
1714 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1715 cs->sc_flags |= CCDF_WANTED;
1716 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1719 cs->sc_flags |= CCDF_LOCKED;
1724 * Unlock and wake up any waiters.
1728 struct ccd_softc *cs;
1731 cs->sc_flags &= ~CCDF_LOCKED;
1732 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1733 cs->sc_flags &= ~CCDF_WANTED;
1741 struct ccdiinfo *ii;
1745 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1746 printf(" itab[%d]: #dk %d sblk %d soff %d",
1747 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1748 for (i = 0; i < ii->ii_ndisk; i++)
1749 printf(" %d", ii->ii_index[i]);
1755 #endif /* NCCD > 0 */
1757 /* Local Variables: */
1758 /* c-argdecl-indent: 8 */
1759 /* c-continued-statement-offset: 8 */
1760 /* c-indent-level: 8 */