3 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
6 * Copyright (c) 1995 Jason R. Thorpe.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project
21 * 4. The name of the author may not be used to endorse or promote products
22 * derived from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * Copyright (c) 1988 University of Utah.
39 * Copyright (c) 1990, 1993
40 * The Regents of the University of California. All rights reserved.
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. All advertising materials mentioning features or use of this software
55 * must display the following acknowledgement:
56 * This product includes software developed by the University of
57 * California, Berkeley and its contributors.
58 * 4. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 * @(#)cd.c 8.2 (Berkeley) 11/16/93
80 * "Concatenated" disk driver.
82 * Dynamic configuration and disklabel support by:
83 * Jason R. Thorpe <thorpej@nas.nasa.gov>
84 * Numerical Aerodynamic Simulation Facility
86 * NASA Ames Research Center
87 * Moffett Field, CA 94035
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
100 #include <sys/sysctl.h>
101 #include <sys/disklabel.h>
102 #include <ufs/ffs/fs.h>
103 #include <sys/devicestat.h>
104 #include <sys/fcntl.h>
105 #include <sys/vnode.h>
107 #include <sys/ccdvar.h>
109 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
111 #if defined(CCDDEBUG) && !defined(DEBUG)
116 #define CCDB_FOLLOW 0x01
117 #define CCDB_INIT 0x02
119 #define CCDB_LABEL 0x08
120 #define CCDB_VNODE 0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
126 #define ccdunit(x) dkunit(x)
127 #define ccdpart(x) dkpart(x)
130 This is how mirroring works (only writes are special):
132 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
133 linked together by the cb_mirror field. "cb_pflags &
134 CCDPF_MIRROR_DONE" is set to 0 on both of them.
136 When a component returns to ccdiodone(), it checks if "cb_pflags &
137 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
138 flag and returns. If it is, it means its partner has already
139 returned, so it will go to the regular cleanup.
144 struct bio cb_buf; /* new I/O buf */
145 struct bio *cb_obp; /* ptr. to original I/O buf */
146 struct ccdbuf *cb_freenext; /* free list link */
147 int cb_unit; /* target unit */
148 int cb_comp; /* target component */
149 int cb_pflags; /* mirror/parity status flag */
150 struct ccdbuf *cb_mirror; /* mirror counterpart */
153 /* bits in cb_pflags */
154 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
156 #define CCDLABELDEV(dev) \
157 (makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
159 /* convinient macros for often-used statements */
160 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL)
161 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0)
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_psize_t ccdsize;
169 #define NCCDFREEHIWAT 16
171 #define CDEV_MAJOR 74
173 static struct cdevsw ccd_cdevsw = {
175 /* close */ ccdclose,
177 /* write */ physwrite,
178 /* ioctl */ ccdioctl,
181 /* strategy */ ccdstrategy,
183 /* maj */ CDEV_MAJOR,
188 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
190 static struct ccd_s *ccdfind(int);
191 static struct ccd_s *ccdnew(int);
192 static int ccddestroy(struct ccd_s *, struct proc *);
194 /* called during module initialization */
195 static void ccdattach(void);
196 static int ccd_modevent(module_t, int, void *);
198 /* called by biodone() at interrupt time */
199 static void ccdiodone(struct bio *bp);
201 static void ccdstart(struct ccd_s *, struct bio *);
202 static void ccdinterleave(struct ccd_s *, int);
203 static void ccdintr(struct ccd_s *, struct bio *);
204 static int ccdinit(struct ccd_s *, char **, struct thread *);
205 static int ccdlookup(char *, struct thread *p, struct vnode **);
206 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
207 struct bio *, daddr_t, caddr_t, long);
208 static void ccdgetdisklabel(dev_t);
209 static void ccdmakedisklabel(struct ccd_s *);
210 static int ccdlock(struct ccd_s *);
211 static void ccdunlock(struct ccd_s *);
214 static void printiinfo(struct ccdiinfo *);
217 /* Non-private for the benefit of libkvm. */
218 struct ccdbuf *ccdfreebufs;
219 static int numccdfreebufs;
222 * getccdbuf() - Allocate and zero a ccd buffer.
224 * This routine is called at splbio().
229 getccdbuf(struct ccdbuf *cpy)
234 * Allocate from freelist or malloc as necessary
236 if ((cbp = ccdfreebufs) != NULL) {
237 ccdfreebufs = cbp->cb_freenext;
240 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
244 * Used by mirroring code
247 bcopy(cpy, cbp, sizeof(struct ccdbuf));
249 bzero(cbp, sizeof(struct ccdbuf));
252 * independant struct bio initialization
259 * putccdbuf() - Free a ccd buffer.
261 * This routine is called at splbio().
266 putccdbuf(struct ccdbuf *cbp)
269 if (numccdfreebufs < NCCDFREEHIWAT) {
270 cbp->cb_freenext = ccdfreebufs;
274 free((caddr_t)cbp, M_DEVBUF);
280 * Number of blocks to untouched in front of a component partition.
281 * This is to avoid violating its disklabel area when it starts at the
282 * beginning of the slice.
284 #if !defined(CCD_OFFSET)
285 #define CCD_OFFSET 16
288 static struct ccd_s *
291 struct ccd_s *sc = NULL;
293 /* XXX: LOCK(unique unit numbers) */
294 LIST_FOREACH(sc, &ccd_softc_list, list) {
295 if (sc->sc_unit == unit)
298 /* XXX: UNLOCK(unique unit numbers) */
299 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
302 static struct ccd_s *
307 /* XXX: LOCK(unique unit numbers) */
308 if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
311 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
313 LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
314 /* XXX: UNLOCK(unique unit numbers) */
319 ccddestroy(struct ccd_s *sc, struct proc *p)
322 /* XXX: LOCK(unique unit numbers) */
323 LIST_REMOVE(sc, list);
324 /* XXX: UNLOCK(unique unit numbers) */
330 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
337 i = dev_stdclone(name, &s, "ccd", &u);
340 if (*s < 'a' || *s > 'h')
344 *dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
345 UID_ROOT, GID_OPERATOR, 0640, name);
349 * Called by main() during pseudo-device attachment. All we need
350 * to do is to add devsw entries.
356 EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
360 ccd_modevent(module_t mod, int type, void *data)
370 printf("ccd0: Unload not supported!\n");
383 DEV_MODULE(ccd, ccd_modevent, NULL);
386 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
388 struct ccdcinfo *ci = NULL; /* XXX */
394 struct ccdgeom *ccg = &cs->sc_geom;
395 char *tmppath = NULL;
401 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
402 printf("ccdinit: unit %d\n", cs->sc_unit);
407 /* Allocate space for the component info. */
408 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
412 * Verify that each component piece exists and record
413 * relevant information about it.
417 tmppath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK);
418 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
420 ci = &cs->sc_cinfo[ix];
424 * Copy in the pathname of the component.
426 if ((error = copyinstr(cpaths[ix], tmppath,
427 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
429 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
430 printf("ccd%d: can't copy path, error = %d\n",
435 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
436 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
438 ci->ci_dev = vn_todev(vp);
441 * Get partition information for the component.
443 error = VOP_IOCTL(vp, DIOCGMEDIASIZE, (caddr_t)&mediasize,
444 FREAD, td->td_ucred, td);
447 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
448 printf("ccd%d: %s: ioctl failed, error = %d\n",
449 cs->sc_unit, ci->ci_path, error);
454 * Get partition information for the component.
456 error = VOP_IOCTL(vp, DIOCGSECTORSIZE, (caddr_t)§orsize,
457 FREAD, td->td_ucred, td);
460 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
461 printf("ccd%d: %s: ioctl failed, error = %d\n",
462 cs->sc_unit, ci->ci_path, error);
466 if (sectorsize > maxsecsize)
467 maxsecsize = sectorsize;
468 size = mediasize / DEV_BSIZE - CCD_OFFSET;
471 * Calculate the size, truncating to an interleave
472 * boundary if necessary.
475 if (cs->sc_ileave > 1)
476 size -= size % cs->sc_ileave;
480 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
481 printf("ccd%d: %s: size == 0\n",
482 cs->sc_unit, ci->ci_path);
488 if (minsize == 0 || size < minsize)
494 free(tmppath, M_DEVBUF);
498 * Don't allow the interleave to be smaller than
499 * the biggest component sector.
501 if ((cs->sc_ileave > 0) &&
502 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
504 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
505 printf("ccd%d: interleave must be at least %d\n",
506 cs->sc_unit, (maxsecsize / DEV_BSIZE));
513 * If uniform interleave is desired set all sizes to that of
514 * the smallest component. This will guarentee that a single
515 * interleave table is generated.
517 * Lost space must be taken into account when calculating the
518 * overall size. Half the space is lost when CCDF_MIRROR is
519 * specified. One disk is lost when CCDF_PARITY is specified.
521 if (cs->sc_flags & CCDF_UNIFORM) {
522 for (ci = cs->sc_cinfo;
523 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
524 ci->ci_size = minsize;
526 if (cs->sc_flags & CCDF_MIRROR) {
528 * Check to see if an even number of components
529 * have been specified. The interleave must also
530 * be non-zero in order for us to be able to
531 * guarentee the topology.
533 if (cs->sc_nccdisks % 2) {
534 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
538 if (cs->sc_ileave == 0) {
539 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
543 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
544 } else if (cs->sc_flags & CCDF_PARITY) {
545 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
547 if (cs->sc_ileave == 0) {
548 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
552 cs->sc_size = cs->sc_nccdisks * minsize;
557 * Construct the interleave table.
559 ccdinterleave(cs, cs->sc_unit);
562 * Create pseudo-geometry based on 1MB cylinders. It's
565 ccg->ccg_secsize = maxsecsize;
566 ccg->ccg_ntracks = 1;
567 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
568 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
571 * Add an devstat entry for this device.
573 devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
574 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
575 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
576 DEVSTAT_PRIORITY_ARRAY);
578 cs->sc_flags |= CCDF_INITED;
579 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */
582 while (ci > cs->sc_cinfo) {
584 free(ci->ci_path, M_DEVBUF);
587 free(tmppath, M_DEVBUF);
588 free(cs->sc_cinfo, M_DEVBUF);
593 ccdinterleave(struct ccd_s *cs, int unit)
595 struct ccdcinfo *ci, *smallci;
602 if (ccddebug & CCDB_INIT)
603 printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
607 * Allocate an interleave table. The worst case occurs when each
608 * of N disks is of a different size, resulting in N interleave
611 * Chances are this is too big, but we don't care.
613 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
614 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
618 * Trivial case: no interleave (actually interleave of disk size).
619 * Each table entry represents a single component in its entirety.
621 * An interleave of 0 may not be used with a mirror or parity setup.
623 if (cs->sc_ileave == 0) {
627 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
628 /* Allocate space for ii_index. */
629 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
631 ii->ii_startblk = bn;
633 ii->ii_index[0] = ix;
634 bn += cs->sc_cinfo[ix].ci_size;
639 if (ccddebug & CCDB_INIT)
640 printiinfo(cs->sc_itable);
646 * The following isn't fast or pretty; it doesn't have to be.
650 for (ii = cs->sc_itable; ; ii++) {
652 * Allocate space for ii_index. We might allocate more then
655 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
659 * Locate the smallest of the remaining components
662 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
664 if (ci->ci_size > size &&
666 ci->ci_size < smallci->ci_size)) {
672 * Nobody left, all done
674 if (smallci == NULL) {
680 * Record starting logical block using an sc_ileave blocksize.
682 ii->ii_startblk = bn / cs->sc_ileave;
685 * Record starting comopnent block using an sc_ileave
686 * blocksize. This value is relative to the beginning of
689 ii->ii_startoff = lbn;
692 * Determine how many disks take part in this interleave
693 * and record their indices.
696 for (ci = cs->sc_cinfo;
697 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
698 if (ci->ci_size >= smallci->ci_size) {
699 ii->ii_index[ix++] = ci - cs->sc_cinfo;
703 bn += ix * (smallci->ci_size - size);
704 lbn = smallci->ci_size / cs->sc_ileave;
705 size = smallci->ci_size;
708 if (ccddebug & CCDB_INIT)
709 printiinfo(cs->sc_itable);
715 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
717 int unit = ccdunit(dev);
719 struct disklabel *lp;
720 int error = 0, part, pmask;
723 if (ccddebug & CCDB_FOLLOW)
724 printf("ccdopen(%p, %x)\n", dev, flags);
727 cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
729 if ((error = ccdlock(cs)) != 0)
738 * If we're initialized, check to see if there are any other
739 * open partitions. If not, then it's safe to update
740 * the in-core disklabel.
742 if (IS_INITED(cs) && (cs->sc_openmask == 0))
743 ccdgetdisklabel(dev);
745 /* Check that the partition exists. */
746 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
747 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
752 cs->sc_openmask |= pmask;
760 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
762 int unit = ccdunit(dev);
767 if (ccddebug & CCDB_FOLLOW)
768 printf("ccdclose(%p, %x)\n", dev, flags);
771 if (!IS_ALLOCATED(unit))
775 if ((error = ccdlock(cs)) != 0)
780 /* ...that much closer to allowing unconfiguration... */
781 cs->sc_openmask &= ~(1 << part);
782 /* collect "garbage" if possible */
783 if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
784 ccddestroy(cs, td->td_proc);
791 ccdstrategy(struct bio *bp)
793 int unit = ccdunit(bp->bio_dev);
794 struct ccd_s *cs = ccdfind(unit);
797 struct disklabel *lp;
800 if (ccddebug & CCDB_FOLLOW)
801 printf("ccdstrategy(%p): unit %d\n", bp, unit);
803 if (!IS_INITED(cs)) {
804 biofinish(bp, NULL, ENXIO);
808 /* If it's a nil transfer, wake up the top half now. */
809 if (bp->bio_bcount == 0) {
817 * Do bounds checking and adjust transfer. If there's an
818 * error, the bounds check will flag that for us.
820 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
821 if (ccdpart(bp->bio_dev) != RAW_PART) {
822 if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
827 int pbn; /* in sc_secsize chunks */
828 long sz; /* in sc_secsize chunks */
830 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
831 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
834 * If out of bounds return an error. If at the EOF point,
835 * simply read or write less.
838 if (pbn < 0 || pbn >= cs->sc_size) {
839 bp->bio_resid = bp->bio_bcount;
840 if (pbn != cs->sc_size)
841 biofinish(bp, NULL, EINVAL);
848 * If the request crosses EOF, truncate the request.
850 if (pbn + sz > cs->sc_size) {
851 bp->bio_bcount = (cs->sc_size - pbn) *
852 cs->sc_geom.ccg_secsize;
856 bp->bio_resid = bp->bio_bcount;
868 ccdstart(struct ccd_s *cs, struct bio *bp)
871 struct ccdbuf *cbp[4];
872 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
875 struct partition *pp;
878 if (ccddebug & CCDB_FOLLOW)
879 printf("ccdstart(%p, %p)\n", cs, bp);
882 /* Record the transaction start */
883 devstat_start_transaction(&cs->device_stats);
886 * Translate the partition-relative block number to an absolute.
889 if (ccdpart(bp->bio_dev) != RAW_PART) {
890 pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
895 * Allocate component buffers and fire off the requests
898 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
899 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
900 rcount = cbp[0]->cb_buf.bio_bcount;
902 if (cs->sc_cflags & CCDF_MIRROR) {
904 * Mirroring. Writes go to both disks, reads are
905 * taken from whichever disk seems most appropriate.
907 * We attempt to localize reads to the disk whos arm
908 * is nearest the read request. We ignore seeks due
909 * to writes when making this determination and we
910 * also try to avoid hogging.
912 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
913 BIO_STRATEGY(&cbp[0]->cb_buf, 0);
914 BIO_STRATEGY(&cbp[1]->cb_buf, 0);
916 int pick = cs->sc_pick;
917 daddr_t range = cs->sc_size / 16;
919 if (bn < cs->sc_blk[pick] - range ||
920 bn > cs->sc_blk[pick] + range
922 cs->sc_pick = pick = 1 - pick;
924 cs->sc_blk[pick] = bn + btodb(rcount);
925 BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
931 BIO_STRATEGY(&cbp[0]->cb_buf, 0);
939 * Build a component buffer header.
942 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
944 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
950 if (ccddebug & CCDB_IO)
951 printf("ccdbuffer(%p, %p, %d, %p, %ld)\n",
952 cs, bp, bn, addr, bcount);
955 * Determine which component bn falls in.
960 if (cs->sc_ileave == 0) {
962 * Serially concatenated and neither a mirror nor a parity
963 * config. This is a special case.
968 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
976 * Calculate cbn, the logical superblock (sc_ileave chunks),
977 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
980 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
981 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
984 * Figure out which interleave table to use.
986 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
987 if (ii->ii_startblk > cbn)
993 * off is the logical superblock relative to the beginning
994 * of this interleave block.
996 off = cbn - ii->ii_startblk;
999 * We must calculate which disk component to use (ccdisk),
1000 * and recalculate cbn to be the superblock relative to
1001 * the beginning of the component. This is typically done by
1002 * adding 'off' and ii->ii_startoff together. However, 'off'
1003 * must typically be divided by the number of components in
1004 * this interleave array to be properly convert it from a
1005 * CCD-relative logical superblock number to a
1006 * component-relative superblock number.
1008 if (ii->ii_ndisk == 1) {
1010 * When we have just one disk, it can't be a mirror
1011 * or a parity config.
1013 ccdisk = ii->ii_index[0];
1014 cbn = ii->ii_startoff + off;
1016 if (cs->sc_cflags & CCDF_MIRROR) {
1018 * We have forced a uniform mapping, resulting
1019 * in a single interleave array. We double
1020 * up on the first half of the available
1021 * components and our mirror is in the second
1022 * half. This only works with a single
1023 * interleave array because doubling up
1024 * doubles the number of sectors, so there
1025 * cannot be another interleave array because
1026 * the next interleave array's calculations
1029 int ndisk2 = ii->ii_ndisk / 2;
1030 ccdisk = ii->ii_index[off % ndisk2];
1031 cbn = ii->ii_startoff + off / ndisk2;
1032 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1033 } else if (cs->sc_cflags & CCDF_PARITY) {
1035 * XXX not implemented yet
1037 int ndisk2 = ii->ii_ndisk - 1;
1038 ccdisk = ii->ii_index[off % ndisk2];
1039 cbn = ii->ii_startoff + off / ndisk2;
1040 if (cbn % ii->ii_ndisk <= ccdisk)
1043 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1044 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1048 ci = &cs->sc_cinfo[ccdisk];
1051 * Convert cbn from a superblock to a normal block so it
1052 * can be used to calculate (along with cboff) the normal
1053 * block index into this particular disk.
1055 cbn *= cs->sc_ileave;
1059 * Fill in the component buf structure.
1061 cbp = getccdbuf(NULL);
1062 cbp->cb_buf.bio_cmd = bp->bio_cmd;
1063 cbp->cb_buf.bio_done = ccdiodone;
1064 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */
1065 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1066 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1067 cbp->cb_buf.bio_data = addr;
1068 if (cs->sc_ileave == 0)
1069 cbc = dbtob((off_t)(ci->ci_size - cbn));
1071 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1072 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1073 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1076 * context for ccdiodone
1079 cbp->cb_unit = cs->sc_unit;
1080 cbp->cb_comp = ci - cs->sc_cinfo;
1083 if (ccddebug & CCDB_IO)
1084 printf(" dev %p(u%ld): cbp %p bn %lld addr %p bcnt %ld\n",
1085 ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1086 (long long)cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1087 cbp->cb_buf.bio_bcount);
1092 * Note: both I/O's setup when reading from mirror, but only one
1095 if (cs->sc_cflags & CCDF_MIRROR) {
1096 /* mirror, setup second I/O */
1097 cbp = getccdbuf(cb[0]);
1098 cbp->cb_buf.bio_dev = ci2->ci_dev;
1099 cbp->cb_comp = ci2 - cs->sc_cinfo;
1101 /* link together the ccdbuf's and clear "mirror done" flag */
1102 cb[0]->cb_mirror = cb[1];
1103 cb[1]->cb_mirror = cb[0];
1104 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1105 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1110 ccdintr(struct ccd_s *cs, struct bio *bp)
1113 if (ccddebug & CCDB_FOLLOW)
1114 printf("ccdintr(%p, %p)\n", cs, bp);
1117 * Request is done for better or worse, wakeup the top half.
1119 if (bp->bio_flags & BIO_ERROR)
1120 bp->bio_resid = bp->bio_bcount;
1121 biofinish(bp, &cs->device_stats, 0);
1125 * Called at interrupt time.
1126 * Mark the component as done and if all components are done,
1127 * take a ccd interrupt.
1130 ccdiodone(struct bio *ibp)
1132 struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1133 struct bio *bp = cbp->cb_obp;
1134 int unit = cbp->cb_unit;
1139 if (ccddebug & CCDB_FOLLOW)
1140 printf("ccdiodone(%p)\n", cbp);
1141 if (ccddebug & CCDB_IO) {
1142 printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1143 bp, bp->bio_bcount, bp->bio_resid);
1144 printf(" dev %p(u%d), cbp %p bn %lld addr %p bcnt %ld\n",
1145 cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1146 (long long)cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1147 cbp->cb_buf.bio_bcount);
1151 * If an error occured, report it. If this is a mirrored
1152 * configuration and the first of two possible reads, do not
1153 * set the error in the bp yet because the second read may
1157 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1158 const char *msg = "";
1160 if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1161 (cbp->cb_buf.bio_cmd == BIO_READ) &&
1162 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1164 * We will try our read on the other disk down
1165 * below, also reverse the default pick so if we
1166 * are doing a scan we do not keep hitting the
1169 struct ccd_s *cs = ccdfind(unit);
1171 msg = ", trying other disk";
1172 cs->sc_pick = 1 - cs->sc_pick;
1173 cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1175 bp->bio_flags |= BIO_ERROR;
1176 bp->bio_error = cbp->cb_buf.bio_error ?
1177 cbp->cb_buf.bio_error : EIO;
1179 printf("ccd%d: error %d on component %d block %d (ccd block %lld)%s\n",
1180 unit, bp->bio_error, cbp->cb_comp,
1181 (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1185 * Process mirror. If we are writing, I/O has been initiated on both
1186 * buffers and we fall through only after both are finished.
1188 * If we are reading only one I/O is initiated at a time. If an
1189 * error occurs we initiate the second I/O and return, otherwise
1190 * we free the second I/O without initiating it.
1193 if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1194 if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1196 * When writing, handshake with the second buffer
1197 * to determine when both are done. If both are not
1198 * done, return here.
1200 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1201 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1208 * When reading, either dispose of the second buffer
1209 * or initiate I/O on the second buffer if an error
1210 * occured with this one.
1212 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1213 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1214 cbp->cb_mirror->cb_pflags |=
1216 BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1221 putccdbuf(cbp->cb_mirror);
1229 * use bio_caller1 to determine how big the original request was rather
1230 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1232 * XXX We check for an error, but we do not test the resid for an
1233 * aligned EOF condition. This may result in character & block
1234 * device access not recognizing EOF properly when read or written
1235 * sequentially, but will not effect filesystems.
1237 count = (long)cbp->cb_buf.bio_caller1;
1241 * If all done, "interrupt".
1243 bp->bio_resid -= count;
1244 if (bp->bio_resid < 0)
1245 panic("ccdiodone: count");
1246 if (bp->bio_resid == 0)
1247 ccdintr(ccdfind(unit), bp);
1252 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1254 int unit = ccdunit(dev);
1255 int i, j, lookedup = 0, error = 0;
1258 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1262 if (!IS_ALLOCATED(unit))
1271 if ((flag & FWRITE) == 0)
1274 if ((error = ccdlock(cs)) != 0)
1277 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1280 /* Fill in some important bits. */
1281 cs->sc_ileave = ccio->ccio_ileave;
1282 if (cs->sc_ileave == 0 &&
1283 ((ccio->ccio_flags & CCDF_MIRROR) ||
1284 (ccio->ccio_flags & CCDF_PARITY))) {
1285 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1286 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1288 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1289 (ccio->ccio_flags & CCDF_PARITY)) {
1290 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1291 ccio->ccio_flags &= ~CCDF_PARITY;
1293 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1294 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1295 printf("ccd%d: mirror/parity forces uniform flag\n",
1297 ccio->ccio_flags |= CCDF_UNIFORM;
1299 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1302 * Allocate space for and copy in the array of
1303 * componet pathnames and device numbers.
1305 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1306 M_DEVBUF, M_WAITOK);
1307 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1308 M_DEVBUF, M_WAITOK);
1310 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1311 ccio->ccio_ndisks * sizeof(char **));
1313 free(vpp, M_DEVBUF);
1314 free(cpp, M_DEVBUF);
1320 if (ccddebug & CCDB_INIT)
1321 for (i = 0; i < ccio->ccio_ndisks; ++i)
1322 printf("ccdioctl: component %d: %p\n",
1326 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1328 if (ccddebug & CCDB_INIT)
1329 printf("ccdioctl: lookedup = %d\n", lookedup);
1331 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1332 for (j = 0; j < lookedup; ++j)
1333 (void)vn_close(vpp[j], FREAD|FWRITE,
1335 free(vpp, M_DEVBUF);
1336 free(cpp, M_DEVBUF);
1343 cs->sc_nccdisks = ccio->ccio_ndisks;
1346 * Initialize the ccd. Fills in the softc for us.
1348 if ((error = ccdinit(cs, cpp, td)) != 0) {
1349 for (j = 0; j < lookedup; ++j)
1350 (void)vn_close(vpp[j], FREAD|FWRITE,
1353 * We can't ccddestroy() cs just yet, because nothing
1354 * prevents user-level app to do another ioctl()
1355 * without closing the device first, therefore
1356 * declare unit null and void and let ccdclose()
1357 * destroy it when it is safe to do so.
1359 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1360 free(vpp, M_DEVBUF);
1361 free(cpp, M_DEVBUF);
1367 * The ccd has been successfully initialized, so
1368 * we can place it into the array and read the disklabel.
1370 ccio->ccio_unit = unit;
1371 ccio->ccio_size = cs->sc_size;
1372 ccdgetdisklabel(dev);
1382 if ((flag & FWRITE) == 0)
1385 if ((error = ccdlock(cs)) != 0)
1388 /* Don't unconfigure if any other partitions are open */
1389 part = ccdpart(dev);
1390 pmask = (1 << part);
1391 if ((cs->sc_openmask & ~pmask)) {
1396 /* Declare unit null and void (reset all flags) */
1397 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1399 /* Close the components and free their pathnames. */
1400 for (i = 0; i < cs->sc_nccdisks; ++i) {
1402 * XXX: this close could potentially fail and
1403 * cause Bad Things. Maybe we need to force
1404 * the close to happen?
1407 if (ccddebug & CCDB_VNODE)
1408 vprint("CCDIOCCLR: vnode info",
1409 cs->sc_cinfo[i].ci_vp);
1411 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1413 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1416 /* Free interleave index. */
1417 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1418 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1420 /* Free component info and interleave table. */
1421 free(cs->sc_cinfo, M_DEVBUF);
1422 free(cs->sc_itable, M_DEVBUF);
1423 free(cs->sc_vpp, M_DEVBUF);
1425 /* And remove the devstat entry. */
1426 devstat_remove_entry(&cs->device_stats);
1428 /* This must be atomic. */
1438 struct ccdconf *conf = (struct ccdconf *)data;
1439 struct ccd_s *tmpcs;
1440 struct ccd_s *ubuf = conf->buffer;
1442 /* XXX: LOCK(unique unit numbers) */
1443 LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1444 if (IS_INITED(tmpcs))
1447 if (conf->size == 0) {
1448 conf->size = sizeof(struct ccd_s) * ninit;
1450 } else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1451 (conf->size % sizeof(struct ccd_s) != 0)) {
1452 /* XXX: UNLOCK(unique unit numbers) */
1457 LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1458 if (!IS_INITED(tmpcs))
1460 error = copyout(tmpcs, --ubuf,
1461 sizeof(struct ccd_s));
1463 /* XXX: UNLOCK(unique unit numbers) */
1466 /* XXX: UNLOCK(unique unit numbers) */
1476 struct ccdcpps *cpps = (struct ccdcpps *)data;
1477 char *ubuf = cpps->buffer;
1480 for (i = 0; i < cs->sc_nccdisks; ++i)
1481 len += cs->sc_cinfo[i].ci_pathlen;
1483 if (cpps->size == 0) {
1486 } else if (cpps->size != len) {
1490 for (i = 0; i < cs->sc_nccdisks; ++i) {
1491 len = cs->sc_cinfo[i].ci_pathlen;
1492 error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1505 *(struct disklabel *)data = cs->sc_label;
1513 if ((flag & FWRITE) == 0)
1516 if ((error = ccdlock(cs)) != 0)
1519 cs->sc_flags |= CCDF_LABELLING;
1521 error = setdisklabel(&cs->sc_label,
1522 (struct disklabel *)data, 0);
1524 if (cmd == DIOCWDINFO)
1525 error = writedisklabel(CCDLABELDEV(dev),
1529 cs->sc_flags &= ~CCDF_LABELLING;
1541 if ((flag & FWRITE) == 0)
1543 if (*(int *)data != 0)
1544 cs->sc_flags |= CCDF_WLABEL;
1546 cs->sc_flags &= ~CCDF_WLABEL;
1562 if (ccdopen(dev, 0, S_IFCHR, curthread))
1565 cs = ccdfind(ccdunit(dev));
1566 part = ccdpart(dev);
1571 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1574 size = cs->sc_label.d_partitions[part].p_size;
1576 if (ccdclose(dev, 0, S_IFCHR, curthread))
1583 * Lookup the provided name in the filesystem. If the file exists,
1584 * is a valid block device, and isn't being used by anyone else,
1585 * set *vpp to the file's vnode.
1588 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1590 struct nameidata nd;
1594 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1595 flags = FREAD | FWRITE;
1596 if ((error = vn_open(&nd, &flags, 0)) != 0) {
1598 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1599 printf("ccdlookup: vn_open error = %d\n", error);
1605 if (vp->v_usecount > 1) {
1610 if (!vn_isdisk(vp, &error))
1614 if (ccddebug & CCDB_VNODE)
1615 vprint("ccdlookup: vnode info", vp);
1618 VOP_UNLOCK(vp, 0, td);
1619 NDFREE(&nd, NDF_ONLY_PNBUF);
1623 VOP_UNLOCK(vp, 0, td);
1624 NDFREE(&nd, NDF_ONLY_PNBUF);
1625 /* vn_close does vrele() for vp */
1626 (void)vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
1631 * Read the disklabel from the ccd. If one is not present, fake one
1635 ccdgetdisklabel(dev_t dev)
1637 int unit = ccdunit(dev);
1638 struct ccd_s *cs = ccdfind(unit);
1640 struct disklabel *lp = &cs->sc_label;
1641 struct ccdgeom *ccg = &cs->sc_geom;
1643 bzero(lp, sizeof(*lp));
1645 lp->d_secperunit = cs->sc_size;
1646 lp->d_secsize = ccg->ccg_secsize;
1647 lp->d_nsectors = ccg->ccg_nsectors;
1648 lp->d_ntracks = ccg->ccg_ntracks;
1649 lp->d_ncylinders = ccg->ccg_ncylinders;
1650 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1652 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1653 lp->d_type = DTYPE_CCD;
1654 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1656 lp->d_interleave = 1;
1659 lp->d_partitions[RAW_PART].p_offset = 0;
1660 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1661 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1662 lp->d_npartitions = RAW_PART + 1;
1664 lp->d_bbsize = BBSIZE; /* XXX */
1665 lp->d_sbsize = SBSIZE; /* XXX */
1667 lp->d_magic = DISKMAGIC;
1668 lp->d_magic2 = DISKMAGIC;
1669 lp->d_checksum = dkcksum(&cs->sc_label);
1672 * Call the generic disklabel extraction routine.
1674 errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1675 if (errstring != NULL)
1676 ccdmakedisklabel(cs);
1679 /* It's actually extremely common to have unlabeled ccds. */
1680 if (ccddebug & CCDB_LABEL)
1681 if (errstring != NULL)
1682 printf("ccd%d: %s\n", unit, errstring);
1687 * Take care of things one might want to take care of in the event
1688 * that a disklabel isn't present.
1691 ccdmakedisklabel(struct ccd_s *cs)
1693 struct disklabel *lp = &cs->sc_label;
1696 * For historical reasons, if there's no disklabel present
1697 * the raw partition must be marked FS_BSDFFS.
1699 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1701 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1705 * Wait interruptibly for an exclusive lock.
1708 * Several drivers do this; it should be abstracted and made MP-safe.
1711 ccdlock(struct ccd_s *cs)
1715 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1716 cs->sc_flags |= CCDF_WANTED;
1717 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1720 cs->sc_flags |= CCDF_LOCKED;
1725 * Unlock and wake up any waiters.
1728 ccdunlock(struct ccd_s *cs)
1731 cs->sc_flags &= ~CCDF_LOCKED;
1732 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1733 cs->sc_flags &= ~CCDF_WANTED;
1740 printiinfo(struct ccdiinfo *ii)
1744 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1745 printf(" itab[%d]: #dk %d sblk %d soff %d",
1746 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1747 for (i = 0; i < ii->ii_ndisk; i++)
1748 printf(" %d", ii->ii_index[i]);