3 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
6 * Copyright (c) 1995 Jason R. Thorpe.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project
21 * 4. The name of the author may not be used to endorse or promote products
22 * derived from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * Copyright (c) 1988 University of Utah.
39 * Copyright (c) 1990, 1993
40 * The Regents of the University of California. All rights reserved.
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. All advertising materials mentioning features or use of this software
55 * must display the following acknowledgement:
56 * This product includes software developed by the University of
57 * California, Berkeley and its contributors.
58 * 4. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 * @(#)cd.c 8.2 (Berkeley) 11/16/93
80 * "Concatenated" disk driver.
82 * Dynamic configuration and disklabel support by:
83 * Jason R. Thorpe <thorpej@nas.nasa.gov>
84 * Numerical Aerodynamic Simulation Facility
86 * NASA Ames Research Center
87 * Moffett Field, CA 94035
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
100 #include <sys/sysctl.h>
101 #include <sys/disklabel.h>
102 #include <ufs/ffs/fs.h>
103 #include <sys/devicestat.h>
104 #include <sys/fcntl.h>
105 #include <sys/vnode.h>
107 #include <sys/ccdvar.h>
109 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
111 #if defined(CCDDEBUG) && !defined(DEBUG)
116 #define CCDB_FOLLOW 0x01
117 #define CCDB_INIT 0x02
119 #define CCDB_LABEL 0x08
120 #define CCDB_VNODE 0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
126 #define ccdunit(x) dkunit(x)
127 #define ccdpart(x) dkpart(x)
130 This is how mirroring works (only writes are special):
132 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
133 linked together by the cb_mirror field. "cb_pflags &
134 CCDPF_MIRROR_DONE" is set to 0 on both of them.
136 When a component returns to ccdiodone(), it checks if "cb_pflags &
137 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
138 flag and returns. If it is, it means its partner has already
139 returned, so it will go to the regular cleanup.
144 struct bio cb_buf; /* new I/O buf */
145 struct bio *cb_obp; /* ptr. to original I/O buf */
146 struct ccdbuf *cb_freenext; /* free list link */
147 int cb_unit; /* target unit */
148 int cb_comp; /* target component */
149 int cb_pflags; /* mirror/parity status flag */
150 struct ccdbuf *cb_mirror; /* mirror counterpart */
153 /* bits in cb_pflags */
154 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
156 #define CCDLABELDEV(dev) \
157 (makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
159 /* convinient macros for often-used statements */
160 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL)
161 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0)
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_dump_t ccddump;
168 static d_psize_t ccdsize;
170 #define NCCDFREEHIWAT 16
172 #define CDEV_MAJOR 74
174 static struct cdevsw ccd_cdevsw = {
176 /* close */ ccdclose,
178 /* write */ physwrite,
179 /* ioctl */ ccdioctl,
182 /* strategy */ ccdstrategy,
184 /* maj */ CDEV_MAJOR,
189 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
191 static struct ccd_s *ccdfind(int);
192 static struct ccd_s *ccdnew(int);
193 static int ccddestroy(struct ccd_s *, struct proc *);
195 /* called during module initialization */
196 static void ccdattach(void);
197 static int ccd_modevent(module_t, int, void *);
199 /* called by biodone() at interrupt time */
200 static void ccdiodone(struct bio *bp);
202 static void ccdstart(struct ccd_s *, struct bio *);
203 static void ccdinterleave(struct ccd_s *, int);
204 static void ccdintr(struct ccd_s *, struct bio *);
205 static int ccdinit(struct ccd_s *, char **, struct thread *);
206 static int ccdlookup(char *, struct thread *p, struct vnode **);
207 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
208 struct bio *, daddr_t, caddr_t, long);
209 static void ccdgetdisklabel(dev_t);
210 static void ccdmakedisklabel(struct ccd_s *);
211 static int ccdlock(struct ccd_s *);
212 static void ccdunlock(struct ccd_s *);
215 static void printiinfo(struct ccdiinfo *);
218 /* Non-private for the benefit of libkvm. */
219 struct ccdbuf *ccdfreebufs;
220 static int numccdfreebufs;
223 * getccdbuf() - Allocate and zero a ccd buffer.
225 * This routine is called at splbio().
230 getccdbuf(struct ccdbuf *cpy)
235 * Allocate from freelist or malloc as necessary
237 if ((cbp = ccdfreebufs) != NULL) {
238 ccdfreebufs = cbp->cb_freenext;
241 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
245 * Used by mirroring code
248 bcopy(cpy, cbp, sizeof(struct ccdbuf));
250 bzero(cbp, sizeof(struct ccdbuf));
253 * independant struct bio initialization
260 * putccdbuf() - Free a ccd buffer.
262 * This routine is called at splbio().
267 putccdbuf(struct ccdbuf *cbp)
270 if (numccdfreebufs < NCCDFREEHIWAT) {
271 cbp->cb_freenext = ccdfreebufs;
275 free((caddr_t)cbp, M_DEVBUF);
281 * Number of blocks to untouched in front of a component partition.
282 * This is to avoid violating its disklabel area when it starts at the
283 * beginning of the slice.
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
289 static struct ccd_s *
292 struct ccd_s *sc = NULL;
294 /* XXX: LOCK(unique unit numbers) */
295 LIST_FOREACH(sc, &ccd_softc_list, list) {
296 if (sc->sc_unit == unit)
299 /* XXX: UNLOCK(unique unit numbers) */
300 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
303 static struct ccd_s *
308 /* XXX: LOCK(unique unit numbers) */
309 if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
312 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
314 LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
315 /* XXX: UNLOCK(unique unit numbers) */
320 ccddestroy(struct ccd_s *sc, struct proc *p)
323 /* XXX: LOCK(unique unit numbers) */
324 LIST_REMOVE(sc, list);
325 /* XXX: UNLOCK(unique unit numbers) */
331 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
338 i = dev_stdclone(name, &s, "ccd", &u);
341 if (*s < 'a' || *s > 'h')
345 *dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
346 UID_ROOT, GID_OPERATOR, 0640, name);
350 * Called by main() during pseudo-device attachment. All we need
351 * to do is to add devsw entries.
357 EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
361 ccd_modevent(module_t mod, int type, void *data)
371 printf("ccd0: Unload not supported!\n");
384 DEV_MODULE(ccd, ccd_modevent, NULL);
387 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
389 struct ccdcinfo *ci = NULL; /* XXX */
395 struct partinfo dpart;
396 struct ccdgeom *ccg = &cs->sc_geom;
397 char tmppath[MAXPATHLEN];
401 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
402 printf("ccdinit: unit %d\n", cs->sc_unit);
407 /* Allocate space for the component info. */
408 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
412 * Verify that each component piece exists and record
413 * relevant information about it.
417 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
419 ci = &cs->sc_cinfo[ix];
423 * Copy in the pathname of the component.
425 bzero(tmppath, sizeof(tmppath)); /* sanity */
426 if ((error = copyinstr(cpaths[ix], tmppath,
427 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
429 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
430 printf("ccd%d: can't copy path, error = %d\n",
435 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
436 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
438 ci->ci_dev = vn_todev(vp);
441 * Get partition information for the component.
443 if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
444 FREAD, td->td_proc->p_ucred, td)) != 0) {
446 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
447 printf("ccd%d: %s: ioctl failed, error = %d\n",
448 cs->sc_unit, ci->ci_path, error);
452 if (dpart.part->p_fstype == FS_BSDFFS) {
454 ((dpart.disklab->d_secsize > maxsecsize) ?
455 dpart.disklab->d_secsize : maxsecsize);
456 size = dpart.part->p_size - CCD_OFFSET;
459 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
460 printf("ccd%d: %s: incorrect partition type\n",
461 cs->sc_unit, ci->ci_path);
468 * Calculate the size, truncating to an interleave
469 * boundary if necessary.
472 if (cs->sc_ileave > 1)
473 size -= size % cs->sc_ileave;
477 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
478 printf("ccd%d: %s: size == 0\n",
479 cs->sc_unit, ci->ci_path);
485 if (minsize == 0 || size < minsize)
492 * Don't allow the interleave to be smaller than
493 * the biggest component sector.
495 if ((cs->sc_ileave > 0) &&
496 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
498 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
499 printf("ccd%d: interleave must be at least %d\n",
500 cs->sc_unit, (maxsecsize / DEV_BSIZE));
507 * If uniform interleave is desired set all sizes to that of
508 * the smallest component. This will guarentee that a single
509 * interleave table is generated.
511 * Lost space must be taken into account when calculating the
512 * overall size. Half the space is lost when CCDF_MIRROR is
513 * specified. One disk is lost when CCDF_PARITY is specified.
515 if (cs->sc_flags & CCDF_UNIFORM) {
516 for (ci = cs->sc_cinfo;
517 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
518 ci->ci_size = minsize;
520 if (cs->sc_flags & CCDF_MIRROR) {
522 * Check to see if an even number of components
523 * have been specified. The interleave must also
524 * be non-zero in order for us to be able to
525 * guarentee the topology.
527 if (cs->sc_nccdisks % 2) {
528 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
532 if (cs->sc_ileave == 0) {
533 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
537 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
538 } else if (cs->sc_flags & CCDF_PARITY) {
539 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
541 if (cs->sc_ileave == 0) {
542 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
546 cs->sc_size = cs->sc_nccdisks * minsize;
551 * Construct the interleave table.
553 ccdinterleave(cs, cs->sc_unit);
556 * Create pseudo-geometry based on 1MB cylinders. It's
559 ccg->ccg_secsize = maxsecsize;
560 ccg->ccg_ntracks = 1;
561 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
562 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
565 * Add an devstat entry for this device.
567 devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
568 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
569 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
570 DEVSTAT_PRIORITY_ARRAY);
572 cs->sc_flags |= CCDF_INITED;
573 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */
576 while (ci > cs->sc_cinfo) {
578 free(ci->ci_path, M_DEVBUF);
580 free(cs->sc_cinfo, M_DEVBUF);
585 ccdinterleave(struct ccd_s *cs, int unit)
587 struct ccdcinfo *ci, *smallci;
594 if (ccddebug & CCDB_INIT)
595 printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
599 * Allocate an interleave table. The worst case occurs when each
600 * of N disks is of a different size, resulting in N interleave
603 * Chances are this is too big, but we don't care.
605 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
606 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
610 * Trivial case: no interleave (actually interleave of disk size).
611 * Each table entry represents a single component in its entirety.
613 * An interleave of 0 may not be used with a mirror or parity setup.
615 if (cs->sc_ileave == 0) {
619 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
620 /* Allocate space for ii_index. */
621 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
623 ii->ii_startblk = bn;
625 ii->ii_index[0] = ix;
626 bn += cs->sc_cinfo[ix].ci_size;
631 if (ccddebug & CCDB_INIT)
632 printiinfo(cs->sc_itable);
638 * The following isn't fast or pretty; it doesn't have to be.
642 for (ii = cs->sc_itable; ; ii++) {
644 * Allocate space for ii_index. We might allocate more then
647 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
651 * Locate the smallest of the remaining components
654 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
656 if (ci->ci_size > size &&
658 ci->ci_size < smallci->ci_size)) {
664 * Nobody left, all done
666 if (smallci == NULL) {
672 * Record starting logical block using an sc_ileave blocksize.
674 ii->ii_startblk = bn / cs->sc_ileave;
677 * Record starting comopnent block using an sc_ileave
678 * blocksize. This value is relative to the beginning of
681 ii->ii_startoff = lbn;
684 * Determine how many disks take part in this interleave
685 * and record their indices.
688 for (ci = cs->sc_cinfo;
689 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
690 if (ci->ci_size >= smallci->ci_size) {
691 ii->ii_index[ix++] = ci - cs->sc_cinfo;
695 bn += ix * (smallci->ci_size - size);
696 lbn = smallci->ci_size / cs->sc_ileave;
697 size = smallci->ci_size;
700 if (ccddebug & CCDB_INIT)
701 printiinfo(cs->sc_itable);
707 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
709 int unit = ccdunit(dev);
711 struct disklabel *lp;
712 int error = 0, part, pmask;
715 if (ccddebug & CCDB_FOLLOW)
716 printf("ccdopen(%p, %x)\n", dev, flags);
719 cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
721 if ((error = ccdlock(cs)) != 0)
730 * If we're initialized, check to see if there are any other
731 * open partitions. If not, then it's safe to update
732 * the in-core disklabel.
734 if (IS_INITED(cs) && (cs->sc_openmask == 0))
735 ccdgetdisklabel(dev);
737 /* Check that the partition exists. */
738 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
739 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
744 cs->sc_openmask |= pmask;
752 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
754 int unit = ccdunit(dev);
759 if (ccddebug & CCDB_FOLLOW)
760 printf("ccdclose(%p, %x)\n", dev, flags);
763 if (!IS_ALLOCATED(unit))
767 if ((error = ccdlock(cs)) != 0)
772 /* ...that much closer to allowing unconfiguration... */
773 cs->sc_openmask &= ~(1 << part);
774 /* collect "garbage" if possible */
775 if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
776 ccddestroy(cs, td->td_proc);
783 ccdstrategy(struct bio *bp)
785 int unit = ccdunit(bp->bio_dev);
786 struct ccd_s *cs = ccdfind(unit);
789 struct disklabel *lp;
792 if (ccddebug & CCDB_FOLLOW)
793 printf("ccdstrategy(%p): unit %d\n", bp, unit);
795 if (!IS_INITED(cs)) {
796 biofinish(bp, NULL, ENXIO);
800 /* If it's a nil transfer, wake up the top half now. */
801 if (bp->bio_bcount == 0) {
809 * Do bounds checking and adjust transfer. If there's an
810 * error, the bounds check will flag that for us.
812 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
813 if (ccdpart(bp->bio_dev) != RAW_PART) {
814 if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
819 int pbn; /* in sc_secsize chunks */
820 long sz; /* in sc_secsize chunks */
822 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
823 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
826 * If out of bounds return an error. If at the EOF point,
827 * simply read or write less.
830 if (pbn < 0 || pbn >= cs->sc_size) {
831 bp->bio_resid = bp->bio_bcount;
832 if (pbn != cs->sc_size)
833 biofinish(bp, NULL, EINVAL);
840 * If the request crosses EOF, truncate the request.
842 if (pbn + sz > cs->sc_size) {
843 bp->bio_bcount = (cs->sc_size - pbn) *
844 cs->sc_geom.ccg_secsize;
848 bp->bio_resid = bp->bio_bcount;
860 ccdstart(struct ccd_s *cs, struct bio *bp)
863 struct ccdbuf *cbp[4];
864 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
867 struct partition *pp;
870 if (ccddebug & CCDB_FOLLOW)
871 printf("ccdstart(%p, %p)\n", cs, bp);
874 /* Record the transaction start */
875 devstat_start_transaction(&cs->device_stats);
878 * Translate the partition-relative block number to an absolute.
881 if (ccdpart(bp->bio_dev) != RAW_PART) {
882 pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
887 * Allocate component buffers and fire off the requests
890 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
891 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
892 rcount = cbp[0]->cb_buf.bio_bcount;
894 if (cs->sc_cflags & CCDF_MIRROR) {
896 * Mirroring. Writes go to both disks, reads are
897 * taken from whichever disk seems most appropriate.
899 * We attempt to localize reads to the disk whos arm
900 * is nearest the read request. We ignore seeks due
901 * to writes when making this determination and we
902 * also try to avoid hogging.
904 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
905 BIO_STRATEGY(&cbp[0]->cb_buf, 0);
906 BIO_STRATEGY(&cbp[1]->cb_buf, 0);
908 int pick = cs->sc_pick;
909 daddr_t range = cs->sc_size / 16;
911 if (bn < cs->sc_blk[pick] - range ||
912 bn > cs->sc_blk[pick] + range
914 cs->sc_pick = pick = 1 - pick;
916 cs->sc_blk[pick] = bn + btodb(rcount);
917 BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
923 BIO_STRATEGY(&cbp[0]->cb_buf, 0);
931 * Build a component buffer header.
934 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
936 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
942 if (ccddebug & CCDB_IO)
943 printf("ccdbuffer(%p, %p, %d, %p, %ld)\n",
944 cs, bp, bn, addr, bcount);
947 * Determine which component bn falls in.
952 if (cs->sc_ileave == 0) {
954 * Serially concatenated and neither a mirror nor a parity
955 * config. This is a special case.
960 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
968 * Calculate cbn, the logical superblock (sc_ileave chunks),
969 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
972 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
973 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
976 * Figure out which interleave table to use.
978 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
979 if (ii->ii_startblk > cbn)
985 * off is the logical superblock relative to the beginning
986 * of this interleave block.
988 off = cbn - ii->ii_startblk;
991 * We must calculate which disk component to use (ccdisk),
992 * and recalculate cbn to be the superblock relative to
993 * the beginning of the component. This is typically done by
994 * adding 'off' and ii->ii_startoff together. However, 'off'
995 * must typically be divided by the number of components in
996 * this interleave array to be properly convert it from a
997 * CCD-relative logical superblock number to a
998 * component-relative superblock number.
1000 if (ii->ii_ndisk == 1) {
1002 * When we have just one disk, it can't be a mirror
1003 * or a parity config.
1005 ccdisk = ii->ii_index[0];
1006 cbn = ii->ii_startoff + off;
1008 if (cs->sc_cflags & CCDF_MIRROR) {
1010 * We have forced a uniform mapping, resulting
1011 * in a single interleave array. We double
1012 * up on the first half of the available
1013 * components and our mirror is in the second
1014 * half. This only works with a single
1015 * interleave array because doubling up
1016 * doubles the number of sectors, so there
1017 * cannot be another interleave array because
1018 * the next interleave array's calculations
1021 int ndisk2 = ii->ii_ndisk / 2;
1022 ccdisk = ii->ii_index[off % ndisk2];
1023 cbn = ii->ii_startoff + off / ndisk2;
1024 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1025 } else if (cs->sc_cflags & CCDF_PARITY) {
1027 * XXX not implemented yet
1029 int ndisk2 = ii->ii_ndisk - 1;
1030 ccdisk = ii->ii_index[off % ndisk2];
1031 cbn = ii->ii_startoff + off / ndisk2;
1032 if (cbn % ii->ii_ndisk <= ccdisk)
1035 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1036 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1040 ci = &cs->sc_cinfo[ccdisk];
1043 * Convert cbn from a superblock to a normal block so it
1044 * can be used to calculate (along with cboff) the normal
1045 * block index into this particular disk.
1047 cbn *= cs->sc_ileave;
1051 * Fill in the component buf structure.
1053 cbp = getccdbuf(NULL);
1054 cbp->cb_buf.bio_cmd = bp->bio_cmd;
1055 cbp->cb_buf.bio_done = ccdiodone;
1056 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */
1057 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1058 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1059 cbp->cb_buf.bio_data = addr;
1060 if (cs->sc_ileave == 0)
1061 cbc = dbtob((off_t)(ci->ci_size - cbn));
1063 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1064 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1065 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1068 * context for ccdiodone
1071 cbp->cb_unit = cs->sc_unit;
1072 cbp->cb_comp = ci - cs->sc_cinfo;
1075 if (ccddebug & CCDB_IO)
1076 printf(" dev %p(u%ld): cbp %p bn %d addr %p bcnt %ld\n",
1077 ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1078 cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1079 cbp->cb_buf.bio_bcount);
1084 * Note: both I/O's setup when reading from mirror, but only one
1087 if (cs->sc_cflags & CCDF_MIRROR) {
1088 /* mirror, setup second I/O */
1089 cbp = getccdbuf(cb[0]);
1090 cbp->cb_buf.bio_dev = ci2->ci_dev;
1091 cbp->cb_comp = ci2 - cs->sc_cinfo;
1093 /* link together the ccdbuf's and clear "mirror done" flag */
1094 cb[0]->cb_mirror = cb[1];
1095 cb[1]->cb_mirror = cb[0];
1096 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1097 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1102 ccdintr(struct ccd_s *cs, struct bio *bp)
1105 if (ccddebug & CCDB_FOLLOW)
1106 printf("ccdintr(%p, %p)\n", cs, bp);
1109 * Request is done for better or worse, wakeup the top half.
1111 if (bp->bio_flags & BIO_ERROR)
1112 bp->bio_resid = bp->bio_bcount;
1113 biofinish(bp, &cs->device_stats, 0);
1117 * Called at interrupt time.
1118 * Mark the component as done and if all components are done,
1119 * take a ccd interrupt.
1122 ccdiodone(struct bio *ibp)
1124 struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1125 struct bio *bp = cbp->cb_obp;
1126 int unit = cbp->cb_unit;
1131 if (ccddebug & CCDB_FOLLOW)
1132 printf("ccdiodone(%p)\n", cbp);
1133 if (ccddebug & CCDB_IO) {
1134 printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1135 bp, bp->bio_bcount, bp->bio_resid);
1136 printf(" dev %p(u%d), cbp %p bn %d addr %p bcnt %ld\n",
1137 cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1138 cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1139 cbp->cb_buf.bio_bcount);
1143 * If an error occured, report it. If this is a mirrored
1144 * configuration and the first of two possible reads, do not
1145 * set the error in the bp yet because the second read may
1149 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1150 const char *msg = "";
1152 if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1153 (cbp->cb_buf.bio_cmd == BIO_READ) &&
1154 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1156 * We will try our read on the other disk down
1157 * below, also reverse the default pick so if we
1158 * are doing a scan we do not keep hitting the
1161 struct ccd_s *cs = ccdfind(unit);
1163 msg = ", trying other disk";
1164 cs->sc_pick = 1 - cs->sc_pick;
1165 cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1167 bp->bio_flags |= BIO_ERROR;
1168 bp->bio_error = cbp->cb_buf.bio_error ?
1169 cbp->cb_buf.bio_error : EIO;
1171 printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1172 unit, bp->bio_error, cbp->cb_comp,
1173 (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1177 * Process mirror. If we are writing, I/O has been initiated on both
1178 * buffers and we fall through only after both are finished.
1180 * If we are reading only one I/O is initiated at a time. If an
1181 * error occurs we initiate the second I/O and return, otherwise
1182 * we free the second I/O without initiating it.
1185 if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1186 if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1188 * When writing, handshake with the second buffer
1189 * to determine when both are done. If both are not
1190 * done, return here.
1192 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1193 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1200 * When reading, either dispose of the second buffer
1201 * or initiate I/O on the second buffer if an error
1202 * occured with this one.
1204 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1205 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1206 cbp->cb_mirror->cb_pflags |=
1208 BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1213 putccdbuf(cbp->cb_mirror);
1221 * use bio_caller1 to determine how big the original request was rather
1222 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1224 * XXX We check for an error, but we do not test the resid for an
1225 * aligned EOF condition. This may result in character & block
1226 * device access not recognizing EOF properly when read or written
1227 * sequentially, but will not effect filesystems.
1229 count = (long)cbp->cb_buf.bio_caller1;
1233 * If all done, "interrupt".
1235 bp->bio_resid -= count;
1236 if (bp->bio_resid < 0)
1237 panic("ccdiodone: count");
1238 if (bp->bio_resid == 0)
1239 ccdintr(ccdfind(unit), bp);
1244 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1246 int unit = ccdunit(dev);
1247 int i, j, lookedup = 0, error = 0;
1250 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1254 if (!IS_ALLOCATED(unit))
1263 if ((flag & FWRITE) == 0)
1266 if ((error = ccdlock(cs)) != 0)
1269 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1272 /* Fill in some important bits. */
1273 cs->sc_ileave = ccio->ccio_ileave;
1274 if (cs->sc_ileave == 0 &&
1275 ((ccio->ccio_flags & CCDF_MIRROR) ||
1276 (ccio->ccio_flags & CCDF_PARITY))) {
1277 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1278 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1280 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1281 (ccio->ccio_flags & CCDF_PARITY)) {
1282 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1283 ccio->ccio_flags &= ~CCDF_PARITY;
1285 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1286 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1287 printf("ccd%d: mirror/parity forces uniform flag\n",
1289 ccio->ccio_flags |= CCDF_UNIFORM;
1291 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1294 * Allocate space for and copy in the array of
1295 * componet pathnames and device numbers.
1297 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1298 M_DEVBUF, M_WAITOK);
1299 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1300 M_DEVBUF, M_WAITOK);
1302 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1303 ccio->ccio_ndisks * sizeof(char **));
1305 free(vpp, M_DEVBUF);
1306 free(cpp, M_DEVBUF);
1312 if (ccddebug & CCDB_INIT)
1313 for (i = 0; i < ccio->ccio_ndisks; ++i)
1314 printf("ccdioctl: component %d: %p\n",
1318 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1320 if (ccddebug & CCDB_INIT)
1321 printf("ccdioctl: lookedup = %d\n", lookedup);
1323 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1324 for (j = 0; j < lookedup; ++j)
1325 (void)vn_close(vpp[j], FREAD|FWRITE,
1326 td->td_proc->p_ucred, td);
1327 free(vpp, M_DEVBUF);
1328 free(cpp, M_DEVBUF);
1335 cs->sc_nccdisks = ccio->ccio_ndisks;
1338 * Initialize the ccd. Fills in the softc for us.
1340 if ((error = ccdinit(cs, cpp, td)) != 0) {
1341 for (j = 0; j < lookedup; ++j)
1342 (void)vn_close(vpp[j], FREAD|FWRITE,
1343 td->td_proc->p_ucred, td);
1345 * We can't ccddestroy() cs just yet, because nothing
1346 * prevents user-level app to do another ioctl()
1347 * without closing the device first, therefore
1348 * declare unit null and void and let ccdclose()
1349 * destroy it when it is safe to do so.
1351 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1352 free(vpp, M_DEVBUF);
1353 free(cpp, M_DEVBUF);
1359 * The ccd has been successfully initialized, so
1360 * we can place it into the array and read the disklabel.
1362 ccio->ccio_unit = unit;
1363 ccio->ccio_size = cs->sc_size;
1364 ccdgetdisklabel(dev);
1374 if ((flag & FWRITE) == 0)
1377 if ((error = ccdlock(cs)) != 0)
1380 /* Don't unconfigure if any other partitions are open */
1381 part = ccdpart(dev);
1382 pmask = (1 << part);
1383 if ((cs->sc_openmask & ~pmask)) {
1388 /* Declare unit null and void (reset all flags) */
1389 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1391 /* Close the components and free their pathnames. */
1392 for (i = 0; i < cs->sc_nccdisks; ++i) {
1394 * XXX: this close could potentially fail and
1395 * cause Bad Things. Maybe we need to force
1396 * the close to happen?
1399 if (ccddebug & CCDB_VNODE)
1400 vprint("CCDIOCCLR: vnode info",
1401 cs->sc_cinfo[i].ci_vp);
1403 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1404 td->td_proc->p_ucred, td);
1405 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1408 /* Free interleave index. */
1409 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1410 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1412 /* Free component info and interleave table. */
1413 free(cs->sc_cinfo, M_DEVBUF);
1414 free(cs->sc_itable, M_DEVBUF);
1415 free(cs->sc_vpp, M_DEVBUF);
1417 /* And remove the devstat entry. */
1418 devstat_remove_entry(&cs->device_stats);
1420 /* This must be atomic. */
1430 struct ccdconf *conf = (struct ccdconf *)data;
1431 struct ccd_s *tmpcs;
1432 struct ccd_s *ubuf = conf->buffer;
1434 /* XXX: LOCK(unique unit numbers) */
1435 LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1436 if (IS_INITED(tmpcs))
1439 if (conf->size == 0) {
1440 conf->size = sizeof(struct ccd_s) * ninit;
1442 } else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1443 (conf->size % sizeof(struct ccd_s) != 0)) {
1444 /* XXX: UNLOCK(unique unit numbers) */
1449 LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1450 if (!IS_INITED(tmpcs))
1452 error = copyout(tmpcs, --ubuf,
1453 sizeof(struct ccd_s));
1455 /* XXX: UNLOCK(unique unit numbers) */
1458 /* XXX: UNLOCK(unique unit numbers) */
1468 struct ccdcpps *cpps = (struct ccdcpps *)data;
1469 char *ubuf = cpps->buffer;
1472 for (i = 0; i < cs->sc_nccdisks; ++i)
1473 len += cs->sc_cinfo[i].ci_pathlen;
1475 if (cpps->size == 0) {
1478 } else if (cpps->size != len) {
1482 for (i = 0; i < cs->sc_nccdisks; ++i) {
1483 len = cs->sc_cinfo[i].ci_pathlen;
1484 error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1497 *(struct disklabel *)data = cs->sc_label;
1504 ((struct partinfo *)data)->disklab = &cs->sc_label;
1505 ((struct partinfo *)data)->part =
1506 &cs->sc_label.d_partitions[ccdpart(dev)];
1514 if ((flag & FWRITE) == 0)
1517 if ((error = ccdlock(cs)) != 0)
1520 cs->sc_flags |= CCDF_LABELLING;
1522 error = setdisklabel(&cs->sc_label,
1523 (struct disklabel *)data, 0);
1525 if (cmd == DIOCWDINFO)
1526 error = writedisklabel(CCDLABELDEV(dev),
1530 cs->sc_flags &= ~CCDF_LABELLING;
1542 if ((flag & FWRITE) == 0)
1544 if (*(int *)data != 0)
1545 cs->sc_flags |= CCDF_WLABEL;
1547 cs->sc_flags &= ~CCDF_WLABEL;
1563 if (ccdopen(dev, 0, S_IFCHR, curthread))
1566 cs = ccdfind(ccdunit(dev));
1567 part = ccdpart(dev);
1572 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1575 size = cs->sc_label.d_partitions[part].p_size;
1577 if (ccdclose(dev, 0, S_IFCHR, curthread))
1587 /* Not implemented. */
1592 * Lookup the provided name in the filesystem. If the file exists,
1593 * is a valid block device, and isn't being used by anyone else,
1594 * set *vpp to the file's vnode.
1597 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1599 struct nameidata nd;
1603 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1604 flags = FREAD | FWRITE;
1605 if ((error = vn_open(&nd, &flags, 0)) != 0) {
1607 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1608 printf("ccdlookup: vn_open error = %d\n", error);
1614 if (vp->v_usecount > 1) {
1619 if (!vn_isdisk(vp, &error))
1623 if (ccddebug & CCDB_VNODE)
1624 vprint("ccdlookup: vnode info", vp);
1627 VOP_UNLOCK(vp, 0, td);
1628 NDFREE(&nd, NDF_ONLY_PNBUF);
1632 VOP_UNLOCK(vp, 0, td);
1633 NDFREE(&nd, NDF_ONLY_PNBUF);
1634 /* vn_close does vrele() for vp */
1635 (void)vn_close(vp, FREAD|FWRITE, td->td_proc->p_ucred, td);
1640 * Read the disklabel from the ccd. If one is not present, fake one
1644 ccdgetdisklabel(dev_t dev)
1646 int unit = ccdunit(dev);
1647 struct ccd_s *cs = ccdfind(unit);
1649 struct disklabel *lp = &cs->sc_label;
1650 struct ccdgeom *ccg = &cs->sc_geom;
1652 bzero(lp, sizeof(*lp));
1654 lp->d_secperunit = cs->sc_size;
1655 lp->d_secsize = ccg->ccg_secsize;
1656 lp->d_nsectors = ccg->ccg_nsectors;
1657 lp->d_ntracks = ccg->ccg_ntracks;
1658 lp->d_ncylinders = ccg->ccg_ncylinders;
1659 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1661 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1662 lp->d_type = DTYPE_CCD;
1663 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1665 lp->d_interleave = 1;
1668 lp->d_partitions[RAW_PART].p_offset = 0;
1669 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1670 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1671 lp->d_npartitions = RAW_PART + 1;
1673 lp->d_bbsize = BBSIZE; /* XXX */
1674 lp->d_sbsize = SBSIZE; /* XXX */
1676 lp->d_magic = DISKMAGIC;
1677 lp->d_magic2 = DISKMAGIC;
1678 lp->d_checksum = dkcksum(&cs->sc_label);
1681 * Call the generic disklabel extraction routine.
1683 errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1684 if (errstring != NULL)
1685 ccdmakedisklabel(cs);
1688 /* It's actually extremely common to have unlabeled ccds. */
1689 if (ccddebug & CCDB_LABEL)
1690 if (errstring != NULL)
1691 printf("ccd%d: %s\n", unit, errstring);
1696 * Take care of things one might want to take care of in the event
1697 * that a disklabel isn't present.
1700 ccdmakedisklabel(struct ccd_s *cs)
1702 struct disklabel *lp = &cs->sc_label;
1705 * For historical reasons, if there's no disklabel present
1706 * the raw partition must be marked FS_BSDFFS.
1708 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1710 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1714 * Wait interruptibly for an exclusive lock.
1717 * Several drivers do this; it should be abstracted and made MP-safe.
1720 ccdlock(struct ccd_s *cs)
1724 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1725 cs->sc_flags |= CCDF_WANTED;
1726 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1729 cs->sc_flags |= CCDF_LOCKED;
1734 * Unlock and wake up any waiters.
1737 ccdunlock(struct ccd_s *cs)
1740 cs->sc_flags &= ~CCDF_LOCKED;
1741 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1742 cs->sc_flags &= ~CCDF_WANTED;
1749 printiinfo(struct ccdiinfo *ii)
1753 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1754 printf(" itab[%d]: #dk %d sblk %d soff %d",
1755 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1756 for (i = 0; i < ii->ii_ndisk; i++)
1757 printf(" %d", ii->ii_index[i]);