3 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
6 * Copyright (c) 1995 Jason R. Thorpe.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project
21 * 4. The name of the author may not be used to endorse or promote products
22 * derived from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * Copyright (c) 1988 University of Utah.
39 * Copyright (c) 1990, 1993
40 * The Regents of the University of California. All rights reserved.
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. All advertising materials mentioning features or use of this software
55 * must display the following acknowledgement:
56 * This product includes software developed by the University of
57 * California, Berkeley and its contributors.
58 * 4. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 * @(#)cd.c 8.2 (Berkeley) 11/16/93
80 * "Concatenated" disk driver.
82 * Dynamic configuration and disklabel support by:
83 * Jason R. Thorpe <thorpej@nas.nasa.gov>
84 * Numerical Aerodynamic Simulation Facility
86 * NASA Ames Research Center
87 * Moffett Field, CA 94035
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/kernel.h>
95 #include <sys/module.h>
98 #include <sys/malloc.h>
99 #include <sys/namei.h>
100 #include <sys/conf.h>
101 #include <sys/stat.h>
102 #include <sys/sysctl.h>
103 #include <sys/disklabel.h>
104 #include <ufs/ffs/fs.h>
105 #include <sys/devicestat.h>
106 #include <sys/fcntl.h>
107 #include <sys/vnode.h>
109 #include <sys/ccdvar.h>
111 #if defined(CCDDEBUG) && !defined(DEBUG)
116 #define CCDB_FOLLOW 0x01
117 #define CCDB_INIT 0x02
119 #define CCDB_LABEL 0x08
120 #define CCDB_VNODE 0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
127 #define ccdunit(x) dkunit(x)
128 #define ccdpart(x) dkpart(x)
131 This is how mirroring works (only writes are special):
133 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
134 linked together by the cb_mirror field. "cb_pflags &
135 CCDPF_MIRROR_DONE" is set to 0 on both of them.
137 When a component returns to ccdiodone(), it checks if "cb_pflags &
138 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
139 flag and returns. If it is, it means its partner has already
140 returned, so it will go to the regular cleanup.
145 struct bio cb_buf; /* new I/O buf */
146 struct bio *cb_obp; /* ptr. to original I/O buf */
147 struct ccdbuf *cb_freenext; /* free list link */
148 int cb_unit; /* target unit */
149 int cb_comp; /* target component */
150 int cb_pflags; /* mirror/parity status flag */
151 struct ccdbuf *cb_mirror; /* mirror counterpart */
154 /* bits in cb_pflags */
155 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
157 #define CCDLABELDEV(dev) \
158 (makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
160 static d_open_t ccdopen;
161 static d_close_t ccdclose;
162 static d_strategy_t ccdstrategy;
163 static d_ioctl_t ccdioctl;
164 static d_dump_t ccddump;
165 static d_psize_t ccdsize;
167 #define NCCDFREEHIWAT 16
169 #define CDEV_MAJOR 74
171 static struct cdevsw ccd_cdevsw = {
173 /* close */ ccdclose,
175 /* write */ physwrite,
176 /* ioctl */ ccdioctl,
179 /* strategy */ ccdstrategy,
181 /* maj */ CDEV_MAJOR,
187 /* called during module initialization */
188 static void ccdattach __P((void));
189 static int ccd_modevent __P((module_t, int, void *));
191 /* called by biodone() at interrupt time */
192 static void ccdiodone __P((struct bio *bp));
194 static void ccdstart __P((struct ccd_softc *, struct bio *));
195 static void ccdinterleave __P((struct ccd_softc *, int));
196 static void ccdintr __P((struct ccd_softc *, struct bio *));
197 static int ccdinit __P((struct ccddevice *, char **, struct proc *));
198 static int ccdlookup __P((char *, struct proc *p, struct vnode **));
199 static void ccdbuffer __P((struct ccdbuf **ret, struct ccd_softc *,
200 struct bio *, daddr_t, caddr_t, long));
201 static void ccdgetdisklabel __P((dev_t));
202 static void ccdmakedisklabel __P((struct ccd_softc *));
203 static int ccdlock __P((struct ccd_softc *));
204 static void ccdunlock __P((struct ccd_softc *));
207 static void printiinfo __P((struct ccdiinfo *));
210 /* Non-private for the benefit of libkvm. */
211 struct ccd_softc *ccd_softc;
212 struct ccddevice *ccddevs;
213 struct ccdbuf *ccdfreebufs;
214 static int numccdfreebufs;
215 static int numccd = 0;
218 * getccdbuf() - Allocate and zero a ccd buffer.
220 * This routine is called at splbio().
225 getccdbuf(struct ccdbuf *cpy)
230 * Allocate from freelist or malloc as necessary
232 if ((cbp = ccdfreebufs) != NULL) {
233 ccdfreebufs = cbp->cb_freenext;
236 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
240 * Used by mirroring code
243 bcopy(cpy, cbp, sizeof(struct ccdbuf));
245 bzero(cbp, sizeof(struct ccdbuf));
248 * independant struct bio initialization
255 * putccdbuf() - Free a ccd buffer.
257 * This routine is called at splbio().
262 putccdbuf(struct ccdbuf *cbp)
265 if (numccdfreebufs < NCCDFREEHIWAT) {
266 cbp->cb_freenext = ccdfreebufs;
270 free((caddr_t)cbp, M_DEVBUF);
276 * Number of blocks to untouched in front of a component partition.
277 * This is to avoid violating its disklabel area when it starts at the
278 * beginning of the slice.
280 #if !defined(CCD_OFFSET)
281 #define CCD_OFFSET 16
285 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
292 i = dev_stdclone(name, &s, "ccd", &u);
297 if (*s < 'a' || *s > 'h')
301 *dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
302 UID_ROOT, GID_OPERATOR, 0640, name);
306 * Called by main() during pseudo-device attachment. All we need
307 * to do is allocate enough space for devices to be configured later, and
317 printf("ccd0-%d: Concatenated disk drivers\n", num-1);
319 printf("ccd0: Concatenated disk driver\n");
321 ccd_softc = (struct ccd_softc *)malloc(num * sizeof(struct ccd_softc),
323 ccddevs = (struct ccddevice *)malloc(num * sizeof(struct ccddevice),
325 if ((ccd_softc == NULL) || (ccddevs == NULL)) {
326 printf("WARNING: no memory for concatenated disks\n");
327 if (ccd_softc != NULL)
328 free(ccd_softc, M_DEVBUF);
330 free(ccddevs, M_DEVBUF);
334 bzero(ccd_softc, num * sizeof(struct ccd_softc));
335 bzero(ccddevs, num * sizeof(struct ccddevice));
337 cdevsw_add(&ccd_cdevsw);
338 /* XXX: is this necessary? */
339 for (i = 0; i < numccd; ++i)
340 ccddevs[i].ccd_dk = -1;
341 EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
345 ccd_modevent(mod, type, data)
358 printf("ccd0: Unload not supported!\n");
362 default: /* MOD_SHUTDOWN etc */
368 DEV_MODULE(ccd, ccd_modevent, NULL);
371 ccdinit(ccd, cpaths, p)
372 struct ccddevice *ccd;
376 struct ccd_softc *cs = &ccd_softc[ccd->ccd_unit];
377 struct ccdcinfo *ci = NULL; /* XXX */
383 struct partinfo dpart;
384 struct ccdgeom *ccg = &cs->sc_geom;
385 char tmppath[MAXPATHLEN];
389 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
390 printf("ccdinit: unit %d\n", ccd->ccd_unit);
394 cs->sc_ileave = ccd->ccd_interleave;
395 cs->sc_nccdisks = ccd->ccd_ndev;
397 /* Allocate space for the component info. */
398 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
402 * Verify that each component piece exists and record
403 * relevant information about it.
407 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
408 vp = ccd->ccd_vpp[ix];
409 ci = &cs->sc_cinfo[ix];
413 * Copy in the pathname of the component.
415 bzero(tmppath, sizeof(tmppath)); /* sanity */
416 if ((error = copyinstr(cpaths[ix], tmppath,
417 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
419 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
420 printf("ccd%d: can't copy path, error = %d\n",
421 ccd->ccd_unit, error);
425 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
426 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
428 ci->ci_dev = vn_todev(vp);
431 * Get partition information for the component.
433 if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
434 FREAD, p->p_ucred, p)) != 0) {
436 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
437 printf("ccd%d: %s: ioctl failed, error = %d\n",
438 ccd->ccd_unit, ci->ci_path, error);
442 if (dpart.part->p_fstype == FS_BSDFFS) {
444 ((dpart.disklab->d_secsize > maxsecsize) ?
445 dpart.disklab->d_secsize : maxsecsize);
446 size = dpart.part->p_size - CCD_OFFSET;
449 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
450 printf("ccd%d: %s: incorrect partition type\n",
451 ccd->ccd_unit, ci->ci_path);
458 * Calculate the size, truncating to an interleave
459 * boundary if necessary.
462 if (cs->sc_ileave > 1)
463 size -= size % cs->sc_ileave;
467 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
468 printf("ccd%d: %s: size == 0\n",
469 ccd->ccd_unit, ci->ci_path);
475 if (minsize == 0 || size < minsize)
482 * Don't allow the interleave to be smaller than
483 * the biggest component sector.
485 if ((cs->sc_ileave > 0) &&
486 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
488 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
489 printf("ccd%d: interleave must be at least %d\n",
490 ccd->ccd_unit, (maxsecsize / DEV_BSIZE));
497 * If uniform interleave is desired set all sizes to that of
498 * the smallest component. This will guarentee that a single
499 * interleave table is generated.
501 * Lost space must be taken into account when calculating the
502 * overall size. Half the space is lost when CCDF_MIRROR is
503 * specified. One disk is lost when CCDF_PARITY is specified.
505 if (ccd->ccd_flags & CCDF_UNIFORM) {
506 for (ci = cs->sc_cinfo;
507 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
508 ci->ci_size = minsize;
510 if (ccd->ccd_flags & CCDF_MIRROR) {
512 * Check to see if an even number of components
513 * have been specified. The interleave must also
514 * be non-zero in order for us to be able to
515 * guarentee the topology.
517 if (cs->sc_nccdisks % 2) {
518 printf("ccd%d: mirroring requires an even number of disks\n", ccd->ccd_unit );
522 if (cs->sc_ileave == 0) {
523 printf("ccd%d: an interleave must be specified when mirroring\n", ccd->ccd_unit);
527 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
528 } else if (ccd->ccd_flags & CCDF_PARITY) {
529 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
531 if (cs->sc_ileave == 0) {
532 printf("ccd%d: an interleave must be specified when using parity\n", ccd->ccd_unit);
536 cs->sc_size = cs->sc_nccdisks * minsize;
541 * Construct the interleave table.
543 ccdinterleave(cs, ccd->ccd_unit);
546 * Create pseudo-geometry based on 1MB cylinders. It's
549 ccg->ccg_secsize = maxsecsize;
550 ccg->ccg_ntracks = 1;
551 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
552 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
555 * Add an devstat entry for this device.
557 devstat_add_entry(&cs->device_stats, "ccd", ccd->ccd_unit,
558 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
559 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
560 DEVSTAT_PRIORITY_ARRAY);
562 cs->sc_flags |= CCDF_INITED;
563 cs->sc_cflags = ccd->ccd_flags; /* So we can find out later... */
564 cs->sc_unit = ccd->ccd_unit;
567 while (ci > cs->sc_cinfo) {
569 free(ci->ci_path, M_DEVBUF);
571 free(cs->sc_cinfo, M_DEVBUF);
576 ccdinterleave(cs, unit)
577 struct ccd_softc *cs;
580 struct ccdcinfo *ci, *smallci;
587 if (ccddebug & CCDB_INIT)
588 printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
592 * Allocate an interleave table. The worst case occurs when each
593 * of N disks is of a different size, resulting in N interleave
596 * Chances are this is too big, but we don't care.
598 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
599 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
603 * Trivial case: no interleave (actually interleave of disk size).
604 * Each table entry represents a single component in its entirety.
606 * An interleave of 0 may not be used with a mirror or parity setup.
608 if (cs->sc_ileave == 0) {
612 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
613 /* Allocate space for ii_index. */
614 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
616 ii->ii_startblk = bn;
618 ii->ii_index[0] = ix;
619 bn += cs->sc_cinfo[ix].ci_size;
624 if (ccddebug & CCDB_INIT)
625 printiinfo(cs->sc_itable);
631 * The following isn't fast or pretty; it doesn't have to be.
635 for (ii = cs->sc_itable; ; ii++) {
637 * Allocate space for ii_index. We might allocate more then
640 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
644 * Locate the smallest of the remaining components
647 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
649 if (ci->ci_size > size &&
651 ci->ci_size < smallci->ci_size)) {
657 * Nobody left, all done
659 if (smallci == NULL) {
665 * Record starting logical block using an sc_ileave blocksize.
667 ii->ii_startblk = bn / cs->sc_ileave;
670 * Record starting comopnent block using an sc_ileave
671 * blocksize. This value is relative to the beginning of
674 ii->ii_startoff = lbn;
677 * Determine how many disks take part in this interleave
678 * and record their indices.
681 for (ci = cs->sc_cinfo;
682 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
683 if (ci->ci_size >= smallci->ci_size) {
684 ii->ii_index[ix++] = ci - cs->sc_cinfo;
688 bn += ix * (smallci->ci_size - size);
689 lbn = smallci->ci_size / cs->sc_ileave;
690 size = smallci->ci_size;
693 if (ccddebug & CCDB_INIT)
694 printiinfo(cs->sc_itable);
700 ccdopen(dev, flags, fmt, p)
705 int unit = ccdunit(dev);
706 struct ccd_softc *cs;
707 struct disklabel *lp;
708 int error = 0, part, pmask;
711 if (ccddebug & CCDB_FOLLOW)
712 printf("ccdopen(%p, %x)\n", dev, flags);
716 cs = &ccd_softc[unit];
718 if ((error = ccdlock(cs)) != 0)
727 * If we're initialized, check to see if there are any other
728 * open partitions. If not, then it's safe to update
729 * the in-core disklabel.
731 if ((cs->sc_flags & CCDF_INITED) && (cs->sc_openmask == 0))
732 ccdgetdisklabel(dev);
734 /* Check that the partition exists. */
735 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
736 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
741 cs->sc_openmask |= pmask;
749 ccdclose(dev, flags, fmt, p)
754 int unit = ccdunit(dev);
755 struct ccd_softc *cs;
759 if (ccddebug & CCDB_FOLLOW)
760 printf("ccdclose(%p, %x)\n", dev, flags);
765 cs = &ccd_softc[unit];
767 if ((error = ccdlock(cs)) != 0)
772 /* ...that much closer to allowing unconfiguration... */
773 cs->sc_openmask &= ~(1 << part);
782 int unit = ccdunit(bp->bio_dev);
783 struct ccd_softc *cs = &ccd_softc[unit];
786 struct disklabel *lp;
789 if (ccddebug & CCDB_FOLLOW)
790 printf("ccdstrategy(%p): unit %d\n", bp, unit);
792 if ((cs->sc_flags & CCDF_INITED) == 0) {
793 biofinish(bp, NULL, ENXIO);
797 /* If it's a nil transfer, wake up the top half now. */
798 if (bp->bio_bcount == 0) {
806 * Do bounds checking and adjust transfer. If there's an
807 * error, the bounds check will flag that for us.
809 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
810 if (ccdpart(bp->bio_dev) != RAW_PART) {
811 if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
816 int pbn; /* in sc_secsize chunks */
817 long sz; /* in sc_secsize chunks */
819 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
820 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
823 * If out of bounds return an error. If at the EOF point,
824 * simply read or write less.
827 if (pbn < 0 || pbn >= cs->sc_size) {
828 bp->bio_resid = bp->bio_bcount;
829 if (pbn != cs->sc_size)
830 biofinish(bp, NULL, EINVAL);
837 * If the request crosses EOF, truncate the request.
839 if (pbn + sz > cs->sc_size) {
840 bp->bio_bcount = (cs->sc_size - pbn) *
841 cs->sc_geom.ccg_secsize;
845 bp->bio_resid = bp->bio_bcount;
858 struct ccd_softc *cs;
862 struct ccdbuf *cbp[4];
863 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
866 struct partition *pp;
869 if (ccddebug & CCDB_FOLLOW)
870 printf("ccdstart(%p, %p)\n", cs, bp);
873 /* Record the transaction start */
874 devstat_start_transaction(&cs->device_stats);
877 * Translate the partition-relative block number to an absolute.
880 if (ccdpart(bp->bio_dev) != RAW_PART) {
881 pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
886 * Allocate component buffers and fire off the requests
889 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
890 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
891 rcount = cbp[0]->cb_buf.bio_bcount;
893 if (cs->sc_cflags & CCDF_MIRROR) {
895 * Mirroring. Writes go to both disks, reads are
896 * taken from whichever disk seems most appropriate.
898 * We attempt to localize reads to the disk whos arm
899 * is nearest the read request. We ignore seeks due
900 * to writes when making this determination and we
901 * also try to avoid hogging.
903 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
904 BIO_STRATEGY(&cbp[0]->cb_buf, 0);
905 BIO_STRATEGY(&cbp[1]->cb_buf, 0);
907 int pick = cs->sc_pick;
908 daddr_t range = cs->sc_size / 16;
910 if (bn < cs->sc_blk[pick] - range ||
911 bn > cs->sc_blk[pick] + range
913 cs->sc_pick = pick = 1 - pick;
915 cs->sc_blk[pick] = bn + btodb(rcount);
916 BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
922 BIO_STRATEGY(&cbp[0]->cb_buf, 0);
930 * Build a component buffer header.
933 ccdbuffer(cb, cs, bp, bn, addr, bcount)
935 struct ccd_softc *cs;
941 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
947 if (ccddebug & CCDB_IO)
948 printf("ccdbuffer(%p, %p, %d, %p, %ld)\n",
949 cs, bp, bn, addr, bcount);
952 * Determine which component bn falls in.
957 if (cs->sc_ileave == 0) {
959 * Serially concatenated and neither a mirror nor a parity
960 * config. This is a special case.
965 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
973 * Calculate cbn, the logical superblock (sc_ileave chunks),
974 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
977 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
978 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
981 * Figure out which interleave table to use.
983 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
984 if (ii->ii_startblk > cbn)
990 * off is the logical superblock relative to the beginning
991 * of this interleave block.
993 off = cbn - ii->ii_startblk;
996 * We must calculate which disk component to use (ccdisk),
997 * and recalculate cbn to be the superblock relative to
998 * the beginning of the component. This is typically done by
999 * adding 'off' and ii->ii_startoff together. However, 'off'
1000 * must typically be divided by the number of components in
1001 * this interleave array to be properly convert it from a
1002 * CCD-relative logical superblock number to a
1003 * component-relative superblock number.
1005 if (ii->ii_ndisk == 1) {
1007 * When we have just one disk, it can't be a mirror
1008 * or a parity config.
1010 ccdisk = ii->ii_index[0];
1011 cbn = ii->ii_startoff + off;
1013 if (cs->sc_cflags & CCDF_MIRROR) {
1015 * We have forced a uniform mapping, resulting
1016 * in a single interleave array. We double
1017 * up on the first half of the available
1018 * components and our mirror is in the second
1019 * half. This only works with a single
1020 * interleave array because doubling up
1021 * doubles the number of sectors, so there
1022 * cannot be another interleave array because
1023 * the next interleave array's calculations
1026 int ndisk2 = ii->ii_ndisk / 2;
1027 ccdisk = ii->ii_index[off % ndisk2];
1028 cbn = ii->ii_startoff + off / ndisk2;
1029 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1030 } else if (cs->sc_cflags & CCDF_PARITY) {
1032 * XXX not implemented yet
1034 int ndisk2 = ii->ii_ndisk - 1;
1035 ccdisk = ii->ii_index[off % ndisk2];
1036 cbn = ii->ii_startoff + off / ndisk2;
1037 if (cbn % ii->ii_ndisk <= ccdisk)
1040 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1041 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1045 ci = &cs->sc_cinfo[ccdisk];
1048 * Convert cbn from a superblock to a normal block so it
1049 * can be used to calculate (along with cboff) the normal
1050 * block index into this particular disk.
1052 cbn *= cs->sc_ileave;
1056 * Fill in the component buf structure.
1058 cbp = getccdbuf(NULL);
1059 cbp->cb_buf.bio_cmd = bp->bio_cmd;
1060 cbp->cb_buf.bio_done = ccdiodone;
1061 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */
1062 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1063 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1064 cbp->cb_buf.bio_data = addr;
1065 if (cs->sc_ileave == 0)
1066 cbc = dbtob((off_t)(ci->ci_size - cbn));
1068 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1069 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1070 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1073 * context for ccdiodone
1076 cbp->cb_unit = cs - ccd_softc;
1077 cbp->cb_comp = ci - cs->sc_cinfo;
1080 if (ccddebug & CCDB_IO)
1081 printf(" dev %p(u%ld): cbp %p bn %d addr %p bcnt %ld\n",
1082 ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1083 cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1084 cbp->cb_buf.bio_bcount);
1089 * Note: both I/O's setup when reading from mirror, but only one
1092 if (cs->sc_cflags & CCDF_MIRROR) {
1093 /* mirror, setup second I/O */
1094 cbp = getccdbuf(cb[0]);
1095 cbp->cb_buf.bio_dev = ci2->ci_dev;
1096 cbp->cb_comp = ci2 - cs->sc_cinfo;
1098 /* link together the ccdbuf's and clear "mirror done" flag */
1099 cb[0]->cb_mirror = cb[1];
1100 cb[1]->cb_mirror = cb[0];
1101 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1102 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1108 struct ccd_softc *cs;
1112 if (ccddebug & CCDB_FOLLOW)
1113 printf("ccdintr(%p, %p)\n", cs, bp);
1116 * Request is done for better or worse, wakeup the top half.
1118 if (bp->bio_flags & BIO_ERROR)
1119 bp->bio_resid = bp->bio_bcount;
1120 biofinish(bp, &cs->device_stats, 0);
1124 * Called at interrupt time.
1125 * Mark the component as done and if all components are done,
1126 * take a ccd interrupt.
1132 struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1133 struct bio *bp = cbp->cb_obp;
1134 int unit = cbp->cb_unit;
1139 if (ccddebug & CCDB_FOLLOW)
1140 printf("ccdiodone(%p)\n", cbp);
1141 if (ccddebug & CCDB_IO) {
1142 printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1143 bp, bp->bio_bcount, bp->bio_resid);
1144 printf(" dev %p(u%d), cbp %p bn %d addr %p bcnt %ld\n",
1145 cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1146 cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1147 cbp->cb_buf.bio_bcount);
1151 * If an error occured, report it. If this is a mirrored
1152 * configuration and the first of two possible reads, do not
1153 * set the error in the bp yet because the second read may
1157 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1158 const char *msg = "";
1160 if ((ccd_softc[unit].sc_cflags & CCDF_MIRROR) &&
1161 (cbp->cb_buf.bio_cmd == BIO_READ) &&
1162 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1164 * We will try our read on the other disk down
1165 * below, also reverse the default pick so if we
1166 * are doing a scan we do not keep hitting the
1169 struct ccd_softc *cs = &ccd_softc[unit];
1171 msg = ", trying other disk";
1172 cs->sc_pick = 1 - cs->sc_pick;
1173 cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1175 bp->bio_flags |= BIO_ERROR;
1176 bp->bio_error = cbp->cb_buf.bio_error ?
1177 cbp->cb_buf.bio_error : EIO;
1179 printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1180 unit, bp->bio_error, cbp->cb_comp,
1181 (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1185 * Process mirror. If we are writing, I/O has been initiated on both
1186 * buffers and we fall through only after both are finished.
1188 * If we are reading only one I/O is initiated at a time. If an
1189 * error occurs we initiate the second I/O and return, otherwise
1190 * we free the second I/O without initiating it.
1193 if (ccd_softc[unit].sc_cflags & CCDF_MIRROR) {
1194 if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1196 * When writing, handshake with the second buffer
1197 * to determine when both are done. If both are not
1198 * done, return here.
1200 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1201 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1208 * When reading, either dispose of the second buffer
1209 * or initiate I/O on the second buffer if an error
1210 * occured with this one.
1212 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1213 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1214 cbp->cb_mirror->cb_pflags |=
1216 BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1221 putccdbuf(cbp->cb_mirror);
1229 * use bio_caller1 to determine how big the original request was rather
1230 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1232 * XXX We check for an error, but we do not test the resid for an
1233 * aligned EOF condition. This may result in character & block
1234 * device access not recognizing EOF properly when read or written
1235 * sequentially, but will not effect filesystems.
1237 count = (long)cbp->cb_buf.bio_caller1;
1241 * If all done, "interrupt".
1243 bp->bio_resid -= count;
1244 if (bp->bio_resid < 0)
1245 panic("ccdiodone: count");
1246 if (bp->bio_resid == 0)
1247 ccdintr(&ccd_softc[unit], bp);
1252 ccdioctl(dev, cmd, data, flag, p)
1259 int unit = ccdunit(dev);
1260 int i, j, lookedup = 0, error = 0;
1262 struct ccd_softc *cs;
1263 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1264 struct ccddevice ccd;
1270 cs = &ccd_softc[unit];
1272 bzero(&ccd, sizeof(ccd));
1276 if (cs->sc_flags & CCDF_INITED)
1279 if ((flag & FWRITE) == 0)
1282 if ((error = ccdlock(cs)) != 0)
1285 /* Fill in some important bits. */
1286 ccd.ccd_unit = unit;
1287 ccd.ccd_interleave = ccio->ccio_ileave;
1288 if (ccd.ccd_interleave == 0 &&
1289 ((ccio->ccio_flags & CCDF_MIRROR) ||
1290 (ccio->ccio_flags & CCDF_PARITY))) {
1291 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1292 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1294 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1295 (ccio->ccio_flags & CCDF_PARITY)) {
1296 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1297 ccio->ccio_flags &= ~CCDF_PARITY;
1299 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1300 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1301 printf("ccd%d: mirror/parity forces uniform flag\n",
1303 ccio->ccio_flags |= CCDF_UNIFORM;
1305 ccd.ccd_flags = ccio->ccio_flags & CCDF_USERMASK;
1308 * Allocate space for and copy in the array of
1309 * componet pathnames and device numbers.
1311 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1312 M_DEVBUF, M_WAITOK);
1313 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1314 M_DEVBUF, M_WAITOK);
1316 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1317 ccio->ccio_ndisks * sizeof(char **));
1319 free(vpp, M_DEVBUF);
1320 free(cpp, M_DEVBUF);
1326 if (ccddebug & CCDB_INIT)
1327 for (i = 0; i < ccio->ccio_ndisks; ++i)
1328 printf("ccdioctl: component %d: %p\n",
1332 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1334 if (ccddebug & CCDB_INIT)
1335 printf("ccdioctl: lookedup = %d\n", lookedup);
1337 if ((error = ccdlookup(cpp[i], p, &vpp[i])) != 0) {
1338 for (j = 0; j < lookedup; ++j)
1339 (void)vn_close(vpp[j], FREAD|FWRITE,
1341 free(vpp, M_DEVBUF);
1342 free(cpp, M_DEVBUF);
1350 ccd.ccd_ndev = ccio->ccio_ndisks;
1353 * Initialize the ccd. Fills in the softc for us.
1355 if ((error = ccdinit(&ccd, cpp, p)) != 0) {
1356 for (j = 0; j < lookedup; ++j)
1357 (void)vn_close(vpp[j], FREAD|FWRITE,
1359 bzero(&ccd_softc[unit], sizeof(struct ccd_softc));
1360 free(vpp, M_DEVBUF);
1361 free(cpp, M_DEVBUF);
1367 * The ccd has been successfully initialized, so
1368 * we can place it into the array and read the disklabel.
1370 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1371 ccio->ccio_unit = unit;
1372 ccio->ccio_size = cs->sc_size;
1373 ccdgetdisklabel(dev);
1380 if ((cs->sc_flags & CCDF_INITED) == 0)
1383 if ((flag & FWRITE) == 0)
1386 if ((error = ccdlock(cs)) != 0)
1389 /* Don't unconfigure if any other partitions are open */
1390 part = ccdpart(dev);
1391 pmask = (1 << part);
1392 if ((cs->sc_openmask & ~pmask)) {
1398 * Free ccd_softc information and clear entry.
1401 /* Close the components and free their pathnames. */
1402 for (i = 0; i < cs->sc_nccdisks; ++i) {
1404 * XXX: this close could potentially fail and
1405 * cause Bad Things. Maybe we need to force
1406 * the close to happen?
1409 if (ccddebug & CCDB_VNODE)
1410 vprint("CCDIOCCLR: vnode info",
1411 cs->sc_cinfo[i].ci_vp);
1413 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1415 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1418 /* Free interleave index. */
1419 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1420 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1422 /* Free component info and interleave table. */
1423 free(cs->sc_cinfo, M_DEVBUF);
1424 free(cs->sc_itable, M_DEVBUF);
1425 cs->sc_flags &= ~CCDF_INITED;
1428 * Free ccddevice information and clear entry.
1430 free(ccddevs[unit].ccd_cpp, M_DEVBUF);
1431 free(ccddevs[unit].ccd_vpp, M_DEVBUF);
1433 bcopy(&ccd, &ccddevs[unit], sizeof(ccd));
1436 * And remove the devstat entry.
1438 devstat_remove_entry(&cs->device_stats);
1440 /* This must be atomic. */
1443 bzero(cs, sizeof(struct ccd_softc));
1449 if ((cs->sc_flags & CCDF_INITED) == 0)
1452 *(struct disklabel *)data = cs->sc_label;
1456 if ((cs->sc_flags & CCDF_INITED) == 0)
1459 ((struct partinfo *)data)->disklab = &cs->sc_label;
1460 ((struct partinfo *)data)->part =
1461 &cs->sc_label.d_partitions[ccdpart(dev)];
1466 if ((cs->sc_flags & CCDF_INITED) == 0)
1469 if ((flag & FWRITE) == 0)
1472 if ((error = ccdlock(cs)) != 0)
1475 cs->sc_flags |= CCDF_LABELLING;
1477 error = setdisklabel(&cs->sc_label,
1478 (struct disklabel *)data, 0);
1480 if (cmd == DIOCWDINFO)
1481 error = writedisklabel(CCDLABELDEV(dev),
1485 cs->sc_flags &= ~CCDF_LABELLING;
1494 if ((cs->sc_flags & CCDF_INITED) == 0)
1497 if ((flag & FWRITE) == 0)
1499 if (*(int *)data != 0)
1500 cs->sc_flags |= CCDF_WLABEL;
1502 cs->sc_flags &= ~CCDF_WLABEL;
1516 struct ccd_softc *cs;
1519 if (ccdopen(dev, 0, S_IFCHR, curproc))
1522 cs = &ccd_softc[ccdunit(dev)];
1523 part = ccdpart(dev);
1525 if ((cs->sc_flags & CCDF_INITED) == 0)
1528 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1531 size = cs->sc_label.d_partitions[part].p_size;
1533 if (ccdclose(dev, 0, S_IFCHR, curproc))
1544 /* Not implemented. */
1549 * Lookup the provided name in the filesystem. If the file exists,
1550 * is a valid block device, and isn't being used by anyone else,
1551 * set *vpp to the file's vnode.
1554 ccdlookup(path, p, vpp)
1557 struct vnode **vpp; /* result */
1559 struct nameidata nd;
1563 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p);
1564 flags = FREAD | FWRITE;
1565 if ((error = vn_open(&nd, &flags, 0)) != 0) {
1567 if (ccddebug & CCDB_FOLLOW|CCDB_INIT)
1568 printf("ccdlookup: vn_open error = %d\n", error);
1574 if (vp->v_usecount > 1) {
1579 if (!vn_isdisk(vp, &error))
1583 if (ccddebug & CCDB_VNODE)
1584 vprint("ccdlookup: vnode info", vp);
1587 VOP_UNLOCK(vp, 0, p);
1588 NDFREE(&nd, NDF_ONLY_PNBUF);
1592 VOP_UNLOCK(vp, 0, p);
1593 NDFREE(&nd, NDF_ONLY_PNBUF);
1594 /* vn_close does vrele() for vp */
1595 (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
1600 * Read the disklabel from the ccd. If one is not present, fake one
1604 ccdgetdisklabel(dev)
1607 int unit = ccdunit(dev);
1608 struct ccd_softc *cs = &ccd_softc[unit];
1610 struct disklabel *lp = &cs->sc_label;
1611 struct ccdgeom *ccg = &cs->sc_geom;
1613 bzero(lp, sizeof(*lp));
1615 lp->d_secperunit = cs->sc_size;
1616 lp->d_secsize = ccg->ccg_secsize;
1617 lp->d_nsectors = ccg->ccg_nsectors;
1618 lp->d_ntracks = ccg->ccg_ntracks;
1619 lp->d_ncylinders = ccg->ccg_ncylinders;
1620 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1622 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1623 lp->d_type = DTYPE_CCD;
1624 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1626 lp->d_interleave = 1;
1629 lp->d_partitions[RAW_PART].p_offset = 0;
1630 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1631 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1632 lp->d_npartitions = RAW_PART + 1;
1634 lp->d_bbsize = BBSIZE; /* XXX */
1635 lp->d_sbsize = SBSIZE; /* XXX */
1637 lp->d_magic = DISKMAGIC;
1638 lp->d_magic2 = DISKMAGIC;
1639 lp->d_checksum = dkcksum(&cs->sc_label);
1642 * Call the generic disklabel extraction routine.
1644 errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1645 if (errstring != NULL)
1646 ccdmakedisklabel(cs);
1649 /* It's actually extremely common to have unlabeled ccds. */
1650 if (ccddebug & CCDB_LABEL)
1651 if (errstring != NULL)
1652 printf("ccd%d: %s\n", unit, errstring);
1657 * Take care of things one might want to take care of in the event
1658 * that a disklabel isn't present.
1661 ccdmakedisklabel(cs)
1662 struct ccd_softc *cs;
1664 struct disklabel *lp = &cs->sc_label;
1667 * For historical reasons, if there's no disklabel present
1668 * the raw partition must be marked FS_BSDFFS.
1670 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1672 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1676 * Wait interruptibly for an exclusive lock.
1679 * Several drivers do this; it should be abstracted and made MP-safe.
1683 struct ccd_softc *cs;
1687 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1688 cs->sc_flags |= CCDF_WANTED;
1689 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1692 cs->sc_flags |= CCDF_LOCKED;
1697 * Unlock and wake up any waiters.
1701 struct ccd_softc *cs;
1704 cs->sc_flags &= ~CCDF_LOCKED;
1705 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1706 cs->sc_flags &= ~CCDF_WANTED;
1714 struct ccdiinfo *ii;
1718 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1719 printf(" itab[%d]: #dk %d sblk %d soff %d",
1720 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1721 for (i = 0; i < ii->ii_ndisk; i++)
1722 printf(" %d", ii->ii_index[i]);