3 /* $NetBSD: ccd.c,v 1.22 1995/12/08 19:13:26 thorpej Exp $ */
6 * Copyright (c) 1995 Jason R. Thorpe.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project
21 * 4. The name of the author may not be used to endorse or promote products
22 * derived from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
25 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * Copyright (c) 1988 University of Utah.
39 * Copyright (c) 1990, 1993
40 * The Regents of the University of California. All rights reserved.
42 * This code is derived from software contributed to Berkeley by
43 * the Systems Programming Group of the University of Utah Computer
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. All advertising materials mentioning features or use of this software
55 * must display the following acknowledgement:
56 * This product includes software developed by the University of
57 * California, Berkeley and its contributors.
58 * 4. Neither the name of the University nor the names of its contributors
59 * may be used to endorse or promote products derived from this software
60 * without specific prior written permission.
62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * from: Utah $Hdr: cd.c 1.6 90/11/28$
76 * @(#)cd.c 8.2 (Berkeley) 11/16/93
80 * "Concatenated" disk driver.
82 * Dynamic configuration and disklabel support by:
83 * Jason R. Thorpe <thorpej@nas.nasa.gov>
84 * Numerical Aerodynamic Simulation Facility
86 * NASA Ames Research Center
87 * Moffett Field, CA 94035
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/kernel.h>
93 #include <sys/module.h>
96 #include <sys/malloc.h>
97 #include <sys/namei.h>
100 #include <sys/sysctl.h>
101 #include <sys/disklabel.h>
102 #include <ufs/ffs/fs.h>
103 #include <sys/devicestat.h>
104 #include <sys/fcntl.h>
105 #include <sys/vnode.h>
107 #include <sys/ccdvar.h>
109 MALLOC_DEFINE(M_CCD, "CCD driver", "Concatenated Disk driver");
111 #if defined(CCDDEBUG) && !defined(DEBUG)
116 #define CCDB_FOLLOW 0x01
117 #define CCDB_INIT 0x02
119 #define CCDB_LABEL 0x08
120 #define CCDB_VNODE 0x10
121 static int ccddebug = CCDB_FOLLOW | CCDB_INIT | CCDB_IO | CCDB_LABEL |
123 SYSCTL_INT(_debug, OID_AUTO, ccddebug, CTLFLAG_RW, &ccddebug, 0, "");
126 #define ccdunit(x) dkunit(x)
127 #define ccdpart(x) dkpart(x)
130 This is how mirroring works (only writes are special):
132 When initiating a write, ccdbuffer() returns two "struct ccdbuf *"s
133 linked together by the cb_mirror field. "cb_pflags &
134 CCDPF_MIRROR_DONE" is set to 0 on both of them.
136 When a component returns to ccdiodone(), it checks if "cb_pflags &
137 CCDPF_MIRROR_DONE" is set or not. If not, it sets the partner's
138 flag and returns. If it is, it means its partner has already
139 returned, so it will go to the regular cleanup.
144 struct bio cb_buf; /* new I/O buf */
145 struct bio *cb_obp; /* ptr. to original I/O buf */
146 struct ccdbuf *cb_freenext; /* free list link */
147 int cb_unit; /* target unit */
148 int cb_comp; /* target component */
149 int cb_pflags; /* mirror/parity status flag */
150 struct ccdbuf *cb_mirror; /* mirror counterpart */
153 /* bits in cb_pflags */
154 #define CCDPF_MIRROR_DONE 1 /* if set, mirror counterpart is done */
156 #define CCDLABELDEV(dev) \
157 (makedev(major((dev)), dkmakeminor(ccdunit((dev)), 0, RAW_PART)))
159 /* convinient macros for often-used statements */
160 #define IS_ALLOCATED(unit) (ccdfind(unit) != NULL)
161 #define IS_INITED(cs) (((cs)->sc_flags & CCDF_INITED) != 0)
163 static d_open_t ccdopen;
164 static d_close_t ccdclose;
165 static d_strategy_t ccdstrategy;
166 static d_ioctl_t ccdioctl;
167 static d_dump_t ccddump;
168 static d_psize_t ccdsize;
170 #define NCCDFREEHIWAT 16
172 #define CDEV_MAJOR 74
174 static struct cdevsw ccd_cdevsw = {
176 /* close */ ccdclose,
178 /* write */ physwrite,
179 /* ioctl */ ccdioctl,
182 /* strategy */ ccdstrategy,
184 /* maj */ CDEV_MAJOR,
189 static LIST_HEAD(, ccd_s) ccd_softc_list = LIST_HEAD_INITIALIZER(&ccd_softc_list);
191 static struct ccd_s *ccdfind(int);
192 static struct ccd_s *ccdnew(int);
193 static int ccddestroy(struct ccd_s *, struct proc *);
195 /* called during module initialization */
196 static void ccdattach(void);
197 static int ccd_modevent(module_t, int, void *);
199 /* called by biodone() at interrupt time */
200 static void ccdiodone(struct bio *bp);
202 static void ccdstart(struct ccd_s *, struct bio *);
203 static void ccdinterleave(struct ccd_s *, int);
204 static void ccdintr(struct ccd_s *, struct bio *);
205 static int ccdinit(struct ccd_s *, char **, struct thread *);
206 static int ccdlookup(char *, struct thread *p, struct vnode **);
207 static void ccdbuffer(struct ccdbuf **ret, struct ccd_s *,
208 struct bio *, daddr_t, caddr_t, long);
209 static void ccdgetdisklabel(dev_t);
210 static void ccdmakedisklabel(struct ccd_s *);
211 static int ccdlock(struct ccd_s *);
212 static void ccdunlock(struct ccd_s *);
215 static void printiinfo(struct ccdiinfo *);
218 /* Non-private for the benefit of libkvm. */
219 struct ccdbuf *ccdfreebufs;
220 static int numccdfreebufs;
223 * getccdbuf() - Allocate and zero a ccd buffer.
225 * This routine is called at splbio().
230 getccdbuf(struct ccdbuf *cpy)
235 * Allocate from freelist or malloc as necessary
237 if ((cbp = ccdfreebufs) != NULL) {
238 ccdfreebufs = cbp->cb_freenext;
241 cbp = malloc(sizeof(struct ccdbuf), M_DEVBUF, M_WAITOK);
245 * Used by mirroring code
248 bcopy(cpy, cbp, sizeof(struct ccdbuf));
250 bzero(cbp, sizeof(struct ccdbuf));
253 * independant struct bio initialization
260 * putccdbuf() - Free a ccd buffer.
262 * This routine is called at splbio().
267 putccdbuf(struct ccdbuf *cbp)
270 if (numccdfreebufs < NCCDFREEHIWAT) {
271 cbp->cb_freenext = ccdfreebufs;
275 free((caddr_t)cbp, M_DEVBUF);
281 * Number of blocks to untouched in front of a component partition.
282 * This is to avoid violating its disklabel area when it starts at the
283 * beginning of the slice.
285 #if !defined(CCD_OFFSET)
286 #define CCD_OFFSET 16
289 static struct ccd_s *
292 struct ccd_s *sc = NULL;
294 /* XXX: LOCK(unique unit numbers) */
295 LIST_FOREACH(sc, &ccd_softc_list, list) {
296 if (sc->sc_unit == unit)
299 /* XXX: UNLOCK(unique unit numbers) */
300 return ((sc == NULL) || (sc->sc_unit != unit) ? NULL : sc);
303 static struct ccd_s *
308 /* XXX: LOCK(unique unit numbers) */
309 if (IS_ALLOCATED(unit) || unit > DKMAXUNIT)
312 MALLOC(sc, struct ccd_s *, sizeof(*sc), M_CCD, M_WAITOK | M_ZERO);
314 LIST_INSERT_HEAD(&ccd_softc_list, sc, list);
315 /* XXX: UNLOCK(unique unit numbers) */
320 ccddestroy(struct ccd_s *sc, struct proc *p)
323 /* XXX: LOCK(unique unit numbers) */
324 LIST_REMOVE(sc, list);
325 /* XXX: UNLOCK(unique unit numbers) */
331 ccd_clone(void *arg, char *name, int namelen, dev_t *dev)
338 i = dev_stdclone(name, &s, "ccd", &u);
341 if (*s < 'a' || *s > 'h')
345 *dev = make_dev(&ccd_cdevsw, u * 8 + *s - 'a',
346 UID_ROOT, GID_OPERATOR, 0640, name);
350 * Called by main() during pseudo-device attachment. All we need
351 * to do is to add devsw entries.
357 EVENTHANDLER_REGISTER(dev_clone, ccd_clone, 0, 1000);
361 ccd_modevent(module_t mod, int type, void *data)
371 printf("ccd0: Unload not supported!\n");
375 default: /* MOD_SHUTDOWN etc */
381 DEV_MODULE(ccd, ccd_modevent, NULL);
384 ccdinit(struct ccd_s *cs, char **cpaths, struct thread *td)
386 struct ccdcinfo *ci = NULL; /* XXX */
392 struct partinfo dpart;
393 struct ccdgeom *ccg = &cs->sc_geom;
394 char tmppath[MAXPATHLEN];
398 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
399 printf("ccdinit: unit %d\n", cs->sc_unit);
404 /* Allocate space for the component info. */
405 cs->sc_cinfo = malloc(cs->sc_nccdisks * sizeof(struct ccdcinfo),
409 * Verify that each component piece exists and record
410 * relevant information about it.
414 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
416 ci = &cs->sc_cinfo[ix];
420 * Copy in the pathname of the component.
422 bzero(tmppath, sizeof(tmppath)); /* sanity */
423 if ((error = copyinstr(cpaths[ix], tmppath,
424 MAXPATHLEN, &ci->ci_pathlen)) != 0) {
426 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
427 printf("ccd%d: can't copy path, error = %d\n",
432 ci->ci_path = malloc(ci->ci_pathlen, M_DEVBUF, M_WAITOK);
433 bcopy(tmppath, ci->ci_path, ci->ci_pathlen);
435 ci->ci_dev = vn_todev(vp);
438 * Get partition information for the component.
440 if ((error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
441 FREAD, td->td_proc->p_ucred, td)) != 0) {
443 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
444 printf("ccd%d: %s: ioctl failed, error = %d\n",
445 cs->sc_unit, ci->ci_path, error);
449 if (dpart.part->p_fstype == FS_BSDFFS) {
451 ((dpart.disklab->d_secsize > maxsecsize) ?
452 dpart.disklab->d_secsize : maxsecsize);
453 size = dpart.part->p_size - CCD_OFFSET;
456 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
457 printf("ccd%d: %s: incorrect partition type\n",
458 cs->sc_unit, ci->ci_path);
465 * Calculate the size, truncating to an interleave
466 * boundary if necessary.
469 if (cs->sc_ileave > 1)
470 size -= size % cs->sc_ileave;
474 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
475 printf("ccd%d: %s: size == 0\n",
476 cs->sc_unit, ci->ci_path);
482 if (minsize == 0 || size < minsize)
489 * Don't allow the interleave to be smaller than
490 * the biggest component sector.
492 if ((cs->sc_ileave > 0) &&
493 (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
495 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
496 printf("ccd%d: interleave must be at least %d\n",
497 cs->sc_unit, (maxsecsize / DEV_BSIZE));
504 * If uniform interleave is desired set all sizes to that of
505 * the smallest component. This will guarentee that a single
506 * interleave table is generated.
508 * Lost space must be taken into account when calculating the
509 * overall size. Half the space is lost when CCDF_MIRROR is
510 * specified. One disk is lost when CCDF_PARITY is specified.
512 if (cs->sc_flags & CCDF_UNIFORM) {
513 for (ci = cs->sc_cinfo;
514 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
515 ci->ci_size = minsize;
517 if (cs->sc_flags & CCDF_MIRROR) {
519 * Check to see if an even number of components
520 * have been specified. The interleave must also
521 * be non-zero in order for us to be able to
522 * guarentee the topology.
524 if (cs->sc_nccdisks % 2) {
525 printf("ccd%d: mirroring requires an even number of disks\n", cs->sc_unit );
529 if (cs->sc_ileave == 0) {
530 printf("ccd%d: an interleave must be specified when mirroring\n", cs->sc_unit);
534 cs->sc_size = (cs->sc_nccdisks/2) * minsize;
535 } else if (cs->sc_flags & CCDF_PARITY) {
536 cs->sc_size = (cs->sc_nccdisks-1) * minsize;
538 if (cs->sc_ileave == 0) {
539 printf("ccd%d: an interleave must be specified when using parity\n", cs->sc_unit);
543 cs->sc_size = cs->sc_nccdisks * minsize;
548 * Construct the interleave table.
550 ccdinterleave(cs, cs->sc_unit);
553 * Create pseudo-geometry based on 1MB cylinders. It's
556 ccg->ccg_secsize = maxsecsize;
557 ccg->ccg_ntracks = 1;
558 ccg->ccg_nsectors = 1024 * 1024 / ccg->ccg_secsize;
559 ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;
562 * Add an devstat entry for this device.
564 devstat_add_entry(&cs->device_stats, "ccd", cs->sc_unit,
565 ccg->ccg_secsize, DEVSTAT_ALL_SUPPORTED,
566 DEVSTAT_TYPE_STORARRAY |DEVSTAT_TYPE_IF_OTHER,
567 DEVSTAT_PRIORITY_ARRAY);
569 cs->sc_flags |= CCDF_INITED;
570 cs->sc_cflags = cs->sc_flags; /* So we can find out later... */
573 while (ci > cs->sc_cinfo) {
575 free(ci->ci_path, M_DEVBUF);
577 free(cs->sc_cinfo, M_DEVBUF);
582 ccdinterleave(struct ccd_s *cs, int unit)
584 struct ccdcinfo *ci, *smallci;
591 if (ccddebug & CCDB_INIT)
592 printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
596 * Allocate an interleave table. The worst case occurs when each
597 * of N disks is of a different size, resulting in N interleave
600 * Chances are this is too big, but we don't care.
602 size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
603 cs->sc_itable = (struct ccdiinfo *)malloc(size, M_DEVBUF,
607 * Trivial case: no interleave (actually interleave of disk size).
608 * Each table entry represents a single component in its entirety.
610 * An interleave of 0 may not be used with a mirror or parity setup.
612 if (cs->sc_ileave == 0) {
616 for (ix = 0; ix < cs->sc_nccdisks; ix++) {
617 /* Allocate space for ii_index. */
618 ii->ii_index = malloc(sizeof(int), M_DEVBUF, M_WAITOK);
620 ii->ii_startblk = bn;
622 ii->ii_index[0] = ix;
623 bn += cs->sc_cinfo[ix].ci_size;
628 if (ccddebug & CCDB_INIT)
629 printiinfo(cs->sc_itable);
635 * The following isn't fast or pretty; it doesn't have to be.
639 for (ii = cs->sc_itable; ; ii++) {
641 * Allocate space for ii_index. We might allocate more then
644 ii->ii_index = malloc((sizeof(int) * cs->sc_nccdisks),
648 * Locate the smallest of the remaining components
651 for (ci = cs->sc_cinfo; ci < &cs->sc_cinfo[cs->sc_nccdisks];
653 if (ci->ci_size > size &&
655 ci->ci_size < smallci->ci_size)) {
661 * Nobody left, all done
663 if (smallci == NULL) {
669 * Record starting logical block using an sc_ileave blocksize.
671 ii->ii_startblk = bn / cs->sc_ileave;
674 * Record starting comopnent block using an sc_ileave
675 * blocksize. This value is relative to the beginning of
678 ii->ii_startoff = lbn;
681 * Determine how many disks take part in this interleave
682 * and record their indices.
685 for (ci = cs->sc_cinfo;
686 ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++) {
687 if (ci->ci_size >= smallci->ci_size) {
688 ii->ii_index[ix++] = ci - cs->sc_cinfo;
692 bn += ix * (smallci->ci_size - size);
693 lbn = smallci->ci_size / cs->sc_ileave;
694 size = smallci->ci_size;
697 if (ccddebug & CCDB_INIT)
698 printiinfo(cs->sc_itable);
704 ccdopen(dev_t dev, int flags, int fmt, struct thread *td)
706 int unit = ccdunit(dev);
708 struct disklabel *lp;
709 int error = 0, part, pmask;
712 if (ccddebug & CCDB_FOLLOW)
713 printf("ccdopen(%p, %x)\n", dev, flags);
716 cs = IS_ALLOCATED(unit) ? ccdfind(unit) : ccdnew(unit);
718 if ((error = ccdlock(cs)) != 0)
727 * If we're initialized, check to see if there are any other
728 * open partitions. If not, then it's safe to update
729 * the in-core disklabel.
731 if (IS_INITED(cs) && (cs->sc_openmask == 0))
732 ccdgetdisklabel(dev);
734 /* Check that the partition exists. */
735 if (part != RAW_PART && ((part >= lp->d_npartitions) ||
736 (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
741 cs->sc_openmask |= pmask;
749 ccdclose(dev_t dev, int flags, int fmt, struct thread *td)
751 int unit = ccdunit(dev);
756 if (ccddebug & CCDB_FOLLOW)
757 printf("ccdclose(%p, %x)\n", dev, flags);
760 if (!IS_ALLOCATED(unit))
764 if ((error = ccdlock(cs)) != 0)
769 /* ...that much closer to allowing unconfiguration... */
770 cs->sc_openmask &= ~(1 << part);
771 /* collect "garbage" if possible */
772 if (!IS_INITED(cs) && (cs->sc_flags & CCDF_WANTED) == 0)
773 ccddestroy(cs, td->td_proc);
780 ccdstrategy(struct bio *bp)
782 int unit = ccdunit(bp->bio_dev);
783 struct ccd_s *cs = ccdfind(unit);
786 struct disklabel *lp;
789 if (ccddebug & CCDB_FOLLOW)
790 printf("ccdstrategy(%p): unit %d\n", bp, unit);
792 if (!IS_INITED(cs)) {
793 biofinish(bp, NULL, ENXIO);
797 /* If it's a nil transfer, wake up the top half now. */
798 if (bp->bio_bcount == 0) {
806 * Do bounds checking and adjust transfer. If there's an
807 * error, the bounds check will flag that for us.
809 wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
810 if (ccdpart(bp->bio_dev) != RAW_PART) {
811 if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
816 int pbn; /* in sc_secsize chunks */
817 long sz; /* in sc_secsize chunks */
819 pbn = bp->bio_blkno / (cs->sc_geom.ccg_secsize / DEV_BSIZE);
820 sz = howmany(bp->bio_bcount, cs->sc_geom.ccg_secsize);
823 * If out of bounds return an error. If at the EOF point,
824 * simply read or write less.
827 if (pbn < 0 || pbn >= cs->sc_size) {
828 bp->bio_resid = bp->bio_bcount;
829 if (pbn != cs->sc_size)
830 biofinish(bp, NULL, EINVAL);
837 * If the request crosses EOF, truncate the request.
839 if (pbn + sz > cs->sc_size) {
840 bp->bio_bcount = (cs->sc_size - pbn) *
841 cs->sc_geom.ccg_secsize;
845 bp->bio_resid = bp->bio_bcount;
857 ccdstart(struct ccd_s *cs, struct bio *bp)
860 struct ccdbuf *cbp[4];
861 /* XXX! : 2 reads and 2 writes for RAID 4/5 */
864 struct partition *pp;
867 if (ccddebug & CCDB_FOLLOW)
868 printf("ccdstart(%p, %p)\n", cs, bp);
871 /* Record the transaction start */
872 devstat_start_transaction(&cs->device_stats);
875 * Translate the partition-relative block number to an absolute.
878 if (ccdpart(bp->bio_dev) != RAW_PART) {
879 pp = &cs->sc_label.d_partitions[ccdpart(bp->bio_dev)];
884 * Allocate component buffers and fire off the requests
887 for (bcount = bp->bio_bcount; bcount > 0; bcount -= rcount) {
888 ccdbuffer(cbp, cs, bp, bn, addr, bcount);
889 rcount = cbp[0]->cb_buf.bio_bcount;
891 if (cs->sc_cflags & CCDF_MIRROR) {
893 * Mirroring. Writes go to both disks, reads are
894 * taken from whichever disk seems most appropriate.
896 * We attempt to localize reads to the disk whos arm
897 * is nearest the read request. We ignore seeks due
898 * to writes when making this determination and we
899 * also try to avoid hogging.
901 if (cbp[0]->cb_buf.bio_cmd == BIO_WRITE) {
902 BIO_STRATEGY(&cbp[0]->cb_buf, 0);
903 BIO_STRATEGY(&cbp[1]->cb_buf, 0);
905 int pick = cs->sc_pick;
906 daddr_t range = cs->sc_size / 16;
908 if (bn < cs->sc_blk[pick] - range ||
909 bn > cs->sc_blk[pick] + range
911 cs->sc_pick = pick = 1 - pick;
913 cs->sc_blk[pick] = bn + btodb(rcount);
914 BIO_STRATEGY(&cbp[pick]->cb_buf, 0);
920 BIO_STRATEGY(&cbp[0]->cb_buf, 0);
928 * Build a component buffer header.
931 ccdbuffer(struct ccdbuf **cb, struct ccd_s *cs, struct bio *bp, daddr_t bn, caddr_t addr, long bcount)
933 struct ccdcinfo *ci, *ci2 = NULL; /* XXX */
939 if (ccddebug & CCDB_IO)
940 printf("ccdbuffer(%p, %p, %d, %p, %ld)\n",
941 cs, bp, bn, addr, bcount);
944 * Determine which component bn falls in.
949 if (cs->sc_ileave == 0) {
951 * Serially concatenated and neither a mirror nor a parity
952 * config. This is a special case.
957 for (ci = cs->sc_cinfo; cbn >= sblk + ci->ci_size; ci++)
965 * Calculate cbn, the logical superblock (sc_ileave chunks),
966 * and cboff, a normal block offset (DEV_BSIZE chunks) relative
969 cboff = cbn % cs->sc_ileave; /* DEV_BSIZE gran */
970 cbn = cbn / cs->sc_ileave; /* DEV_BSIZE * ileave gran */
973 * Figure out which interleave table to use.
975 for (ii = cs->sc_itable; ii->ii_ndisk; ii++) {
976 if (ii->ii_startblk > cbn)
982 * off is the logical superblock relative to the beginning
983 * of this interleave block.
985 off = cbn - ii->ii_startblk;
988 * We must calculate which disk component to use (ccdisk),
989 * and recalculate cbn to be the superblock relative to
990 * the beginning of the component. This is typically done by
991 * adding 'off' and ii->ii_startoff together. However, 'off'
992 * must typically be divided by the number of components in
993 * this interleave array to be properly convert it from a
994 * CCD-relative logical superblock number to a
995 * component-relative superblock number.
997 if (ii->ii_ndisk == 1) {
999 * When we have just one disk, it can't be a mirror
1000 * or a parity config.
1002 ccdisk = ii->ii_index[0];
1003 cbn = ii->ii_startoff + off;
1005 if (cs->sc_cflags & CCDF_MIRROR) {
1007 * We have forced a uniform mapping, resulting
1008 * in a single interleave array. We double
1009 * up on the first half of the available
1010 * components and our mirror is in the second
1011 * half. This only works with a single
1012 * interleave array because doubling up
1013 * doubles the number of sectors, so there
1014 * cannot be another interleave array because
1015 * the next interleave array's calculations
1018 int ndisk2 = ii->ii_ndisk / 2;
1019 ccdisk = ii->ii_index[off % ndisk2];
1020 cbn = ii->ii_startoff + off / ndisk2;
1021 ci2 = &cs->sc_cinfo[ccdisk + ndisk2];
1022 } else if (cs->sc_cflags & CCDF_PARITY) {
1024 * XXX not implemented yet
1026 int ndisk2 = ii->ii_ndisk - 1;
1027 ccdisk = ii->ii_index[off % ndisk2];
1028 cbn = ii->ii_startoff + off / ndisk2;
1029 if (cbn % ii->ii_ndisk <= ccdisk)
1032 ccdisk = ii->ii_index[off % ii->ii_ndisk];
1033 cbn = ii->ii_startoff + off / ii->ii_ndisk;
1037 ci = &cs->sc_cinfo[ccdisk];
1040 * Convert cbn from a superblock to a normal block so it
1041 * can be used to calculate (along with cboff) the normal
1042 * block index into this particular disk.
1044 cbn *= cs->sc_ileave;
1048 * Fill in the component buf structure.
1050 cbp = getccdbuf(NULL);
1051 cbp->cb_buf.bio_cmd = bp->bio_cmd;
1052 cbp->cb_buf.bio_done = ccdiodone;
1053 cbp->cb_buf.bio_dev = ci->ci_dev; /* XXX */
1054 cbp->cb_buf.bio_blkno = cbn + cboff + CCD_OFFSET;
1055 cbp->cb_buf.bio_offset = dbtob(cbn + cboff + CCD_OFFSET);
1056 cbp->cb_buf.bio_data = addr;
1057 if (cs->sc_ileave == 0)
1058 cbc = dbtob((off_t)(ci->ci_size - cbn));
1060 cbc = dbtob((off_t)(cs->sc_ileave - cboff));
1061 cbp->cb_buf.bio_bcount = (cbc < bcount) ? cbc : bcount;
1062 cbp->cb_buf.bio_caller1 = (void*)cbp->cb_buf.bio_bcount;
1065 * context for ccdiodone
1068 cbp->cb_unit = cs->sc_unit;
1069 cbp->cb_comp = ci - cs->sc_cinfo;
1072 if (ccddebug & CCDB_IO)
1073 printf(" dev %p(u%ld): cbp %p bn %d addr %p bcnt %ld\n",
1074 ci->ci_dev, (unsigned long)(ci-cs->sc_cinfo), cbp,
1075 cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1076 cbp->cb_buf.bio_bcount);
1081 * Note: both I/O's setup when reading from mirror, but only one
1084 if (cs->sc_cflags & CCDF_MIRROR) {
1085 /* mirror, setup second I/O */
1086 cbp = getccdbuf(cb[0]);
1087 cbp->cb_buf.bio_dev = ci2->ci_dev;
1088 cbp->cb_comp = ci2 - cs->sc_cinfo;
1090 /* link together the ccdbuf's and clear "mirror done" flag */
1091 cb[0]->cb_mirror = cb[1];
1092 cb[1]->cb_mirror = cb[0];
1093 cb[0]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1094 cb[1]->cb_pflags &= ~CCDPF_MIRROR_DONE;
1099 ccdintr(struct ccd_s *cs, struct bio *bp)
1102 if (ccddebug & CCDB_FOLLOW)
1103 printf("ccdintr(%p, %p)\n", cs, bp);
1106 * Request is done for better or worse, wakeup the top half.
1108 if (bp->bio_flags & BIO_ERROR)
1109 bp->bio_resid = bp->bio_bcount;
1110 biofinish(bp, &cs->device_stats, 0);
1114 * Called at interrupt time.
1115 * Mark the component as done and if all components are done,
1116 * take a ccd interrupt.
1119 ccdiodone(struct bio *ibp)
1121 struct ccdbuf *cbp = (struct ccdbuf *)ibp;
1122 struct bio *bp = cbp->cb_obp;
1123 int unit = cbp->cb_unit;
1128 if (ccddebug & CCDB_FOLLOW)
1129 printf("ccdiodone(%p)\n", cbp);
1130 if (ccddebug & CCDB_IO) {
1131 printf("ccdiodone: bp %p bcount %ld resid %ld\n",
1132 bp, bp->bio_bcount, bp->bio_resid);
1133 printf(" dev %p(u%d), cbp %p bn %d addr %p bcnt %ld\n",
1134 cbp->cb_buf.bio_dev, cbp->cb_comp, cbp,
1135 cbp->cb_buf.bio_blkno, cbp->cb_buf.bio_data,
1136 cbp->cb_buf.bio_bcount);
1140 * If an error occured, report it. If this is a mirrored
1141 * configuration and the first of two possible reads, do not
1142 * set the error in the bp yet because the second read may
1146 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1147 const char *msg = "";
1149 if ((ccdfind(unit)->sc_cflags & CCDF_MIRROR) &&
1150 (cbp->cb_buf.bio_cmd == BIO_READ) &&
1151 (cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1153 * We will try our read on the other disk down
1154 * below, also reverse the default pick so if we
1155 * are doing a scan we do not keep hitting the
1158 struct ccd_s *cs = ccdfind(unit);
1160 msg = ", trying other disk";
1161 cs->sc_pick = 1 - cs->sc_pick;
1162 cs->sc_blk[cs->sc_pick] = bp->bio_blkno;
1164 bp->bio_flags |= BIO_ERROR;
1165 bp->bio_error = cbp->cb_buf.bio_error ?
1166 cbp->cb_buf.bio_error : EIO;
1168 printf("ccd%d: error %d on component %d block %d (ccd block %d)%s\n",
1169 unit, bp->bio_error, cbp->cb_comp,
1170 (int)cbp->cb_buf.bio_blkno, bp->bio_blkno, msg);
1174 * Process mirror. If we are writing, I/O has been initiated on both
1175 * buffers and we fall through only after both are finished.
1177 * If we are reading only one I/O is initiated at a time. If an
1178 * error occurs we initiate the second I/O and return, otherwise
1179 * we free the second I/O without initiating it.
1182 if (ccdfind(unit)->sc_cflags & CCDF_MIRROR) {
1183 if (cbp->cb_buf.bio_cmd == BIO_WRITE) {
1185 * When writing, handshake with the second buffer
1186 * to determine when both are done. If both are not
1187 * done, return here.
1189 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1190 cbp->cb_mirror->cb_pflags |= CCDPF_MIRROR_DONE;
1197 * When reading, either dispose of the second buffer
1198 * or initiate I/O on the second buffer if an error
1199 * occured with this one.
1201 if ((cbp->cb_pflags & CCDPF_MIRROR_DONE) == 0) {
1202 if (cbp->cb_buf.bio_flags & BIO_ERROR) {
1203 cbp->cb_mirror->cb_pflags |=
1205 BIO_STRATEGY(&cbp->cb_mirror->cb_buf, 0);
1210 putccdbuf(cbp->cb_mirror);
1218 * use bio_caller1 to determine how big the original request was rather
1219 * then bio_bcount, because bio_bcount may have been truncated for EOF.
1221 * XXX We check for an error, but we do not test the resid for an
1222 * aligned EOF condition. This may result in character & block
1223 * device access not recognizing EOF properly when read or written
1224 * sequentially, but will not effect filesystems.
1226 count = (long)cbp->cb_buf.bio_caller1;
1230 * If all done, "interrupt".
1232 bp->bio_resid -= count;
1233 if (bp->bio_resid < 0)
1234 panic("ccdiodone: count");
1235 if (bp->bio_resid == 0)
1236 ccdintr(ccdfind(unit), bp);
1241 ccdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
1243 int unit = ccdunit(dev);
1244 int i, j, lookedup = 0, error = 0;
1247 struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
1251 if (!IS_ALLOCATED(unit))
1260 if ((flag & FWRITE) == 0)
1263 if ((error = ccdlock(cs)) != 0)
1266 if (ccio->ccio_ndisks > CCD_MAXNDISKS)
1269 /* Fill in some important bits. */
1270 cs->sc_ileave = ccio->ccio_ileave;
1271 if (cs->sc_ileave == 0 &&
1272 ((ccio->ccio_flags & CCDF_MIRROR) ||
1273 (ccio->ccio_flags & CCDF_PARITY))) {
1274 printf("ccd%d: disabling mirror/parity, interleave is 0\n", unit);
1275 ccio->ccio_flags &= ~(CCDF_MIRROR | CCDF_PARITY);
1277 if ((ccio->ccio_flags & CCDF_MIRROR) &&
1278 (ccio->ccio_flags & CCDF_PARITY)) {
1279 printf("ccd%d: can't specify both mirror and parity, using mirror\n", unit);
1280 ccio->ccio_flags &= ~CCDF_PARITY;
1282 if ((ccio->ccio_flags & (CCDF_MIRROR | CCDF_PARITY)) &&
1283 !(ccio->ccio_flags & CCDF_UNIFORM)) {
1284 printf("ccd%d: mirror/parity forces uniform flag\n",
1286 ccio->ccio_flags |= CCDF_UNIFORM;
1288 cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;
1291 * Allocate space for and copy in the array of
1292 * componet pathnames and device numbers.
1294 cpp = malloc(ccio->ccio_ndisks * sizeof(char *),
1295 M_DEVBUF, M_WAITOK);
1296 vpp = malloc(ccio->ccio_ndisks * sizeof(struct vnode *),
1297 M_DEVBUF, M_WAITOK);
1299 error = copyin((caddr_t)ccio->ccio_disks, (caddr_t)cpp,
1300 ccio->ccio_ndisks * sizeof(char **));
1302 free(vpp, M_DEVBUF);
1303 free(cpp, M_DEVBUF);
1309 if (ccddebug & CCDB_INIT)
1310 for (i = 0; i < ccio->ccio_ndisks; ++i)
1311 printf("ccdioctl: component %d: %p\n",
1315 for (i = 0; i < ccio->ccio_ndisks; ++i) {
1317 if (ccddebug & CCDB_INIT)
1318 printf("ccdioctl: lookedup = %d\n", lookedup);
1320 if ((error = ccdlookup(cpp[i], td, &vpp[i])) != 0) {
1321 for (j = 0; j < lookedup; ++j)
1322 (void)vn_close(vpp[j], FREAD|FWRITE,
1323 td->td_proc->p_ucred, td);
1324 free(vpp, M_DEVBUF);
1325 free(cpp, M_DEVBUF);
1332 cs->sc_nccdisks = ccio->ccio_ndisks;
1335 * Initialize the ccd. Fills in the softc for us.
1337 if ((error = ccdinit(cs, cpp, td)) != 0) {
1338 for (j = 0; j < lookedup; ++j)
1339 (void)vn_close(vpp[j], FREAD|FWRITE,
1340 td->td_proc->p_ucred, td);
1342 * We can't ccddestroy() cs just yet, because nothing
1343 * prevents user-level app to do another ioctl()
1344 * without closing the device first, therefore
1345 * declare unit null and void and let ccdclose()
1346 * destroy it when it is safe to do so.
1348 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1349 free(vpp, M_DEVBUF);
1350 free(cpp, M_DEVBUF);
1356 * The ccd has been successfully initialized, so
1357 * we can place it into the array and read the disklabel.
1359 ccio->ccio_unit = unit;
1360 ccio->ccio_size = cs->sc_size;
1361 ccdgetdisklabel(dev);
1371 if ((flag & FWRITE) == 0)
1374 if ((error = ccdlock(cs)) != 0)
1377 /* Don't unconfigure if any other partitions are open */
1378 part = ccdpart(dev);
1379 pmask = (1 << part);
1380 if ((cs->sc_openmask & ~pmask)) {
1385 /* Declare unit null and void (reset all flags) */
1386 cs->sc_flags &= (CCDF_WANTED | CCDF_LOCKED);
1388 /* Close the components and free their pathnames. */
1389 for (i = 0; i < cs->sc_nccdisks; ++i) {
1391 * XXX: this close could potentially fail and
1392 * cause Bad Things. Maybe we need to force
1393 * the close to happen?
1396 if (ccddebug & CCDB_VNODE)
1397 vprint("CCDIOCCLR: vnode info",
1398 cs->sc_cinfo[i].ci_vp);
1400 (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
1401 td->td_proc->p_ucred, td);
1402 free(cs->sc_cinfo[i].ci_path, M_DEVBUF);
1405 /* Free interleave index. */
1406 for (i = 0; cs->sc_itable[i].ii_ndisk; ++i)
1407 free(cs->sc_itable[i].ii_index, M_DEVBUF);
1409 /* Free component info and interleave table. */
1410 free(cs->sc_cinfo, M_DEVBUF);
1411 free(cs->sc_itable, M_DEVBUF);
1412 free(cs->sc_vpp, M_DEVBUF);
1414 /* And remove the devstat entry. */
1415 devstat_remove_entry(&cs->device_stats);
1417 /* This must be atomic. */
1427 struct ccdconf *conf = (struct ccdconf *)data;
1428 struct ccd_s *tmpcs;
1429 struct ccd_s *ubuf = conf->buffer;
1431 /* XXX: LOCK(unique unit numbers) */
1432 LIST_FOREACH(tmpcs, &ccd_softc_list, list)
1433 if (IS_INITED(tmpcs))
1436 if (conf->size == 0) {
1437 conf->size = sizeof(struct ccd_s) * ninit;
1439 } else if ((conf->size / sizeof(struct ccd_s) != ninit) ||
1440 (conf->size % sizeof(struct ccd_s) != 0)) {
1441 /* XXX: UNLOCK(unique unit numbers) */
1446 LIST_FOREACH(tmpcs, &ccd_softc_list, list) {
1447 if (!IS_INITED(tmpcs))
1449 error = copyout(tmpcs, --ubuf,
1450 sizeof(struct ccd_s));
1452 /* XXX: UNLOCK(unique unit numbers) */
1455 /* XXX: UNLOCK(unique unit numbers) */
1465 struct ccdcpps *cpps = (struct ccdcpps *)data;
1466 char *ubuf = cpps->buffer;
1469 for (i = 0; i < cs->sc_nccdisks; ++i)
1470 len += cs->sc_cinfo[i].ci_pathlen;
1472 if (cpps->size == 0) {
1475 } else if (cpps->size != len) {
1479 for (i = 0; i < cs->sc_nccdisks; ++i) {
1480 len = cs->sc_cinfo[i].ci_pathlen;
1481 error = copyout(cs->sc_cinfo[i].ci_path, ubuf,
1494 *(struct disklabel *)data = cs->sc_label;
1501 ((struct partinfo *)data)->disklab = &cs->sc_label;
1502 ((struct partinfo *)data)->part =
1503 &cs->sc_label.d_partitions[ccdpart(dev)];
1511 if ((flag & FWRITE) == 0)
1514 if ((error = ccdlock(cs)) != 0)
1517 cs->sc_flags |= CCDF_LABELLING;
1519 error = setdisklabel(&cs->sc_label,
1520 (struct disklabel *)data, 0);
1522 if (cmd == DIOCWDINFO)
1523 error = writedisklabel(CCDLABELDEV(dev),
1527 cs->sc_flags &= ~CCDF_LABELLING;
1539 if ((flag & FWRITE) == 0)
1541 if (*(int *)data != 0)
1542 cs->sc_flags |= CCDF_WLABEL;
1544 cs->sc_flags &= ~CCDF_WLABEL;
1560 if (ccdopen(dev, 0, S_IFCHR, curthread))
1563 cs = ccdfind(ccdunit(dev));
1564 part = ccdpart(dev);
1569 if (cs->sc_label.d_partitions[part].p_fstype != FS_SWAP)
1572 size = cs->sc_label.d_partitions[part].p_size;
1574 if (ccdclose(dev, 0, S_IFCHR, curthread))
1584 /* Not implemented. */
1589 * Lookup the provided name in the filesystem. If the file exists,
1590 * is a valid block device, and isn't being used by anyone else,
1591 * set *vpp to the file's vnode.
1594 ccdlookup(char *path, struct thread *td, struct vnode **vpp)
1596 struct nameidata nd;
1600 NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, td);
1601 flags = FREAD | FWRITE;
1602 if ((error = vn_open(&nd, &flags, 0)) != 0) {
1604 if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
1605 printf("ccdlookup: vn_open error = %d\n", error);
1611 if (vp->v_usecount > 1) {
1616 if (!vn_isdisk(vp, &error))
1620 if (ccddebug & CCDB_VNODE)
1621 vprint("ccdlookup: vnode info", vp);
1624 VOP_UNLOCK(vp, 0, td);
1625 NDFREE(&nd, NDF_ONLY_PNBUF);
1629 VOP_UNLOCK(vp, 0, td);
1630 NDFREE(&nd, NDF_ONLY_PNBUF);
1631 /* vn_close does vrele() for vp */
1632 (void)vn_close(vp, FREAD|FWRITE, td->td_proc->p_ucred, td);
1637 * Read the disklabel from the ccd. If one is not present, fake one
1641 ccdgetdisklabel(dev_t dev)
1643 int unit = ccdunit(dev);
1644 struct ccd_s *cs = ccdfind(unit);
1646 struct disklabel *lp = &cs->sc_label;
1647 struct ccdgeom *ccg = &cs->sc_geom;
1649 bzero(lp, sizeof(*lp));
1651 lp->d_secperunit = cs->sc_size;
1652 lp->d_secsize = ccg->ccg_secsize;
1653 lp->d_nsectors = ccg->ccg_nsectors;
1654 lp->d_ntracks = ccg->ccg_ntracks;
1655 lp->d_ncylinders = ccg->ccg_ncylinders;
1656 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1658 strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
1659 lp->d_type = DTYPE_CCD;
1660 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1662 lp->d_interleave = 1;
1665 lp->d_partitions[RAW_PART].p_offset = 0;
1666 lp->d_partitions[RAW_PART].p_size = cs->sc_size;
1667 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
1668 lp->d_npartitions = RAW_PART + 1;
1670 lp->d_bbsize = BBSIZE; /* XXX */
1671 lp->d_sbsize = SBSIZE; /* XXX */
1673 lp->d_magic = DISKMAGIC;
1674 lp->d_magic2 = DISKMAGIC;
1675 lp->d_checksum = dkcksum(&cs->sc_label);
1678 * Call the generic disklabel extraction routine.
1680 errstring = readdisklabel(CCDLABELDEV(dev), &cs->sc_label);
1681 if (errstring != NULL)
1682 ccdmakedisklabel(cs);
1685 /* It's actually extremely common to have unlabeled ccds. */
1686 if (ccddebug & CCDB_LABEL)
1687 if (errstring != NULL)
1688 printf("ccd%d: %s\n", unit, errstring);
1693 * Take care of things one might want to take care of in the event
1694 * that a disklabel isn't present.
1697 ccdmakedisklabel(struct ccd_s *cs)
1699 struct disklabel *lp = &cs->sc_label;
1702 * For historical reasons, if there's no disklabel present
1703 * the raw partition must be marked FS_BSDFFS.
1705 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
1707 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
1711 * Wait interruptibly for an exclusive lock.
1714 * Several drivers do this; it should be abstracted and made MP-safe.
1717 ccdlock(struct ccd_s *cs)
1721 while ((cs->sc_flags & CCDF_LOCKED) != 0) {
1722 cs->sc_flags |= CCDF_WANTED;
1723 if ((error = tsleep(cs, PRIBIO | PCATCH, "ccdlck", 0)) != 0)
1726 cs->sc_flags |= CCDF_LOCKED;
1731 * Unlock and wake up any waiters.
1734 ccdunlock(struct ccd_s *cs)
1737 cs->sc_flags &= ~CCDF_LOCKED;
1738 if ((cs->sc_flags & CCDF_WANTED) != 0) {
1739 cs->sc_flags &= ~CCDF_WANTED;
1746 printiinfo(struct ccdiinfo *ii)
1750 for (ix = 0; ii->ii_ndisk; ix++, ii++) {
1751 printf(" itab[%d]: #dk %d sblk %d soff %d",
1752 ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
1753 for (i = 0; i < ii->ii_ndisk; i++)
1754 printf(" %d", ii->ii_index[i]);