2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
40 * $Id: vinumvar.h,v 1.21 1999/10/12 04:39:08 grog Exp grog $
45 #include <dev/vinum/vinumstate.h>
47 * Some configuration maxima. They're an enum because
48 * we can't define global constants. Sorry about that.
50 * These aren't as bad as they look: most of them are soft limits.
54 VINUM_HEADER = 512, /* size of header on disk */
55 MAXCONFIGLINE = 1024, /* maximum size of a single config line */
56 MINVINUMSLICE = 1048576, /* minimum size of a slice */
58 CDEV_MAJOR = 91, /* major number for character device */
59 BDEV_MAJOR = 25, /* and block device */
61 ROUND_ROBIN_READPOL = -1, /* round robin read policy */
63 /* type field in minor number */
64 VINUM_VOLUME_TYPE = 0,
68 VINUM_SUPERDEV_TYPE = 4, /* super device. */
69 VINUM_RAWPLEX_TYPE = 5, /* anonymous plex */
70 VINUM_RAWSD_TYPE = 6, /* anonymous subdisk */
72 /* Shifts for the individual fields in the device */
73 VINUM_TYPE_SHIFT = 28,
75 VINUM_PLEX_SHIFT = 16,
82 * Shifts for the second half of raw plex and
85 VINUM_RAWPLEX_SHIFT = 8, /* shift the second half this much */
86 VINUM_RAWPLEX_WIDTH = 12, /* width of second half */
90 MAXPLEX = 8, /* maximum number of plexes in a volume */
91 MAXSD = 256, /* maximum number of subdisks in a plex */
92 MAXDRIVENAME = 32, /* maximum length of a device name */
93 MAXSDNAME = 64, /* maximum length of a subdisk name */
94 MAXPLEXNAME = 64, /* maximum length of a plex name */
95 MAXVOLNAME = 64, /* maximum length of a volume name */
96 MAXNAME = 64, /* maximum length of any name */
100 * Define a minor device number.
101 * This is not used directly; instead, it's
102 * called by the other macros.
104 #define VINUMMINOR(v,p,s,t) ( (v << VINUM_VOL_SHIFT) \
105 | (p << VINUM_PLEX_SHIFT) \
106 | (s << VINUM_SD_SHIFT) \
107 | (t << VINUM_TYPE_SHIFT) )
109 /* Create block and character device minor numbers */
110 #define VINUMBDEV(v,p,s,t) makedev (BDEV_MAJOR, VINUMMINOR (v, p, s, t))
111 #define VINUMCDEV(v,p,s,t) makedev (CDEV_MAJOR, VINUMMINOR (v, p, s, t))
113 #define VINUM_BLOCK_PLEX(p) makedev (BDEV_MAJOR, \
114 (VINUM_RAWPLEX_TYPE << VINUM_TYPE_SHIFT) \
116 | ((p & ~0xff) << 8) )
118 #define VINUM_CHAR_PLEX(p) makedev (CDEV_MAJOR, \
119 (VINUM_RAWPLEX_TYPE << VINUM_TYPE_SHIFT) \
121 | ((p & ~0xff) << 8) )
123 #define VINUM_BLOCK_SD(s) makedev (BDEV_MAJOR, \
124 (VINUM_RAWSD_TYPE << VINUM_TYPE_SHIFT) \
126 | ((s & ~0xff) << 8) )
128 #define VINUM_CHAR_SD(s) makedev (CDEV_MAJOR, \
129 (VINUM_RAWSD_TYPE << VINUM_TYPE_SHIFT) \
131 | ((s & ~0xff) << 8) )
133 /* Create a bit mask for x bits */
134 #define MASK(x) ((1 << (x)) - 1)
136 /* Create a raw block device minor number */
137 #define VINUMRMINOR(d,t) ( ((d & MASK (VINUM_VOL_WIDTH)) << VINUM_VOL_SHIFT) \
138 | ((d & ~MASK (VINUM_VOL_WIDTH)) \
139 << (VINUM_PLEX_SHIFT + VINUM_VOL_WIDTH)) \
140 | (t << VINUM_TYPE_SHIFT) )
142 #define VINUMRBDEV(d,t) makedev (BDEV_MAJOR, VINUMRMINOR (d, t))
144 /* extract device type */
145 #define DEVTYPE(x) ((minor (x) >> VINUM_TYPE_SHIFT) & 7)
148 * This mess is used to catch people who compile
149 * a debug vinum(8) and non-debug kernel module,
150 * or the other way round.
154 #define VINUM_SUPERDEV VINUMMINOR (1, 0, 0, VINUM_SUPERDEV_TYPE) /* superdevice number */
155 #define VINUM_WRONGSUPERDEV VINUMMINOR (2, 0, 0, VINUM_SUPERDEV_TYPE) /* non-debug superdevice number */
157 #define VINUM_SUPERDEV VINUMMINOR (2, 0, 0, VINUM_SUPERDEV_TYPE) /* superdevice number */
158 #define VINUM_WRONGSUPERDEV VINUMMINOR (1, 0, 0, VINUM_SUPERDEV_TYPE) /* debug superdevice number */
161 #define VINUM_DAEMON_DEV VINUMMINOR (0, 0, 0, VINUM_SUPERDEV_TYPE) /* daemon superdevice number */
164 * the number of object entries to cater for initially, and also the
165 * value by which they are incremented. It doesn't take long
166 * to extend them, so theoretically we could start with 1 of each, but
167 * it's untidy to allocate such small areas. These values are
168 * probably too small.
174 INITIAL_SUBDISKS = 16,
175 INITIAL_SUBDISKS_IN_PLEX = 4, /* number of subdisks to allocate to a plex */
176 INITIAL_SUBDISKS_IN_DRIVE = 4, /* number of subdisks to allocate to a drive */
177 INITIAL_DRIVE_FREELIST = 16, /* number of entries in drive freelist */
178 PLEX_REGION_TABLE_SIZE = 8, /* number of entries in plex region tables */
179 INITIAL_LOCKS = 64, /* number of locks to allocate to a plex */
180 MAX_REVIVE_BLOCKSIZE = 65536, /* maximum revive block size */
181 DEFAULT_REVIVE_BLOCKSIZE = 16384, /* default revive block size */
182 VINUMHOSTNAMELEN = 32, /* host name field in label */
188 * 31 30 28 27 20 19 18 16 15 8 7 0
189 * |-----------------------------------------------------------------------------------------------|
190 * |X | Type | Subdisk number | X| Plex | Major number | volume number |
191 * |-----------------------------------------------------------------------------------------------|
195 * The fields in the minor number are interpreted as follows:
197 * Volume: Only type and volume number are relevant
198 * Plex in volume: type, plex number in volume and volume number are relevant
199 * raw plex: type, plex number is made of bits 27-16 and 7-0
200 * raw subdisk: type, subdisk number is made of bits 27-16 and 7-0
203 /* This doesn't get used. Consider removing it. */
206 * CARE. These fields assume a big-endian word. On a
207 * little-endian system, they're the wrong way around
209 unsigned volume:8; /* up to 256 volumes */
210 unsigned major:8; /* this is where the major number fits */
211 unsigned plex:3; /* up to 8 plexes per volume */
212 unsigned unused:1; /* up for grabs */
213 unsigned sd:8; /* up to 256 subdisks per plex */
214 unsigned type:3; /* type of object */
224 unsigned signbit:1; /* to make 32 bits */
227 #define VINUM_DIR "/dev/vinum"
228 #define VINUM_RDIR "/dev/rvinum"
231 * These definitions help catch
232 * userland/kernel mismatches.
235 #define VINUM_WRONGSUPERDEV_NAME VINUM_DIR"/control" /* normal super device */
236 #define VINUM_SUPERDEV_NAME VINUM_DIR"/Control" /* debug super device */
238 #define VINUM_WRONGSUPERDEV_NAME VINUM_DIR"/Control" /* debug super device */
239 #define VINUM_SUPERDEV_NAME VINUM_DIR"/control" /* normal super device */
241 #define VINUM_DAEMON_DEV_NAME VINUM_DIR"/controld" /* super device for daemon only */
244 * Flags for all objects. Most of them only apply to
245 * specific objects, but we have space for all in any
249 VF_LOCKED = 1, /* somebody has locked access to this object */
250 VF_LOCKING = 2, /* we want access to this object */
251 VF_OPEN = 4, /* object has openers */
252 VF_WRITETHROUGH = 8, /* volume: write through */
253 VF_INITED = 0x10, /* unit has been initialized */
254 VF_WLABEL = 0x20, /* label area is writable */
255 VF_LABELLING = 0x40, /* unit is currently being labelled */
256 VF_WANTED = 0x80, /* someone is waiting to obtain a lock */
257 VF_RAW = 0x100, /* raw volume (no file system) */
258 VF_LOADED = 0x200, /* module is loaded */
259 VF_CONFIGURING = 0x400, /* somebody is changing the config */
260 VF_WILL_CONFIGURE = 0x800, /* somebody wants to change the config */
261 VF_CONFIG_INCOMPLETE = 0x1000, /* haven't finished changing the config */
262 VF_CONFIG_SETUPSTATE = 0x2000, /* set a volume up if all plexes are empty */
263 VF_READING_CONFIG = 0x4000, /* we're reading config database from disk */
264 VF_FORCECONFIG = 0x8000, /* configure drives even with different names */
265 VF_NEWBORN = 0x10000, /* for objects: we've just created it */
266 VF_CONFIGURED = 0x20000, /* for drives: we read the config */
267 VF_STOPPING = 0x40000, /* for vinum_conf: stop on last close */
268 VF_DAEMONOPEN = 0x80000, /* the daemon has us open (only superdev) */
269 VF_CREATED = 0x100000, /* for volumes: freshly created, more then new */
270 VF_HOTSPARE = 0x200000, /* for drives: use as hot spare */
273 /* Global configuration information for the vinum subsystem */
275 /* Pointers to vinum structures */
279 struct volume *volume;
281 /* the number allocated */
282 int drives_allocated;
283 int subdisks_allocated;
284 int plexes_allocated;
285 int volumes_allocated;
287 /* and the number currently in use */
295 #define VINUM_MAXACTIVE 256 /* maximum number of active requests */
296 int active; /* current number of requests outstanding */
297 int maxactive; /* maximum number of requests ever outstanding */
304 /* Use these defines to simplify code */
305 #define DRIVE vinum_conf.drive
306 #define SD vinum_conf.sd
307 #define PLEX vinum_conf.plex
308 #define VOL vinum_conf.volume
309 #define VFLAGS vinum_conf.flags
314 * Vinum drives start with this structure:
317 * |--------------------------------------|
318 * | PDP-11 memorial boot block | 0
319 * |--------------------------------------|
320 * | Disk label, maybe | 1
321 * |--------------------------------------|
322 * | Slice definition (vinum_hdr) | 8
323 * |--------------------------------------|
325 * | Configuration info, first copy | 9
327 * |--------------------------------------|
329 * | Configuration info, second copy | 9 + size of config
331 * |--------------------------------------|
334 /* Sizes and offsets of our information */
336 VINUM_LABEL_OFFSET = 4096, /* offset of vinum label */
337 VINUMHEADERLEN = 512, /* size of vinum label */
338 VINUM_CONFIG_OFFSET = 4608, /* offset of first config copy */
339 MAXCONFIG = 65536, /* and size of config copy */
340 DATASTART = (MAXCONFIG * 2 + VINUM_CONFIG_OFFSET) / DEV_BSIZE /* this is where the data starts */
344 * hostname is 256 bytes long, but we don't need to shlep
345 * multiple copies in vinum. We use the host name just
346 * to identify this system, and 32 bytes should be ample
351 char sysname[VINUMHOSTNAMELEN]; /* system name at time of creation */
352 char name[MAXDRIVENAME]; /* our name of the drive */
353 struct timeval date_of_birth; /* the time it was created */
354 struct timeval last_update; /* and the time of last update */
356 * total size in bytes of the drive. This value
357 * includes the headers.
363 long long magic; /* we're long on magic numbers */
364 #define VINUM_MAGIC 22322600044678729LL /* should be this */
365 #define VINUM_NOMAGIC 22322600044678990LL /* becomes this after obliteration */
367 * Size in bytes of each copy of the
368 * configuration info. This must be a multiple
369 * of the sector size.
372 struct vinum_label label; /* unique label */
375 /* Information returned from read_drive_label */
376 enum drive_label_info {
377 DL_CANT_OPEN, /* invalid partition */
378 DL_NOT_OURS, /* valid partition, but no vinum label */
379 DL_DELETED_LABEL, /* valid partition, deleted label found */
380 DL_WRONG_DRIVE, /* drive name doesn't match */
381 DL_OURS /* valid partition and label found */
384 /*** Drive definitions ***/
386 * A drive corresponds to a disk slice. We use a different term to show
387 * the difference in usage: it doesn't have to be a slice, and could
388 * theoretically be a complete, unpartitioned disk
392 enum drivestate state; /* current state */
393 int flags; /* flags */
394 int subdisks_allocated; /* number of entries in sd */
395 int subdisks_used; /* and the number used */
396 int blocksize; /* size of fs blocks */
397 int pid; /* of locker */
398 u_int64_t sectors_available; /* number of sectors still available */
400 int lasterror; /* last error on drive */
401 int driveno; /* index of drive in vinum_conf */
402 int opencount; /* number of up subdisks */
403 u_int64_t reads; /* number of reads on this drive */
404 u_int64_t writes; /* number of writes on this drive */
405 u_int64_t bytes_read; /* number of bytes read */
406 u_int64_t bytes_written; /* number of bytes written */
407 char devicename[MAXDRIVENAME]; /* name of the slice it's on */
408 struct vnode *vp; /* vnode pointer */
410 struct vinum_label label; /* and the label information */
411 struct partinfo partinfo; /* partition information */
412 int freelist_size; /* number of entries alloced in free list */
413 int freelist_entries; /* number of entries used in free list */
414 struct drive_freelist { /* sorted list of free space on drive */
415 u_int64_t offset; /* offset of entry */
416 u_int64_t sectors; /* and length in sectors */
418 #define DRIVE_MAXACTIVE 10 /* maximum number of active requests */
419 int active; /* current number of requests outstanding */
420 int maxactive; /* maximum number of requests ever outstanding */
422 char lockfilename[16]; /* name of file from which we were locked */
423 int lockline; /* and the line number */
427 /*** Subdisk definitions ***/
430 enum sdstate state; /* state */
432 int lasterror; /* last error occurred */
433 /* offsets in blocks */
434 int64_t driveoffset; /* offset on drive */
436 * plexoffset is the offset from the beginning of the
437 * plex to the very first part of the subdisk, in
438 * sectors. For striped and RAID-5 plexes, only
439 * the first stripe is located at this offset
441 int64_t plexoffset; /* offset in plex */
442 u_int64_t sectors; /* and length in sectors */
443 int plexno; /* index of plex, if it belongs */
444 int driveno; /* index of the drive on which it is located */
445 int sdno; /* our index in vinum_conf */
446 int plexsdno; /* and our number in our plex */
447 /* (undefined if no plex) */
448 u_int64_t reads; /* number of reads on this subdisk */
449 u_int64_t writes; /* number of writes on this subdisk */
450 u_int64_t bytes_read; /* number of bytes read */
451 u_int64_t bytes_written; /* number of bytes written */
452 /* revive parameters */
453 u_int64_t revived; /* block number of current revive request */
454 int revive_blocksize; /* revive block size (bytes) */
455 int revive_interval; /* and time to wait between transfers */
456 struct request *waitlist; /* list of requests waiting on revive op */
457 /* init parameters */
458 u_int64_t initialized; /* block number of current init request */
459 int init_blocksize; /* init block size (bytes) */
460 int init_interval; /* and time to wait between transfers */
461 char name[MAXSDNAME]; /* name of subdisk */
464 /*** Plex definitions ***/
466 /* kinds of plex organization */
468 plex_disorg, /* disorganized */
469 plex_concat, /* concatenated plex */
470 plex_striped, /* striped plex */
471 plex_raid5 /* RAID5 plex */
475 enum plexorg organization; /* Plex organization */
476 enum plexstate state; /* and current state */
477 u_int64_t length; /* total length of plex (sectors) */
479 int stripesize; /* size of stripe or raid band, in sectors */
480 int subdisks; /* number of associated subdisks */
481 int subdisks_allocated; /* number of subdisks allocated space for */
482 int *sdnos; /* list of component subdisks */
483 int plexno; /* index of plex in vinum_conf */
484 int volno; /* index of volume */
485 int volplexno; /* number of plex in volume */
486 /* Lock information */
487 int alloclocks; /* number of locks allocated */
488 int usedlocks; /* number currently in use */
489 int lockwaits; /* and number of waits for locks */
490 struct rangelock *lock; /* ranges of locked addresses */
491 u_int64_t checkblock; /* block number for check parity op */
492 u_int64_t rebuildblock; /* block number for rebuild parity op */
494 u_int64_t reads; /* number of reads on this plex */
495 u_int64_t writes; /* number of writes on this plex */
496 u_int64_t bytes_read; /* number of bytes read */
497 u_int64_t bytes_written; /* number of bytes written */
498 u_int64_t recovered_reads; /* number of recovered read operations */
499 u_int64_t degraded_writes; /* number of degraded writes */
500 u_int64_t parityless_writes; /* number of parityless writes */
501 u_int64_t multiblock; /* requests that needed more than one block */
502 u_int64_t multistripe; /* requests that needed more than one stripe */
503 int sddowncount; /* number of subdisks down */
504 char name[MAXPLEXNAME]; /* name of plex */
507 /*** Volume definitions ***/
509 /* Address range definitions, for locking volumes */
511 daddr_t stripe; /* address + 1 of the range being locked */
512 struct buf *bp; /* user's buffer pointer */
513 int plexno; /* and number of plex it affects */
517 enum volumestate state; /* current state */
518 int plexes; /* number of plexes */
519 int preferred_plex; /* plex to read from, -1 for round-robin */
521 * index of plex used for last read, for
525 int volno; /* volume number */
526 int flags; /* status and configuration flags */
527 int openflags; /* flags supplied to last open(2) */
528 u_int64_t size; /* size of volume */
529 int blocksize; /* logical block size */
530 int active; /* number of outstanding requests active */
531 int subops; /* and the number of suboperations */
533 u_int64_t bytes_read; /* number of bytes read */
534 u_int64_t bytes_written; /* number of bytes written */
535 u_int64_t reads; /* number of reads on this volume */
536 u_int64_t writes; /* number of writes on this volume */
537 u_int64_t recovered_reads; /* reads recovered from another plex */
539 * Unlike subdisks in the plex, space for the
540 * plex pointers is static.
542 int plex[MAXPLEX]; /* index of plexes */
543 char name[MAXVOLNAME]; /* name of volume */
544 struct disklabel label; /* for DIOCGPART */
548 * Table expansion. Expand table, which contains oldcount
549 * entries of type element, by increment entries, and change
550 * oldcount accordingly
552 #define EXPAND(table, element, oldcount, increment) \
554 expand_table ((void **) &table, \
555 oldcount * sizeof (element), \
556 (oldcount + increment) * sizeof (element) ); \
557 oldcount += increment; \
560 /* Information on vinum's memory usage */
562 int mallocs; /* number of malloced blocks */
563 int total_malloced; /* total amount malloced */
564 int highwater; /* maximum number of mallocs */
565 struct mc *malloced; /* pointer to kernel table */
568 #define MCFILENAMELEN 16
575 char file[MCFILENAMELEN];
579 * These enums are used by the state transition
580 * routines. They're in bit map format:
582 * Bit 0: Other plexes in the volume are down
583 * Bit 1: Other plexes in the volume are up
584 * Bit 2: The current plex is up
585 * Maybe they should be local to
589 volplex_onlyusdown = 0, /* 0: we're the only plex, and we're down */
590 volplex_alldown, /* 1: another plex is down, and so are we */
591 volplex_otherup, /* 2: another plex is up */
592 volplex_otherupdown, /* 3: other plexes are up and down */
593 volplex_onlyus, /* 4: we're up and alone */
594 volplex_onlyusup, /* 5: only we are up, others are down */
595 volplex_allup, /* 6: all plexes are up */
596 volplex_someup /* 7: some plexes are up, including us */
599 /* state map for plex */
602 sd_downstate = 2, /* SD is down */
603 sd_crashedstate = 4, /* SD is crashed */
604 sd_obsoletestate = 8, /* SD is obsolete */
605 sd_stalestate = 16, /* SD is stale */
606 sd_rebornstate = 32, /* SD is reborn */
607 sd_upstate = 64, /* SD is up */
608 sd_initstate = 128, /* SD is initializing */
609 sd_initializedstate = 256, /* SD is initialized */
610 sd_otherstate = 512, /* SD is in some other state */
614 * This is really just a parameter to pass to
615 * set_<foo>_state, but since it needs to be known
616 * in the external definitions, we need to define
620 setstate_none = 0, /* no flags */
621 setstate_force = 1, /* force the state change */
622 setstate_configuring = 2, /* we're currently configuring, don't save */
625 /* Operations for parityops to perform. */
632 /* Debugging stuff */
634 DEBUG_ADDRESSES = 1, /* show buffer information during requests */
635 DEBUG_NUMOUTPUT = 2, /* show the value of vp->v_numoutput */
636 DEBUG_RESID = 4, /* go into debugger in complete_rqe */
637 DEBUG_LASTREQS = 8, /* keep a circular buffer of last requests */
638 DEBUG_REVIVECONFLICT = 16, /* print info about revive conflicts */
639 DEBUG_EOFINFO = 32, /* print info about EOF detection */
640 DEBUG_MEMFREE = 64, /* keep info about Frees */
641 DEBUG_BIGDRIVE = 128, /* pretend our drives are 100 times the size */
642 DEBUG_REMOTEGDB = 256, /* go into remote gdb */
643 DEBUG_WARNINGS = 512, /* log various relatively harmless warnings */
647 #define longjmp LongJmp /* test our longjmps */
650 /* Local Variables: */
651 /* fill-column: 50 */