share/doc/papers/bufbio/bio.ms

   1 .\" ----------------------------------------------------------------------------
   2 .\" "THE BEER-WARE LICENSE" (Revision 42):
   3 .\" <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
   4 .\" can do whatever you want with this stuff. If we meet some day, and you think
   5 .\" this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
   6 .\" ----------------------------------------------------------------------------
   7 .\"
   8 .\" $FreeBSD$
   9 .\"
  10 .if n .ftr C R
  11 .nr PI 2n
  12 .TL
  13 The case for struct bio
  14 .br
  15 - or -
  16 .br
  17 A road map for a stackable BIO subsystem in FreeBSD
  18 .AU
  19 Poul-Henning Kamp <phk@FreeBSD.org>
  20 .AI
  21 The FreeBSD Project
  22 .AB
  23 Historically, the only translation performed on I/O requests after
  24 they they left the file-system layer were logical sub disk implementation
  25 done in the device driver.  No universal standard for how sub disks are
  26 configured and implemented exists, in fact pretty much every single platform
  27 and operating system have done it their own way.  As FreeBSD migrates to
  28 other platforms it needs to understand these local conventions to be
  29 able to co-exist with other operating systems on the same disk.
  30 .PP
  31 Recently a number of technologies like RAID have expanded the
  32 concept of "a disk" a fair bit and while these technologies initially
  33 were implemented in separate hardware they increasingly migrate into
  34 the operating systems as standard functionality.
  35 .PP
  36 Both of these factors indicate the need for a structured approach to
  37 systematic "geometry manipulation" facilities in FreeBSD.
  38 .PP
  39 This paper contains the road-map for a stackable "BIO" system in
  40 FreeBSD, which will support these facilities.
  41 .AE
  42 .NH
  43 The miseducation of \fCstruct buf\fP.
  44 .PP
  45 To fully appreciate the topic, I include a little historic overview
  46 of struct buf, it is a most enlightening case of not exactly bit-rot
  47 but more appropriately design-rot.
  48 .PP
  49 In the beginning, which for this purpose extends until virtual
  50 memory is was introduced into UNIX, all disk I/O were done from or
  51 to a struct buf.  In the 6th edition sources, as printed in Lions
  52 Book, struct buf looks like this:
  53 .DS
  54 .ft C
  55 .ps -1
  56 struct buf
  57 {
  58    int     b_flags;        /* see defines below */
  59    struct  buf *b_forw;    /* headed by devtab of b_dev */
  60    struct  buf *b_back;    /*  '  */
  61    struct  buf *av_forw;   /* position on free list, */
  62    struct  buf *av_back;   /*     if not BUSY*/
  63    int     b_dev;          /* major+minor device name */
  64    int     b_wcount;       /* transfer count (usu. words) */
  65    char    *b_addr;        /* low order core address */
  66    char    *b_xmem;        /* high order core address */
  67    char    *b_blkno;       /* block # on device */
  68    char    b_error;        /* returned after I/O */
  69    char    *b_resid;       /* words not transferred after
  70                                            error */
  71 } buf[NBUF];
  72 .ps +1
  73 .ft P
  74 .DE
  75 .PP
  76 At this point in time, struct buf had only two functions:
  77 To act as a cache
  78 and to transport I/O operations to device drivers.  For the purpose of
  79 this document, the cache functionality is uninteresting and will be
  80 ignored.
  81 .PP
  82 The I/O operations functionality consists of three parts:
  83 .IP "" 5n
  84 \(bu Where in Ram/Core is the data located (b_addr, b_xmem, b_wcount).
  85 .IP
  86 \(bu Where on disk is the data located (b_dev, b_blkno)
  87 .IP
  88 \(bu Request and result information (b_flags, b_error, b_resid)
  89 .PP
  90 In addition to this, the av_forw and av_back elements are
  91 used by the disk device drivers to put requests on a linked list.
  92 All in all the majority of struct buf is involved with the I/O
  93 aspect and only a few fields relate exclusively to the cache aspect.
  94 .PP
  95 If we step forward to the BSD 4.4-Lite-2 release, struct buf has grown
  96 a bit here or there:
  97 .DS
  98 .ft C
  99 .ps -1
 100 struct buf {
 101         LIST_ENTRY(buf) b_hash;         /* Hash chain. */
 102         LIST_ENTRY(buf) b_vnbufs;       /* Buffer's associated vnode. */
 103         TAILQ_ENTRY(buf) b_freelist;    /* Free list position if not active. */
 104         struct  buf *b_actf, **b_actb;  /* Device driver queue when active. */
 105         struct  proc *b_proc;           /* Associated proc; NULL if kernel. */
 106         volatile long   b_flags;        /* B_* flags. */
 107         int     b_error;                /* Errno value. */
 108         long    b_bufsize;              /* Allocated buffer size. */
 109         long    b_bcount;               /* Valid bytes in buffer. */
 110         long    b_resid;                /* Remaining I/O. */
 111         dev_t   b_dev;                  /* Device associated with buffer. */
 112         struct {
 113                 caddr_t b_addr;         /* Memory, superblocks, indirect etc. */
 114         } b_un;
 115         void    *b_saveaddr;            /* Original b_addr for physio. */
 116         daddr_t b_lblkno;               /* Logical block number. */
 117         daddr_t b_blkno;                /* Underlying physical block number. */
 118                                         /* Function to call upon completion. */
 119         void    (*b_iodone) __P((struct buf *));
 120         struct  vnode *b_vp;            /* Device vnode. */
 121         long    b_pfcent;               /* Center page when swapping cluster. */
 122                                         /* XXX pfcent should be int; overld. */
 123         int     b_dirtyoff;             /* Offset in buffer of dirty region. */
 124         int     b_dirtyend;             /* Offset of end of dirty region. */
 125         struct  ucred *b_rcred;         /* Read credentials reference. */
 126         struct  ucred *b_wcred;         /* Write credentials reference. */
 127         int     b_validoff;             /* Offset in buffer of valid region. */
 128         int     b_validend;             /* Offset of end of valid region. */
 129 };
 130 .ps +1
 131 .ft P
 132 .DE
 133 .PP
 134 The main piece of action is the addition of vnodes, a VM system and a
 135 prototype LFS filesystem, all of which needed some handles on struct
 136 buf.  Comparison will show that the I/O aspect of struct buf is in
 137 essence unchanged, the length field is now in bytes instead of words,
 138 the linked list the drivers can use has been renamed (b_actf,
 139 b_actb) and a b_iodone pointer for callback notification has been added
 140 but otherwise there is no change to the fields which
 141 represent the I/O aspect.  All the new fields relate to the cache
 142 aspect, link buffers to the VM system, provide hacks for file-systems
 143 (b_lblkno) etc etc.
 144 .PP
 145 By the time we get to FreeBSD 3.0 more stuff has grown on struct buf:
 146 .DS
 147 .ft C
 148 .ps -1
 149 struct buf {
 150         LIST_ENTRY(buf) b_hash;         /* Hash chain. */
 151         LIST_ENTRY(buf) b_vnbufs;       /* Buffer's associated vnode. */
 152         TAILQ_ENTRY(buf) b_freelist;    /* Free list position if not active. */
 153         TAILQ_ENTRY(buf) b_act;         /* Device driver queue when active. *new* */
 154         struct  proc *b_proc;           /* Associated proc; NULL if kernel. */
 155         long    b_flags;                /* B_* flags. */
 156         unsigned short b_qindex;        /* buffer queue index */
 157         unsigned char b_usecount;       /* buffer use count */
 158         int     b_error;                /* Errno value. */
 159         long    b_bufsize;              /* Allocated buffer size. */
 160         long    b_bcount;               /* Valid bytes in buffer. */
 161         long    b_resid;                /* Remaining I/O. */
 162         dev_t   b_dev;                  /* Device associated with buffer. */
 163         caddr_t b_data;                 /* Memory, superblocks, indirect etc. */
 164         caddr_t b_kvabase;              /* base kva for buffer */
 165         int     b_kvasize;              /* size of kva for buffer */
 166         daddr_t b_lblkno;               /* Logical block number. */
 167         daddr_t b_blkno;                /* Underlying physical block number. */
 168         off_t   b_offset;               /* Offset into file */
 169                                         /* Function to call upon completion. */
 170         void    (*b_iodone) __P((struct buf *));
 171                                         /* For nested b_iodone's. */
 172         struct  iodone_chain *b_iodone_chain;
 173         struct  vnode *b_vp;            /* Device vnode. */
 174         int     b_dirtyoff;             /* Offset in buffer of dirty region. */
 175         int     b_dirtyend;             /* Offset of end of dirty region. */
 176         struct  ucred *b_rcred;         /* Read credentials reference. */
 177         struct  ucred *b_wcred;         /* Write credentials reference. */
 178         int     b_validoff;             /* Offset in buffer of valid region. */
 179         int     b_validend;             /* Offset of end of valid region. */
 180         daddr_t b_pblkno;               /* physical block number */
 181         void    *b_saveaddr;            /* Original b_addr for physio. */
 182         caddr_t b_savekva;              /* saved kva for transfer while bouncing */
 183         void    *b_driver1;             /* for private use by the driver */
 184         void    *b_driver2;             /* for private use by the driver */
 185         void    *b_spc;
 186         union   cluster_info {
 187                 TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 188                 TAILQ_ENTRY(buf) cluster_entry;
 189         } b_cluster;
 190         struct  vm_page *b_pages[btoc(MAXPHYS)];
 191         int             b_npages;
 192         struct  workhead b_dep;         /* List of filesystem dependencies. */
 193 };
 194 .ps +1
 195 .ft P
 196 .DE
 197 .PP
 198 Still we find that the I/O aspect of struct buf is in essence unchanged.  A couple of fields have been added which allows the driver to hang local data off the buf while working on it have been added (b_driver1, b_driver2) and a "physical block number" (b_pblkno) have been added.
 199 .PP
 200 This p_blkno is relevant, it has been added because the disklabel/slice
 201 code have been abstracted out of the device drivers, the filesystem
 202 ask for b_blkno, the slice/label code translates this into b_pblkno
 203 which the device driver operates on.
 204 .PP
 205 After this point some minor cleanups have happened, some unused fields
 206 have been removed etc but the I/O aspect of struct buf is still only
 207 a fraction of the entire structure: less than a quarter of the
 208 bytes in a struct buf are used for the I/O aspect and struct buf
 209 seems to continue to grow and grow.
 210 .PP
 211 Since version 6 as documented in Lions book, a three significant pieces
 212 of code have emerged which need to do non-trivial translations of
 213 the I/O request before it reaches the device drivers: CCD, slice/label
 214 and Vinum.  They all basically do the same: they map I/O requests from
 215 a logical space to a physical space, and the mappings they perform
 216 can be 1:1 or 1:N.  \**
 217 .FS
 218 It is interesting to note that Lions in his comments to the \fCrkaddr\fP
 219 routine (p. 16-2) writes \fIThe code in this procedure incorporates
 220 a special feature for files which extend over more than one disk
 221 drive.  This feature is described in the UPM Section "RK(IV)".  Its
 222 usefulness seems to be restricted.\fP  This more than hints at the
 223 presence already then of various hacks to stripe/span multiple devices.
 224 .FE
 225 .PP
 226 The 1:1 mapping of the slice/label code is rather trivial, and the
 227 addition of the b_pblkno field catered for the majority of the issues
 228 this resulted in, leaving but one:  Reads or writes to the magic "disklabel"
 229 or equally magic "MBR" sectors on a disk must be caught, examined and in
 230 some cases modified before being passed on to the device driver.  This need
 231 resulted in the addition of the b_iodone_chain field which adds a limited
 232 ability to stack I/O operations;
 233 .PP
 234 The 1:N mapping of CCD and Vinum are far more interesting.  These two
 235 subsystems look like a device driver, but rather than drive some piece
 236 of hardware, they allocate new struct buf data structures populates
 237 these and pass them on to other device drivers.
 238 .PP
 239 Apart from it being inefficient to lug about a 348 bytes data structure
 240 when 80 bytes would have done, it also leads to significant code rot
 241 when programmers don't know what to do about the remaining fields or
 242 even worse: "borrow" a field or two for their own uses.
 243 .PP
 244 .ID
 245 .if t .PSPIC bufsize.eps
 246 .if n [graph not available in this format]
 247 .DE
 248 .I
 249 Conclusions:
 250 .IP "" 5n
 251 \(bu Struct buf is victim of chronic bloat.
 252 .IP
 253 \(bu The I/O aspect of
 254 struct buf is practically constant and only about \(14 of the total bytes.
 255 .IP
 256 \(bu Struct buf currently have several users, vinum, ccd and to
 257 limited extent diskslice/label, which
 258 need only the I/O aspect, not the vnode, caching or VM linkage.
 259 .IP
 260 .I
 261 The I/O aspect of struct buf should be put in a separate \fCstruct bio\fP.
 262 .R
 263 .NH 1
 264 Implications for future struct buf improvements
 265 .PP
 266 Concerns have been raised about the implications this separation
 267 will have for future work on struct buf, I will try to address
 268 these concerns here.
 269 .PP
 270 As the existence and popularity of vinum and ccd proves, there is
 271 a legitimate and valid requirement to be able to do I/O operations
 272 which are not initiated by a vnode or filesystem operation.
 273 In other words, an I/O request is a fully valid entity in its own
 274 right and should be treated like that.
 275 .PP
 276 Without doubt, the I/O request has to be tuned to fit the needs
 277 of struct buf users in the best possible way, and consequently
 278 any future changes in struct buf are likely to affect the I/O request
 279 semantics.
 280 .PP
 281 One particular change which has been proposed is to drop the present
 282 requirement that a struct buf be mapped contiguously into kernel
 283 address space.  The argument goes that since many modern drivers use
 284 physical address DMA to transfer the data maintaining such a mapping
 285 is needless overhead.
 286 .PP
 287 Of course some drivers will still need to be able to access the
 288 buffer in kernel address space and some kind of compatibility
 289 must be provided there.
 290 .PP
 291 The question is, if such a change is made impossible by the
 292 separation of the I/O aspect into its own data structure?
 293 .PP
 294 The answer to this is ``no''.
 295 Anything that could be added to or done with
 296 the I/O aspect of struct buf can also be added to or done
 297 with the I/O aspect if it lives in a new "struct bio".
 298 .NH 1
 299 Implementing a \fCstruct bio\fP
 300 .PP
 301 The first decision to be made was who got to use the name "struct buf",
 302 and considering the fact that it is the I/O aspect which gets separated
 303 out and that it only covers about \(14 of the bytes in struct buf,
 304 obviously the new structure for the I/O aspect gets a new name.
 305 Examining the naming in the kernel, the "bio" prefix seemed a given,
 306 for instance, the function to signal completion of an I/O request is
 307 already named "biodone()".
 308 .PP
 309 Making the transition smooth is obviously also a priority and after
 310 some prototyping \**
 311 .FS
 312 The software development technique previously known as "Trial & Error".
 313 .FE
 314 it was found that a totally transparent transition could be made by
 315 embedding a copy of the new "struct bio" as the first element of "struct buf"
 316 and by using cpp(1) macros to alias the fields to the legacy struct buf
 317 names.
 318 .NH 2
 319 The b_flags problem.
 320 .PP
 321 Struct bio was defined by examining all code existing in the driver tree
 322 and finding all the struct buf fields which were legitimately used (as
 323 opposed to "hi-jacked" fields).
 324 One field was found to have "dual-use": the b_flags field.
 325 This required special attention.
 326 Examination showed that b_flags were used for three things:
 327 .IP "" 5n
 328 \(bu Communication of the I/O command (READ, WRITE, FORMAT, DELETE)
 329 .IP
 330 \(bu Communication of ordering and error status
 331 .IP
 332 \(bu General status for non I/O aspect consumers of struct buf.
 333 .PP
 334 For historic reasons B_WRITE was defined to be zero, which lead to
 335 confusion and bugs, this pushed the decision to have a separate
 336 "b_iocmd" field in struct buf and struct bio for communicating
 337 only the action to be performed.
 338 .PP
 339 The ordering and error status bits were put in a new flag field "b_ioflag".
 340 This has left sufficiently many now unused bits in b_flags that the b_xflags element
 341 can now be merged back into b_flags.
 342 .NH 2
 343 Definition of struct bio
 344 .PP
 345 With the cleanup of b_flags in place, the definition of struct bio looks like this:
 346 .DS
 347 .ft C
 348 .ps -1
 349 struct bio {
 350         u_int   bio_cmd;                /* I/O operation. */
 351         dev_t   bio_dev;                /* Device to do I/O on. */
 352         daddr_t bio_blkno;              /* Underlying physical block number. */
 353         off_t   bio_offset;             /* Offset into file. */
 354         long    bio_bcount;             /* Valid bytes in buffer. */
 355         caddr_t bio_data;               /* Memory, superblocks, indirect etc. */
 356         u_int   bio_flags;              /* BIO_ flags. */
 357         struct buf      *_bio_buf;      /* Parent buffer. */
 358         int     bio_error;              /* Errno for BIO_ERROR. */
 359         long    bio_resid;              /* Remaining I/O in bytes. */
 360         void    (*bio_done) __P((struct buf *));
 361         void    *bio_driver1;           /* Private use by the callee. */
 362         void    *bio_driver2;           /* Private use by the callee. */
 363         void    *bio_caller1;           /* Private use by the caller. */
 364         void    *bio_caller2;           /* Private use by the caller. */
 365         TAILQ_ENTRY(bio) bio_queue;     /* Disksort queue. */
 366         daddr_t bio_pblkno;               /* physical block number */
 367         struct  iodone_chain *bio_done_chain;
 368 };
 369 .ps +1
 370 .ft P
 371 .DE
 372 .NH 2
 373 Definition of struct buf
 374 .PP
 375 After adding a struct bio to struct buf and the fields aliased into it
 376 struct buf looks like this:
 377 .DS
 378 .ft C
 379 .ps -1
 380 struct buf {
 381         /* XXX: b_io must be the first element of struct buf for now /phk */
 382         struct bio b_io;                /* "Builtin" I/O request. */
 383 #define b_bcount        b_io.bio_bcount
 384 #define b_blkno         b_io.bio_blkno
 385 #define b_caller1       b_io.bio_caller1
 386 #define b_caller2       b_io.bio_caller2
 387 #define b_data          b_io.bio_data
 388 #define b_dev           b_io.bio_dev
 389 #define b_driver1       b_io.bio_driver1
 390 #define b_driver2       b_io.bio_driver2
 391 #define b_error         b_io.bio_error
 392 #define b_iocmd         b_io.bio_cmd
 393 #define b_iodone        b_io.bio_done
 394 #define b_iodone_chain  b_io.bio_done_chain
 395 #define b_ioflags       b_io.bio_flags
 396 #define b_offset        b_io.bio_offset
 397 #define b_pblkno        b_io.bio_pblkno
 398 #define b_resid         b_io.bio_resid
 399         LIST_ENTRY(buf) b_hash;         /* Hash chain. */
 400         TAILQ_ENTRY(buf) b_vnbufs;      /* Buffer's associated vnode. */
 401         TAILQ_ENTRY(buf) b_freelist;    /* Free list position if not active. */
 402         TAILQ_ENTRY(buf) b_act;         /* Device driver queue when active. *new* */
 403         long    b_flags;                /* B_* flags. */
 404         unsigned short b_qindex;        /* buffer queue index */
 405         unsigned char b_xflags;         /* extra flags */
 406 [...]
 407 .ps +1
 408 .ft P
 409 .DE
 410 .PP
 411 Putting the struct bio as the first element in struct buf during a transition
 412 period allows a pointer to either to be cast to a pointer of the other,
 413 which means that certain pieces of code can be left un-converted with the
 414 use of a couple of casts while the remaining pieces of code are tested.
 415 The ccd and vinum modules have been left un-converted like this for now.
 416 .PP
 417 This is basically where FreeBSD-current stands today.
 418 .PP
 419 The next step is to substitute struct bio for struct buf in all the code
 420 which only care about the I/O aspect: device drivers, diskslice/label.
 421 The patch to do this is up for review. \**
 422 .FS
 423 And can be found at http://phk.freebsd.dk/misc
 424 .FE
 425 and consists mainly of systematic substitutions like these
 426 .DS
 427 .ft C
 428 s/struct buf/struct bio/
 429 s/b_flags/bio_flags/
 430 s/b_bcount/bio_bcount/
 431 &c &c
 432 .ft P
 433 .DE
 434 .NH 2
 435 Future work
 436 .PP
 437 It can be successfully argued that the cpp(1) macros used for aliasing
 438 above are ugly and should be expanded in place.  It would certainly
 439 be trivial to do so, but not by definition worthwhile.
 440 .PP
 441 Retaining the aliasing for the b_* and bio_* name-spaces this way
 442 leaves us with considerable flexibility in modifying the future
 443 interaction between the two.  The DEV_STRATEGY() macro is the single
 444 point where a struct buf is turned into a struct bio and launched
 445 into the drivers to full-fill the I/O request and this provides us
 446 with a single isolated location for performing non-trivial translations.
 447 .PP
 448 As an example of this flexibility:  It has been proposed to essentially
 449 drop the b_blkno field and use the b_offset field to communicate the
 450 on-disk location of the data.  b_blkno is a 32bit offset of B_DEVSIZE
 451 (512) bytes sectors which allows us to address two terabytes worth
 452 of data.  Using b_offset as a 64 bit byte-address would not only allow
 453 us to address 8 million times larger disks, it would also make it
 454 possible to accommodate disks which use non-power-of-two sector-size,
 455 Audio CD-ROMs for instance.
 456 .PP
 457 The above mentioned flexibility makes an implementation almost trivial:
 458 .IP "" 5n
 459 \(bu Add code to DEV_STRATEGY() to populate b_offset from b_blkno in the
 460 cases where it is not valid.  Today it is only valid for a struct buf
 461 marked B_PHYS.
 462 .IP
 463 \(bu Change diskslice/label, ccd, vinum and device drivers to use b_offset
 464 instead of b_blkno.
 465 .IP
 466 \(bu Remove the bio_blkno field from struct bio, add it to struct buf as
 467 b_blkno and remove the cpp(1) macro which aliased it into struct bio.
 468 .PP
 469 Another possible transition could be to not have a "built-in" struct bio
 470 in struct buf.  If for some reason struct bio grows fields of no relevance
 471 to struct buf it might be cheaper to remove struct bio from struct buf,
 472 un-alias the fields and have DEV_STRATEGY() allocate a struct bio and populate
 473 the relevant fields from struct buf.
 474 This would also be entirely transparent to both users of struct buf and
 475 struct bio as long as we retain the aliasing mechanism and DEV_STRATEGY().
 476 .bp
 477 .NH 1
 478 Towards a stackable BIO subsystem.
 479 .PP
 480 Considering that we now have three distinct pieces of code living
 481 in the nowhere between DEV_STRATEGY() and the device drivers:
 482 diskslice/label, ccd and vinum, it is not unreasonable to start
 483 to look for a more structured and powerful API for these pieces
 484 of code.
 485 .PP
 486 In traditional UNIX semantics a "disk" is a one-dimensional array of
 487 512 byte sectors which can be read or written.  Support for sectors
 488 of multiple of 512 bytes were implemented with a sort of "don't ask-don't tell" policy where system administrator would specify a larger minimum sector-size
 489 to the filesystem, and things would "just work", but no formal communication about the size of the smallest transfer possible were exchanged between the disk driver and the filesystem.
 490 .PP
 491 A truly generalised concept of a disk needs to be more flexible and more
 492 expressive.  For instance, a user of a disk will want to know:
 493 .IP "" 5n
 494 \(bu What is the sector size.  Sector-size these days may not be a power
 495 of two, for instance Audio CDs have 2352 byte "sectors".
 496 .IP
 497 \(bu How many sectors are there.
 498 .IP
 499 \(bu Is writing of sectors supported.
 500 .IP
 501 \(bu Is freeing of sectors supported.  This is important for flash based
 502 devices where a wear-distribution software or hardware function uses
 503 the information about which sectors are actually in use to optimise the
 504 usage of the slow erase function to a minimum.
 505 .IP
 506 \(bu Is opening this device in a specific mode, (read-only or read-write)
 507 allowed.  The VM system and the file-systems generally assume that nobody
 508 writes to "their storage" under their feet, and therefore opens which
 509 would make that possible should be rejected.
 510 .IP
 511 \(bu What is the "native" geometry of this device (Sectors/Heads/Cylinders).
 512 This is useful for staying compatible with badly designed on-disk formats
 513 from other operating systems.
 514 .PP
 515 Obviously, all of these properties are dynamic in the sense that in
 516 these days disks are removable devices, and they may therefore change
 517 at any time.  While some devices like CD-ROMs can lock the media in
 518 place with a special command, this cannot be done for all devices,
 519 in particular it cannot be done with normal floppy disk drives.
 520 .PP
 521 If we adopt such a model for disk, retain the existing "strategy/biodone" model of I/O scheduling and decide to use a modular or stackable approach to
 522 geometry translations we find that nearly endless flexibility emerges:
 523 Mirroring, RAID, striping, interleaving, disk-labels and sub-disks, all of
 524 these techniques would get a common framework to operate in.
 525 .PP
 526 In practice of course, such a scheme must not complicate the use of or
 527 installation of FreeBSD.  The code will have to act and react exactly
 528 like the current code but fortunately the current behaviour is not at
 529 all hard to emulate so implementation-wise this is a non-issue.
 530 .PP
 531 But lets look at some drawings to see what this means in practice.
 532 .PP
 533 Today the plumbing might look like this on a machine:
 534 .DS
 535 .PS
 536         Ad0: box "disk (ad0)"
 537                 arrow up from Ad0.n
 538                 SL0: box "slice/label"
 539         Ad1: box "disk (ad1)" with .w at Ad0.e + (.2,0)
 540                 arrow up from Ad1.n
 541                 SL1: box "slice/label"
 542         Ad2: box "disk (ad2)" with .w at Ad1.e + (.2,0)
 543                 arrow up from Ad2.n
 544                 SL2: box "slice/label"
 545         Ad3: box "disk (ad3)" with .w at Ad2.e + (.2,0)
 546                 arrow up from Ad3.n
 547                 SL3: box "slice/label"
 548         DML: box dashed width 4i height .9i with .sw at SL0.sw + (-.2,-.2)
 549         "Disk-mini-layer" with .n at DML.s + (0, .1)
 550
 551         V: box "vinum" at 1/2 <SL1.n, SL2.n> + (0,1.2)
 552
 553         A0A: arrow up from 1/4 <SL0.nw, SL0.ne>
 554         A0B: arrow up from 2/4 <SL0.nw, SL0.ne>
 555         A0E: arrow up from 3/4 <SL0.nw, SL0.ne>
 556         A1C: arrow up from 2/4 <SL1.nw, SL1.ne>
 557                 arrow to 1/3 <V.sw, V.se>
 558         A2C: arrow up from 2/4 <SL2.nw, SL2.ne>
 559                 arrow to 2/3 <V.sw, V.se>
 560         A3A: arrow up from 1/4 <SL3.nw, SL3.ne>
 561         A3E: arrow up from 2/4 <SL3.nw, SL3.ne>
 562         A3F: arrow up from 3/4 <SL3.nw, SL3.ne>
 563
 564         "ad0s1a" with .s at A0A.n + (0, .1)
 565         "ad0s1b" with .s at A0B.n + (0, .3)
 566         "ad0s1e" with .s at A0E.n + (0, .5)
 567         "ad1s1c" with .s at A1C.n + (0, .1)
 568         "ad2s1c" with .s at A2C.n + (0, .1)
 569         "ad3s4a" with .s at A3A.n + (0, .1)
 570         "ad3s4e" with .s at A3E.n + (0, .3)
 571         "ad3s4f" with .s at A3F.n + (0, .5)
 572
 573         V1: arrow up from 1/4 <V.nw, V.ne>
 574         V2: arrow up from 2/4 <V.nw, V.ne>
 575         V3: arrow up from 3/4 <V.nw, V.ne>
 576         "V1" with .s at V1.n + (0, .1)
 577         "V2" with .s at V2.n + (0, .1)
 578         "V3" with .s at V3.n + (0, .1)
 579
 580 .PE
 581 .DE
 582 .PP
 583 And while this drawing looks nice and clean, the code underneat isn't.
 584 With a stackable BIO implementation, the picture would look like this:
 585 .DS
 586 .PS
 587         Ad0: box "disk (ad0)"
 588                 arrow up from Ad0.n
 589                 M0: box "MBR"
 590                 arrow up
 591                 B0: box "BSD"
 592
 593         A0A: arrow up from 1/4 <B0.nw, B0.ne>
 594         A0B: arrow up from 2/4 <B0.nw, B0.ne>
 595         A0E: arrow up from 3/4 <B0.nw, B0.ne>
 596
 597         Ad1: box "disk (ad1)" with .w at Ad0.e + (.2,0)
 598         Ad2: box "disk (ad2)" with .w at Ad1.e + (.2,0)
 599         Ad3: box "disk (ad3)" with .w at Ad2.e + (.2,0)
 600                 arrow up from Ad3.n
 601                 SL3: box "MBR"
 602                 arrow up
 603                 B3: box "BSD"
 604
 605         V: box "vinum" at 1/2 <Ad1.n, Ad2.n> + (0,.8)
 606         arrow from Ad1.n to 1/3 <V.sw, V.se>
 607         arrow from Ad2.n to 2/3 <V.sw, V.se>
 608
 609         A3A: arrow from 1/4 <B3.nw, B3.ne>
 610         A3E: arrow from 2/4 <B3.nw, B3.ne>
 611         A3F: arrow from 3/4 <B3.nw, B3.ne>
 612
 613         "ad0s1a" with .s at A0A.n + (0, .1)
 614         "ad0s1b" with .s at A0B.n + (0, .3)
 615         "ad0s1e" with .s at A0E.n + (0, .5)
 616         "ad3s4a" with .s at A3A.n + (0, .1)
 617         "ad3s4e" with .s at A3E.n + (0, .3)
 618         "ad3s4f" with .s at A3F.n + (0, .5)
 619
 620         V1: arrow up from 1/4 <V.nw, V.ne>
 621         V2: arrow up from 2/4 <V.nw, V.ne>
 622         V3: arrow up from 3/4 <V.nw, V.ne>
 623         "V1" with .s at V1.n + (0, .1)
 624         "V2" with .s at V2.n + (0, .1)
 625         "V3" with .s at V3.n + (0, .1)
 626
 627 .PE
 628 .DE
 629 .PP
 630 The first thing we notice is that the disk mini-layer is gone, instead
 631 separate modules for the Microsoft style MBR and the BSD style disklabel
 632 are now stacked over the disk.  We can also see that Vinum no longer
 633 needs to go though the BSD/MBR layers if it wants access to the entire
 634 physical disk, it can be stacked right over the disk.
 635 .PP
 636 Now, imagine that a ZIP drive is connected to the machine, and the
 637 user loads a ZIP disk in it.  First the device driver notices the
 638 new disk and instantiates a new disk:
 639 .DS
 640 .PS
 641         box "disk (da0)"
 642 .PE
 643 .DE
 644 .PP
 645 A number of the geometry modules have registered as "auto-discovering"
 646 and will be polled sequentially to see if any of them recognise what
 647 is on this disk.  The MBR module finds a MBR in sector 0 and attach
 648 an instance of itself to the disk:
 649 .DS
 650 .PS
 651         D: box "disk (da0)"
 652         arrow up from D.n
 653         M: box "MBR"
 654         M1: arrow up from 1/3 <M.nw, M.ne>
 655         M2: arrow up from 2/3 <M.nw, M.ne>
 656 .PE
 657 .DE
 658 .PP
 659 It finds two "slices" in the MBR and creates two new "disks" one for
 660 each of these.  The polling of modules is repeated and this time the
 661 BSD label module recognises a FreeBSD label on one of the slices and
 662 attach itself:
 663 .DS
 664 .PS
 665         D: box "disk (da0)"
 666         arrow "O" up from D.n
 667         M: box "MBR"
 668         M1: line up .3i from 1/3 <M.nw, M.ne>
 669                 arrow "O" left
 670         M2: arrow "O" up from 2/3 <M.nw, M.ne>
 671         B: box "BSD"
 672         B1: arrow "O" up from 1/4 <B.nw, B.ne>
 673         B2: arrow "O" up from 2/4 <B.nw, B.ne>
 674         B3: arrow "O" up from 3/4 <B.nw, B.ne>
 675
 676 .PE
 677 .DE
 678 .PP
 679 The BSD module finds three partitions, creates them as disks and the
 680 polling is repeated for each of these.  No modules recognise these
 681 and the process ends.  In theory one could have a module recognise
 682 the UFS superblock and extract from there the path to mount the disk
 683 on, but this is probably better implemented in a general "device-daemon"
 684 in user-land.
 685 .PP
 686 On this last drawing I have marked with "O" the "disks" which can be
 687 accessed from user-land or kernel.  The VM and file-systems generally
 688 prefer to have exclusive write access to the disk sectors they use,
 689 so we need to enforce this policy.  Since we cannot know what transformation
 690 a particular module implements, we need to ask the modules if the open
 691 is OK, and they may need to ask their neighbours before they can answer.
 692 .PP
 693 We decide to mount a filesystem on one of the BSD partitions at the very top.
 694 The open request is passed to the BSD module, which finds that none of
 695 the other open partitions (there are none) overlap this one, so far no
 696 objections.  It then passes the open to the MBR module, which goes through
 697 basically the same procedure finds no objections and pass the request to
 698 the disk driver, which since it was not previously open approves of the
 699 open.
 700 .PP
 701 Next we mount a filesystem on the next BSD partition.  The
 702 BSD module again checks for overlapping open partitions and find none.
 703 This time however, it finds that it has already opened the "downstream"
 704 in R/W mode so it does not need to ask for permission for that again
 705 so the open is OK.
 706 .PP
 707 Next we mount a msdos filesystem on the other MBR slice.  This is the
 708 same case, the MBR finds no overlapping open slices and has already
 709 opened "downstream" so the open is OK.
 710 .PP
 711 If we now try to open the other slice for writing, the one which has the
 712 BSD module attached already.  The open is passed to the MBR module which
 713 notes that the device is already opened for writing by a module (the BSD
 714 module) and consequently the open is refused.
 715 .PP
 716 While this sounds complicated it actually took less than 200 lines of
 717 code to implement in a prototype implementation.
 718 .PP
 719 Now, the user ejects the ZIP disk.  If the hardware can give a notification
 720 of intent to eject, a call-up from the driver can try to get devices synchronised
 721 and closed, this is pretty trivial.  If the hardware just disappears like
 722 a unplugged parallel zip drive, a floppy disk or a PC-card, we have no
 723 choice but to dismantle the setup.  The device driver sends a "gone" notification to the MBR module, which replicates this upwards to the mounted msdosfs
 724 and the BSD module.  The msdosfs unmounts forcefully, invalidates any blocks
 725 in the buf/vm system and returns.  The BSD module replicates the "gone" to
 726 the two mounted file-systems which in turn unmounts forcefully, invalidates
 727 blocks and return, after which the BSD module releases any resources held
 728 and returns, the MBR module releases any resources held and returns and all
 729 traces of the device have been removed.
 730 .PP
 731 Now, let us get a bit more complicated.  We add another disk and mirror
 732 two of the MBR slices:
 733 .DS
 734 .PS
 735         D0: box "disk (da0)"
 736
 737         arrow "O" up from D0.n
 738         M0: box "MBR"
 739         M01: line up .3i from 1/3 <M0.nw, M0.ne>
 740                 arrow "O" left
 741         M02: arrow "O" up from 2/3 <M0.nw, M0.ne>
 742
 743         D1: box "disk (da1)" with .w at D0.e + (.2,0)
 744         arrow "O" up from D1.n
 745         M1: box "MBR"
 746         M11: line up .3i from 1/3 <M1.nw, M1.ne>
 747                 line "O" left
 748         M11a: arrow up .2i
 749
 750         I: box "Mirror" with .s at 1/2 <M02.n, M11a.n>
 751         arrow "O" up
 752         BB: box "BSD"
 753         BB1: arrow "O" up from 1/4 <BB.nw, BB.ne>
 754         BB2: arrow "O" up from 2/4 <BB.nw, BB.ne>
 755         BB3: arrow "O" up from 3/4 <BB.nw, BB.ne>
 756
 757         M12: arrow "O" up from 2/3 <M1.nw, M1.ne>
 758         B: box "BSD"
 759         B1: arrow "O" up from 1/4 <B.nw, B.ne>
 760         B2: arrow "O" up from 2/4 <B.nw, B.ne>
 761         B3: arrow "O" up from 3/4 <B.nw, B.ne>
 762 .PE
 763 .DE
 764 .PP
 765 Now assuming that we lose disk da0, the notification goes up like before
 766 but the mirror module still has a valid mirror from disk da1, so it
 767 doesn't propagate the "gone" notification further up and the three
 768 file-systems mounted are not affected.
 769 .PP
 770 It is possible to modify the graph while in action, as long as the
 771 modules know that they will not affect any I/O in progress.  This is
 772 very handy for moving things around.  At any of the arrows we can
 773 insert a mirroring module, since it has a 1:1 mapping from input
 774 to output.  Next we can add another copy to the mirror, give the
 775 mirror time to sync the two copies.  Detach the first mirror copy
 776 and remove the mirror module.  We have now in essence moved a partition
 777 from one disk to another transparently.
 778 .NH 1
 779 Getting stackable BIO layers from where we are today.
 780 .PP
 781 Most of the infrastructure is in place now to implement stackable
 782 BIO layers:
 783 .IP "" 5n
 784 \(bu The dev_t change gave us a public structure where
 785 information about devices can be put.  This enabled us to get rid
 786 of all the NFOO limits on the number of instances of a particular
 787 driver/device, and significantly cleaned up the vnode aliasing for
 788 device vnodes.
 789 .IP
 790 \(bu The disk-mini-layer has
 791 taken the knowledge about diskslice/labels out of the
 792 majority of the disk-drivers, saving on average 100 lines of code per
 793 driver.
 794 .IP
 795 \(bu The struct bio/buf divorce is giving us an IO request of manageable
 796 size which can be modified without affecting all the filesystem and
 797 VM system users of struct buf.
 798 .PP
 799 The missing bits are:
 800 .IP "" 5n
 801 \(bu changes to struct bio to make it more
 802 stackable.  This mostly relates to the handling of the biodone()
 803 event, something which will be transparent to all current users
 804 of struct buf/bio.
 805 .IP
 806 \(bu code to stich modules together and to pass events and notifications
 807 between them.
 808 .NH 1
 809 An Implementation plan for stackable BIO layers
 810 .PP
 811 My plan for implementation stackable BIO layers is to first complete
 812 the struct bio/buf divorce with the already mentioned patch.
 813 .PP
 814 The next step is to re-implement the monolithic disk-mini-layer so
 815 that it becomes the stackable BIO system.  Vinum and CCD and all
 816 other consumers should not be unable to tell the difference between
 817 the current and the new disk-mini-layer.  The new implementation
 818 will initially use a static stacking to remain compatible with the
 819 current behaviour.  This will be the next logical checkpoint commit.
 820 .PP
 821 The next step is to make the stackable layers configurable,
 822 to provide the means to initialise the stacking and to subsequently
 823 change it.  This will be the next logical checkpoint commit.
 824 .PP
 825 At this point new functionality can be added inside the stackable
 826 BIO system: CCD can be re-implemented as a mirror module and a stripe
 827 module.  Vinum can be integrated either as one "macro-module" or
 828 as separate functions in separate modules.  Also modules for other
 829 purposes can be added, sub-disk handling for Solaris, MacOS, etc
 830 etc.  These modules can be committed one at a time.