2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
40 * $Id: vinumrequest.c,v 1.25 1999/10/12 04:38:20 grog Exp grog $
44 #include <dev/vinum/vinumhdr.h>
45 #include <dev/vinum/request.h>
46 #include <sys/resourcevar.h>
48 enum requeststatus bre(struct request *rq,
52 enum requeststatus bre5(struct request *rq,
56 enum requeststatus build_read_request(struct request *rq, int volplexno);
57 enum requeststatus build_write_request(struct request *rq);
58 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
59 int find_alternate_sd(struct request *rq);
60 int check_range_covered(struct request *);
61 void complete_rqe(struct buf *bp);
62 void complete_raid5_write(struct rqelement *);
63 int abortrequest(struct request *rq, int error);
64 void sdio_done(struct buf *bp);
65 int vinum_bounds_check(struct buf *bp, struct volume *vol);
66 caddr_t allocdatabuf(struct rqelement *rqe);
67 void freedatabuf(struct rqelement *rqe);
70 struct rqinfo rqinfo[RQINFO_SIZE];
71 struct rqinfo *rqip = rqinfo;
74 logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)
78 microtime(&rqip->timestamp); /* when did this happen? */
80 rqip->bp = ubp; /* user buffer */
83 case loginfo_user_bpl:
84 case loginfo_sdio: /* subdisk I/O */
85 case loginfo_sdiol: /* subdisk I/O launch */
86 case loginfo_sdiodone: /* subdisk I/O complete */
87 bcopy(info.bp, &rqip->info.b, sizeof(struct buf));
88 rqip->devmajor = major(info.bp->b_dev);
89 rqip->devminor = minor(info.bp->b_dev);
94 case loginfo_raid5_data:
95 case loginfo_raid5_parity:
96 bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement));
97 rqip->devmajor = major(info.rqe->b.b_dev);
98 rqip->devminor = minor(info.rqe->b.b_dev);
101 case loginfo_lockwait:
104 bcopy(info.lockinfo, &rqip->info.lockinfo, sizeof(struct rangelock));
112 if (rqip >= &rqinfo[RQINFO_SIZE]) /* wrap around */
120 vinumstrategy(struct buf *bp)
123 struct volume *vol = NULL;
125 switch (DEVTYPE(bp->b_dev)) {
127 case VINUM_RAWSD_TYPE:
132 * In fact, vinum doesn't handle drives: they're
133 * handled directly by the disk drivers
135 case VINUM_DRIVE_TYPE:
137 bp->b_error = EIO; /* I/O error */
138 bp->b_flags |= B_ERROR;
142 case VINUM_VOLUME_TYPE: /* volume I/O */
143 volno = Volno(bp->b_dev);
145 if (vol->state != volume_up) { /* can't access this volume */
146 bp->b_error = EIO; /* I/O error */
147 bp->b_flags |= B_ERROR;
151 if (vinum_bounds_check(bp, vol) <= 0) { /* don't like them bounds */
152 biodone(bp); /* have nothing to do with this */
157 * Plex I/O is pretty much the same as volume I/O
158 * for a single plex. Indicate this by passing a NULL
159 * pointer (set above) for the volume
161 case VINUM_PLEX_TYPE:
162 case VINUM_RAWPLEX_TYPE:
163 bp->b_resid = bp->b_bcount; /* transfer everything */
170 * Start a transfer. Return -1 on error,
171 * 0 if OK, 1 if we need to retry.
172 * Parameter reviveok is set when doing
173 * transfers for revives: it allows transfers to
174 * be started immediately when a revive is in
175 * progress. During revive, normal transfers
176 * are queued if they share address space with
177 * a currently active revive operation.
180 vinumstart(struct buf *bp, int reviveok)
183 int maxplex; /* maximum number of plexes to handle */
185 struct request *rq; /* build up our request here */
186 enum requeststatus status;
189 if (debug & DEBUG_LASTREQS)
190 logrq(loginfo_user_bp, (union rqinfou) bp, bp);
193 if ((bp->b_bcount % DEV_BSIZE) != 0) { /* bad length */
194 bp->b_error = EINVAL; /* invalid size */
195 bp->b_flags |= B_ERROR;
199 rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */
200 if (rq == NULL) { /* can't do it */
201 bp->b_error = ENOMEM; /* can't get memory */
202 bp->b_flags |= B_ERROR;
206 bzero(rq, sizeof(struct request));
209 * Note the volume ID. This can be NULL, which
210 * the request building functions use as an
211 * indication for single plex I/O
213 rq->bp = bp; /* and the user buffer struct */
215 if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) { /* it's a volume, */
216 rq->volplex.volno = Volno(bp->b_dev); /* get the volume number */
217 vol = &VOL[rq->volplex.volno]; /* and point to it */
218 vol->active++; /* one more active request */
219 maxplex = vol->plexes; /* consider all its plexes */
221 vol = NULL; /* no volume */
222 rq->volplex.plexno = Plexno(bp->b_dev); /* point to the plex */
223 rq->isplex = 1; /* note that it's a plex */
224 maxplex = 1; /* just the one plex */
227 if (bp->b_flags & B_READ) {
229 * This is a read request. Decide
230 * which plex to read from.
232 * There's a potential race condition here,
233 * since we're not locked, and we could end
234 * up multiply incrementing the round-robin
235 * counter. This doesn't have any serious
240 vol->bytes_read += bp->b_bcount;
241 plexno = vol->preferred_plex; /* get the plex to use */
242 if (plexno < 0) { /* round robin */
243 plexno = vol->last_plex_read;
244 vol->last_plex_read++;
245 if (vol->last_plex_read >= vol->plexes) /* got the the end? */
246 vol->last_plex_read = 0; /* wrap around */
248 status = build_read_request(rq, plexno); /* build a request */
250 daddr_t diskaddr = bp->b_blkno; /* start offset of transfer */
251 status = bre(rq, /* build a request list */
254 diskaddr + (bp->b_bcount / DEV_BSIZE));
257 if ((status > REQUEST_RECOVERED) /* can't satisfy it */
258 ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */
259 if (status == REQUEST_DOWN) { /* not enough subdisks */
260 bp->b_error = EIO; /* I/O error */
261 bp->b_flags |= B_ERROR;
267 return launch_requests(rq, reviveok); /* now start the requests if we can */
270 * This is a write operation. We write to all
271 * plexes. If this is a RAID 5 plex, we must also
272 * update the parity stripe.
277 vol->bytes_written += bp->b_bcount;
278 status = build_write_request(rq); /* Not all the subdisks are up */
279 } else { /* plex I/O */
282 diskstart = bp->b_blkno; /* start offset of transfer */
286 bp->b_blkno + (bp->b_bcount / DEV_BSIZE)); /* build requests for the plex */
288 if ((status > REQUEST_RECOVERED) /* can't satisfy it */
289 ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */
290 if (status == REQUEST_DOWN) { /* not enough subdisks */
291 bp->b_error = EIO; /* I/O error */
292 bp->b_flags |= B_ERROR;
294 if ((bp->b_flags & B_DONE) == 0)
299 return launch_requests(rq, reviveok); /* now start the requests if we can */
304 * Call the low-level strategy routines to
305 * perform the requests in a struct request
308 launch_requests(struct request *rq, int reviveok)
312 int rqno; /* loop index */
313 struct rqelement *rqe; /* current element */
317 * First find out whether we're reviving, and the
318 * request contains a conflict. If so, we hang
319 * the request off plex->waitlist of the first
320 * plex we find which is reviving
322 if ((rq->flags & XFR_REVIVECONFLICT) /* possible revive conflict */
323 &&(!reviveok)) { /* and we don't want to do it now, */
325 struct request *waitlist; /* point to the waitlist */
328 if (sd->waitlist != NULL) { /* something there already, */
329 waitlist = sd->waitlist;
330 while (waitlist->next != NULL) /* find the end */
331 waitlist = waitlist->next;
332 waitlist->next = rq; /* hook our request there */
334 sd->waitlist = rq; /* hook our request at the front */
337 if (debug & DEBUG_REVIVECONFLICT)
339 "Revive conflict sd %d: %x\n%s dev %d.%d, offset 0x%x, length %ld\n",
342 rq->bp->b_flags & B_READ ? "Read" : "Write",
343 major(rq->bp->b_dev),
344 minor(rq->bp->b_dev),
348 return 0; /* and get out of here */
350 rq->active = 0; /* nothing yet */
352 if (debug & DEBUG_ADDRESSES)
354 "Request: %x\n%s dev %d.%d, offset 0x%x, length %ld\n",
356 rq->bp->b_flags & B_READ ? "Read" : "Write",
357 major(rq->bp->b_dev),
358 minor(rq->bp->b_dev),
361 vinum_conf.lastrq = (int) rq;
362 vinum_conf.lastbuf = rq->bp;
363 if (debug & DEBUG_LASTREQS)
364 logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp);
368 * We have a potential race condition here: between firing off each
369 * request, we need to check that we're not overloading the system,
370 * and possibly sleep. But the bottom half releases the request
371 * when the active count goes to 0, so we need to set the total
372 * active count in advance.
374 for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */
375 rqg->active = rqg->count; /* they're all active */
376 for (rqno = 0; rqno < rqg->count; rqno++) {
377 rqe = &rqg->rqe[rqno];
378 if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */
379 rqg->active--; /* one less active request */
381 if (rqg->active) /* we have at least one active request, */
382 rq->active++; /* one more active request group */
385 /* Now fire off the requests */
386 s = splbio(); /* lock out the interrupt routines */
387 for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */
388 for (rqno = 0; rqno < rqg->count; rqno++) {
389 rqe = &rqg->rqe[rqno];
390 if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* this subdisk is good, */
391 /* Check that we're not overloading things */
392 drive = &DRIVE[rqe->driveno]; /* look at drive */
393 while ((drive->active >= DRIVE_MAXACTIVE) /* it has too much to do already, */
394 ||(vinum_conf.active >= VINUM_MAXACTIVE)) /* or too many requests globally */
395 tsleep(&launch_requests, PRIBIO, "vinbuf", 0); /* wait for it to subside, XXX: should PCATCH */
397 if (drive->active >= drive->maxactive)
398 drive->maxactive = drive->active;
400 if (vinum_conf.active >= vinum_conf.maxactive)
401 vinum_conf.maxactive = vinum_conf.active;
404 if (debug & DEBUG_ADDRESSES)
406 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
407 rqe->b.b_flags & B_READ ? "Read" : "Write",
411 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
414 if (debug & DEBUG_NUMOUTPUT)
416 " vinumstart sd %d numoutput %ld\n",
418 rqe->b.b_vp->v_numoutput);
419 if (debug & DEBUG_LASTREQS)
420 logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp);
423 if ((rqe->b.b_flags & B_READ) == 0)
424 rqe->b.b_vp->v_numoutput++; /* one more output going */
426 rqe->b.b_flags |= B_ORDERED; /* stick to the request order */
428 /* fire off the request */
429 BUF_STRATEGY(&rqe->b, 0);
438 * define the low-level requests needed to perform a
439 * high-level I/O operation for a specific plex 'plexno'.
441 * Return REQUEST_OK if all subdisks involved in the request are up,
442 * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the
443 * request is at least partially outside the bounds of the subdisks.
445 * Modify the pointer *diskstart to point to the end address. On
446 * read, return on the first bad subdisk, so that the caller
447 * (build_read_request) can try alternatives.
449 * On entry to this routine, the rqg structures are not assigned. The
450 * assignment is performed by expandrq(). Strictly speaking, the
451 * elements rqe->sdno of all entries should be set to -1, since 0
452 * (from bzero) is a valid subdisk number. We avoid this problem by
453 * initializing the ones we use, and not looking at the others (index
457 bre(struct request *rq,
465 struct buf *bp; /* user's bp */
467 enum requeststatus status; /* return value */
468 daddr_t plexoffset; /* offset of transfer in plex */
469 daddr_t stripebase; /* base address of stripe (1st subdisk) */
470 daddr_t stripeoffset; /* offset in stripe */
471 daddr_t blockoffset; /* offset in stripe on subdisk */
472 struct rqelement *rqe; /* point to this request information */
473 daddr_t diskstart = *diskaddr; /* remember where this transfer starts */
474 enum requeststatus s; /* temp return value */
476 bp = rq->bp; /* buffer pointer */
477 status = REQUEST_OK; /* return value: OK until proven otherwise */
478 plex = &PLEX[plexno]; /* point to the plex */
480 switch (plex->organization) {
482 sd = NULL; /* (keep compiler quiet) */
483 for (sdno = 0; sdno < plex->subdisks; sdno++) {
484 sd = &SD[plex->sdnos[sdno]];
485 if (*diskaddr < sd->plexoffset) /* we must have a hole, */
486 status = REQUEST_DEGRADED; /* note the fact */
487 if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */
488 rqg = allocrqg(rq, 1); /* space for the request */
489 if (rqg == NULL) { /* malloc failed */
490 bp->b_flags |= B_ERROR;
491 bp->b_error = ENOMEM;
493 return REQUEST_ENOMEM;
495 rqg->plexno = plexno;
497 rqe = &rqg->rqe[0]; /* point to the element */
498 rqe->rqg = rqg; /* group */
499 rqe->sdno = sd->sdno; /* put in the subdisk number */
500 plexoffset = *diskaddr; /* start offset in plex */
501 rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */
502 rqe->useroffset = plexoffset - diskstart; /* start offset in user buffer */
504 rqe->datalen = min(diskend - *diskaddr, /* number of sectors to transfer in this sd */
505 sd->sectors - rqe->sdoffset);
506 rqe->groupoffset = 0; /* no groups for concatenated plexes */
508 rqe->buflen = rqe->datalen; /* buffer length is data buffer length */
510 rqe->driveno = sd->driveno;
511 if (sd->state != sd_up) { /* *now* we find the sd is down */
512 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
513 if (s == REQUEST_DOWN) { /* down? */
514 rqe->flags = XFR_BAD_SUBDISK; /* yup */
515 if (rq->bp->b_flags & B_READ) /* read request, */
516 return REQUEST_DEGRADED; /* give up here */
518 * If we're writing, don't give up
519 * because of a bad subdisk. Go
520 * through to the bitter end, but note
521 * which ones we can't access.
523 status = REQUEST_DEGRADED; /* can't do it all */
526 *diskaddr += rqe->datalen; /* bump the address */
527 if (build_rq_buffer(rqe, plex)) { /* build the buffer */
529 bp->b_flags |= B_ERROR;
530 bp->b_error = ENOMEM;
532 return REQUEST_ENOMEM; /* can't do it */
535 if (*diskaddr == diskend) /* we're finished, */
536 break; /* get out of here */
539 * We've got to the end of the plex. Have we got to the end of
540 * the transfer? It would seem that having an offset beyond the
541 * end of the subdisk is an error, but in fact it can happen if
542 * the volume has another plex of different size. There's a valid
543 * question as to why you would want to do this, but currently
546 * In a previous version, I returned REQUEST_DOWN here. I think
547 * REQUEST_EOF is more appropriate now.
549 if (diskend > sd->sectors + sd->plexoffset) /* pointing beyond EOF? */
550 status = REQUEST_EOF;
555 while (*diskaddr < diskend) { /* until we get it all sorted out */
556 if (*diskaddr >= plex->length) /* beyond the end of the plex */
557 return REQUEST_EOF; /* can't continue */
559 /* The offset of the start address from the start of the stripe. */
560 stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks);
562 /* The plex-relative address of the start of the stripe. */
563 stripebase = *diskaddr - stripeoffset;
565 /* The number of the subdisk in which the start is located. */
566 sdno = stripeoffset / plex->stripesize;
568 /* The offset from the beginning of the stripe on this subdisk. */
569 blockoffset = stripeoffset % plex->stripesize;
571 sd = &SD[plex->sdnos[sdno]]; /* the subdisk in question */
572 rqg = allocrqg(rq, 1); /* space for the request */
573 if (rqg == NULL) { /* malloc failed */
574 bp->b_flags |= B_ERROR;
575 bp->b_error = ENOMEM;
577 return REQUEST_ENOMEM;
579 rqg->plexno = plexno;
581 rqe = &rqg->rqe[0]; /* point to the element */
583 rqe->sdoffset = stripebase / plex->subdisks + blockoffset; /* start offset in this subdisk */
584 rqe->useroffset = *diskaddr - diskstart; /* The offset of the start in the user buffer */
586 rqe->datalen = min(diskend - *diskaddr, /* the amount remaining to transfer */
587 plex->stripesize - blockoffset); /* and the amount left in this stripe */
588 rqe->groupoffset = 0; /* no groups for striped plexes */
590 rqe->buflen = rqe->datalen; /* buffer length is data buffer length */
592 rqe->sdno = sd->sdno; /* put in the subdisk number */
593 rqe->driveno = sd->driveno;
595 if (sd->state != sd_up) { /* *now* we find the sd is down */
596 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
597 if (s == REQUEST_DOWN) { /* down? */
598 rqe->flags = XFR_BAD_SUBDISK; /* yup */
599 if (rq->bp->b_flags & B_READ) /* read request, */
600 return REQUEST_DEGRADED; /* give up here */
602 * If we're writing, don't give up
603 * because of a bad subdisk. Go through
604 * to the bitter end, but note which
605 * ones we can't access.
607 status = REQUEST_DEGRADED; /* can't do it all */
611 * It would seem that having an offset
612 * beyond the end of the subdisk is an
613 * error, but in fact it can happen if the
614 * volume has another plex of different
615 * size. There's a valid question as to why
616 * you would want to do this, but currently
619 if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */
620 rqe->datalen = sd->sectors - rqe->sdoffset; /* truncate */
622 if (debug & DEBUG_EOFINFO) { /* tell on the request */
624 "vinum: EOF on plex %s, sd %s offset %x (user offset %x)\n",
630 "vinum: stripebase %x, stripeoffset %x, blockoffset %x\n",
637 if (build_rq_buffer(rqe, plex)) { /* build the buffer */
639 bp->b_flags |= B_ERROR;
640 bp->b_error = ENOMEM;
642 return REQUEST_ENOMEM; /* can't do it */
644 *diskaddr += rqe->datalen; /* look at the remainder */
645 if ((*diskaddr < diskend) /* didn't finish the request on this stripe */
646 &&(*diskaddr < plex->length)) { /* and there's more to come */
647 plex->multiblock++; /* count another one */
648 if (sdno == plex->subdisks - 1) /* last subdisk, */
649 plex->multistripe++; /* another stripe as well */
656 * RAID5 is complicated enough to have
660 status = bre5(rq, plexno, diskaddr, diskend);
664 log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization);
665 status = REQUEST_DOWN; /* can't access it */
672 * Build up a request structure for reading volumes.
673 * This function is not needed for plex reads, since there's
674 * no recovery if a plex read can't be satisified.
677 build_read_request(struct request *rq, /* request */
679 { /* index in the volume's plex table */
681 daddr_t startaddr; /* offset of previous part of transfer */
682 daddr_t diskaddr; /* offset of current part of transfer */
683 daddr_t diskend; /* and end offset of transfer */
684 int plexno; /* plex index in vinum_conf */
685 struct rqgroup *rqg; /* point to the request we're working on */
686 struct volume *vol; /* volume in question */
687 int recovered = 0; /* set if we recover a read */
688 enum requeststatus status = REQUEST_OK;
689 int plexmask; /* bit mask of plexes, for recovery */
691 bp = rq->bp; /* buffer pointer */
692 diskaddr = bp->b_blkno; /* start offset of transfer */
693 diskend = diskaddr + (bp->b_bcount / DEV_BSIZE); /* and end offset of transfer */
694 rqg = &rq->rqg[plexindex]; /* plex request */
695 vol = &VOL[rq->volplex.volno]; /* point to volume */
697 while (diskaddr < diskend) { /* build up request components */
698 startaddr = diskaddr;
699 status = bre(rq, vol->plex[plexindex], &diskaddr, diskend); /* build up a request */
704 case REQUEST_RECOVERED:
706 * XXX FIXME if we have more than one plex, and we can
707 * satisfy the request from another, don't use the
708 * recovered request, since it's more expensive.
716 * If we get here, our request is not complete. Try
717 * to fill in the missing parts from another plex.
718 * This can happen multiple times in this function,
719 * and we reinitialize the plex mask each time, since
720 * we could have a hole in our plexes.
723 case REQUEST_DOWN: /* can't access the plex */
724 case REQUEST_DEGRADED: /* can't access the plex */
725 plexmask = ((1 << vol->plexes) - 1) /* all plexes in the volume */
726 &~(1 << plexindex); /* except for the one we were looking at */
727 for (plexno = 0; plexno < vol->plexes; plexno++) {
728 if (plexmask == 0) /* no plexes left to try */
729 return REQUEST_DOWN; /* failed */
730 diskaddr = startaddr; /* start at the beginning again */
731 if (plexmask & (1 << plexno)) { /* we haven't tried this plex yet */
732 bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */
733 if (diskaddr > startaddr) { /* we satisfied another part */
734 recovered = 1; /* we recovered from the problem */
735 status = REQUEST_OK; /* don't complain about it */
740 if (diskaddr == startaddr) /* didn't get any further, */
744 vol->recovered_reads += recovered; /* adjust our recovery count */
750 * Build up a request structure for writes.
751 * Return 0 if all subdisks involved in the request are up, 1 if some
752 * subdisks are not up, and -1 if the request is at least partially
753 * outside the bounds of the subdisks.
756 build_write_request(struct request *rq)
759 daddr_t diskstart; /* offset of current part of transfer */
760 daddr_t diskend; /* and end offset of transfer */
761 int plexno; /* plex index in vinum_conf */
762 struct volume *vol; /* volume in question */
763 enum requeststatus status;
765 bp = rq->bp; /* buffer pointer */
766 vol = &VOL[rq->volplex.volno]; /* point to volume */
767 diskend = bp->b_blkno + (bp->b_bcount / DEV_BSIZE); /* end offset of transfer */
768 status = REQUEST_DOWN; /* assume the worst */
769 for (plexno = 0; plexno < vol->plexes; plexno++) {
770 diskstart = bp->b_blkno; /* start offset of transfer */
772 * Build requests for the plex.
773 * We take the best possible result here (min,
774 * not max): we're happy if we can write at all
776 status = min(status, bre(rq,
784 /* Fill in the struct buf part of a request element. */
786 build_rq_buffer(struct rqelement *rqe, struct plex *plex)
788 struct sd *sd; /* point to subdisk */
791 struct buf *ubp; /* user (high level) buffer header */
793 vol = &VOL[rqe->rqg->rq->volplex.volno];
794 sd = &SD[rqe->sdno]; /* point to subdisk */
796 ubp = rqe->rqg->rq->bp; /* pointer to user buffer header */
798 /* Initialize the buf struct */
799 bp->b_flags = ubp->b_flags & (B_NOCACHE | B_READ | B_ASYNC); /* copy these flags from user bp */
800 bp->b_flags |= B_CALL; /* inform us when it's done */
801 BUF_LOCKINIT(bp); /* get a lock for the buffer */
802 BUF_LOCK(bp, LK_EXCLUSIVE); /* and lock it */
804 bp->b_iodone = complete_rqe; /* by calling us here */
806 * You'd think that we wouldn't need to even
807 * build the request buffer for a dead subdisk,
808 * but in some cases we need information like
809 * the user buffer address. Err on the side of
810 * generosity and supply what we can. That
811 * obviously doesn't include drive information
812 * when the drive is dead.
814 if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk is accessible, */
815 bp->b_dev = DRIVE[rqe->driveno].vp->v_rdev; /* drive device */
816 bp->b_vp = DRIVE[rqe->driveno].vp; /* drive vnode */
818 bp->b_blkno = rqe->sdoffset + sd->driveoffset; /* start address */
819 bp->b_bcount = rqe->buflen << DEV_BSHIFT; /* number of bytes to transfer */
820 bp->b_resid = bp->b_bcount; /* and it's still all waiting */
821 bp->b_bufsize = bp->b_bcount; /* and buffer size */
822 bp->b_rcred = FSCRED; /* we have the file system credentials */
823 bp->b_wcred = FSCRED; /* we have the file system credentials */
825 if (rqe->flags & XFR_MALLOCED) { /* this operation requires a malloced buffer */
826 bp->b_data = Malloc(bp->b_bcount); /* get a buffer to put it in */
827 if (bp->b_data == NULL) { /* failed */
828 abortrequest(rqe->rqg->rq, ENOMEM);
829 return REQUEST_ENOMEM; /* no memory */
833 * Point directly to user buffer data. This means
834 * that we don't need to do anything when we have
835 * finished the transfer
837 bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE;
839 * On a recovery read, we perform an XOR of
840 * all blocks to the user buffer. To make
841 * this work, we first clean out the buffer
843 if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK))
844 == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */
845 int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */
846 char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */
848 bzero(data, length); /* clean it out */
854 * Abort a request: free resources and complete the
855 * user request with the specified error
858 abortrequest(struct request *rq, int error)
860 struct buf *bp = rq->bp; /* user buffer */
862 bp->b_flags |= B_ERROR;
864 freerq(rq); /* free everything we're doing */
866 return error; /* and give up */
870 * Check that our transfer will cover the
871 * complete address space of the user request.
873 * Return 1 if it can, otherwise 0
876 check_range_covered(struct request *rq)
881 /* Perform I/O on a subdisk */
892 if (debug & DEBUG_LASTREQS)
893 logrq(loginfo_sdio, (union rqinfou) bp, bp);
895 sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */
896 drive = &DRIVE[sd->driveno];
900 * We allow access to any kind of subdisk as long as we can expect
901 * to get the I/O performed.
903 if (sd->state < sd_empty) { /* nothing to talk to, */
904 bp->b_flags |= B_ERROR;
910 sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf));
912 bp->b_flags |= B_ERROR;
913 bp->b_error = ENOMEM;
917 bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */
918 sbp->b.b_flags = bp->b_flags | B_CALL; /* inform us when it's done */
919 sbp->b.b_bufsize = bp->b_bufsize; /* buffer size */
920 sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */
921 sbp->b.b_resid = bp->b_resid; /* and amount waiting */
922 sbp->b.b_dev = DRIVE[sd->driveno].vp->v_rdev; /* device */
923 sbp->b.b_data = bp->b_data; /* data buffer */
924 sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
925 sbp->b.b_iodone = sdio_done; /* come here on completion */
926 BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */
927 BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */
929 sbp->b.b_vp = DRIVE[sd->driveno].vp; /* vnode */
930 sbp->bp = bp; /* note the address of the original header */
931 sbp->sdno = sd->sdno; /* note for statistics */
932 sbp->driveno = sd->driveno;
933 endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */
934 if (endoffset > sd->sectors) { /* beyond the end */
935 sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */
936 if (sbp->b.b_bcount <= 0) { /* nothing to transfer */
937 bp->b_resid = bp->b_bcount; /* nothing transferred */
943 if ((sbp->b.b_flags & B_READ) == 0) /* write */
944 sbp->b.b_vp->v_numoutput++; /* one more output going */
946 if (debug & DEBUG_ADDRESSES)
948 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
949 sbp->b.b_flags & B_READ ? "Read" : "Write",
953 (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset),
954 (int) sbp->b.b_blkno,
956 if (debug & DEBUG_NUMOUTPUT)
958 " vinumstart sd %d numoutput %ld\n",
960 sbp->b.b_vp->v_numoutput);
964 if (debug & DEBUG_LASTREQS)
965 logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b);
967 BUF_STRATEGY(&sbp->b, 0);
972 * Simplified version of bounds_check_with_label
973 * Determine the size of the transfer, and make sure it is
974 * within the boundaries of the partition. Adjust transfer
975 * if needed, and signal errors or early completion.
977 * Volumes are simpler than disk slices: they only contain
978 * one component (though we call them a, b and c to make
979 * system utilities happy), and they always take up the
980 * complete space of the "partition".
982 * I'm still not happy with this: why should the label be
983 * protected? If it weren't so damned difficult to write
984 * one in the first pleace (because it's protected), it wouldn't
988 vinum_bounds_check(struct buf *bp, struct volume *vol)
990 int maxsize = vol->size; /* size of the partition (sectors) */
991 int size = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* size of this request (sectors) */
993 /* Would this transfer overwrite the disk label? */
994 if (bp->b_blkno <= LABELSECTOR /* starts before or at the label */
996 && bp->b_blkno + size > LABELSECTOR /* and finishes after */
998 && (!(vol->flags & VF_RAW)) /* and it's not raw */
999 &&major(bp->b_dev) == BDEV_MAJOR /* and it's the block device */
1000 && (bp->b_flags & B_READ) == 0 /* and it's a write */
1001 && (!vol->flags & (VF_WLABEL | VF_LABELLING))) { /* and we're not allowed to write the label */
1002 bp->b_error = EROFS; /* read-only */
1003 bp->b_flags |= B_ERROR;
1006 if (size == 0) /* no transfer specified, */
1007 return 0; /* treat as EOF */
1008 /* beyond partition? */
1009 if (bp->b_blkno < 0 /* negative start */
1010 || bp->b_blkno + size > maxsize) { /* or goes beyond the end of the partition */
1011 /* if exactly at end of disk, return an EOF */
1012 if (bp->b_blkno == maxsize) {
1013 bp->b_resid = bp->b_bcount;
1016 /* or truncate if part of it fits */
1017 size = maxsize - bp->b_blkno;
1018 if (size <= 0) { /* nothing to transfer */
1019 bp->b_error = EINVAL;
1020 bp->b_flags |= B_ERROR;
1023 bp->b_bcount = size << DEV_BSHIFT;
1025 bp->b_pblkno = bp->b_blkno;
1030 * Allocate a request group and hook
1031 * it in in the list for rq
1034 allocrqg(struct request *rq, int elements)
1036 struct rqgroup *rqg; /* the one we're going to allocate */
1037 int size = sizeof(struct rqgroup) + elements * sizeof(struct rqelement);
1039 rqg = (struct rqgroup *) Malloc(size);
1040 if (rqg != NULL) { /* malloc OK, */
1041 if (rq->rqg) /* we already have requests */
1042 rq->lrqg->next = rqg; /* hang it off the end */
1043 else /* first request */
1044 rq->rqg = rqg; /* at the start */
1045 rq->lrqg = rqg; /* this one is the last in the list */
1047 bzero(rqg, size); /* no old junk */
1048 rqg->rq = rq; /* point back to the parent request */
1049 rqg->count = elements; /* number of requests in the group */
1055 * Deallocate a request group out of a chain. We do
1056 * this by linear search: the chain is short, this
1057 * almost never happens, and currently it can only
1058 * happen to the first member of the chain.
1061 deallocrqg(struct rqgroup *rqg)
1063 struct rqgroup *rqgc = rqg->rq->rqg; /* point to the request chain */
1065 if (rqg->lock) /* got a lock? */
1066 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
1067 if (rqgc == rqg) /* we're first in line */
1068 rqg->rq->rqg = rqg->next; /* unhook ourselves */
1070 while ((rqgc->next != NULL) /* find the group */
1071 &&(rqgc->next != rqg))
1073 if (rqgc->next == NULL)
1075 "vinum deallocrqg: rqg %p not found in request %p\n",
1079 rqgc->next = rqg->next; /* make the chain jump over us */