2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
43 #include <dev/vinum/vinumhdr.h>
44 #include <dev/vinum/request.h>
45 #include <sys/resourcevar.h>
47 enum requeststatus bre(struct request *rq,
51 enum requeststatus bre5(struct request *rq,
55 enum requeststatus build_read_request(struct request *rq, int volplexno);
56 enum requeststatus build_write_request(struct request *rq);
57 enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
58 int find_alternate_sd(struct request *rq);
59 int check_range_covered(struct request *);
60 void complete_rqe(struct buf *bp);
61 void complete_raid5_write(struct rqelement *);
62 int abortrequest(struct request *rq, int error);
63 void sdio_done(struct buf *bp);
64 int vinum_bounds_check(struct buf *bp, struct volume *vol);
65 caddr_t allocdatabuf(struct rqelement *rqe);
66 void freedatabuf(struct rqelement *rqe);
69 struct rqinfo rqinfo[RQINFO_SIZE];
70 struct rqinfo *rqip = rqinfo;
73 logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)
77 microtime(&rqip->timestamp); /* when did this happen? */
79 rqip->bp = ubp; /* user buffer */
82 case loginfo_user_bpl:
83 case loginfo_sdio: /* subdisk I/O */
84 case loginfo_sdiol: /* subdisk I/O launch */
85 case loginfo_sdiodone: /* subdisk I/O complete */
86 bcopy(info.bp, &rqip->info.b, sizeof(struct buf));
87 rqip->devmajor = major(info.bp->b_dev);
88 rqip->devminor = minor(info.bp->b_dev);
93 case loginfo_raid5_data:
94 case loginfo_raid5_parity:
95 bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement));
96 rqip->devmajor = major(info.rqe->b.b_dev);
97 rqip->devminor = minor(info.rqe->b.b_dev);
100 case loginfo_lockwait:
103 bcopy(info.lockinfo, &rqip->info.lockinfo, sizeof(struct rangelock));
111 if (rqip >= &rqinfo[RQINFO_SIZE]) /* wrap around */
119 vinumstrategy(struct buf *bp)
122 struct volume *vol = NULL;
124 switch (DEVTYPE(bp->b_dev)) {
126 case VINUM_RAWSD_TYPE:
131 * In fact, vinum doesn't handle drives: they're
132 * handled directly by the disk drivers
134 case VINUM_DRIVE_TYPE:
136 bp->b_error = EIO; /* I/O error */
137 bp->b_flags |= B_ERROR;
141 case VINUM_VOLUME_TYPE: /* volume I/O */
142 volno = Volno(bp->b_dev);
144 if (vol->state != volume_up) { /* can't access this volume */
145 bp->b_error = EIO; /* I/O error */
146 bp->b_flags |= B_ERROR;
150 if (vinum_bounds_check(bp, vol) <= 0) { /* don't like them bounds */
151 biodone(bp); /* have nothing to do with this */
156 * Plex I/O is pretty much the same as volume I/O
157 * for a single plex. Indicate this by passing a NULL
158 * pointer (set above) for the volume
160 case VINUM_PLEX_TYPE:
161 case VINUM_RAWPLEX_TYPE:
162 bp->b_resid = bp->b_bcount; /* transfer everything */
169 * Start a transfer. Return -1 on error,
170 * 0 if OK, 1 if we need to retry.
171 * Parameter reviveok is set when doing
172 * transfers for revives: it allows transfers to
173 * be started immediately when a revive is in
174 * progress. During revive, normal transfers
175 * are queued if they share address space with
176 * a currently active revive operation.
179 vinumstart(struct buf *bp, int reviveok)
182 int maxplex; /* maximum number of plexes to handle */
184 struct request *rq; /* build up our request here */
185 enum requeststatus status;
188 if (debug & DEBUG_LASTREQS)
189 logrq(loginfo_user_bp, (union rqinfou) bp, bp);
192 if ((bp->b_bcount % DEV_BSIZE) != 0) { /* bad length */
193 bp->b_error = EINVAL; /* invalid size */
194 bp->b_flags |= B_ERROR;
198 rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */
199 if (rq == NULL) { /* can't do it */
200 bp->b_error = ENOMEM; /* can't get memory */
201 bp->b_flags |= B_ERROR;
205 bzero(rq, sizeof(struct request));
208 * Note the volume ID. This can be NULL, which
209 * the request building functions use as an
210 * indication for single plex I/O
212 rq->bp = bp; /* and the user buffer struct */
214 if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) { /* it's a volume, */
215 rq->volplex.volno = Volno(bp->b_dev); /* get the volume number */
216 vol = &VOL[rq->volplex.volno]; /* and point to it */
217 vol->active++; /* one more active request */
218 maxplex = vol->plexes; /* consider all its plexes */
220 vol = NULL; /* no volume */
221 rq->volplex.plexno = Plexno(bp->b_dev); /* point to the plex */
222 rq->isplex = 1; /* note that it's a plex */
223 maxplex = 1; /* just the one plex */
226 if (bp->b_flags & B_READ) {
228 * This is a read request. Decide
229 * which plex to read from.
231 * There's a potential race condition here,
232 * since we're not locked, and we could end
233 * up multiply incrementing the round-robin
234 * counter. This doesn't have any serious
239 vol->bytes_read += bp->b_bcount;
240 plexno = vol->preferred_plex; /* get the plex to use */
241 if (plexno < 0) { /* round robin */
242 plexno = vol->last_plex_read;
243 vol->last_plex_read++;
244 if (vol->last_plex_read >= vol->plexes) /* got the the end? */
245 vol->last_plex_read = 0; /* wrap around */
247 status = build_read_request(rq, plexno); /* build a request */
249 daddr_t diskaddr = bp->b_blkno; /* start offset of transfer */
250 status = bre(rq, /* build a request list */
253 diskaddr + (bp->b_bcount / DEV_BSIZE));
256 if ((status > REQUEST_RECOVERED) /* can't satisfy it */
257 ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */
258 if (status == REQUEST_DOWN) { /* not enough subdisks */
259 bp->b_error = EIO; /* I/O error */
260 bp->b_flags |= B_ERROR;
266 return launch_requests(rq, reviveok); /* now start the requests if we can */
269 * This is a write operation. We write to all
270 * plexes. If this is a RAID 5 plex, we must also
271 * update the parity stripe.
276 vol->bytes_written += bp->b_bcount;
277 status = build_write_request(rq); /* Not all the subdisks are up */
278 } else { /* plex I/O */
281 diskstart = bp->b_blkno; /* start offset of transfer */
285 bp->b_blkno + (bp->b_bcount / DEV_BSIZE)); /* build requests for the plex */
287 if ((status > REQUEST_RECOVERED) /* can't satisfy it */
288 ||(bp->b_flags & B_DONE)) { /* XXX shouldn't get this without bad status */
289 if (status == REQUEST_DOWN) { /* not enough subdisks */
290 bp->b_error = EIO; /* I/O error */
291 bp->b_flags |= B_ERROR;
293 if ((bp->b_flags & B_DONE) == 0)
298 return launch_requests(rq, reviveok); /* now start the requests if we can */
303 * Call the low-level strategy routines to
304 * perform the requests in a struct request
307 launch_requests(struct request *rq, int reviveok)
310 int rqno; /* loop index */
311 struct rqelement *rqe; /* current element */
315 * First find out whether we're reviving, and the
316 * request contains a conflict. If so, we hang
317 * the request off plex->waitlist of the first
318 * plex we find which is reviving
320 if ((rq->flags & XFR_REVIVECONFLICT) /* possible revive conflict */
321 &&(!reviveok)) { /* and we don't want to do it now, */
323 struct request *waitlist; /* point to the waitlist */
326 if (sd->waitlist != NULL) { /* something there already, */
327 waitlist = sd->waitlist;
328 while (waitlist->next != NULL) /* find the end */
329 waitlist = waitlist->next;
330 waitlist->next = rq; /* hook our request there */
332 sd->waitlist = rq; /* hook our request at the front */
335 if (debug & DEBUG_REVIVECONFLICT)
337 "Revive conflict sd %d: %x\n%s dev %d.%d, offset 0x%x, length %ld\n",
340 rq->bp->b_flags & B_READ ? "Read" : "Write",
341 major(rq->bp->b_dev),
342 minor(rq->bp->b_dev),
346 return 0; /* and get out of here */
348 rq->active = 0; /* nothing yet */
350 if (debug & DEBUG_ADDRESSES)
352 "Request: %x\n%s dev %d.%d, offset 0x%x, length %ld\n",
354 rq->bp->b_flags & B_READ ? "Read" : "Write",
355 major(rq->bp->b_dev),
356 minor(rq->bp->b_dev),
359 vinum_conf.lastrq = (int) rq;
360 vinum_conf.lastbuf = rq->bp;
361 if (debug & DEBUG_LASTREQS)
362 logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp);
365 for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) { /* through the whole request chain */
366 rqg->active = rqg->count; /* they're all active */
367 for (rqno = 0; rqno < rqg->count; rqno++) {
368 rqe = &rqg->rqe[rqno];
369 if (rqe->flags & XFR_BAD_SUBDISK) /* this subdisk is bad, */
370 rqg->active--; /* one less active request */
371 else { /* we can do it */
372 if ((rqe->b.b_flags & B_READ) == 0)
373 rqe->b.b_vp->v_numoutput++; /* one more output going */
374 rqe->b.b_flags |= B_ORDERED; /* stick to the request order */
376 if (debug & DEBUG_ADDRESSES)
378 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
379 rqe->b.b_flags & B_READ ? "Read" : "Write",
383 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
386 if (debug & DEBUG_NUMOUTPUT)
388 " vinumstart sd %d numoutput %ld\n",
390 rqe->b.b_vp->v_numoutput);
391 if (debug & DEBUG_LASTREQS)
392 logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp);
394 /* fire off the request */
395 BUF_STRATEGY(&rqe->b, 0);
398 if (rqg->active) /* we have at least one active request, */
399 rq->active++; /* one more active request group */
406 * define the low-level requests needed to perform a
407 * high-level I/O operation for a specific plex 'plexno'.
409 * Return REQUEST_OK if all subdisks involved in the request are up,
410 * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the
411 * request is at least partially outside the bounds of the subdisks.
413 * Modify the pointer *diskstart to point to the end address. On
414 * read, return on the first bad subdisk, so that the caller
415 * (build_read_request) can try alternatives.
417 * On entry to this routine, the rqg structures are not assigned. The
418 * assignment is performed by expandrq(). Strictly speaking, the
419 * elements rqe->sdno of all entries should be set to -1, since 0
420 * (from bzero) is a valid subdisk number. We avoid this problem by
421 * initializing the ones we use, and not looking at the others (index
425 bre(struct request *rq,
433 struct buf *bp; /* user's bp */
435 enum requeststatus status; /* return value */
436 daddr_t plexoffset; /* offset of transfer in plex */
437 daddr_t stripebase; /* base address of stripe (1st subdisk) */
438 daddr_t stripeoffset; /* offset in stripe */
439 daddr_t blockoffset; /* offset in stripe on subdisk */
440 struct rqelement *rqe; /* point to this request information */
441 daddr_t diskstart = *diskaddr; /* remember where this transfer starts */
442 enum requeststatus s; /* temp return value */
444 bp = rq->bp; /* buffer pointer */
445 status = REQUEST_OK; /* return value: OK until proven otherwise */
446 plex = &PLEX[plexno]; /* point to the plex */
448 switch (plex->organization) {
450 sd = NULL; /* (keep compiler quiet) */
451 for (sdno = 0; sdno < plex->subdisks; sdno++) {
452 sd = &SD[plex->sdnos[sdno]];
453 if (*diskaddr < sd->plexoffset) /* we must have a hole, */
454 status = REQUEST_DEGRADED; /* note the fact */
455 if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */
456 rqg = allocrqg(rq, 1); /* space for the request */
457 if (rqg == NULL) { /* malloc failed */
458 bp->b_flags |= B_ERROR;
459 bp->b_error = ENOMEM;
461 return REQUEST_ENOMEM;
463 rqg->plexno = plexno;
465 rqe = &rqg->rqe[0]; /* point to the element */
466 rqe->rqg = rqg; /* group */
467 rqe->sdno = sd->sdno; /* put in the subdisk number */
468 plexoffset = *diskaddr; /* start offset in plex */
469 rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */
470 rqe->useroffset = plexoffset - diskstart; /* start offset in user buffer */
472 rqe->datalen = min(diskend - *diskaddr, /* number of sectors to transfer in this sd */
473 sd->sectors - rqe->sdoffset);
474 rqe->groupoffset = 0; /* no groups for concatenated plexes */
476 rqe->buflen = rqe->datalen; /* buffer length is data buffer length */
478 rqe->driveno = sd->driveno;
479 if (sd->state != sd_up) { /* *now* we find the sd is down */
480 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
481 if (s == REQUEST_DOWN) { /* down? */
482 rqe->flags = XFR_BAD_SUBDISK; /* yup */
483 if (rq->bp->b_flags & B_READ) /* read request, */
484 return REQUEST_DEGRADED; /* give up here */
486 * If we're writing, don't give up
487 * because of a bad subdisk. Go
488 * through to the bitter end, but note
489 * which ones we can't access.
491 status = REQUEST_DEGRADED; /* can't do it all */
494 *diskaddr += rqe->datalen; /* bump the address */
495 if (build_rq_buffer(rqe, plex)) { /* build the buffer */
497 bp->b_flags |= B_ERROR;
498 bp->b_error = ENOMEM;
500 return REQUEST_ENOMEM; /* can't do it */
503 if (*diskaddr == diskend) /* we're finished, */
504 break; /* get out of here */
507 * We've got to the end of the plex. Have we got to the end of
508 * the transfer? It would seem that having an offset beyond the
509 * end of the subdisk is an error, but in fact it can happen if
510 * the volume has another plex of different size. There's a valid
511 * question as to why you would want to do this, but currently
514 * In a previous version, I returned REQUEST_DOWN here. I think
515 * REQUEST_EOF is more appropriate now.
517 if (diskend > sd->sectors + sd->plexoffset) /* pointing beyond EOF? */
518 status = REQUEST_EOF;
523 while (*diskaddr < diskend) { /* until we get it all sorted out */
524 if (*diskaddr >= plex->length) /* beyond the end of the plex */
525 return REQUEST_EOF; /* can't continue */
527 /* The offset of the start address from the start of the stripe. */
528 stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks);
530 /* The plex-relative address of the start of the stripe. */
531 stripebase = *diskaddr - stripeoffset;
533 /* The number of the subdisk in which the start is located. */
534 sdno = stripeoffset / plex->stripesize;
536 /* The offset from the beginning of the stripe on this subdisk. */
537 blockoffset = stripeoffset % plex->stripesize;
539 sd = &SD[plex->sdnos[sdno]]; /* the subdisk in question */
540 rqg = allocrqg(rq, 1); /* space for the request */
541 if (rqg == NULL) { /* malloc failed */
542 bp->b_flags |= B_ERROR;
543 bp->b_error = ENOMEM;
545 return REQUEST_ENOMEM;
547 rqg->plexno = plexno;
549 rqe = &rqg->rqe[0]; /* point to the element */
551 rqe->sdoffset = stripebase / plex->subdisks + blockoffset; /* start offset in this subdisk */
552 rqe->useroffset = *diskaddr - diskstart; /* The offset of the start in the user buffer */
554 rqe->datalen = min(diskend - *diskaddr, /* the amount remaining to transfer */
555 plex->stripesize - blockoffset); /* and the amount left in this stripe */
556 rqe->groupoffset = 0; /* no groups for striped plexes */
558 rqe->buflen = rqe->datalen; /* buffer length is data buffer length */
560 rqe->sdno = sd->sdno; /* put in the subdisk number */
561 rqe->driveno = sd->driveno;
563 if (sd->state != sd_up) { /* *now* we find the sd is down */
564 s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
565 if (s == REQUEST_DOWN) { /* down? */
566 rqe->flags = XFR_BAD_SUBDISK; /* yup */
567 if (rq->bp->b_flags & B_READ) /* read request, */
568 return REQUEST_DEGRADED; /* give up here */
570 * If we're writing, don't give up
571 * because of a bad subdisk. Go through
572 * to the bitter end, but note which
573 * ones we can't access.
575 status = REQUEST_DEGRADED; /* can't do it all */
579 * It would seem that having an offset
580 * beyond the end of the subdisk is an
581 * error, but in fact it can happen if the
582 * volume has another plex of different
583 * size. There's a valid question as to why
584 * you would want to do this, but currently
587 if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */
588 rqe->datalen = sd->sectors - rqe->sdoffset; /* truncate */
590 if (debug & DEBUG_EOFINFO) { /* tell on the request */
592 "vinum: EOF on plex %s, sd %s offset %x (user offset %x)\n",
598 "vinum: stripebase %x, stripeoffset %x, blockoffset %x\n",
605 if (build_rq_buffer(rqe, plex)) { /* build the buffer */
607 bp->b_flags |= B_ERROR;
608 bp->b_error = ENOMEM;
610 return REQUEST_ENOMEM; /* can't do it */
612 *diskaddr += rqe->datalen; /* look at the remainder */
613 if ((*diskaddr < diskend) /* didn't finish the request on this stripe */
614 &&(*diskaddr < plex->length)) { /* and there's more to come */
615 plex->multiblock++; /* count another one */
616 if (sdno == plex->subdisks - 1) /* last subdisk, */
617 plex->multistripe++; /* another stripe as well */
624 * RAID5 is complicated enough to have
628 status = bre5(rq, plexno, diskaddr, diskend);
632 log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization);
633 status = REQUEST_DOWN; /* can't access it */
640 * Build up a request structure for reading volumes.
641 * This function is not needed for plex reads, since there's
642 * no recovery if a plex read can't be satisified.
645 build_read_request(struct request *rq, /* request */
647 { /* index in the volume's plex table */
649 daddr_t startaddr; /* offset of previous part of transfer */
650 daddr_t diskaddr; /* offset of current part of transfer */
651 daddr_t diskend; /* and end offset of transfer */
652 int plexno; /* plex index in vinum_conf */
653 struct rqgroup *rqg; /* point to the request we're working on */
654 struct volume *vol; /* volume in question */
655 int recovered = 0; /* set if we recover a read */
656 enum requeststatus status = REQUEST_OK;
657 int plexmask; /* bit mask of plexes, for recovery */
659 bp = rq->bp; /* buffer pointer */
660 diskaddr = bp->b_blkno; /* start offset of transfer */
661 diskend = diskaddr + (bp->b_bcount / DEV_BSIZE); /* and end offset of transfer */
662 rqg = &rq->rqg[plexindex]; /* plex request */
663 vol = &VOL[rq->volplex.volno]; /* point to volume */
665 while (diskaddr < diskend) { /* build up request components */
666 startaddr = diskaddr;
667 status = bre(rq, vol->plex[plexindex], &diskaddr, diskend); /* build up a request */
672 case REQUEST_RECOVERED:
674 * XXX FIXME if we have more than one plex, and we can
675 * satisfy the request from another, don't use the
676 * recovered request, since it's more expensive.
684 * If we get here, our request is not complete. Try
685 * to fill in the missing parts from another plex.
686 * This can happen multiple times in this function,
687 * and we reinitialize the plex mask each time, since
688 * we could have a hole in our plexes.
691 case REQUEST_DOWN: /* can't access the plex */
692 case REQUEST_DEGRADED: /* can't access the plex */
693 plexmask = ((1 << vol->plexes) - 1) /* all plexes in the volume */
694 &~(1 << plexindex); /* except for the one we were looking at */
695 for (plexno = 0; plexno < vol->plexes; plexno++) {
696 if (plexmask == 0) /* no plexes left to try */
697 return REQUEST_DOWN; /* failed */
698 diskaddr = startaddr; /* start at the beginning again */
699 if (plexmask & (1 << plexno)) { /* we haven't tried this plex yet */
700 bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */
701 if (diskaddr > startaddr) { /* we satisfied another part */
702 recovered = 1; /* we recovered from the problem */
703 status = REQUEST_OK; /* don't complain about it */
708 if (diskaddr == startaddr) /* didn't get any further, */
712 vol->recovered_reads += recovered; /* adjust our recovery count */
718 * Build up a request structure for writes.
719 * Return 0 if all subdisks involved in the request are up, 1 if some
720 * subdisks are not up, and -1 if the request is at least partially
721 * outside the bounds of the subdisks.
724 build_write_request(struct request *rq)
727 daddr_t diskstart; /* offset of current part of transfer */
728 daddr_t diskend; /* and end offset of transfer */
729 int plexno; /* plex index in vinum_conf */
730 struct volume *vol; /* volume in question */
731 enum requeststatus status;
733 bp = rq->bp; /* buffer pointer */
734 vol = &VOL[rq->volplex.volno]; /* point to volume */
735 diskend = bp->b_blkno + (bp->b_bcount / DEV_BSIZE); /* end offset of transfer */
736 status = REQUEST_DOWN; /* assume the worst */
737 for (plexno = 0; plexno < vol->plexes; plexno++) {
738 diskstart = bp->b_blkno; /* start offset of transfer */
740 * Build requests for the plex.
741 * We take the best possible result here (min,
742 * not max): we're happy if we can write at all
744 status = min(status, bre(rq,
752 /* Fill in the struct buf part of a request element. */
754 build_rq_buffer(struct rqelement *rqe, struct plex *plex)
756 struct sd *sd; /* point to subdisk */
759 struct buf *ubp; /* user (high level) buffer header */
761 vol = &VOL[rqe->rqg->rq->volplex.volno];
762 sd = &SD[rqe->sdno]; /* point to subdisk */
764 ubp = rqe->rqg->rq->bp; /* pointer to user buffer header */
766 /* Initialize the buf struct */
767 bp->b_flags = ubp->b_flags & (B_NOCACHE | B_READ | B_ASYNC); /* copy these flags from user bp */
768 bp->b_flags |= B_CALL; /* inform us when it's done */
769 BUF_LOCKINIT(bp); /* get a lock for the buffer */
770 BUF_LOCK(bp, LK_EXCLUSIVE); /* and lock it */
772 bp->b_iodone = complete_rqe; /* by calling us here */
774 * You'd think that we wouldn't need to even
775 * build the request buffer for a dead subdisk,
776 * but in some cases we need information like
777 * the user buffer address. Err on the side of
778 * generosity and supply what we can. That
779 * obviously doesn't include drive information
780 * when the drive is dead.
782 if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk is accessible, */
783 bp->b_dev = DRIVE[rqe->driveno].vp->v_rdev; /* drive device */
784 bp->b_vp = DRIVE[rqe->driveno].vp; /* drive vnode */
786 bp->b_blkno = rqe->sdoffset + sd->driveoffset; /* start address */
787 bp->b_bcount = rqe->buflen << DEV_BSHIFT; /* number of bytes to transfer */
788 bp->b_resid = bp->b_bcount; /* and it's still all waiting */
789 bp->b_bufsize = bp->b_bcount; /* and buffer size */
790 bp->b_rcred = FSCRED; /* we have the file system credentials */
791 bp->b_wcred = FSCRED; /* we have the file system credentials */
793 if (rqe->flags & XFR_MALLOCED) { /* this operation requires a malloced buffer */
794 bp->b_data = Malloc(bp->b_bcount); /* get a buffer to put it in */
795 if (bp->b_data == NULL) { /* failed */
796 abortrequest(rqe->rqg->rq, ENOMEM);
797 return REQUEST_ENOMEM; /* no memory */
801 * Point directly to user buffer data. This means
802 * that we don't need to do anything when we have
803 * finished the transfer
805 bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE;
807 * On a recovery read, we perform an XOR of
808 * all blocks to the user buffer. To make
809 * this work, we first clean out the buffer
811 if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK))
812 == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */
813 int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */
814 char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */
816 bzero(data, length); /* clean it out */
822 * Abort a request: free resources and complete the
823 * user request with the specified error
826 abortrequest(struct request *rq, int error)
828 struct buf *bp = rq->bp; /* user buffer */
830 bp->b_flags |= B_ERROR;
832 freerq(rq); /* free everything we're doing */
834 return error; /* and give up */
838 * Check that our transfer will cover the
839 * complete address space of the user request.
841 * Return 1 if it can, otherwise 0
844 check_range_covered(struct request *rq)
849 /* Perform I/O on a subdisk */
860 if (debug & DEBUG_LASTREQS)
861 logrq(loginfo_sdio, (union rqinfou) bp, bp);
863 sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */
864 drive = &DRIVE[sd->driveno];
868 * We allow access to any kind of subdisk as long as we can expect
869 * to get the I/O performed.
871 if (sd->state < sd_empty) { /* nothing to talk to, */
872 bp->b_flags |= B_ERROR;
878 sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf));
880 bp->b_flags |= B_ERROR;
881 bp->b_error = ENOMEM;
885 bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */
886 sbp->b.b_flags = bp->b_flags | B_CALL; /* inform us when it's done */
887 sbp->b.b_bufsize = bp->b_bufsize; /* buffer size */
888 sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */
889 sbp->b.b_resid = bp->b_resid; /* and amount waiting */
890 sbp->b.b_dev = DRIVE[sd->driveno].vp->v_rdev; /* device */
891 sbp->b.b_data = bp->b_data; /* data buffer */
892 sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
893 sbp->b.b_iodone = sdio_done; /* come here on completion */
894 BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */
895 BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */
897 sbp->b.b_vp = DRIVE[sd->driveno].vp; /* vnode */
898 sbp->bp = bp; /* note the address of the original header */
899 sbp->sdno = sd->sdno; /* note for statistics */
900 sbp->driveno = sd->driveno;
901 endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */
902 if (endoffset > sd->sectors) { /* beyond the end */
903 sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */
904 if (sbp->b.b_bcount <= 0) { /* nothing to transfer */
905 bp->b_resid = bp->b_bcount; /* nothing transferred */
911 if ((sbp->b.b_flags & B_READ) == 0) /* write */
912 sbp->b.b_vp->v_numoutput++; /* one more output going */
914 if (debug & DEBUG_ADDRESSES)
916 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
917 sbp->b.b_flags & B_READ ? "Read" : "Write",
921 (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset),
922 (int) sbp->b.b_blkno,
924 if (debug & DEBUG_NUMOUTPUT)
926 " vinumstart sd %d numoutput %ld\n",
928 sbp->b.b_vp->v_numoutput);
932 if (debug & DEBUG_LASTREQS)
933 logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b);
935 BUF_STRATEGY(&sbp->b, 0);
940 * Simplified version of bounds_check_with_label
941 * Determine the size of the transfer, and make sure it is
942 * within the boundaries of the partition. Adjust transfer
943 * if needed, and signal errors or early completion.
945 * Volumes are simpler than disk slices: they only contain
946 * one component (though we call them a, b and c to make
947 * system utilities happy), and they always take up the
948 * complete space of the "partition".
950 * I'm still not happy with this: why should the label be
951 * protected? If it weren't so damned difficult to write
952 * one in the first pleace (because it's protected), it wouldn't
956 vinum_bounds_check(struct buf *bp, struct volume *vol)
958 int maxsize = vol->size; /* size of the partition (sectors) */
959 int size = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* size of this request (sectors) */
961 /* Would this transfer overwrite the disk label? */
962 if (bp->b_blkno <= LABELSECTOR /* starts before or at the label */
964 && bp->b_blkno + size > LABELSECTOR /* and finishes after */
966 && (!(vol->flags & VF_RAW)) /* and it's not raw */
967 &&major(bp->b_dev) == BDEV_MAJOR /* and it's the block device */
968 && (bp->b_flags & B_READ) == 0 /* and it's a write */
969 && (!vol->flags & (VF_WLABEL | VF_LABELLING))) { /* and we're not allowed to write the label */
970 bp->b_error = EROFS; /* read-only */
971 bp->b_flags |= B_ERROR;
974 if (size == 0) /* no transfer specified, */
975 return 0; /* treat as EOF */
976 /* beyond partition? */
977 if (bp->b_blkno < 0 /* negative start */
978 || bp->b_blkno + size > maxsize) { /* or goes beyond the end of the partition */
979 /* if exactly at end of disk, return an EOF */
980 if (bp->b_blkno == maxsize) {
981 bp->b_resid = bp->b_bcount;
984 /* or truncate if part of it fits */
985 size = maxsize - bp->b_blkno;
986 if (size <= 0) { /* nothing to transfer */
987 bp->b_error = EINVAL;
988 bp->b_flags |= B_ERROR;
991 bp->b_bcount = size << DEV_BSHIFT;
993 bp->b_pblkno = bp->b_blkno;
998 * Allocate a request group and hook
999 * it in in the list for rq
1002 allocrqg(struct request *rq, int elements)
1004 struct rqgroup *rqg; /* the one we're going to allocate */
1005 int size = sizeof(struct rqgroup) + elements * sizeof(struct rqelement);
1007 rqg = (struct rqgroup *) Malloc(size);
1008 if (rqg != NULL) { /* malloc OK, */
1009 if (rq->rqg) /* we already have requests */
1010 rq->lrqg->next = rqg; /* hang it off the end */
1011 else /* first request */
1012 rq->rqg = rqg; /* at the start */
1013 rq->lrqg = rqg; /* this one is the last in the list */
1015 bzero(rqg, size); /* no old junk */
1016 rqg->rq = rq; /* point back to the parent request */
1017 rqg->count = elements; /* number of requests in the group */
1023 * Deallocate a request group out of a chain. We do
1024 * this by linear search: the chain is short, this
1025 * almost never happens, and currently it can only
1026 * happen to the first member of the chain.
1029 deallocrqg(struct rqgroup *rqg)
1031 struct rqgroup *rqgc = rqg->rq->rqg; /* point to the request chain */
1033 if (rqg->lock) /* got a lock? */
1034 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
1035 if (rqgc == rqg) /* we're first in line */
1036 rqg->rq->rqg = rqg->next; /* unhook ourselves */
1038 while ((rqgc->next != NULL) /* find the group */
1039 &&(rqgc->next != rqg))
1041 if (rqgc->next == NULL)
1043 "vinum deallocrqg: rqg %p not found in request %p\n",
1047 rqgc->next = rqg->next; /* make the chain jump over us */