sys/dev/vinum/vinuminterrupt.c

   1 /* vinuminterrupt.c: bottom half of the driver */
   2
   3 /*-
   4  * Copyright (c) 1997, 1998, 1999
   5  *      Nan Yang Computer Services Limited.  All rights reserved.
   6  *
   7  *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
   8  *
   9  *  Written by Greg Lehey
  10  *
  11  *  This software is distributed under the so-called ``Berkeley
  12  *  License'':
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions
  16  * are met:
  17  * 1. Redistributions of source code must retain the above copyright
  18  *    notice, this list of conditions and the following disclaimer.
  19  * 2. Redistributions in binary form must reproduce the above copyright
  20  *    notice, this list of conditions and the following disclaimer in the
  21  *    documentation and/or other materials provided with the distribution.
  22  * 3. All advertising materials mentioning features or use of this software
  23  *    must display the following acknowledgement:
  24  *      This product includes software developed by Nan Yang Computer
  25  *      Services Limited.
  26  * 4. Neither the name of the Company nor the names of its contributors
  27  *    may be used to endorse or promote products derived from this software
  28  *    without specific prior written permission.
  29  *
  30  * This software is provided ``as is'', and any express or implied
  31  * warranties, including, but not limited to, the implied warranties of
  32  * merchantability and fitness for a particular purpose are disclaimed.
  33  * In no event shall the company or contributors be liable for any
  34  * direct, indirect, incidental, special, exemplary, or consequential
  35  * damages (including, but not limited to, procurement of substitute
  36  * goods or services; loss of use, data, or profits; or business
  37  * interruption) however caused and on any theory of liability, whether
  38  * in contract, strict liability, or tort (including negligence or
  39  * otherwise) arising in any way out of the use of this software, even if
  40  * advised of the possibility of such damage.
  41  *
  42  * $Id$
  43  * $FreeBSD$
  44  */
  45
  46 #include <dev/vinum/vinumhdr.h>
  47 #include <dev/vinum/request.h>
  48 #include <sys/resourcevar.h>
  49
  50 void complete_raid5_write(struct rqelement *);
  51 void complete_rqe(struct buf *bp);
  52 void sdio_done(struct buf *bp);
  53
  54 /*
  55  * Take a completed buffer, transfer the data back if
  56  * it's a read, and complete the high-level request
  57  * if this is the last subrequest.
  58  *
  59  * The bp parameter is in fact a struct rqelement, which
  60  * includes a couple of extras at the end.
  61  */
  62 void
  63 complete_rqe(struct buf *bp)
  64 {
  65     struct rqelement *rqe;
  66     struct request *rq;
  67     struct rqgroup *rqg;
  68     struct buf *ubp;                                        /* user buffer */
  69     struct drive *drive;
  70
  71     rqe = (struct rqelement *) bp;                          /* point to the element element that completed */
  72     rqg = rqe->rqg;                                         /* and the request group */
  73     rq = rqg->rq;                                           /* and the complete request */
  74     ubp = rq->bp;                                           /* user buffer */
  75
  76 #ifdef VINUMDEBUG
  77     if (debug & DEBUG_LASTREQS)
  78         logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
  79 #endif
  80     drive = &DRIVE[rqe->driveno];
  81     drive->active--;                                        /* one less outstanding I/O on this drive */
  82     vinum_conf.active--;                                    /* one less outstanding I/O globally */
  83     if ((drive->active == (DRIVE_MAXACTIVE - 1))            /* we were at the drive limit */
  84     ||(vinum_conf.active == VINUM_MAXACTIVE))               /* or the global limit */
  85         wakeup(&launch_requests);                           /* let another one at it */
  86     if ((bp->b_flags & B_ERROR) != 0) {                     /* transfer in error */
  87         if (bp->b_error != 0)                               /* did it return a number? */
  88             rq->error = bp->b_error;                        /* yes, put it in. */
  89         else if (rq->error == 0)                            /* no: do we have one already? */
  90             rq->error = EIO;                                /* no: catchall "I/O error" */
  91         SD[rqe->sdno].lasterror = rq->error;
  92         if (bp->b_flags & B_READ) {
  93             log(LOG_ERR, "%s: fatal read I/O error\n", SD[rqe->sdno].name);
  94             set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
  95         } else {                                            /* write operation */
  96             log(LOG_ERR, "%s: fatal write I/O error\n", SD[rqe->sdno].name);
  97             set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
  98         }
  99         if (rq->error == ENXIO) {                           /* the drive's down too */
 100             log(LOG_ERR, "%s: fatal drive I/O error\n", DRIVE[rqe->driveno].label.name);
 101             DRIVE[rqe->driveno].lasterror = rq->error;
 102             set_drive_state(rqe->driveno,                   /* take the drive down */
 103                 drive_down,
 104                 setstate_force);
 105         }
 106     }
 107     /* Now update the statistics */
 108     if (bp->b_flags & B_READ) {                             /* read operation */
 109         DRIVE[rqe->driveno].reads++;
 110         DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
 111         SD[rqe->sdno].reads++;
 112         SD[rqe->sdno].bytes_read += bp->b_bcount;
 113         PLEX[rqe->rqg->plexno].reads++;
 114         PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
 115     } else {                                                /* write operation */
 116         DRIVE[rqe->driveno].writes++;
 117         DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
 118         SD[rqe->sdno].writes++;
 119         SD[rqe->sdno].bytes_written += bp->b_bcount;
 120         PLEX[rqe->rqg->plexno].writes++;
 121         PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
 122     }
 123     rqg->active--;                                          /* one less request active */
 124     if (rqg->flags & XFR_RECOVERY_READ) {                   /* recovery read, */
 125         int *sdata;                                         /* source */
 126         int *data;                                          /* and group data */
 127         int length;                                         /* and count involved */
 128         int count;                                          /* loop counter */
 129         struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
 130
 131         /* XOR destination is the user data */
 132         sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
 133         data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
 134         length = urqe->grouplen << (DEV_BSHIFT - 2);        /* and count involved */
 135
 136         for (count = 0; count < length; count++)
 137             data[count] ^= sdata[count];
 138
 139         /*
 140          * In a normal read, we will normally read directly
 141          * into the user buffer.  This doesn't work if
 142          * we're also doing a recovery, so we have to
 143          * copy it
 144          */
 145         if (rqe->flags & XFR_NORMAL_READ) {                 /* normal read as well, */
 146             char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
 147             char *dst;
 148
 149             dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
 150             length = rqe->datalen << DEV_BSHIFT;            /* and count involved */
 151             bcopy(src, dst, length);                        /* move it */
 152         }
 153     } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 5 group write operation  */
 154     &&(rqg->active == 0))                                   /* and we've finished phase 1 */
 155         complete_raid5_write(rqe);
 156     if (rqg->active == 0)                                   /* request group finished, */
 157         rq->active--;                                       /* one less */
 158     if (rq->active == 0) {                                  /* request finished, */
 159 #if VINUMDEBUG
 160         if (debug & DEBUG_RESID) {
 161             if (ubp->b_resid != 0)                          /* still something to transfer? */
 162                 Debugger("resid");
 163         }
 164 #endif
 165
 166         if (rq->error) {                                    /* did we have an error? */
 167             if (rq->isplex) {                               /* plex operation, */
 168                 ubp->b_flags |= B_ERROR;                    /* yes, propagate to user */
 169                 ubp->b_error = rq->error;
 170             } else                                          /* try to recover */
 171                 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
 172         } else {
 173             ubp->b_resid = 0;                               /* completed our transfer */
 174             if (rq->isplex == 0)                            /* volume request, */
 175                 VOL[rq->volplex.volno].active--;            /* another request finished */
 176             biodone(ubp);                                   /* top level buffer completed */
 177             freerq(rq);                                     /* return the request storage */
 178         }
 179     }
 180 }
 181
 182 /* Free a request block and anything hanging off it */
 183 void
 184 freerq(struct request *rq)
 185 {
 186     struct rqgroup *rqg;
 187     struct rqgroup *nrqg;                                   /* next in chain */
 188     int rqno;
 189
 190     for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) {          /* through the whole request chain */
 191         if (rqg->lock)                                      /* got a lock? */
 192             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
 193         for (rqno = 0; rqno < rqg->count; rqno++)
 194             if ((rqg->rqe[rqno].flags & XFR_MALLOCED)       /* data buffer was malloced, */
 195             &&rqg->rqe[rqno].b.b_data)                      /* and the allocation succeeded */
 196                 Free(rqg->rqe[rqno].b.b_data);              /* free it */
 197         nrqg = rqg->next;                                   /* note the next one */
 198         Free(rqg);                                          /* and free this one */
 199     }
 200     Free(rq);                                               /* free the request itself */
 201 }
 202
 203 /* I/O on subdisk completed */
 204 void
 205 sdio_done(struct buf *bp)
 206 {
 207     struct sdbuf *sbp;
 208
 209     sbp = (struct sdbuf *) bp;
 210     if (sbp->b.b_flags & B_ERROR) {                         /* had an error */
 211         sbp->bp->b_flags |= B_ERROR;                        /* propagate upwards */
 212         sbp->bp->b_error = sbp->b.b_error;
 213     }
 214 #ifdef VINUMDEBUG
 215     if (debug & DEBUG_LASTREQS)
 216         logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
 217 #endif
 218     sbp->bp->b_resid = sbp->b.b_resid;                      /* copy the resid field */
 219     /* Now update the statistics */
 220     if (bp->b_flags & B_READ) {                             /* read operation */
 221         DRIVE[sbp->driveno].reads++;
 222         DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
 223         SD[sbp->sdno].reads++;
 224         SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
 225     } else {                                                /* write operation */
 226         DRIVE[sbp->driveno].writes++;
 227         DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
 228         SD[sbp->sdno].writes++;
 229         SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
 230     }
 231     biodone(sbp->bp);                                       /* complete the caller's I/O */
 232     Free(sbp);
 233 }
 234
 235 /* Start the second phase of a RAID5 group write operation. */
 236 void
 237 complete_raid5_write(struct rqelement *rqe)
 238 {
 239     int *sdata;                                             /* source */
 240     int *pdata;                                             /* and parity block data */
 241     int length;                                             /* and count involved */
 242     int count;                                              /* loop counter */
 243     int rqno;                                               /* request index */
 244     int rqoffset;                                           /* offset of request data from parity data */
 245     struct buf *bp;                                         /* user buffer header */
 246     struct request *rq;                                     /* pointer to our request */
 247     struct rqgroup *rqg;                                    /* and to the request group */
 248     struct rqelement *prqe;                                 /* point to the parity block */
 249     struct drive *drive;                                    /* drive to access */
 250
 251     rqg = rqe->rqg;                                         /* and to our request group */
 252     rq = rqg->rq;                                           /* point to our request */
 253     bp = rq->bp;                                            /* user's buffer header */
 254     prqe = &rqg->rqe[0];                                    /* point to the parity block */
 255
 256     /*
 257      * If we get to this function, we have normal or
 258      * degraded writes, or a combination of both.  We do
 259      * the same thing in each case: we perform an
 260      * exclusive or to the parity block.  The only
 261      * difference is the origin of the data and the
 262      * address range.
 263      */
 264
 265     if (rqe->flags & XFR_DEGRADED_WRITE) {                  /* do the degraded write stuff */
 266         pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
 267         bzero(pdata, prqe->grouplen << DEV_BSHIFT);         /* start with nothing in the parity block */
 268
 269         /* Now get what data we need from each block */
 270         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
 271             /*
 272              * This can do with improvement.  If we're doing
 273              * both a degraded and a normal write, we don't
 274              * need to xor (nor to read) the part of the block
 275              * that we're going to overwrite.  FIXME XXX
 276              */
 277             rqe = &rqg->rqe[rqno];                          /* this request */
 278             sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
 279             length = rqe->grouplen << (DEV_BSHIFT - 2);     /* and count involved */
 280
 281             /*
 282              * add the data block to the parity block.  Before
 283              * we started the request, we zeroed the parity
 284              * block, so the result of adding all the other
 285              * blocks and the block we want to write will be
 286              * the correct parity block.
 287              */
 288             for (count = 0; count < length; count++)
 289                 pdata[count] ^= sdata[count];
 290             if ((rqe->flags & XFR_MALLOCED)                 /* the buffer was malloced, */
 291             &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {     /* and we have no normal write, */
 292                 Free(rqe->b.b_data);                        /* free it now */
 293                 rqe->flags &= ~XFR_MALLOCED;
 294             }
 295         }
 296     }
 297     if (rqg->flags & XFR_NORMAL_WRITE) {                    /* do normal write stuff */
 298         /* Get what data we need from each block */
 299         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
 300             rqe = &rqg->rqe[rqno];                          /* this request */
 301             if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
 302                 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
 303                 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
 304                 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
 305                 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
 306                 length = rqe->datalen << (DEV_BSHIFT - 2);  /* and count involved */
 307                 /*
 308                  * "remove" the old data block
 309                  * from the parity block
 310                  */
 311                 if ((pdata < ((int *) prqe->b.b_data))
 312                     || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
 313                     || (sdata < ((int *) rqe->b.b_data))
 314                     || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
 315                     panic("complete_raid5_write: bounds overflow");
 316                 for (count = 0; count < length; count++)
 317                     pdata[count] ^= sdata[count];
 318
 319                 /* "add" the new data block */
 320                 sdata = (int *) (&bp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
 321                 if ((sdata < ((int *) bp->b_data))
 322                     || (&sdata[length] > ((int *) (bp->b_data + bp->b_bcount))))
 323                     panic("complete_raid5_write: bounds overflow");
 324                 for (count = 0; count < length; count++)
 325                     pdata[count] ^= sdata[count];
 326
 327                 /* Free the malloced buffer */
 328                 if (rqe->flags & XFR_MALLOCED) {            /* the buffer was malloced, */
 329                     Free(rqe->b.b_data);                    /* free it */
 330                     rqe->flags &= ~XFR_MALLOCED;
 331                 } else
 332                     panic("complete_raid5_write: malloc conflict");
 333
 334                 if ((rqe->b.b_flags & B_READ)               /* this was a read */
 335                 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
 336                     rqe->b.b_flags &= ~(B_READ | B_DONE);   /* we're writing now */
 337                     rqe->b.b_flags |= B_CALL;               /* call us when you're done */
 338                     rqe->b.b_iodone = complete_rqe;         /* by calling us here */
 339                     rqe->flags &= ~XFR_PARITYOP;            /* reset flags that brought us here */
 340                     rqe->b.b_data = &bp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
 341                     rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
 342                     rqe->b.b_bufsize = rqe->b.b_bcount;     /* don't claim more */
 343                     rqe->b.b_resid = rqe->b.b_bcount;       /* nothing transferred */
 344                     rqe->b.b_blkno += rqe->dataoffset;      /* point to the correct block */
 345                     rqg->active++;                          /* another active request */
 346                     rqe->b.b_vp->v_numoutput++;             /* one more output going */
 347                     drive = &DRIVE[rqe->driveno];           /* drive to access */
 348
 349                                                             /* We can't sleep here, so we just increment the counters. */
 350                     drive->active++;
 351                     if (drive->active >= drive->maxactive)
 352                         drive->maxactive = drive->active;
 353                     vinum_conf.active++;
 354                     if (vinum_conf.active >= vinum_conf.maxactive)
 355                         vinum_conf.maxactive = vinum_conf.active;
 356 #if VINUMDEBUG
 357                     if (debug & DEBUG_ADDRESSES)
 358                         log(LOG_DEBUG,
 359                             "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
 360                             rqe->b.b_flags & B_READ ? "Read" : "Write",
 361                             major(rqe->b.b_dev),
 362                             minor(rqe->b.b_dev),
 363                             rqe->sdno,
 364                             (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
 365                             rqe->b.b_blkno,
 366                             rqe->b.b_bcount);
 367                     if (debug & DEBUG_NUMOUTPUT)
 368                         log(LOG_DEBUG,
 369                             "  raid5.2 sd %d numoutput %ld\n",
 370                             rqe->sdno,
 371                             rqe->b.b_vp->v_numoutput);
 372                     if (debug & DEBUG_LASTREQS)
 373                         logrq(loginfo_raid5_data, (union rqinfou) rqe, bp);
 374 #endif
 375                     BUF_STRATEGY(&rqe->b, 0);
 376                 }
 377             }
 378         }
 379     }
 380     /* Finally, write the parity block */
 381     rqe = &rqg->rqe[0];
 382     rqe->b.b_flags &= ~(B_READ | B_DONE);                   /* we're writing now */
 383     rqe->b.b_flags |= B_CALL;                               /* call us when you're done */
 384     rqe->b.b_iodone = complete_rqe;                         /* by calling us here */
 385     rqg->flags &= ~XFR_PARITYOP;                            /* reset flags that brought us here */
 386     rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;            /* length to write */
 387     rqe->b.b_bufsize = rqe->b.b_bcount;                     /* don't claim we have more */
 388     rqe->b.b_resid = rqe->b.b_bcount;                       /* nothing transferred */
 389     rqg->active++;                                          /* another active request */
 390     rqe->b.b_vp->v_numoutput++;                             /* one more output going */
 391     drive = &DRIVE[rqe->driveno];                           /* drive to access */
 392
 393     /* We can't sleep here, so we just increment the counters. */
 394     drive->active++;
 395     if (drive->active >= drive->maxactive)
 396         drive->maxactive = drive->active;
 397     vinum_conf.active++;
 398     if (vinum_conf.active >= vinum_conf.maxactive)
 399         vinum_conf.maxactive = vinum_conf.active;
 400
 401 #if VINUMDEBUG
 402     if (debug & DEBUG_ADDRESSES)
 403         log(LOG_DEBUG,
 404             "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
 405             rqe->b.b_flags & B_READ ? "Read" : "Write",
 406             major(rqe->b.b_dev),
 407             minor(rqe->b.b_dev),
 408             rqe->sdno,
 409             (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
 410             rqe->b.b_blkno,
 411             rqe->b.b_bcount);
 412     if (debug & DEBUG_NUMOUTPUT)
 413         log(LOG_DEBUG,
 414             "  raid5.3 sd %d numoutput %ld\n",
 415             rqe->sdno,
 416             rqe->b.b_vp->v_numoutput);
 417     if (debug & DEBUG_LASTREQS)
 418         logrq(loginfo_raid5_parity, (union rqinfou) rqe, bp);
 419 #endif
 420     BUF_STRATEGY(&rqe->b, 0);
 421 }