]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/vinum/vinuminterrupt.c
This commit was generated by cvs2svn to compensate for changes in r98005,
[FreeBSD/FreeBSD.git] / sys / dev / vinum / vinuminterrupt.c
1 /* vinuminterrupt.c: bottom half of the driver */
2
3 /*-
4  * Copyright (c) 1997, 1998, 1999
5  *      Nan Yang Computer Services Limited.  All rights reserved.
6  *
7  *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
8  *
9  *  Written by Greg Lehey
10  *
11  *  This software is distributed under the so-called ``Berkeley
12  *  License'':
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *      This product includes software developed by Nan Yang Computer
25  *      Services Limited.
26  * 4. Neither the name of the Company nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * This software is provided ``as is'', and any express or implied
31  * warranties, including, but not limited to, the implied warranties of
32  * merchantability and fitness for a particular purpose are disclaimed.
33  * In no event shall the company or contributors be liable for any
34  * direct, indirect, incidental, special, exemplary, or consequential
35  * damages (including, but not limited to, procurement of substitute
36  * goods or services; loss of use, data, or profits; or business
37  * interruption) however caused and on any theory of liability, whether
38  * in contract, strict liability, or tort (including negligence or
39  * otherwise) arising in any way out of the use of this software, even if
40  * advised of the possibility of such damage.
41  *
42  * $Id: vinuminterrupt.c,v 1.14 2001/05/23 23:03:37 grog Exp grog $
43  * $FreeBSD$
44  */
45
46 #include <dev/vinum/vinumhdr.h>
47 #include <dev/vinum/request.h>
48 #include <sys/resourcevar.h>
49
50 void complete_raid5_write(struct rqelement *);
51 void complete_rqe(struct buf *bp);
52 void sdio_done(struct buf *bp);
53
54 /*
55  * Take a completed buffer, transfer the data back if
56  * it's a read, and complete the high-level request
57  * if this is the last subrequest.
58  *
59  * The bp parameter is in fact a struct rqelement, which
60  * includes a couple of extras at the end.
61  */
62 void
63 complete_rqe(struct buf *bp)
64 {
65     struct rqelement *rqe;
66     struct request *rq;
67     struct rqgroup *rqg;
68     struct buf *ubp;                                        /* user buffer */
69     struct drive *drive;
70     struct sd *sd;
71     char *gravity;                                          /* for error messages */
72
73     rqe = (struct rqelement *) bp;                          /* point to the element element that completed */
74     rqg = rqe->rqg;                                         /* and the request group */
75     rq = rqg->rq;                                           /* and the complete request */
76     ubp = rq->bp;                                           /* user buffer */
77
78 #ifdef VINUMDEBUG
79     if (debug & DEBUG_LASTREQS)
80         logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
81 #endif
82     drive = &DRIVE[rqe->driveno];
83     drive->active--;                                        /* one less outstanding I/O on this drive */
84     vinum_conf.active--;                                    /* one less outstanding I/O globally */
85     if ((drive->active == (DRIVE_MAXACTIVE - 1))            /* we were at the drive limit */
86     ||(vinum_conf.active == VINUM_MAXACTIVE))               /* or the global limit */
87         wakeup(&launch_requests);                           /* let another one at it */
88     if ((bp->b_io.bio_flags & BIO_ERROR) != 0) {            /* transfer in error */
89         gravity = "";
90         sd = &SD[rqe->sdno];
91
92         if (bp->b_error != 0)                               /* did it return a number? */
93             rq->error = bp->b_error;                        /* yes, put it in. */
94         else if (rq->error == 0)                            /* no: do we have one already? */
95             rq->error = EIO;                                /* no: catchall "I/O error" */
96         sd->lasterror = rq->error;
97         if (bp->b_iocmd == BIO_READ) {                      /* read operation */
98             if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
99                 gravity = " fatal";
100                 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
101             }
102             log(LOG_ERR,
103                 "%s:%s read error, block %lld for %ld bytes\n",
104                 gravity,
105                 sd->name,
106                 bp->b_blkno,
107                 bp->b_bcount);
108         } else {                                            /* write operation */
109             if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
110                 gravity = "fatal ";
111                 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
112             }
113             log(LOG_ERR,
114                 "%s:%s write error, block %lld for %ld bytes\n",
115                 gravity,
116                 sd->name,
117                 bp->b_blkno,
118                 bp->b_bcount);
119         }
120         log(LOG_ERR,
121             "%s: user buffer block %lld for %ld bytes\n",
122             sd->name,
123             ubp->b_blkno,
124             ubp->b_bcount);
125         if (rq->error == ENXIO) {                           /* the drive's down too */
126             log(LOG_ERR,
127                 "%s: fatal drive I/O error, block %lld for %ld bytes\n",
128                 DRIVE[rqe->driveno].label.name,
129                 bp->b_blkno,
130                 bp->b_bcount);
131             DRIVE[rqe->driveno].lasterror = rq->error;
132             set_drive_state(rqe->driveno,                   /* take the drive down */
133                 drive_down,
134                 setstate_force);
135         }
136     }
137     /* Now update the statistics */
138     if (bp->b_iocmd == BIO_READ) {                          /* read operation */
139         DRIVE[rqe->driveno].reads++;
140         DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
141         SD[rqe->sdno].reads++;
142         SD[rqe->sdno].bytes_read += bp->b_bcount;
143         PLEX[rqe->rqg->plexno].reads++;
144         PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
145         if (PLEX[rqe->rqg->plexno].volno >= 0) {            /* volume I/O, not plex */
146             VOL[PLEX[rqe->rqg->plexno].volno].reads++;
147             VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
148         }
149     } else {                                                /* write operation */
150         DRIVE[rqe->driveno].writes++;
151         DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
152         SD[rqe->sdno].writes++;
153         SD[rqe->sdno].bytes_written += bp->b_bcount;
154         PLEX[rqe->rqg->plexno].writes++;
155         PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
156         if (PLEX[rqe->rqg->plexno].volno >= 0) {            /* volume I/O, not plex */
157             VOL[PLEX[rqe->rqg->plexno].volno].writes++;
158             VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
159         }
160     }
161     if (rqg->flags & XFR_RECOVERY_READ) {                   /* recovery read, */
162         int *sdata;                                         /* source */
163         int *data;                                          /* and group data */
164         int length;                                         /* and count involved */
165         int count;                                          /* loop counter */
166         struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
167
168         /* XOR destination is the user data */
169         sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
170         data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
171         length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
172
173         for (count = 0; count < length; count++)
174             data[count] ^= sdata[count];
175
176         /*
177          * In a normal read, we will normally read directly
178          * into the user buffer.  This doesn't work if
179          * we're also doing a recovery, so we have to
180          * copy it
181          */
182         if (rqe->flags & XFR_NORMAL_READ) {                 /* normal read as well, */
183             char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
184             char *dst;
185
186             dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
187             length = rqe->datalen << DEV_BSHIFT;            /* and count involved */
188             bcopy(src, dst, length);                        /* move it */
189         }
190     } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
191     &&(rqg->active == 1))                                   /* and this is the last active request */
192         complete_raid5_write(rqe);
193     /*
194      * This is the earliest place where we can be
195      * sure that the request has really finished,
196      * since complete_raid5_write can issue new
197      * requests.
198      */
199     rqg->active--;                                          /* this request now finished */
200     if (rqg->active == 0) {                                 /* request group finished, */
201         rq->active--;                                       /* one less */
202         if (rqg->lock) {                                    /* got a lock? */
203             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
204             rqg->lock = 0;
205         }
206     }
207     if (rq->active == 0) {                                  /* request finished, */
208 #ifdef VINUMDEBUG
209         if (debug & DEBUG_RESID) {
210             if (ubp->b_resid != 0)                          /* still something to transfer? */
211                 Debugger("resid");
212         }
213 #endif
214
215         if (rq->error) {                                    /* did we have an error? */
216             if (rq->isplex) {                               /* plex operation, */
217                 ubp->b_io.bio_flags |= BIO_ERROR;           /* yes, propagate to user */
218                 ubp->b_error = rq->error;
219             } else                                          /* try to recover */
220                 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
221         } else {
222             ubp->b_resid = 0;                               /* completed our transfer */
223             if (rq->isplex == 0)                            /* volume request, */
224                 VOL[rq->volplex.volno].active--;            /* another request finished */
225             if (rq->flags & XFR_COPYBUF) {
226                 Free(ubp->b_data);
227                 ubp->b_data = rq->save_data;
228             }
229             bufdone(ubp);                                   /* top level buffer completed */
230             freerq(rq);                                     /* return the request storage */
231         }
232     }
233 }
234
235 /* Free a request block and anything hanging off it */
236 void
237 freerq(struct request *rq)
238 {
239     struct rqgroup *rqg;
240     struct rqgroup *nrqg;                                   /* next in chain */
241     int rqno;
242
243     for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) {          /* through the whole request chain */
244         if (rqg->lock)                                      /* got a lock? */
245             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
246         for (rqno = 0; rqno < rqg->count; rqno++) {
247             if ((rqg->rqe[rqno].flags & XFR_MALLOCED)       /* data buffer was malloced, */
248             &&rqg->rqe[rqno].b.b_data)                      /* and the allocation succeeded */
249                 Free(rqg->rqe[rqno].b.b_data);              /* free it */
250             if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) {     /* locked this buffer, */
251                 BUF_UNLOCK(&rqg->rqe[rqno].b);              /* unlock it again */
252                 BUF_LOCKFREE(&rqg->rqe[rqno].b);
253             }
254         }
255         nrqg = rqg->next;                                   /* note the next one */
256         Free(rqg);                                          /* and free this one */
257     }
258     Free(rq);                                               /* free the request itself */
259 }
260
261 /* I/O on subdisk completed */
262 void
263 sdio_done(struct buf *bp)
264 {
265     struct sdbuf *sbp;
266
267     sbp = (struct sdbuf *) bp;
268     if (sbp->b.b_io.bio_flags & BIO_ERROR) {                /* had an error */
269         sbp->bp->b_io.bio_flags |= BIO_ERROR;               /* propagate upwards */
270         sbp->bp->b_error = sbp->b.b_error;
271     }
272 #ifdef VINUMDEBUG
273     if (debug & DEBUG_LASTREQS)
274         logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
275 #endif
276     sbp->bp->b_resid = sbp->b.b_resid;                      /* copy the resid field */
277     /* Now update the statistics */
278     if (bp->b_iocmd == BIO_READ) {                          /* read operation */
279         DRIVE[sbp->driveno].reads++;
280         DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
281         SD[sbp->sdno].reads++;
282         SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
283     } else {                                                /* write operation */
284         DRIVE[sbp->driveno].writes++;
285         DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
286         SD[sbp->sdno].writes++;
287         SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
288     }
289     bufdone(sbp->bp);                                       /* complete the caller's I/O */
290     BUF_UNLOCK(&sbp->b);
291     BUF_LOCKFREE(&sbp->b);
292     Free(sbp);
293 }
294
295 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
296 void
297 complete_raid5_write(struct rqelement *rqe)
298 {
299     int *sdata;                                             /* source */
300     int *pdata;                                             /* and parity block data */
301     int length;                                             /* and count involved */
302     int count;                                              /* loop counter */
303     int rqno;                                               /* request index */
304     int rqoffset;                                           /* offset of request data from parity data */
305     struct buf *ubp;                                        /* user buffer header */
306     struct request *rq;                                     /* pointer to our request */
307     struct rqgroup *rqg;                                    /* and to the request group */
308     struct rqelement *prqe;                                 /* point to the parity block */
309     struct drive *drive;                                    /* drive to access */
310
311     rqg = rqe->rqg;                                         /* and to our request group */
312     rq = rqg->rq;                                           /* point to our request */
313     ubp = rq->bp;                                           /* user's buffer header */
314     prqe = &rqg->rqe[0];                                    /* point to the parity block */
315
316     /*
317      * If we get to this function, we have normal or
318      * degraded writes, or a combination of both.  We do
319      * the same thing in each case: we perform an
320      * exclusive or to the parity block.  The only
321      * difference is the origin of the data and the
322      * address range.
323      */
324     if (rqe->flags & XFR_DEGRADED_WRITE) {                  /* do the degraded write stuff */
325         pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
326         bzero(pdata, prqe->grouplen << DEV_BSHIFT);         /* start with nothing in the parity block */
327
328         /* Now get what data we need from each block */
329         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
330             rqe = &rqg->rqe[rqno];                          /* this request */
331             sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
332             length = rqe->grouplen << (DEV_BSHIFT - 2);     /* and count involved */
333
334             /*
335              * Add the data block to the parity block.  Before
336              * we started the request, we zeroed the parity
337              * block, so the result of adding all the other
338              * blocks and the block we want to write will be
339              * the correct parity block.
340              */
341             for (count = 0; count < length; count++)
342                 pdata[count] ^= sdata[count];
343             if ((rqe->flags & XFR_MALLOCED)                 /* the buffer was malloced, */
344             &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {     /* and we have no normal write, */
345                 Free(rqe->b.b_data);                        /* free it now */
346                 rqe->flags &= ~XFR_MALLOCED;
347             }
348         }
349     }
350     if (rqg->flags & XFR_NORMAL_WRITE) {                    /* do normal write stuff */
351         /* Get what data we need from each block */
352         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
353             rqe = &rqg->rqe[rqno];                          /* this request */
354             if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
355                 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
356                 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
357                 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
358                 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
359                 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
360
361                 /*
362                  * "remove" the old data block
363                  * from the parity block
364                  */
365                 if ((pdata < ((int *) prqe->b.b_data))
366                     || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
367                     || (sdata < ((int *) rqe->b.b_data))
368                     || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
369                     panic("complete_raid5_write: bounds overflow");
370                 for (count = 0; count < length; count++)
371                     pdata[count] ^= sdata[count];
372
373                 /* "add" the new data block */
374                 sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
375                 if ((sdata < ((int *) ubp->b_data))
376                     || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount))))
377                     panic("complete_raid5_write: bounds overflow");
378                 for (count = 0; count < length; count++)
379                     pdata[count] ^= sdata[count];
380
381                 /* Free the malloced buffer */
382                 if (rqe->flags & XFR_MALLOCED) {            /* the buffer was malloced, */
383                     Free(rqe->b.b_data);                    /* free it */
384                     rqe->flags &= ~XFR_MALLOCED;
385                 } else
386                     panic("complete_raid5_write: malloc conflict");
387
388                 if ((rqe->b.b_iocmd == BIO_READ)            /* this was a read */
389                 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
390                     rqe->b.b_flags &= ~B_DONE;              /* start a new request */
391                     rqe->b.b_iocmd = BIO_WRITE;             /* we're writing now */
392                     rqe->b.b_iodone = complete_rqe;         /* call us here when done */
393                     rqe->flags &= ~XFR_PARITYOP;            /* reset flags that brought us here */
394                     rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
395                     rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
396                     rqe->b.b_bufsize = rqe->b.b_bcount;     /* don't claim more */
397                     rqe->b.b_resid = rqe->b.b_bcount;       /* nothing transferred */
398                     rqe->b.b_blkno += rqe->dataoffset;      /* point to the correct block */
399                     rqg->active++;                          /* another active request */
400                     drive = &DRIVE[rqe->driveno];           /* drive to access */
401
402                                                             /* We can't sleep here, so we just increment the counters. */
403                     drive->active++;
404                     if (drive->active >= drive->maxactive)
405                         drive->maxactive = drive->active;
406                     vinum_conf.active++;
407                     if (vinum_conf.active >= vinum_conf.maxactive)
408                         vinum_conf.maxactive = vinum_conf.active;
409 #ifdef VINUMDEBUG
410                     if (debug & DEBUG_ADDRESSES)
411                         log(LOG_DEBUG,
412                             "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
413                             rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
414                             major(rqe->b.b_dev),
415                             minor(rqe->b.b_dev),
416                             rqe->sdno,
417                             (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
418                             rqe->b.b_blkno,
419                             rqe->b.b_bcount);
420                     if (debug & DEBUG_LASTREQS)
421                         logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp);
422 #endif
423                     DEV_STRATEGY(&rqe->b, 0);
424                 }
425             }
426         }
427     }
428     /* Finally, write the parity block */
429     rqe = &rqg->rqe[0];
430     rqe->b.b_flags &= ~B_DONE;                              /* we're not done */
431     rqe->b.b_iocmd = BIO_WRITE;                             /* we're writing now */
432     rqe->b.b_iodone = complete_rqe;                         /* call us here when done */
433     rqg->flags &= ~XFR_PARITYOP;                            /* reset flags that brought us here */
434     rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;            /* length to write */
435     rqe->b.b_bufsize = rqe->b.b_bcount;                     /* don't claim we have more */
436     rqe->b.b_resid = rqe->b.b_bcount;                       /* nothing transferred */
437     rqg->active++;                                          /* another active request */
438     drive = &DRIVE[rqe->driveno];                           /* drive to access */
439
440     /* We can't sleep here, so we just increment the counters. */
441     drive->active++;
442     if (drive->active >= drive->maxactive)
443         drive->maxactive = drive->active;
444     vinum_conf.active++;
445     if (vinum_conf.active >= vinum_conf.maxactive)
446         vinum_conf.maxactive = vinum_conf.active;
447
448 #ifdef VINUMDEBUG
449     if (debug & DEBUG_ADDRESSES)
450         log(LOG_DEBUG,
451            "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%llx, length %ld\n",
452             rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
453             major(rqe->b.b_dev),
454             minor(rqe->b.b_dev),
455             rqe->sdno,
456             (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
457             rqe->b.b_blkno,
458             rqe->b.b_bcount);
459     if (debug & DEBUG_LASTREQS)
460         logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp);
461 #endif
462     DEV_STRATEGY(&rqe->b, 0);
463 }
464
465 /* Local Variables: */
466 /* fill-column: 50 */
467 /* End: */