]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/vinum/vinuminterrupt.c
This commit was generated by cvs2svn to compensate for changes in r54427,
[FreeBSD/FreeBSD.git] / sys / dev / vinum / vinuminterrupt.c
1 /* vinuminterrupt.c: bottom half of the driver */
2
3 /*-
4  * Copyright (c) 1997, 1998, 1999
5  *      Nan Yang Computer Services Limited.  All rights reserved.
6  *
7  *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
8  *
9  *  Written by Greg Lehey
10  *
11  *  This software is distributed under the so-called ``Berkeley
12  *  License'':
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *      This product includes software developed by Nan Yang Computer
25  *      Services Limited.
26  * 4. Neither the name of the Company nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * This software is provided ``as is'', and any express or implied
31  * warranties, including, but not limited to, the implied warranties of
32  * merchantability and fitness for a particular purpose are disclaimed.
33  * In no event shall the company or contributors be liable for any
34  * direct, indirect, incidental, special, exemplary, or consequential
35  * damages (including, but not limited to, procurement of substitute
36  * goods or services; loss of use, data, or profits; or business
37  * interruption) however caused and on any theory of liability, whether
38  * in contract, strict liability, or tort (including negligence or
39  * otherwise) arising in any way out of the use of this software, even if
40  * advised of the possibility of such damage.
41  *
42  * $Id$
43  * $FreeBSD$
44  */
45
46 #include <dev/vinum/vinumhdr.h>
47 #include <dev/vinum/request.h>
48 #include <sys/resourcevar.h>
49
50 void complete_raid5_write(struct rqelement *);
51 void complete_rqe(struct buf *bp);
52 void sdio_done(struct buf *bp);
53
54 /*
55  * Take a completed buffer, transfer the data back if
56  * it's a read, and complete the high-level request
57  * if this is the last subrequest.
58  *
59  * The bp parameter is in fact a struct rqelement, which
60  * includes a couple of extras at the end.
61  */
62 void
63 complete_rqe(struct buf *bp)
64 {
65     struct rqelement *rqe;
66     struct request *rq;
67     struct rqgroup *rqg;
68     struct buf *ubp;                                        /* user buffer */
69     struct drive *drive;
70
71     rqe = (struct rqelement *) bp;                          /* point to the element element that completed */
72     rqg = rqe->rqg;                                         /* and the request group */
73     rq = rqg->rq;                                           /* and the complete request */
74     ubp = rq->bp;                                           /* user buffer */
75
76 #ifdef VINUMDEBUG
77     if (debug & DEBUG_LASTREQS)
78         logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
79 #endif
80     drive = &DRIVE[rqe->driveno];
81     drive->active--;                                        /* one less outstanding I/O on this drive */
82     vinum_conf.active--;                                    /* one less outstanding I/O globally */
83     if ((drive->active == (DRIVE_MAXACTIVE - 1))            /* we were at the drive limit */
84     ||(vinum_conf.active == VINUM_MAXACTIVE))               /* or the global limit */
85         wakeup(&launch_requests);                           /* let another one at it */
86     if ((bp->b_flags & B_ERROR) != 0) {                     /* transfer in error */
87         if (bp->b_error != 0)                               /* did it return a number? */
88             rq->error = bp->b_error;                        /* yes, put it in. */
89         else if (rq->error == 0)                            /* no: do we have one already? */
90             rq->error = EIO;                                /* no: catchall "I/O error" */
91         SD[rqe->sdno].lasterror = rq->error;
92         if (bp->b_flags & B_READ) {
93             log(LOG_ERR, "%s: fatal read I/O error\n", SD[rqe->sdno].name);
94             set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
95         } else {                                            /* write operation */
96             log(LOG_ERR, "%s: fatal write I/O error\n", SD[rqe->sdno].name);
97             set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
98         }
99         if (rq->error == ENXIO) {                           /* the drive's down too */
100             log(LOG_ERR, "%s: fatal drive I/O error\n", DRIVE[rqe->driveno].label.name);
101             DRIVE[rqe->driveno].lasterror = rq->error;
102             set_drive_state(rqe->driveno,                   /* take the drive down */
103                 drive_down,
104                 setstate_force);
105         }
106     }
107     /* Now update the statistics */
108     if (bp->b_flags & B_READ) {                             /* read operation */
109         DRIVE[rqe->driveno].reads++;
110         DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
111         SD[rqe->sdno].reads++;
112         SD[rqe->sdno].bytes_read += bp->b_bcount;
113         PLEX[rqe->rqg->plexno].reads++;
114         PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
115     } else {                                                /* write operation */
116         DRIVE[rqe->driveno].writes++;
117         DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
118         SD[rqe->sdno].writes++;
119         SD[rqe->sdno].bytes_written += bp->b_bcount;
120         PLEX[rqe->rqg->plexno].writes++;
121         PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
122     }
123     rqg->active--;                                          /* one less request active */
124     if (rqg->flags & XFR_RECOVERY_READ) {                   /* recovery read, */
125         int *sdata;                                         /* source */
126         int *data;                                          /* and group data */
127         int length;                                         /* and count involved */
128         int count;                                          /* loop counter */
129         struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
130
131         /* XOR destination is the user data */
132         sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
133         data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
134         length = urqe->grouplen << (DEV_BSHIFT - 2);        /* and count involved */
135
136         for (count = 0; count < length; count++)
137             data[count] ^= sdata[count];
138
139         /*
140          * In a normal read, we will normally read directly
141          * into the user buffer.  This doesn't work if
142          * we're also doing a recovery, so we have to
143          * copy it
144          */
145         if (rqe->flags & XFR_NORMAL_READ) {                 /* normal read as well, */
146             char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
147             char *dst;
148
149             dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
150             length = rqe->datalen << DEV_BSHIFT;            /* and count involved */
151             bcopy(src, dst, length);                        /* move it */
152         }
153     } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 5 group write operation  */
154     &&(rqg->active == 0))                                   /* and we've finished phase 1 */
155         complete_raid5_write(rqe);
156     if (rqg->active == 0)                                   /* request group finished, */
157         rq->active--;                                       /* one less */
158     if (rq->active == 0) {                                  /* request finished, */
159 #if VINUMDEBUG
160         if (debug & DEBUG_RESID) {
161             if (ubp->b_resid != 0)                          /* still something to transfer? */
162                 Debugger("resid");
163         }
164 #endif
165
166         if (rq->error) {                                    /* did we have an error? */
167             if (rq->isplex) {                               /* plex operation, */
168                 ubp->b_flags |= B_ERROR;                    /* yes, propagate to user */
169                 ubp->b_error = rq->error;
170             } else                                          /* try to recover */
171                 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
172         } else {
173             ubp->b_resid = 0;                               /* completed our transfer */
174             if (rq->isplex == 0)                            /* volume request, */
175                 VOL[rq->volplex.volno].active--;            /* another request finished */
176             biodone(ubp);                                   /* top level buffer completed */
177             freerq(rq);                                     /* return the request storage */
178         }
179     }
180 }
181
182 /* Free a request block and anything hanging off it */
183 void
184 freerq(struct request *rq)
185 {
186     struct rqgroup *rqg;
187     struct rqgroup *nrqg;                                   /* next in chain */
188     int rqno;
189
190     for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) {          /* through the whole request chain */
191         if (rqg->lock)                                      /* got a lock? */
192             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
193         for (rqno = 0; rqno < rqg->count; rqno++)
194             if ((rqg->rqe[rqno].flags & XFR_MALLOCED)       /* data buffer was malloced, */
195             &&rqg->rqe[rqno].b.b_data)                      /* and the allocation succeeded */
196                 Free(rqg->rqe[rqno].b.b_data);              /* free it */
197         nrqg = rqg->next;                                   /* note the next one */
198         Free(rqg);                                          /* and free this one */
199     }
200     Free(rq);                                               /* free the request itself */
201 }
202
203 /* I/O on subdisk completed */
204 void
205 sdio_done(struct buf *bp)
206 {
207     struct sdbuf *sbp;
208
209     sbp = (struct sdbuf *) bp;
210     if (sbp->b.b_flags & B_ERROR) {                         /* had an error */
211         sbp->bp->b_flags |= B_ERROR;                        /* propagate upwards */
212         sbp->bp->b_error = sbp->b.b_error;
213     }
214 #ifdef VINUMDEBUG
215     if (debug & DEBUG_LASTREQS)
216         logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
217 #endif
218     sbp->bp->b_resid = sbp->b.b_resid;                      /* copy the resid field */
219     /* Now update the statistics */
220     if (bp->b_flags & B_READ) {                             /* read operation */
221         DRIVE[sbp->driveno].reads++;
222         DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
223         SD[sbp->sdno].reads++;
224         SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
225     } else {                                                /* write operation */
226         DRIVE[sbp->driveno].writes++;
227         DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
228         SD[sbp->sdno].writes++;
229         SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
230     }
231     biodone(sbp->bp);                                       /* complete the caller's I/O */
232     Free(sbp);
233 }
234
235 /* Start the second phase of a RAID5 group write operation. */
236 void
237 complete_raid5_write(struct rqelement *rqe)
238 {
239     int *sdata;                                             /* source */
240     int *pdata;                                             /* and parity block data */
241     int length;                                             /* and count involved */
242     int count;                                              /* loop counter */
243     int rqno;                                               /* request index */
244     int rqoffset;                                           /* offset of request data from parity data */
245     struct buf *bp;                                         /* user buffer header */
246     struct request *rq;                                     /* pointer to our request */
247     struct rqgroup *rqg;                                    /* and to the request group */
248     struct rqelement *prqe;                                 /* point to the parity block */
249     struct drive *drive;                                    /* drive to access */
250
251     rqg = rqe->rqg;                                         /* and to our request group */
252     rq = rqg->rq;                                           /* point to our request */
253     bp = rq->bp;                                            /* user's buffer header */
254     prqe = &rqg->rqe[0];                                    /* point to the parity block */
255
256     /*
257      * If we get to this function, we have normal or
258      * degraded writes, or a combination of both.  We do
259      * the same thing in each case: we perform an
260      * exclusive or to the parity block.  The only
261      * difference is the origin of the data and the
262      * address range.
263      */
264
265     if (rqe->flags & XFR_DEGRADED_WRITE) {                  /* do the degraded write stuff */
266         pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
267         bzero(pdata, prqe->grouplen << DEV_BSHIFT);         /* start with nothing in the parity block */
268
269         /* Now get what data we need from each block */
270         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
271             /*
272              * This can do with improvement.  If we're doing
273              * both a degraded and a normal write, we don't
274              * need to xor (nor to read) the part of the block
275              * that we're going to overwrite.  FIXME XXX
276              */
277             rqe = &rqg->rqe[rqno];                          /* this request */
278             sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
279             length = rqe->grouplen << (DEV_BSHIFT - 2);     /* and count involved */
280
281             /*
282              * add the data block to the parity block.  Before
283              * we started the request, we zeroed the parity
284              * block, so the result of adding all the other
285              * blocks and the block we want to write will be
286              * the correct parity block.
287              */
288             for (count = 0; count < length; count++)
289                 pdata[count] ^= sdata[count];
290             if ((rqe->flags & XFR_MALLOCED)                 /* the buffer was malloced, */
291             &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {     /* and we have no normal write, */
292                 Free(rqe->b.b_data);                        /* free it now */
293                 rqe->flags &= ~XFR_MALLOCED;
294             }
295         }
296     }
297     if (rqg->flags & XFR_NORMAL_WRITE) {                    /* do normal write stuff */
298         /* Get what data we need from each block */
299         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
300             rqe = &rqg->rqe[rqno];                          /* this request */
301             if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
302                 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
303                 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
304                 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
305                 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
306                 length = rqe->datalen << (DEV_BSHIFT - 2);  /* and count involved */
307                 /*
308                  * "remove" the old data block
309                  * from the parity block
310                  */
311                 if ((pdata < ((int *) prqe->b.b_data))
312                     || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
313                     || (sdata < ((int *) rqe->b.b_data))
314                     || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
315                     panic("complete_raid5_write: bounds overflow");
316                 for (count = 0; count < length; count++)
317                     pdata[count] ^= sdata[count];
318
319                 /* "add" the new data block */
320                 sdata = (int *) (&bp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
321                 if ((sdata < ((int *) bp->b_data))
322                     || (&sdata[length] > ((int *) (bp->b_data + bp->b_bcount))))
323                     panic("complete_raid5_write: bounds overflow");
324                 for (count = 0; count < length; count++)
325                     pdata[count] ^= sdata[count];
326
327                 /* Free the malloced buffer */
328                 if (rqe->flags & XFR_MALLOCED) {            /* the buffer was malloced, */
329                     Free(rqe->b.b_data);                    /* free it */
330                     rqe->flags &= ~XFR_MALLOCED;
331                 } else
332                     panic("complete_raid5_write: malloc conflict");
333
334                 if ((rqe->b.b_flags & B_READ)               /* this was a read */
335                 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
336                     rqe->b.b_flags &= ~(B_READ | B_DONE);   /* we're writing now */
337                     rqe->b.b_flags |= B_CALL;               /* call us when you're done */
338                     rqe->b.b_iodone = complete_rqe;         /* by calling us here */
339                     rqe->flags &= ~XFR_PARITYOP;            /* reset flags that brought us here */
340                     rqe->b.b_data = &bp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
341                     rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
342                     rqe->b.b_bufsize = rqe->b.b_bcount;     /* don't claim more */
343                     rqe->b.b_resid = rqe->b.b_bcount;       /* nothing transferred */
344                     rqe->b.b_blkno += rqe->dataoffset;      /* point to the correct block */
345                     rqg->active++;                          /* another active request */
346                     rqe->b.b_vp->v_numoutput++;             /* one more output going */
347                     drive = &DRIVE[rqe->driveno];           /* drive to access */
348
349                                                             /* We can't sleep here, so we just increment the counters. */
350                     drive->active++;
351                     if (drive->active >= drive->maxactive)
352                         drive->maxactive = drive->active;
353                     vinum_conf.active++;
354                     if (vinum_conf.active >= vinum_conf.maxactive)
355                         vinum_conf.maxactive = vinum_conf.active;
356 #if VINUMDEBUG
357                     if (debug & DEBUG_ADDRESSES)
358                         log(LOG_DEBUG,
359                             "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
360                             rqe->b.b_flags & B_READ ? "Read" : "Write",
361                             major(rqe->b.b_dev),
362                             minor(rqe->b.b_dev),
363                             rqe->sdno,
364                             (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
365                             rqe->b.b_blkno,
366                             rqe->b.b_bcount);
367                     if (debug & DEBUG_NUMOUTPUT)
368                         log(LOG_DEBUG,
369                             "  raid5.2 sd %d numoutput %ld\n",
370                             rqe->sdno,
371                             rqe->b.b_vp->v_numoutput);
372                     if (debug & DEBUG_LASTREQS)
373                         logrq(loginfo_raid5_data, (union rqinfou) rqe, bp);
374 #endif
375                     BUF_STRATEGY(&rqe->b, 0);
376                 }
377             }
378         }
379     }
380     /* Finally, write the parity block */
381     rqe = &rqg->rqe[0];
382     rqe->b.b_flags &= ~(B_READ | B_DONE);                   /* we're writing now */
383     rqe->b.b_flags |= B_CALL;                               /* call us when you're done */
384     rqe->b.b_iodone = complete_rqe;                         /* by calling us here */
385     rqg->flags &= ~XFR_PARITYOP;                            /* reset flags that brought us here */
386     rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;            /* length to write */
387     rqe->b.b_bufsize = rqe->b.b_bcount;                     /* don't claim we have more */
388     rqe->b.b_resid = rqe->b.b_bcount;                       /* nothing transferred */
389     rqg->active++;                                          /* another active request */
390     rqe->b.b_vp->v_numoutput++;                             /* one more output going */
391     drive = &DRIVE[rqe->driveno];                           /* drive to access */
392
393     /* We can't sleep here, so we just increment the counters. */
394     drive->active++;
395     if (drive->active >= drive->maxactive)
396         drive->maxactive = drive->active;
397     vinum_conf.active++;
398     if (vinum_conf.active >= vinum_conf.maxactive)
399         vinum_conf.maxactive = vinum_conf.active;
400
401 #if VINUMDEBUG
402     if (debug & DEBUG_ADDRESSES)
403         log(LOG_DEBUG,
404             "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
405             rqe->b.b_flags & B_READ ? "Read" : "Write",
406             major(rqe->b.b_dev),
407             minor(rqe->b.b_dev),
408             rqe->sdno,
409             (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
410             rqe->b.b_blkno,
411             rqe->b.b_bcount);
412     if (debug & DEBUG_NUMOUTPUT)
413         log(LOG_DEBUG,
414             "  raid5.3 sd %d numoutput %ld\n",
415             rqe->sdno,
416             rqe->b.b_vp->v_numoutput);
417     if (debug & DEBUG_LASTREQS)
418         logrq(loginfo_raid5_parity, (union rqinfou) rqe, bp);
419 #endif
420     BUF_STRATEGY(&rqe->b, 0);
421 }