1 /* vinuminterrupt.c: bottom half of the driver */
4 * Copyright (c) 1997, 1998, 1999
5 * Nan Yang Computer Services Limited. All rights reserved.
7 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
9 * Written by Greg Lehey
11 * This software is distributed under the so-called ``Berkeley
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by Nan Yang Computer
26 * 4. Neither the name of the Company nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
30 * This software is provided ``as is'', and any express or implied
31 * warranties, including, but not limited to, the implied warranties of
32 * merchantability and fitness for a particular purpose are disclaimed.
33 * In no event shall the company or contributors be liable for any
34 * direct, indirect, incidental, special, exemplary, or consequential
35 * damages (including, but not limited to, procurement of substitute
36 * goods or services; loss of use, data, or profits; or business
37 * interruption) however caused and on any theory of liability, whether
38 * in contract, strict liability, or tort (including negligence or
39 * otherwise) arising in any way out of the use of this software, even if
40 * advised of the possibility of such damage.
46 #include <dev/vinum/vinumhdr.h>
47 #include <dev/vinum/request.h>
48 #include <sys/resourcevar.h>
50 void complete_raid5_write(struct rqelement *);
51 void complete_rqe(struct buf *bp);
52 void sdio_done(struct buf *bp);
55 * Take a completed buffer, transfer the data back if
56 * it's a read, and complete the high-level request
57 * if this is the last subrequest.
59 * The bp parameter is in fact a struct rqelement, which
60 * includes a couple of extras at the end.
63 complete_rqe(struct buf *bp)
65 struct rqelement *rqe;
68 struct buf *ubp; /* user buffer */
71 rqe = (struct rqelement *) bp; /* point to the element element that completed */
72 rqg = rqe->rqg; /* and the request group */
73 rq = rqg->rq; /* and the complete request */
74 ubp = rq->bp; /* user buffer */
77 if (debug & DEBUG_LASTREQS)
78 logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
80 drive = &DRIVE[rqe->driveno];
81 drive->active--; /* one less outstanding I/O on this drive */
82 vinum_conf.active--; /* one less outstanding I/O globally */
83 if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */
84 ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */
85 wakeup(&launch_requests); /* let another one at it */
86 if ((bp->b_flags & B_ERROR) != 0) { /* transfer in error */
87 if (bp->b_error != 0) /* did it return a number? */
88 rq->error = bp->b_error; /* yes, put it in. */
89 else if (rq->error == 0) /* no: do we have one already? */
90 rq->error = EIO; /* no: catchall "I/O error" */
91 SD[rqe->sdno].lasterror = rq->error;
92 if (bp->b_flags & B_READ) {
93 log(LOG_ERR, "%s: fatal read I/O error\n", SD[rqe->sdno].name);
94 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
95 } else { /* write operation */
96 log(LOG_ERR, "%s: fatal write I/O error\n", SD[rqe->sdno].name);
97 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
99 if (rq->error == ENXIO) { /* the drive's down too */
100 log(LOG_ERR, "%s: fatal drive I/O error\n", DRIVE[rqe->driveno].label.name);
101 DRIVE[rqe->driveno].lasterror = rq->error;
102 set_drive_state(rqe->driveno, /* take the drive down */
107 /* Now update the statistics */
108 if (bp->b_flags & B_READ) { /* read operation */
109 DRIVE[rqe->driveno].reads++;
110 DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
111 SD[rqe->sdno].reads++;
112 SD[rqe->sdno].bytes_read += bp->b_bcount;
113 PLEX[rqe->rqg->plexno].reads++;
114 PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
115 } else { /* write operation */
116 DRIVE[rqe->driveno].writes++;
117 DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
118 SD[rqe->sdno].writes++;
119 SD[rqe->sdno].bytes_written += bp->b_bcount;
120 PLEX[rqe->rqg->plexno].writes++;
121 PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
123 rqg->active--; /* one less request active */
124 if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */
125 int *sdata; /* source */
126 int *data; /* and group data */
127 int length; /* and count involved */
128 int count; /* loop counter */
129 struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */
131 /* XOR destination is the user data */
132 sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
133 data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
134 length = urqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */
136 for (count = 0; count < length; count++)
137 data[count] ^= sdata[count];
140 * In a normal read, we will normally read directly
141 * into the user buffer. This doesn't work if
142 * we're also doing a recovery, so we have to
145 if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */
146 char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
149 dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
150 length = rqe->datalen << DEV_BSHIFT; /* and count involved */
151 bcopy(src, dst, length); /* move it */
153 } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 5 group write operation */
154 &&(rqg->active == 0)) /* and we've finished phase 1 */
155 complete_raid5_write(rqe);
156 if (rqg->active == 0) /* request group finished, */
157 rq->active--; /* one less */
158 if (rq->active == 0) { /* request finished, */
160 if (debug & DEBUG_RESID) {
161 if (ubp->b_resid != 0) /* still something to transfer? */
166 if (rq->error) { /* did we have an error? */
167 if (rq->isplex) { /* plex operation, */
168 ubp->b_flags |= B_ERROR; /* yes, propagate to user */
169 ubp->b_error = rq->error;
170 } else /* try to recover */
171 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
173 ubp->b_resid = 0; /* completed our transfer */
174 if (rq->isplex == 0) /* volume request, */
175 VOL[rq->volplex.volno].active--; /* another request finished */
176 biodone(ubp); /* top level buffer completed */
177 freerq(rq); /* return the request storage */
182 /* Free a request block and anything hanging off it */
184 freerq(struct request *rq)
187 struct rqgroup *nrqg; /* next in chain */
190 for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */
191 if (rqg->lock) /* got a lock? */
192 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
193 for (rqno = 0; rqno < rqg->count; rqno++)
194 if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */
195 &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */
196 Free(rqg->rqe[rqno].b.b_data); /* free it */
197 nrqg = rqg->next; /* note the next one */
198 Free(rqg); /* and free this one */
200 Free(rq); /* free the request itself */
203 /* I/O on subdisk completed */
205 sdio_done(struct buf *bp)
209 sbp = (struct sdbuf *) bp;
210 if (sbp->b.b_flags & B_ERROR) { /* had an error */
211 sbp->bp->b_flags |= B_ERROR; /* propagate upwards */
212 sbp->bp->b_error = sbp->b.b_error;
215 if (debug & DEBUG_LASTREQS)
216 logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
218 sbp->bp->b_resid = sbp->b.b_resid; /* copy the resid field */
219 /* Now update the statistics */
220 if (bp->b_flags & B_READ) { /* read operation */
221 DRIVE[sbp->driveno].reads++;
222 DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
223 SD[sbp->sdno].reads++;
224 SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
225 } else { /* write operation */
226 DRIVE[sbp->driveno].writes++;
227 DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
228 SD[sbp->sdno].writes++;
229 SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
231 biodone(sbp->bp); /* complete the caller's I/O */
235 /* Start the second phase of a RAID5 group write operation. */
237 complete_raid5_write(struct rqelement *rqe)
239 int *sdata; /* source */
240 int *pdata; /* and parity block data */
241 int length; /* and count involved */
242 int count; /* loop counter */
243 int rqno; /* request index */
244 int rqoffset; /* offset of request data from parity data */
245 struct buf *bp; /* user buffer header */
246 struct request *rq; /* pointer to our request */
247 struct rqgroup *rqg; /* and to the request group */
248 struct rqelement *prqe; /* point to the parity block */
249 struct drive *drive; /* drive to access */
251 rqg = rqe->rqg; /* and to our request group */
252 rq = rqg->rq; /* point to our request */
253 bp = rq->bp; /* user's buffer header */
254 prqe = &rqg->rqe[0]; /* point to the parity block */
257 * If we get to this function, we have normal or
258 * degraded writes, or a combination of both. We do
259 * the same thing in each case: we perform an
260 * exclusive or to the parity block. The only
261 * difference is the origin of the data and the
265 if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */
266 pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
267 bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */
269 /* Now get what data we need from each block */
270 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
272 * This can do with improvement. If we're doing
273 * both a degraded and a normal write, we don't
274 * need to xor (nor to read) the part of the block
275 * that we're going to overwrite. FIXME XXX
277 rqe = &rqg->rqe[rqno]; /* this request */
278 sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
279 length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */
282 * add the data block to the parity block. Before
283 * we started the request, we zeroed the parity
284 * block, so the result of adding all the other
285 * blocks and the block we want to write will be
286 * the correct parity block.
288 for (count = 0; count < length; count++)
289 pdata[count] ^= sdata[count];
290 if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */
291 &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */
292 Free(rqe->b.b_data); /* free it now */
293 rqe->flags &= ~XFR_MALLOCED;
297 if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */
298 /* Get what data we need from each block */
299 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
300 rqe = &rqg->rqe[rqno]; /* this request */
301 if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
302 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */
303 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
304 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
305 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
306 length = rqe->datalen << (DEV_BSHIFT - 2); /* and count involved */
308 * "remove" the old data block
309 * from the parity block
311 if ((pdata < ((int *) prqe->b.b_data))
312 || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
313 || (sdata < ((int *) rqe->b.b_data))
314 || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
315 panic("complete_raid5_write: bounds overflow");
316 for (count = 0; count < length; count++)
317 pdata[count] ^= sdata[count];
319 /* "add" the new data block */
320 sdata = (int *) (&bp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
321 if ((sdata < ((int *) bp->b_data))
322 || (&sdata[length] > ((int *) (bp->b_data + bp->b_bcount))))
323 panic("complete_raid5_write: bounds overflow");
324 for (count = 0; count < length; count++)
325 pdata[count] ^= sdata[count];
327 /* Free the malloced buffer */
328 if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */
329 Free(rqe->b.b_data); /* free it */
330 rqe->flags &= ~XFR_MALLOCED;
332 panic("complete_raid5_write: malloc conflict");
334 if ((rqe->b.b_flags & B_READ) /* this was a read */
335 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */
336 rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */
337 rqe->b.b_flags |= B_CALL; /* call us when you're done */
338 rqe->b.b_iodone = complete_rqe; /* by calling us here */
339 rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
340 rqe->b.b_data = &bp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
341 rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
342 rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */
343 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
344 rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */
345 rqg->active++; /* another active request */
346 rqe->b.b_vp->v_numoutput++; /* one more output going */
347 drive = &DRIVE[rqe->driveno]; /* drive to access */
349 /* We can't sleep here, so we just increment the counters. */
351 if (drive->active >= drive->maxactive)
352 drive->maxactive = drive->active;
354 if (vinum_conf.active >= vinum_conf.maxactive)
355 vinum_conf.maxactive = vinum_conf.active;
357 if (debug & DEBUG_ADDRESSES)
359 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
360 rqe->b.b_flags & B_READ ? "Read" : "Write",
364 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
367 if (debug & DEBUG_NUMOUTPUT)
369 " raid5.2 sd %d numoutput %ld\n",
371 rqe->b.b_vp->v_numoutput);
372 if (debug & DEBUG_LASTREQS)
373 logrq(loginfo_raid5_data, (union rqinfou) rqe, bp);
375 BUF_STRATEGY(&rqe->b, 0);
380 /* Finally, write the parity block */
382 rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */
383 rqe->b.b_flags |= B_CALL; /* call us when you're done */
384 rqe->b.b_iodone = complete_rqe; /* by calling us here */
385 rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
386 rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */
387 rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */
388 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
389 rqg->active++; /* another active request */
390 rqe->b.b_vp->v_numoutput++; /* one more output going */
391 drive = &DRIVE[rqe->driveno]; /* drive to access */
393 /* We can't sleep here, so we just increment the counters. */
395 if (drive->active >= drive->maxactive)
396 drive->maxactive = drive->active;
398 if (vinum_conf.active >= vinum_conf.maxactive)
399 vinum_conf.maxactive = vinum_conf.active;
402 if (debug & DEBUG_ADDRESSES)
404 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
405 rqe->b.b_flags & B_READ ? "Read" : "Write",
409 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
412 if (debug & DEBUG_NUMOUTPUT)
414 " raid5.3 sd %d numoutput %ld\n",
416 rqe->b.b_vp->v_numoutput);
417 if (debug & DEBUG_LASTREQS)
418 logrq(loginfo_raid5_parity, (union rqinfou) rqe, bp);
420 BUF_STRATEGY(&rqe->b, 0);