]> CyberLeo.Net >> Repos - FreeBSD/stable/9.git/blob - sys/geom/raid3/g_raid3.c
MFC r245444:
[FreeBSD/stable/9.git] / sys / geom / raid3 / g_raid3.c
1 /*-
2  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sbuf.h>
39 #include <sys/sysctl.h>
40 #include <sys/malloc.h>
41 #include <sys/eventhandler.h>
42 #include <vm/uma.h>
43 #include <geom/geom.h>
44 #include <sys/proc.h>
45 #include <sys/kthread.h>
46 #include <sys/sched.h>
47 #include <geom/raid3/g_raid3.h>
48
49 FEATURE(geom_raid3, "GEOM RAID-3 functionality");
50
51 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
52
53 SYSCTL_DECL(_kern_geom);
54 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
55 u_int g_raid3_debug = 0;
56 TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
57 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
58     "Debug level");
59 static u_int g_raid3_timeout = 4;
60 TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
61 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
62     0, "Time to wait on all raid3 components");
63 static u_int g_raid3_idletime = 5;
64 TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
65 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
66     &g_raid3_idletime, 0, "Mark components as clean when idling");
67 static u_int g_raid3_disconnect_on_failure = 1;
68 TUNABLE_INT("kern.geom.raid3.disconnect_on_failure",
69     &g_raid3_disconnect_on_failure);
70 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
71     &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
72 static u_int g_raid3_syncreqs = 2;
73 TUNABLE_INT("kern.geom.raid3.sync_requests", &g_raid3_syncreqs);
74 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
75     &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
76 static u_int g_raid3_use_malloc = 0;
77 TUNABLE_INT("kern.geom.raid3.use_malloc", &g_raid3_use_malloc);
78 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN,
79     &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9).");
80
81 static u_int g_raid3_n64k = 50;
82 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
83 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
84     "Maximum number of 64kB allocations");
85 static u_int g_raid3_n16k = 200;
86 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
87 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
88     "Maximum number of 16kB allocations");
89 static u_int g_raid3_n4k = 1200;
90 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
91 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
92     "Maximum number of 4kB allocations");
93
94 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
95     "GEOM_RAID3 statistics");
96 static u_int g_raid3_parity_mismatch = 0;
97 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
98     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
99
100 #define MSLEEP(ident, mtx, priority, wmesg, timeout)    do {            \
101         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));        \
102         msleep((ident), (mtx), (priority), (wmesg), (timeout));         \
103         G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));        \
104 } while (0)
105
106 static eventhandler_tag g_raid3_post_sync = NULL;
107 static int g_raid3_shutdown = 0;
108
109 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
110     struct g_geom *gp);
111 static g_taste_t g_raid3_taste;
112 static void g_raid3_init(struct g_class *mp);
113 static void g_raid3_fini(struct g_class *mp);
114
115 struct g_class g_raid3_class = {
116         .name = G_RAID3_CLASS_NAME,
117         .version = G_VERSION,
118         .ctlreq = g_raid3_config,
119         .taste = g_raid3_taste,
120         .destroy_geom = g_raid3_destroy_geom,
121         .init = g_raid3_init,
122         .fini = g_raid3_fini
123 };
124
125
126 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
127 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
128 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
129 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
130     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
131 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
132 static int g_raid3_register_request(struct bio *pbp);
133 static void g_raid3_sync_release(struct g_raid3_softc *sc);
134
135
136 static const char *
137 g_raid3_disk_state2str(int state)
138 {
139
140         switch (state) {
141         case G_RAID3_DISK_STATE_NODISK:
142                 return ("NODISK");
143         case G_RAID3_DISK_STATE_NONE:
144                 return ("NONE");
145         case G_RAID3_DISK_STATE_NEW:
146                 return ("NEW");
147         case G_RAID3_DISK_STATE_ACTIVE:
148                 return ("ACTIVE");
149         case G_RAID3_DISK_STATE_STALE:
150                 return ("STALE");
151         case G_RAID3_DISK_STATE_SYNCHRONIZING:
152                 return ("SYNCHRONIZING");
153         case G_RAID3_DISK_STATE_DISCONNECTED:
154                 return ("DISCONNECTED");
155         default:
156                 return ("INVALID");
157         }
158 }
159
160 static const char *
161 g_raid3_device_state2str(int state)
162 {
163
164         switch (state) {
165         case G_RAID3_DEVICE_STATE_STARTING:
166                 return ("STARTING");
167         case G_RAID3_DEVICE_STATE_DEGRADED:
168                 return ("DEGRADED");
169         case G_RAID3_DEVICE_STATE_COMPLETE:
170                 return ("COMPLETE");
171         default:
172                 return ("INVALID");
173         }
174 }
175
176 const char *
177 g_raid3_get_diskname(struct g_raid3_disk *disk)
178 {
179
180         if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
181                 return ("[unknown]");
182         return (disk->d_name);
183 }
184
185 static void *
186 g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags)
187 {
188         void *ptr;
189         enum g_raid3_zones zone;
190
191         if (g_raid3_use_malloc ||
192             (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
193                 ptr = malloc(size, M_RAID3, flags);
194         else {
195                 ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone,
196                    &sc->sc_zones[zone], flags);
197                 sc->sc_zones[zone].sz_requested++;
198                 if (ptr == NULL)
199                         sc->sc_zones[zone].sz_failed++;
200         }
201         return (ptr);
202 }
203
204 static void
205 g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size)
206 {
207         enum g_raid3_zones zone;
208
209         if (g_raid3_use_malloc ||
210             (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
211                 free(ptr, M_RAID3);
212         else {
213                 uma_zfree_arg(sc->sc_zones[zone].sz_zone,
214                     ptr, &sc->sc_zones[zone]);
215         }
216 }
217
218 static int
219 g_raid3_uma_ctor(void *mem, int size, void *arg, int flags)
220 {
221         struct g_raid3_zone *sz = arg;
222
223         if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max)
224                 return (ENOMEM);
225         sz->sz_inuse++;
226         return (0);
227 }
228
229 static void
230 g_raid3_uma_dtor(void *mem, int size, void *arg)
231 {
232         struct g_raid3_zone *sz = arg;
233
234         sz->sz_inuse--;
235 }
236
237 #define g_raid3_xor(src, dst, size)                                     \
238         _g_raid3_xor((uint64_t *)(src),                                 \
239             (uint64_t *)(dst), (size_t)size)
240 static void
241 _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size)
242 {
243
244         KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
245         for (; size > 0; size -= 128) {
246                 *dst++ ^= (*src++);
247                 *dst++ ^= (*src++);
248                 *dst++ ^= (*src++);
249                 *dst++ ^= (*src++);
250                 *dst++ ^= (*src++);
251                 *dst++ ^= (*src++);
252                 *dst++ ^= (*src++);
253                 *dst++ ^= (*src++);
254                 *dst++ ^= (*src++);
255                 *dst++ ^= (*src++);
256                 *dst++ ^= (*src++);
257                 *dst++ ^= (*src++);
258                 *dst++ ^= (*src++);
259                 *dst++ ^= (*src++);
260                 *dst++ ^= (*src++);
261                 *dst++ ^= (*src++);
262         }
263 }
264
265 static int
266 g_raid3_is_zero(struct bio *bp)
267 {
268         static const uint64_t zeros[] = {
269             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
270         };
271         u_char *addr;
272         ssize_t size;
273
274         size = bp->bio_length;
275         addr = (u_char *)bp->bio_data;
276         for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
277                 if (bcmp(addr, zeros, sizeof(zeros)) != 0)
278                         return (0);
279         }
280         return (1);
281 }
282
283 /*
284  * --- Events handling functions ---
285  * Events in geom_raid3 are used to maintain disks and device status
286  * from one thread to simplify locking.
287  */
288 static void
289 g_raid3_event_free(struct g_raid3_event *ep)
290 {
291
292         free(ep, M_RAID3);
293 }
294
295 int
296 g_raid3_event_send(void *arg, int state, int flags)
297 {
298         struct g_raid3_softc *sc;
299         struct g_raid3_disk *disk;
300         struct g_raid3_event *ep;
301         int error;
302
303         ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
304         G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
305         if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
306                 disk = NULL;
307                 sc = arg;
308         } else {
309                 disk = arg;
310                 sc = disk->d_softc;
311         }
312         ep->e_disk = disk;
313         ep->e_state = state;
314         ep->e_flags = flags;
315         ep->e_error = 0;
316         mtx_lock(&sc->sc_events_mtx);
317         TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
318         mtx_unlock(&sc->sc_events_mtx);
319         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
320         mtx_lock(&sc->sc_queue_mtx);
321         wakeup(sc);
322         wakeup(&sc->sc_queue);
323         mtx_unlock(&sc->sc_queue_mtx);
324         if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
325                 return (0);
326         sx_assert(&sc->sc_lock, SX_XLOCKED);
327         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
328         sx_xunlock(&sc->sc_lock);
329         while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
330                 mtx_lock(&sc->sc_events_mtx);
331                 MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
332                     hz * 5);
333         }
334         error = ep->e_error;
335         g_raid3_event_free(ep);
336         sx_xlock(&sc->sc_lock);
337         return (error);
338 }
339
340 static struct g_raid3_event *
341 g_raid3_event_get(struct g_raid3_softc *sc)
342 {
343         struct g_raid3_event *ep;
344
345         mtx_lock(&sc->sc_events_mtx);
346         ep = TAILQ_FIRST(&sc->sc_events);
347         mtx_unlock(&sc->sc_events_mtx);
348         return (ep);
349 }
350
351 static void
352 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
353 {
354
355         mtx_lock(&sc->sc_events_mtx);
356         TAILQ_REMOVE(&sc->sc_events, ep, e_next);
357         mtx_unlock(&sc->sc_events_mtx);
358 }
359
360 static void
361 g_raid3_event_cancel(struct g_raid3_disk *disk)
362 {
363         struct g_raid3_softc *sc;
364         struct g_raid3_event *ep, *tmpep;
365
366         sc = disk->d_softc;
367         sx_assert(&sc->sc_lock, SX_XLOCKED);
368
369         mtx_lock(&sc->sc_events_mtx);
370         TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
371                 if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
372                         continue;
373                 if (ep->e_disk != disk)
374                         continue;
375                 TAILQ_REMOVE(&sc->sc_events, ep, e_next);
376                 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
377                         g_raid3_event_free(ep);
378                 else {
379                         ep->e_error = ECANCELED;
380                         wakeup(ep);
381                 }
382         }
383         mtx_unlock(&sc->sc_events_mtx);
384 }
385
386 /*
387  * Return the number of disks in the given state.
388  * If state is equal to -1, count all connected disks.
389  */
390 u_int
391 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
392 {
393         struct g_raid3_disk *disk;
394         u_int n, ndisks;
395
396         sx_assert(&sc->sc_lock, SX_LOCKED);
397
398         for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
399                 disk = &sc->sc_disks[n];
400                 if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
401                         continue;
402                 if (state == -1 || disk->d_state == state)
403                         ndisks++;
404         }
405         return (ndisks);
406 }
407
408 static u_int
409 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
410 {
411         struct bio *bp;
412         u_int nreqs = 0;
413
414         mtx_lock(&sc->sc_queue_mtx);
415         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
416                 if (bp->bio_from == cp)
417                         nreqs++;
418         }
419         mtx_unlock(&sc->sc_queue_mtx);
420         return (nreqs);
421 }
422
423 static int
424 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
425 {
426
427         if (cp->index > 0) {
428                 G_RAID3_DEBUG(2,
429                     "I/O requests for %s exist, can't destroy it now.",
430                     cp->provider->name);
431                 return (1);
432         }
433         if (g_raid3_nrequests(sc, cp) > 0) {
434                 G_RAID3_DEBUG(2,
435                     "I/O requests for %s in queue, can't destroy it now.",
436                     cp->provider->name);
437                 return (1);
438         }
439         return (0);
440 }
441
442 static void
443 g_raid3_destroy_consumer(void *arg, int flags __unused)
444 {
445         struct g_consumer *cp;
446
447         g_topology_assert();
448
449         cp = arg;
450         G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
451         g_detach(cp);
452         g_destroy_consumer(cp);
453 }
454
455 static void
456 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
457 {
458         struct g_provider *pp;
459         int retaste_wait;
460
461         g_topology_assert();
462
463         cp->private = NULL;
464         if (g_raid3_is_busy(sc, cp))
465                 return;
466         G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
467         pp = cp->provider;
468         retaste_wait = 0;
469         if (cp->acw == 1) {
470                 if ((pp->geom->flags & G_GEOM_WITHER) == 0)
471                         retaste_wait = 1;
472         }
473         G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
474             -cp->acw, -cp->ace, 0);
475         if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
476                 g_access(cp, -cp->acr, -cp->acw, -cp->ace);
477         if (retaste_wait) {
478                 /*
479                  * After retaste event was send (inside g_access()), we can send
480                  * event to detach and destroy consumer.
481                  * A class, which has consumer to the given provider connected
482                  * will not receive retaste event for the provider.
483                  * This is the way how I ignore retaste events when I close
484                  * consumers opened for write: I detach and destroy consumer
485                  * after retaste event is sent.
486                  */
487                 g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
488                 return;
489         }
490         G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
491         g_detach(cp);
492         g_destroy_consumer(cp);
493 }
494
495 static int
496 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
497 {
498         struct g_consumer *cp;
499         int error;
500
501         g_topology_assert_not();
502         KASSERT(disk->d_consumer == NULL,
503             ("Disk already connected (device %s).", disk->d_softc->sc_name));
504
505         g_topology_lock();
506         cp = g_new_consumer(disk->d_softc->sc_geom);
507         error = g_attach(cp, pp);
508         if (error != 0) {
509                 g_destroy_consumer(cp);
510                 g_topology_unlock();
511                 return (error);
512         }
513         error = g_access(cp, 1, 1, 1);
514                 g_topology_unlock();
515         if (error != 0) {
516                 g_detach(cp);
517                 g_destroy_consumer(cp);
518                 G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
519                     pp->name, error);
520                 return (error);
521         }
522         disk->d_consumer = cp;
523         disk->d_consumer->private = disk;
524         disk->d_consumer->index = 0;
525         G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
526         return (0);
527 }
528
529 static void
530 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
531 {
532
533         g_topology_assert();
534
535         if (cp == NULL)
536                 return;
537         if (cp->provider != NULL)
538                 g_raid3_kill_consumer(sc, cp);
539         else
540                 g_destroy_consumer(cp);
541 }
542
543 /*
544  * Initialize disk. This means allocate memory, create consumer, attach it
545  * to the provider and open access (r1w1e1) to it.
546  */
547 static struct g_raid3_disk *
548 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
549     struct g_raid3_metadata *md, int *errorp)
550 {
551         struct g_raid3_disk *disk;
552         int error;
553
554         disk = &sc->sc_disks[md->md_no];
555         error = g_raid3_connect_disk(disk, pp);
556         if (error != 0) {
557                 if (errorp != NULL)
558                         *errorp = error;
559                 return (NULL);
560         }
561         disk->d_state = G_RAID3_DISK_STATE_NONE;
562         disk->d_flags = md->md_dflags;
563         if (md->md_provider[0] != '\0')
564                 disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
565         disk->d_sync.ds_consumer = NULL;
566         disk->d_sync.ds_offset = md->md_sync_offset;
567         disk->d_sync.ds_offset_done = md->md_sync_offset;
568         disk->d_genid = md->md_genid;
569         disk->d_sync.ds_syncid = md->md_syncid;
570         if (errorp != NULL)
571                 *errorp = 0;
572         return (disk);
573 }
574
575 static void
576 g_raid3_destroy_disk(struct g_raid3_disk *disk)
577 {
578         struct g_raid3_softc *sc;
579
580         g_topology_assert_not();
581         sc = disk->d_softc;
582         sx_assert(&sc->sc_lock, SX_XLOCKED);
583
584         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
585                 return;
586         g_raid3_event_cancel(disk);
587         switch (disk->d_state) {
588         case G_RAID3_DISK_STATE_SYNCHRONIZING:
589                 if (sc->sc_syncdisk != NULL)
590                         g_raid3_sync_stop(sc, 1);
591                 /* FALLTHROUGH */
592         case G_RAID3_DISK_STATE_NEW:
593         case G_RAID3_DISK_STATE_STALE:
594         case G_RAID3_DISK_STATE_ACTIVE:
595                 g_topology_lock();
596                 g_raid3_disconnect_consumer(sc, disk->d_consumer);
597                 g_topology_unlock();
598                 disk->d_consumer = NULL;
599                 break;
600         default:
601                 KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
602                     g_raid3_get_diskname(disk),
603                     g_raid3_disk_state2str(disk->d_state)));
604         }
605         disk->d_state = G_RAID3_DISK_STATE_NODISK;
606 }
607
608 static void
609 g_raid3_destroy_device(struct g_raid3_softc *sc)
610 {
611         struct g_raid3_event *ep;
612         struct g_raid3_disk *disk;
613         struct g_geom *gp;
614         struct g_consumer *cp;
615         u_int n;
616
617         g_topology_assert_not();
618         sx_assert(&sc->sc_lock, SX_XLOCKED);
619
620         gp = sc->sc_geom;
621         if (sc->sc_provider != NULL)
622                 g_raid3_destroy_provider(sc);
623         for (n = 0; n < sc->sc_ndisks; n++) {
624                 disk = &sc->sc_disks[n];
625                 if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
626                         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
627                         g_raid3_update_metadata(disk);
628                         g_raid3_destroy_disk(disk);
629                 }
630         }
631         while ((ep = g_raid3_event_get(sc)) != NULL) {
632                 g_raid3_event_remove(sc, ep);
633                 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
634                         g_raid3_event_free(ep);
635                 else {
636                         ep->e_error = ECANCELED;
637                         ep->e_flags |= G_RAID3_EVENT_DONE;
638                         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
639                         mtx_lock(&sc->sc_events_mtx);
640                         wakeup(ep);
641                         mtx_unlock(&sc->sc_events_mtx);
642                 }
643         }
644         callout_drain(&sc->sc_callout);
645         cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
646         g_topology_lock();
647         if (cp != NULL)
648                 g_raid3_disconnect_consumer(sc, cp);
649         g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
650         G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
651         g_wither_geom(gp, ENXIO);
652         g_topology_unlock();
653         if (!g_raid3_use_malloc) {
654                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
655                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
656                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
657         }
658         mtx_destroy(&sc->sc_queue_mtx);
659         mtx_destroy(&sc->sc_events_mtx);
660         sx_xunlock(&sc->sc_lock);
661         sx_destroy(&sc->sc_lock);
662 }
663
664 static void
665 g_raid3_orphan(struct g_consumer *cp)
666 {
667         struct g_raid3_disk *disk;
668
669         g_topology_assert();
670
671         disk = cp->private;
672         if (disk == NULL)
673                 return;
674         disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
675         g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
676             G_RAID3_EVENT_DONTWAIT);
677 }
678
679 static int
680 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
681 {
682         struct g_raid3_softc *sc;
683         struct g_consumer *cp;
684         off_t offset, length;
685         u_char *sector;
686         int error = 0;
687
688         g_topology_assert_not();
689         sc = disk->d_softc;
690         sx_assert(&sc->sc_lock, SX_LOCKED);
691
692         cp = disk->d_consumer;
693         KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
694         KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
695         KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
696             ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
697             cp->acw, cp->ace));
698         length = cp->provider->sectorsize;
699         offset = cp->provider->mediasize - length;
700         sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
701         if (md != NULL)
702                 raid3_metadata_encode(md, sector);
703         error = g_write_data(cp, offset, sector, length);
704         free(sector, M_RAID3);
705         if (error != 0) {
706                 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
707                         G_RAID3_DEBUG(0, "Cannot write metadata on %s "
708                             "(device=%s, error=%d).",
709                             g_raid3_get_diskname(disk), sc->sc_name, error);
710                         disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
711                 } else {
712                         G_RAID3_DEBUG(1, "Cannot write metadata on %s "
713                             "(device=%s, error=%d).",
714                             g_raid3_get_diskname(disk), sc->sc_name, error);
715                 }
716                 if (g_raid3_disconnect_on_failure &&
717                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
718                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
719                         g_raid3_event_send(disk,
720                             G_RAID3_DISK_STATE_DISCONNECTED,
721                             G_RAID3_EVENT_DONTWAIT);
722                 }
723         }
724         return (error);
725 }
726
727 int
728 g_raid3_clear_metadata(struct g_raid3_disk *disk)
729 {
730         int error;
731
732         g_topology_assert_not();
733         sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
734
735         error = g_raid3_write_metadata(disk, NULL);
736         if (error == 0) {
737                 G_RAID3_DEBUG(2, "Metadata on %s cleared.",
738                     g_raid3_get_diskname(disk));
739         } else {
740                 G_RAID3_DEBUG(0,
741                     "Cannot clear metadata on disk %s (error=%d).",
742                     g_raid3_get_diskname(disk), error);
743         }
744         return (error);
745 }
746
747 void
748 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
749 {
750         struct g_raid3_softc *sc;
751         struct g_provider *pp;
752
753         sc = disk->d_softc;
754         strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
755         md->md_version = G_RAID3_VERSION;
756         strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
757         md->md_id = sc->sc_id;
758         md->md_all = sc->sc_ndisks;
759         md->md_genid = sc->sc_genid;
760         md->md_mediasize = sc->sc_mediasize;
761         md->md_sectorsize = sc->sc_sectorsize;
762         md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
763         md->md_no = disk->d_no;
764         md->md_syncid = disk->d_sync.ds_syncid;
765         md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
766         if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
767                 md->md_sync_offset = 0;
768         else {
769                 md->md_sync_offset =
770                     disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1);
771         }
772         if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
773                 pp = disk->d_consumer->provider;
774         else
775                 pp = NULL;
776         if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
777                 strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
778         else
779                 bzero(md->md_provider, sizeof(md->md_provider));
780         if (pp != NULL)
781                 md->md_provsize = pp->mediasize;
782         else
783                 md->md_provsize = 0;
784 }
785
786 void
787 g_raid3_update_metadata(struct g_raid3_disk *disk)
788 {
789         struct g_raid3_softc *sc;
790         struct g_raid3_metadata md;
791         int error;
792
793         g_topology_assert_not();
794         sc = disk->d_softc;
795         sx_assert(&sc->sc_lock, SX_LOCKED);
796
797         g_raid3_fill_metadata(disk, &md);
798         error = g_raid3_write_metadata(disk, &md);
799         if (error == 0) {
800                 G_RAID3_DEBUG(2, "Metadata on %s updated.",
801                     g_raid3_get_diskname(disk));
802         } else {
803                 G_RAID3_DEBUG(0,
804                     "Cannot update metadata on disk %s (error=%d).",
805                     g_raid3_get_diskname(disk), error);
806         }
807 }
808
809 static void
810 g_raid3_bump_syncid(struct g_raid3_softc *sc)
811 {
812         struct g_raid3_disk *disk;
813         u_int n;
814
815         g_topology_assert_not();
816         sx_assert(&sc->sc_lock, SX_XLOCKED);
817         KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
818             ("%s called with no active disks (device=%s).", __func__,
819             sc->sc_name));
820
821         sc->sc_syncid++;
822         G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
823             sc->sc_syncid);
824         for (n = 0; n < sc->sc_ndisks; n++) {
825                 disk = &sc->sc_disks[n];
826                 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
827                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
828                         disk->d_sync.ds_syncid = sc->sc_syncid;
829                         g_raid3_update_metadata(disk);
830                 }
831         }
832 }
833
834 static void
835 g_raid3_bump_genid(struct g_raid3_softc *sc)
836 {
837         struct g_raid3_disk *disk;
838         u_int n;
839
840         g_topology_assert_not();
841         sx_assert(&sc->sc_lock, SX_XLOCKED);
842         KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
843             ("%s called with no active disks (device=%s).", __func__,
844             sc->sc_name));
845
846         sc->sc_genid++;
847         G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
848             sc->sc_genid);
849         for (n = 0; n < sc->sc_ndisks; n++) {
850                 disk = &sc->sc_disks[n];
851                 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
852                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
853                         disk->d_genid = sc->sc_genid;
854                         g_raid3_update_metadata(disk);
855                 }
856         }
857 }
858
859 static int
860 g_raid3_idle(struct g_raid3_softc *sc, int acw)
861 {
862         struct g_raid3_disk *disk;
863         u_int i;
864         int timeout;
865
866         g_topology_assert_not();
867         sx_assert(&sc->sc_lock, SX_XLOCKED);
868
869         if (sc->sc_provider == NULL)
870                 return (0);
871         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
872                 return (0);
873         if (sc->sc_idle)
874                 return (0);
875         if (sc->sc_writes > 0)
876                 return (0);
877         if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
878                 timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
879                 if (!g_raid3_shutdown && timeout > 0)
880                         return (timeout);
881         }
882         sc->sc_idle = 1;
883         for (i = 0; i < sc->sc_ndisks; i++) {
884                 disk = &sc->sc_disks[i];
885                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
886                         continue;
887                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
888                     g_raid3_get_diskname(disk), sc->sc_name);
889                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
890                 g_raid3_update_metadata(disk);
891         }
892         return (0);
893 }
894
895 static void
896 g_raid3_unidle(struct g_raid3_softc *sc)
897 {
898         struct g_raid3_disk *disk;
899         u_int i;
900
901         g_topology_assert_not();
902         sx_assert(&sc->sc_lock, SX_XLOCKED);
903
904         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
905                 return;
906         sc->sc_idle = 0;
907         sc->sc_last_write = time_uptime;
908         for (i = 0; i < sc->sc_ndisks; i++) {
909                 disk = &sc->sc_disks[i];
910                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
911                         continue;
912                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
913                     g_raid3_get_diskname(disk), sc->sc_name);
914                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
915                 g_raid3_update_metadata(disk);
916         }
917 }
918
919 /*
920  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
921  * in child bio as pointer to the next element on the list.
922  */
923 #define G_RAID3_HEAD_BIO(pbp)   (pbp)->bio_driver1
924
925 #define G_RAID3_NEXT_BIO(cbp)   (cbp)->bio_caller1
926
927 #define G_RAID3_FOREACH_BIO(pbp, bp)                                    \
928         for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;                \
929             (bp) = G_RAID3_NEXT_BIO(bp))
930
931 #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)                        \
932         for ((bp) = G_RAID3_HEAD_BIO(pbp);                              \
933             (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);        \
934             (bp) = (tmpbp))
935
936 static void
937 g_raid3_init_bio(struct bio *pbp)
938 {
939
940         G_RAID3_HEAD_BIO(pbp) = NULL;
941 }
942
943 static void
944 g_raid3_remove_bio(struct bio *cbp)
945 {
946         struct bio *pbp, *bp;
947
948         pbp = cbp->bio_parent;
949         if (G_RAID3_HEAD_BIO(pbp) == cbp)
950                 G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
951         else {
952                 G_RAID3_FOREACH_BIO(pbp, bp) {
953                         if (G_RAID3_NEXT_BIO(bp) == cbp) {
954                                 G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
955                                 break;
956                         }
957                 }
958         }
959         G_RAID3_NEXT_BIO(cbp) = NULL;
960 }
961
962 static void
963 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
964 {
965         struct bio *pbp, *bp;
966
967         g_raid3_remove_bio(sbp);
968         pbp = dbp->bio_parent;
969         G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
970         if (G_RAID3_HEAD_BIO(pbp) == dbp)
971                 G_RAID3_HEAD_BIO(pbp) = sbp;
972         else {
973                 G_RAID3_FOREACH_BIO(pbp, bp) {
974                         if (G_RAID3_NEXT_BIO(bp) == dbp) {
975                                 G_RAID3_NEXT_BIO(bp) = sbp;
976                                 break;
977                         }
978                 }
979         }
980         G_RAID3_NEXT_BIO(dbp) = NULL;
981 }
982
983 static void
984 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
985 {
986         struct bio *bp, *pbp;
987         size_t size;
988
989         pbp = cbp->bio_parent;
990         pbp->bio_children--;
991         KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
992         size = pbp->bio_length / (sc->sc_ndisks - 1);
993         g_raid3_free(sc, cbp->bio_data, size);
994         if (G_RAID3_HEAD_BIO(pbp) == cbp) {
995                 G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
996                 G_RAID3_NEXT_BIO(cbp) = NULL;
997                 g_destroy_bio(cbp);
998         } else {
999                 G_RAID3_FOREACH_BIO(pbp, bp) {
1000                         if (G_RAID3_NEXT_BIO(bp) == cbp)
1001                                 break;
1002                 }
1003                 if (bp != NULL) {
1004                         KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
1005                             ("NULL bp->bio_driver1"));
1006                         G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
1007                         G_RAID3_NEXT_BIO(cbp) = NULL;
1008                 }
1009                 g_destroy_bio(cbp);
1010         }
1011 }
1012
1013 static struct bio *
1014 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
1015 {
1016         struct bio *bp, *cbp;
1017         size_t size;
1018         int memflag;
1019
1020         cbp = g_clone_bio(pbp);
1021         if (cbp == NULL)
1022                 return (NULL);
1023         size = pbp->bio_length / (sc->sc_ndisks - 1);
1024         if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
1025                 memflag = M_WAITOK;
1026         else
1027                 memflag = M_NOWAIT;
1028         cbp->bio_data = g_raid3_alloc(sc, size, memflag);
1029         if (cbp->bio_data == NULL) {
1030                 pbp->bio_children--;
1031                 g_destroy_bio(cbp);
1032                 return (NULL);
1033         }
1034         G_RAID3_NEXT_BIO(cbp) = NULL;
1035         if (G_RAID3_HEAD_BIO(pbp) == NULL)
1036                 G_RAID3_HEAD_BIO(pbp) = cbp;
1037         else {
1038                 G_RAID3_FOREACH_BIO(pbp, bp) {
1039                         if (G_RAID3_NEXT_BIO(bp) == NULL) {
1040                                 G_RAID3_NEXT_BIO(bp) = cbp;
1041                                 break;
1042                         }
1043                 }
1044         }
1045         return (cbp);
1046 }
1047
1048 static void
1049 g_raid3_scatter(struct bio *pbp)
1050 {
1051         struct g_raid3_softc *sc;
1052         struct g_raid3_disk *disk;
1053         struct bio *bp, *cbp, *tmpbp;
1054         off_t atom, cadd, padd, left;
1055         int first;
1056
1057         sc = pbp->bio_to->geom->softc;
1058         bp = NULL;
1059         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1060                 /*
1061                  * Find bio for which we should calculate data.
1062                  */
1063                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1064                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1065                                 bp = cbp;
1066                                 break;
1067                         }
1068                 }
1069                 KASSERT(bp != NULL, ("NULL parity bio."));
1070         }
1071         atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1072         cadd = padd = 0;
1073         for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1074                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1075                         if (cbp == bp)
1076                                 continue;
1077                         bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1078                         padd += atom;
1079                 }
1080                 cadd += atom;
1081         }
1082         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1083                 /*
1084                  * Calculate parity.
1085                  */
1086                 first = 1;
1087                 G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1088                         if (cbp == bp)
1089                                 continue;
1090                         if (first) {
1091                                 bcopy(cbp->bio_data, bp->bio_data,
1092                                     bp->bio_length);
1093                                 first = 0;
1094                         } else {
1095                                 g_raid3_xor(cbp->bio_data, bp->bio_data,
1096                                     bp->bio_length);
1097                         }
1098                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1099                                 g_raid3_destroy_bio(sc, cbp);
1100                 }
1101         }
1102         G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1103                 struct g_consumer *cp;
1104
1105                 disk = cbp->bio_caller2;
1106                 cp = disk->d_consumer;
1107                 cbp->bio_to = cp->provider;
1108                 G_RAID3_LOGREQ(3, cbp, "Sending request.");
1109                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1110                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1111                     cp->acr, cp->acw, cp->ace));
1112                 cp->index++;
1113                 sc->sc_writes++;
1114                 g_io_request(cbp, cp);
1115         }
1116 }
1117
1118 static void
1119 g_raid3_gather(struct bio *pbp)
1120 {
1121         struct g_raid3_softc *sc;
1122         struct g_raid3_disk *disk;
1123         struct bio *xbp, *fbp, *cbp;
1124         off_t atom, cadd, padd, left;
1125
1126         sc = pbp->bio_to->geom->softc;
1127         /*
1128          * Find bio for which we have to calculate data.
1129          * While going through this path, check if all requests
1130          * succeeded, if not, deny whole request.
1131          * If we're in COMPLETE mode, we allow one request to fail,
1132          * so if we find one, we're sending it to the parity consumer.
1133          * If there are more failed requests, we deny whole request.
1134          */
1135         xbp = fbp = NULL;
1136         G_RAID3_FOREACH_BIO(pbp, cbp) {
1137                 if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1138                         KASSERT(xbp == NULL, ("More than one parity bio."));
1139                         xbp = cbp;
1140                 }
1141                 if (cbp->bio_error == 0)
1142                         continue;
1143                 /*
1144                  * Found failed request.
1145                  */
1146                 if (fbp == NULL) {
1147                         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1148                                 /*
1149                                  * We are already in degraded mode, so we can't
1150                                  * accept any failures.
1151                                  */
1152                                 if (pbp->bio_error == 0)
1153                                         pbp->bio_error = cbp->bio_error;
1154                         } else {
1155                                 fbp = cbp;
1156                         }
1157                 } else {
1158                         /*
1159                          * Next failed request, that's too many.
1160                          */
1161                         if (pbp->bio_error == 0)
1162                                 pbp->bio_error = fbp->bio_error;
1163                 }
1164                 disk = cbp->bio_caller2;
1165                 if (disk == NULL)
1166                         continue;
1167                 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1168                         disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1169                         G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
1170                             cbp->bio_error);
1171                 } else {
1172                         G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
1173                             cbp->bio_error);
1174                 }
1175                 if (g_raid3_disconnect_on_failure &&
1176                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1177                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1178                         g_raid3_event_send(disk,
1179                             G_RAID3_DISK_STATE_DISCONNECTED,
1180                             G_RAID3_EVENT_DONTWAIT);
1181                 }
1182         }
1183         if (pbp->bio_error != 0)
1184                 goto finish;
1185         if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1186                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1187                 if (xbp != fbp)
1188                         g_raid3_replace_bio(xbp, fbp);
1189                 g_raid3_destroy_bio(sc, fbp);
1190         } else if (fbp != NULL) {
1191                 struct g_consumer *cp;
1192
1193                 /*
1194                  * One request failed, so send the same request to
1195                  * the parity consumer.
1196                  */
1197                 disk = pbp->bio_driver2;
1198                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1199                         pbp->bio_error = fbp->bio_error;
1200                         goto finish;
1201                 }
1202                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1203                 pbp->bio_inbed--;
1204                 fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1205                 if (disk->d_no == sc->sc_ndisks - 1)
1206                         fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1207                 fbp->bio_error = 0;
1208                 fbp->bio_completed = 0;
1209                 fbp->bio_children = 0;
1210                 fbp->bio_inbed = 0;
1211                 cp = disk->d_consumer;
1212                 fbp->bio_caller2 = disk;
1213                 fbp->bio_to = cp->provider;
1214                 G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1215                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1216                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1217                     cp->acr, cp->acw, cp->ace));
1218                 cp->index++;
1219                 g_io_request(fbp, cp);
1220                 return;
1221         }
1222         if (xbp != NULL) {
1223                 /*
1224                  * Calculate parity.
1225                  */
1226                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1227                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1228                                 continue;
1229                         g_raid3_xor(cbp->bio_data, xbp->bio_data,
1230                             xbp->bio_length);
1231                 }
1232                 xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1233                 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1234                         if (!g_raid3_is_zero(xbp)) {
1235                                 g_raid3_parity_mismatch++;
1236                                 pbp->bio_error = EIO;
1237                                 goto finish;
1238                         }
1239                         g_raid3_destroy_bio(sc, xbp);
1240                 }
1241         }
1242         atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1243         cadd = padd = 0;
1244         for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1245                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1246                         bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1247                         pbp->bio_completed += atom;
1248                         padd += atom;
1249                 }
1250                 cadd += atom;
1251         }
1252 finish:
1253         if (pbp->bio_error == 0)
1254                 G_RAID3_LOGREQ(3, pbp, "Request finished.");
1255         else {
1256                 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1257                         G_RAID3_LOGREQ(1, pbp, "Verification error.");
1258                 else
1259                         G_RAID3_LOGREQ(0, pbp, "Request failed.");
1260         }
1261         pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1262         while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1263                 g_raid3_destroy_bio(sc, cbp);
1264         g_io_deliver(pbp, pbp->bio_error);
1265 }
1266
1267 static void
1268 g_raid3_done(struct bio *bp)
1269 {
1270         struct g_raid3_softc *sc;
1271
1272         sc = bp->bio_from->geom->softc;
1273         bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1274         G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1275         mtx_lock(&sc->sc_queue_mtx);
1276         bioq_insert_head(&sc->sc_queue, bp);
1277         mtx_unlock(&sc->sc_queue_mtx);
1278         wakeup(sc);
1279         wakeup(&sc->sc_queue);
1280 }
1281
1282 static void
1283 g_raid3_regular_request(struct bio *cbp)
1284 {
1285         struct g_raid3_softc *sc;
1286         struct g_raid3_disk *disk;
1287         struct bio *pbp;
1288
1289         g_topology_assert_not();
1290
1291         pbp = cbp->bio_parent;
1292         sc = pbp->bio_to->geom->softc;
1293         cbp->bio_from->index--;
1294         if (cbp->bio_cmd == BIO_WRITE)
1295                 sc->sc_writes--;
1296         disk = cbp->bio_from->private;
1297         if (disk == NULL) {
1298                 g_topology_lock();
1299                 g_raid3_kill_consumer(sc, cbp->bio_from);
1300                 g_topology_unlock();
1301         }
1302
1303         G_RAID3_LOGREQ(3, cbp, "Request finished.");
1304         pbp->bio_inbed++;
1305         KASSERT(pbp->bio_inbed <= pbp->bio_children,
1306             ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1307             pbp->bio_children));
1308         if (pbp->bio_inbed != pbp->bio_children)
1309                 return;
1310         switch (pbp->bio_cmd) {
1311         case BIO_READ:
1312                 g_raid3_gather(pbp);
1313                 break;
1314         case BIO_WRITE:
1315         case BIO_DELETE:
1316             {
1317                 int error = 0;
1318
1319                 pbp->bio_completed = pbp->bio_length;
1320                 while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1321                         if (cbp->bio_error == 0) {
1322                                 g_raid3_destroy_bio(sc, cbp);
1323                                 continue;
1324                         }
1325
1326                         if (error == 0)
1327                                 error = cbp->bio_error;
1328                         else if (pbp->bio_error == 0) {
1329                                 /*
1330                                  * Next failed request, that's too many.
1331                                  */
1332                                 pbp->bio_error = error;
1333                         }
1334
1335                         disk = cbp->bio_caller2;
1336                         if (disk == NULL) {
1337                                 g_raid3_destroy_bio(sc, cbp);
1338                                 continue;
1339                         }
1340
1341                         if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1342                                 disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1343                                 G_RAID3_LOGREQ(0, cbp,
1344                                     "Request failed (error=%d).",
1345                                     cbp->bio_error);
1346                         } else {
1347                                 G_RAID3_LOGREQ(1, cbp,
1348                                     "Request failed (error=%d).",
1349                                     cbp->bio_error);
1350                         }
1351                         if (g_raid3_disconnect_on_failure &&
1352                             sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1353                                 sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1354                                 g_raid3_event_send(disk,
1355                                     G_RAID3_DISK_STATE_DISCONNECTED,
1356                                     G_RAID3_EVENT_DONTWAIT);
1357                         }
1358                         g_raid3_destroy_bio(sc, cbp);
1359                 }
1360                 if (pbp->bio_error == 0)
1361                         G_RAID3_LOGREQ(3, pbp, "Request finished.");
1362                 else
1363                         G_RAID3_LOGREQ(0, pbp, "Request failed.");
1364                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1365                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1366                 bioq_remove(&sc->sc_inflight, pbp);
1367                 /* Release delayed sync requests if possible. */
1368                 g_raid3_sync_release(sc);
1369                 g_io_deliver(pbp, pbp->bio_error);
1370                 break;
1371             }
1372         }
1373 }
1374
1375 static void
1376 g_raid3_sync_done(struct bio *bp)
1377 {
1378         struct g_raid3_softc *sc;
1379
1380         G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1381         sc = bp->bio_from->geom->softc;
1382         bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1383         mtx_lock(&sc->sc_queue_mtx);
1384         bioq_insert_head(&sc->sc_queue, bp);
1385         mtx_unlock(&sc->sc_queue_mtx);
1386         wakeup(sc);
1387         wakeup(&sc->sc_queue);
1388 }
1389
1390 static void
1391 g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp)
1392 {
1393         struct bio_queue_head queue;
1394         struct g_raid3_disk *disk;
1395         struct g_consumer *cp;
1396         struct bio *cbp;
1397         u_int i;
1398
1399         bioq_init(&queue);
1400         for (i = 0; i < sc->sc_ndisks; i++) {
1401                 disk = &sc->sc_disks[i];
1402                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
1403                         continue;
1404                 cbp = g_clone_bio(bp);
1405                 if (cbp == NULL) {
1406                         for (cbp = bioq_first(&queue); cbp != NULL;
1407                             cbp = bioq_first(&queue)) {
1408                                 bioq_remove(&queue, cbp);
1409                                 g_destroy_bio(cbp);
1410                         }
1411                         if (bp->bio_error == 0)
1412                                 bp->bio_error = ENOMEM;
1413                         g_io_deliver(bp, bp->bio_error);
1414                         return;
1415                 }
1416                 bioq_insert_tail(&queue, cbp);
1417                 cbp->bio_done = g_std_done;
1418                 cbp->bio_caller1 = disk;
1419                 cbp->bio_to = disk->d_consumer->provider;
1420         }
1421         for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
1422                 bioq_remove(&queue, cbp);
1423                 G_RAID3_LOGREQ(3, cbp, "Sending request.");
1424                 disk = cbp->bio_caller1;
1425                 cbp->bio_caller1 = NULL;
1426                 cp = disk->d_consumer;
1427                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1428                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1429                     cp->acr, cp->acw, cp->ace));
1430                 g_io_request(cbp, disk->d_consumer);
1431         }
1432 }
1433
1434 static void
1435 g_raid3_start(struct bio *bp)
1436 {
1437         struct g_raid3_softc *sc;
1438
1439         sc = bp->bio_to->geom->softc;
1440         /*
1441          * If sc == NULL or there are no valid disks, provider's error
1442          * should be set and g_raid3_start() should not be called at all.
1443          */
1444         KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1445             sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1446             ("Provider's error should be set (error=%d)(device=%s).",
1447             bp->bio_to->error, bp->bio_to->name));
1448         G_RAID3_LOGREQ(3, bp, "Request received.");
1449
1450         switch (bp->bio_cmd) {
1451         case BIO_READ:
1452         case BIO_WRITE:
1453         case BIO_DELETE:
1454                 break;
1455         case BIO_FLUSH:
1456                 g_raid3_flush(sc, bp);
1457                 return;
1458         case BIO_GETATTR:
1459         default:
1460                 g_io_deliver(bp, EOPNOTSUPP);
1461                 return;
1462         }
1463         mtx_lock(&sc->sc_queue_mtx);
1464         bioq_insert_tail(&sc->sc_queue, bp);
1465         mtx_unlock(&sc->sc_queue_mtx);
1466         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1467         wakeup(sc);
1468 }
1469
1470 /*
1471  * Return TRUE if the given request is colliding with a in-progress
1472  * synchronization request.
1473  */
1474 static int
1475 g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp)
1476 {
1477         struct g_raid3_disk *disk;
1478         struct bio *sbp;
1479         off_t rstart, rend, sstart, send;
1480         int i;
1481
1482         disk = sc->sc_syncdisk;
1483         if (disk == NULL)
1484                 return (0);
1485         rstart = bp->bio_offset;
1486         rend = bp->bio_offset + bp->bio_length;
1487         for (i = 0; i < g_raid3_syncreqs; i++) {
1488                 sbp = disk->d_sync.ds_bios[i];
1489                 if (sbp == NULL)
1490                         continue;
1491                 sstart = sbp->bio_offset;
1492                 send = sbp->bio_length;
1493                 if (sbp->bio_cmd == BIO_WRITE) {
1494                         sstart *= sc->sc_ndisks - 1;
1495                         send *= sc->sc_ndisks - 1;
1496                 }
1497                 send += sstart;
1498                 if (rend > sstart && rstart < send)
1499                         return (1);
1500         }
1501         return (0);
1502 }
1503
1504 /*
1505  * Return TRUE if the given sync request is colliding with a in-progress regular
1506  * request.
1507  */
1508 static int
1509 g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp)
1510 {
1511         off_t rstart, rend, sstart, send;
1512         struct bio *bp;
1513
1514         if (sc->sc_syncdisk == NULL)
1515                 return (0);
1516         sstart = sbp->bio_offset;
1517         send = sstart + sbp->bio_length;
1518         TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
1519                 rstart = bp->bio_offset;
1520                 rend = bp->bio_offset + bp->bio_length;
1521                 if (rend > sstart && rstart < send)
1522                         return (1);
1523         }
1524         return (0);
1525 }
1526
1527 /*
1528  * Puts request onto delayed queue.
1529  */
1530 static void
1531 g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp)
1532 {
1533
1534         G_RAID3_LOGREQ(2, bp, "Delaying request.");
1535         bioq_insert_head(&sc->sc_regular_delayed, bp);
1536 }
1537
1538 /*
1539  * Puts synchronization request onto delayed queue.
1540  */
1541 static void
1542 g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp)
1543 {
1544
1545         G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
1546         bioq_insert_tail(&sc->sc_sync_delayed, bp);
1547 }
1548
1549 /*
1550  * Releases delayed regular requests which don't collide anymore with sync
1551  * requests.
1552  */
1553 static void
1554 g_raid3_regular_release(struct g_raid3_softc *sc)
1555 {
1556         struct bio *bp, *bp2;
1557
1558         TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
1559                 if (g_raid3_sync_collision(sc, bp))
1560                         continue;
1561                 bioq_remove(&sc->sc_regular_delayed, bp);
1562                 G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
1563                 mtx_lock(&sc->sc_queue_mtx);
1564                 bioq_insert_head(&sc->sc_queue, bp);
1565 #if 0
1566                 /*
1567                  * wakeup() is not needed, because this function is called from
1568                  * the worker thread.
1569                  */
1570                 wakeup(&sc->sc_queue);
1571 #endif
1572                 mtx_unlock(&sc->sc_queue_mtx);
1573         }
1574 }
1575
1576 /*
1577  * Releases delayed sync requests which don't collide anymore with regular
1578  * requests.
1579  */
1580 static void
1581 g_raid3_sync_release(struct g_raid3_softc *sc)
1582 {
1583         struct bio *bp, *bp2;
1584
1585         TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
1586                 if (g_raid3_regular_collision(sc, bp))
1587                         continue;
1588                 bioq_remove(&sc->sc_sync_delayed, bp);
1589                 G_RAID3_LOGREQ(2, bp,
1590                     "Releasing delayed synchronization request.");
1591                 g_io_request(bp, bp->bio_from);
1592         }
1593 }
1594
1595 /*
1596  * Handle synchronization requests.
1597  * Every synchronization request is two-steps process: first, READ request is
1598  * send to active provider and then WRITE request (with read data) to the provider
1599  * beeing synchronized. When WRITE is finished, new synchronization request is
1600  * send.
1601  */
1602 static void
1603 g_raid3_sync_request(struct bio *bp)
1604 {
1605         struct g_raid3_softc *sc;
1606         struct g_raid3_disk *disk;
1607
1608         bp->bio_from->index--;
1609         sc = bp->bio_from->geom->softc;
1610         disk = bp->bio_from->private;
1611         if (disk == NULL) {
1612                 sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
1613                 g_topology_lock();
1614                 g_raid3_kill_consumer(sc, bp->bio_from);
1615                 g_topology_unlock();
1616                 free(bp->bio_data, M_RAID3);
1617                 g_destroy_bio(bp);
1618                 sx_xlock(&sc->sc_lock);
1619                 return;
1620         }
1621
1622         /*
1623          * Synchronization request.
1624          */
1625         switch (bp->bio_cmd) {
1626         case BIO_READ:
1627             {
1628                 struct g_consumer *cp;
1629                 u_char *dst, *src;
1630                 off_t left;
1631                 u_int atom;
1632
1633                 if (bp->bio_error != 0) {
1634                         G_RAID3_LOGREQ(0, bp,
1635                             "Synchronization request failed (error=%d).",
1636                             bp->bio_error);
1637                         g_destroy_bio(bp);
1638                         return;
1639                 }
1640                 G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1641                 atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1642                 dst = src = bp->bio_data;
1643                 if (disk->d_no == sc->sc_ndisks - 1) {
1644                         u_int n;
1645
1646                         /* Parity component. */
1647                         for (left = bp->bio_length; left > 0;
1648                             left -= sc->sc_sectorsize) {
1649                                 bcopy(src, dst, atom);
1650                                 src += atom;
1651                                 for (n = 1; n < sc->sc_ndisks - 1; n++) {
1652                                         g_raid3_xor(src, dst, atom);
1653                                         src += atom;
1654                                 }
1655                                 dst += atom;
1656                         }
1657                 } else {
1658                         /* Regular component. */
1659                         src += atom * disk->d_no;
1660                         for (left = bp->bio_length; left > 0;
1661                             left -= sc->sc_sectorsize) {
1662                                 bcopy(src, dst, atom);
1663                                 src += sc->sc_sectorsize;
1664                                 dst += atom;
1665                         }
1666                 }
1667                 bp->bio_driver1 = bp->bio_driver2 = NULL;
1668                 bp->bio_pflags = 0;
1669                 bp->bio_offset /= sc->sc_ndisks - 1;
1670                 bp->bio_length /= sc->sc_ndisks - 1;
1671                 bp->bio_cmd = BIO_WRITE;
1672                 bp->bio_cflags = 0;
1673                 bp->bio_children = bp->bio_inbed = 0;
1674                 cp = disk->d_consumer;
1675                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1676                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1677                     cp->acr, cp->acw, cp->ace));
1678                 cp->index++;
1679                 g_io_request(bp, cp);
1680                 return;
1681             }
1682         case BIO_WRITE:
1683             {
1684                 struct g_raid3_disk_sync *sync;
1685                 off_t boffset, moffset;
1686                 void *data;
1687                 int i;
1688
1689                 if (bp->bio_error != 0) {
1690                         G_RAID3_LOGREQ(0, bp,
1691                             "Synchronization request failed (error=%d).",
1692                             bp->bio_error);
1693                         g_destroy_bio(bp);
1694                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1695                         g_raid3_event_send(disk,
1696                             G_RAID3_DISK_STATE_DISCONNECTED,
1697                             G_RAID3_EVENT_DONTWAIT);
1698                         return;
1699                 }
1700                 G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1701                 sync = &disk->d_sync;
1702                 if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) ||
1703                     sync->ds_consumer == NULL ||
1704                     (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1705                         /* Don't send more synchronization requests. */
1706                         sync->ds_inflight--;
1707                         if (sync->ds_bios != NULL) {
1708                                 i = (int)(uintptr_t)bp->bio_caller1;
1709                                 sync->ds_bios[i] = NULL;
1710                         }
1711                         free(bp->bio_data, M_RAID3);
1712                         g_destroy_bio(bp);
1713                         if (sync->ds_inflight > 0)
1714                                 return;
1715                         if (sync->ds_consumer == NULL ||
1716                             (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1717                                 return;
1718                         }
1719                         /*
1720                          * Disk up-to-date, activate it.
1721                          */
1722                         g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1723                             G_RAID3_EVENT_DONTWAIT);
1724                         return;
1725                 }
1726
1727                 /* Send next synchronization request. */
1728                 data = bp->bio_data;
1729                 bzero(bp, sizeof(*bp));
1730                 bp->bio_cmd = BIO_READ;
1731                 bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
1732                 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1733                 sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1734                 bp->bio_done = g_raid3_sync_done;
1735                 bp->bio_data = data;
1736                 bp->bio_from = sync->ds_consumer;
1737                 bp->bio_to = sc->sc_provider;
1738                 G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1739                 sync->ds_consumer->index++;
1740                 /*
1741                  * Delay the request if it is colliding with a regular request.
1742                  */
1743                 if (g_raid3_regular_collision(sc, bp))
1744                         g_raid3_sync_delay(sc, bp);
1745                 else
1746                         g_io_request(bp, sync->ds_consumer);
1747
1748                 /* Release delayed requests if possible. */
1749                 g_raid3_regular_release(sc);
1750
1751                 /* Find the smallest offset. */
1752                 moffset = sc->sc_mediasize;
1753                 for (i = 0; i < g_raid3_syncreqs; i++) {
1754                         bp = sync->ds_bios[i];
1755                         boffset = bp->bio_offset;
1756                         if (bp->bio_cmd == BIO_WRITE)
1757                                 boffset *= sc->sc_ndisks - 1;
1758                         if (boffset < moffset)
1759                                 moffset = boffset;
1760                 }
1761                 if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
1762                         /* Update offset_done on every 100 blocks. */
1763                         sync->ds_offset_done = moffset;
1764                         g_raid3_update_metadata(disk);
1765                 }
1766                 return;
1767             }
1768         default:
1769                 KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1770                     bp->bio_cmd, sc->sc_name));
1771                 break;
1772         }
1773 }
1774
1775 static int
1776 g_raid3_register_request(struct bio *pbp)
1777 {
1778         struct g_raid3_softc *sc;
1779         struct g_raid3_disk *disk;
1780         struct g_consumer *cp;
1781         struct bio *cbp, *tmpbp;
1782         off_t offset, length;
1783         u_int n, ndisks;
1784         int round_robin, verify;
1785
1786         ndisks = 0;
1787         sc = pbp->bio_to->geom->softc;
1788         if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1789             sc->sc_syncdisk == NULL) {
1790                 g_io_deliver(pbp, EIO);
1791                 return (0);
1792         }
1793         g_raid3_init_bio(pbp);
1794         length = pbp->bio_length / (sc->sc_ndisks - 1);
1795         offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1796         round_robin = verify = 0;
1797         switch (pbp->bio_cmd) {
1798         case BIO_READ:
1799                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1800                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1801                         pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1802                         verify = 1;
1803                         ndisks = sc->sc_ndisks;
1804                 } else {
1805                         verify = 0;
1806                         ndisks = sc->sc_ndisks - 1;
1807                 }
1808                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1809                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1810                         round_robin = 1;
1811                 } else {
1812                         round_robin = 0;
1813                 }
1814                 KASSERT(!round_robin || !verify,
1815                     ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1816                 pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1817                 break;
1818         case BIO_WRITE:
1819         case BIO_DELETE:
1820                 /*
1821                  * Delay the request if it is colliding with a synchronization
1822                  * request.
1823                  */
1824                 if (g_raid3_sync_collision(sc, pbp)) {
1825                         g_raid3_regular_delay(sc, pbp);
1826                         return (0);
1827                 }
1828
1829                 if (sc->sc_idle)
1830                         g_raid3_unidle(sc);
1831                 else
1832                         sc->sc_last_write = time_uptime;
1833
1834                 ndisks = sc->sc_ndisks;
1835                 break;
1836         }
1837         for (n = 0; n < ndisks; n++) {
1838                 disk = &sc->sc_disks[n];
1839                 cbp = g_raid3_clone_bio(sc, pbp);
1840                 if (cbp == NULL) {
1841                         while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1842                                 g_raid3_destroy_bio(sc, cbp);
1843                         /*
1844                          * To prevent deadlock, we must run back up
1845                          * with the ENOMEM for failed requests of any
1846                          * of our consumers.  Our own sync requests
1847                          * can stick around, as they are finite.
1848                          */
1849                         if ((pbp->bio_cflags &
1850                             G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1851                                 g_io_deliver(pbp, ENOMEM);
1852                                 return (0);
1853                         }
1854                         return (ENOMEM);
1855                 }
1856                 cbp->bio_offset = offset;
1857                 cbp->bio_length = length;
1858                 cbp->bio_done = g_raid3_done;
1859                 switch (pbp->bio_cmd) {
1860                 case BIO_READ:
1861                         if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1862                                 /*
1863                                  * Replace invalid component with the parity
1864                                  * component.
1865                                  */
1866                                 disk = &sc->sc_disks[sc->sc_ndisks - 1];
1867                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1868                                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1869                         } else if (round_robin &&
1870                             disk->d_no == sc->sc_round_robin) {
1871                                 /*
1872                                  * In round-robin mode skip one data component
1873                                  * and use parity component when reading.
1874                                  */
1875                                 pbp->bio_driver2 = disk;
1876                                 disk = &sc->sc_disks[sc->sc_ndisks - 1];
1877                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1878                                 sc->sc_round_robin++;
1879                                 round_robin = 0;
1880                         } else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1881                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1882                         }
1883                         break;
1884                 case BIO_WRITE:
1885                 case BIO_DELETE:
1886                         if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1887                             disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1888                                 if (n == ndisks - 1) {
1889                                         /*
1890                                          * Active parity component, mark it as such.
1891                                          */
1892                                         cbp->bio_cflags |=
1893                                             G_RAID3_BIO_CFLAG_PARITY;
1894                                 }
1895                         } else {
1896                                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1897                                 if (n == ndisks - 1) {
1898                                         /*
1899                                          * Parity component is not connected,
1900                                          * so destroy its request.
1901                                          */
1902                                         pbp->bio_pflags |=
1903                                             G_RAID3_BIO_PFLAG_NOPARITY;
1904                                         g_raid3_destroy_bio(sc, cbp);
1905                                         cbp = NULL;
1906                                 } else {
1907                                         cbp->bio_cflags |=
1908                                             G_RAID3_BIO_CFLAG_NODISK;
1909                                         disk = NULL;
1910                                 }
1911                         }
1912                         break;
1913                 }
1914                 if (cbp != NULL)
1915                         cbp->bio_caller2 = disk;
1916         }
1917         switch (pbp->bio_cmd) {
1918         case BIO_READ:
1919                 if (round_robin) {
1920                         /*
1921                          * If we are in round-robin mode and 'round_robin' is
1922                          * still 1, it means, that we skipped parity component
1923                          * for this read and must reset sc_round_robin field.
1924                          */
1925                         sc->sc_round_robin = 0;
1926                 }
1927                 G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1928                         disk = cbp->bio_caller2;
1929                         cp = disk->d_consumer;
1930                         cbp->bio_to = cp->provider;
1931                         G_RAID3_LOGREQ(3, cbp, "Sending request.");
1932                         KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1933                             ("Consumer %s not opened (r%dw%de%d).",
1934                             cp->provider->name, cp->acr, cp->acw, cp->ace));
1935                         cp->index++;
1936                         g_io_request(cbp, cp);
1937                 }
1938                 break;
1939         case BIO_WRITE:
1940         case BIO_DELETE:
1941                 /*
1942                  * Put request onto inflight queue, so we can check if new
1943                  * synchronization requests don't collide with it.
1944                  */
1945                 bioq_insert_tail(&sc->sc_inflight, pbp);
1946
1947                 /*
1948                  * Bump syncid on first write.
1949                  */
1950                 if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1951                         sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1952                         g_raid3_bump_syncid(sc);
1953                 }
1954                 g_raid3_scatter(pbp);
1955                 break;
1956         }
1957         return (0);
1958 }
1959
1960 static int
1961 g_raid3_can_destroy(struct g_raid3_softc *sc)
1962 {
1963         struct g_geom *gp;
1964         struct g_consumer *cp;
1965
1966         g_topology_assert();
1967         gp = sc->sc_geom;
1968         if (gp->softc == NULL)
1969                 return (1);
1970         LIST_FOREACH(cp, &gp->consumer, consumer) {
1971                 if (g_raid3_is_busy(sc, cp))
1972                         return (0);
1973         }
1974         gp = sc->sc_sync.ds_geom;
1975         LIST_FOREACH(cp, &gp->consumer, consumer) {
1976                 if (g_raid3_is_busy(sc, cp))
1977                         return (0);
1978         }
1979         G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1980             sc->sc_name);
1981         return (1);
1982 }
1983
1984 static int
1985 g_raid3_try_destroy(struct g_raid3_softc *sc)
1986 {
1987
1988         g_topology_assert_not();
1989         sx_assert(&sc->sc_lock, SX_XLOCKED);
1990
1991         if (sc->sc_rootmount != NULL) {
1992                 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
1993                     sc->sc_rootmount);
1994                 root_mount_rel(sc->sc_rootmount);
1995                 sc->sc_rootmount = NULL;
1996         }
1997
1998         g_topology_lock();
1999         if (!g_raid3_can_destroy(sc)) {
2000                 g_topology_unlock();
2001                 return (0);
2002         }
2003         sc->sc_geom->softc = NULL;
2004         sc->sc_sync.ds_geom->softc = NULL;
2005         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
2006                 g_topology_unlock();
2007                 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
2008                     &sc->sc_worker);
2009                 /* Unlock sc_lock here, as it can be destroyed after wakeup. */
2010                 sx_xunlock(&sc->sc_lock);
2011                 wakeup(&sc->sc_worker);
2012                 sc->sc_worker = NULL;
2013         } else {
2014                 g_topology_unlock();
2015                 g_raid3_destroy_device(sc);
2016                 free(sc->sc_disks, M_RAID3);
2017                 free(sc, M_RAID3);
2018         }
2019         return (1);
2020 }
2021
2022 /*
2023  * Worker thread.
2024  */
2025 static void
2026 g_raid3_worker(void *arg)
2027 {
2028         struct g_raid3_softc *sc;
2029         struct g_raid3_event *ep;
2030         struct bio *bp;
2031         int timeout;
2032
2033         sc = arg;
2034         thread_lock(curthread);
2035         sched_prio(curthread, PRIBIO);
2036         thread_unlock(curthread);
2037
2038         sx_xlock(&sc->sc_lock);
2039         for (;;) {
2040                 G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
2041                 /*
2042                  * First take a look at events.
2043                  * This is important to handle events before any I/O requests.
2044                  */
2045                 ep = g_raid3_event_get(sc);
2046                 if (ep != NULL) {
2047                         g_raid3_event_remove(sc, ep);
2048                         if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
2049                                 /* Update only device status. */
2050                                 G_RAID3_DEBUG(3,
2051                                     "Running event for device %s.",
2052                                     sc->sc_name);
2053                                 ep->e_error = 0;
2054                                 g_raid3_update_device(sc, 1);
2055                         } else {
2056                                 /* Update disk status. */
2057                                 G_RAID3_DEBUG(3, "Running event for disk %s.",
2058                                      g_raid3_get_diskname(ep->e_disk));
2059                                 ep->e_error = g_raid3_update_disk(ep->e_disk,
2060                                     ep->e_state);
2061                                 if (ep->e_error == 0)
2062                                         g_raid3_update_device(sc, 0);
2063                         }
2064                         if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
2065                                 KASSERT(ep->e_error == 0,
2066                                     ("Error cannot be handled."));
2067                                 g_raid3_event_free(ep);
2068                         } else {
2069                                 ep->e_flags |= G_RAID3_EVENT_DONE;
2070                                 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
2071                                     ep);
2072                                 mtx_lock(&sc->sc_events_mtx);
2073                                 wakeup(ep);
2074                                 mtx_unlock(&sc->sc_events_mtx);
2075                         }
2076                         if ((sc->sc_flags &
2077                             G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2078                                 if (g_raid3_try_destroy(sc)) {
2079                                         curthread->td_pflags &= ~TDP_GEOM;
2080                                         G_RAID3_DEBUG(1, "Thread exiting.");
2081                                         kproc_exit(0);
2082                                 }
2083                         }
2084                         G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
2085                         continue;
2086                 }
2087                 /*
2088                  * Check if we can mark array as CLEAN and if we can't take
2089                  * how much seconds should we wait.
2090                  */
2091                 timeout = g_raid3_idle(sc, -1);
2092                 /*
2093                  * Now I/O requests.
2094                  */
2095                 /* Get first request from the queue. */
2096                 mtx_lock(&sc->sc_queue_mtx);
2097                 bp = bioq_first(&sc->sc_queue);
2098                 if (bp == NULL) {
2099                         if ((sc->sc_flags &
2100                             G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2101                                 mtx_unlock(&sc->sc_queue_mtx);
2102                                 if (g_raid3_try_destroy(sc)) {
2103                                         curthread->td_pflags &= ~TDP_GEOM;
2104                                         G_RAID3_DEBUG(1, "Thread exiting.");
2105                                         kproc_exit(0);
2106                                 }
2107                                 mtx_lock(&sc->sc_queue_mtx);
2108                         }
2109                         sx_xunlock(&sc->sc_lock);
2110                         /*
2111                          * XXX: We can miss an event here, because an event
2112                          *      can be added without sx-device-lock and without
2113                          *      mtx-queue-lock. Maybe I should just stop using
2114                          *      dedicated mutex for events synchronization and
2115                          *      stick with the queue lock?
2116                          *      The event will hang here until next I/O request
2117                          *      or next event is received.
2118                          */
2119                         MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1",
2120                             timeout * hz);
2121                         sx_xlock(&sc->sc_lock);
2122                         G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
2123                         continue;
2124                 }
2125 process:
2126                 bioq_remove(&sc->sc_queue, bp);
2127                 mtx_unlock(&sc->sc_queue_mtx);
2128
2129                 if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
2130                     (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
2131                         g_raid3_sync_request(bp);       /* READ */
2132                 } else if (bp->bio_to != sc->sc_provider) {
2133                         if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
2134                                 g_raid3_regular_request(bp);
2135                         else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
2136                                 g_raid3_sync_request(bp);       /* WRITE */
2137                         else {
2138                                 KASSERT(0,
2139                                     ("Invalid request cflags=0x%hhx to=%s.",
2140                                     bp->bio_cflags, bp->bio_to->name));
2141                         }
2142                 } else if (g_raid3_register_request(bp) != 0) {
2143                         mtx_lock(&sc->sc_queue_mtx);
2144                         bioq_insert_head(&sc->sc_queue, bp);
2145                         /*
2146                          * We are short in memory, let see if there are finished
2147                          * request we can free.
2148                          */
2149                         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
2150                                 if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR)
2151                                         goto process;
2152                         }
2153                         /*
2154                          * No finished regular request, so at least keep
2155                          * synchronization running.
2156                          */
2157                         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
2158                                 if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC)
2159                                         goto process;
2160                         }
2161                         sx_xunlock(&sc->sc_lock);
2162                         MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP,
2163                             "r3:lowmem", hz / 10);
2164                         sx_xlock(&sc->sc_lock);
2165                 }
2166                 G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
2167         }
2168 }
2169
2170 static void
2171 g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk)
2172 {
2173
2174         sx_assert(&sc->sc_lock, SX_LOCKED);
2175         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
2176                 return;
2177         if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
2178                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2179                     g_raid3_get_diskname(disk), sc->sc_name);
2180                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2181         } else if (sc->sc_idle &&
2182             (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
2183                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2184                     g_raid3_get_diskname(disk), sc->sc_name);
2185                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2186         }
2187 }
2188
2189 static void
2190 g_raid3_sync_start(struct g_raid3_softc *sc)
2191 {
2192         struct g_raid3_disk *disk;
2193         struct g_consumer *cp;
2194         struct bio *bp;
2195         int error;
2196         u_int n;
2197
2198         g_topology_assert_not();
2199         sx_assert(&sc->sc_lock, SX_XLOCKED);
2200
2201         KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2202             ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2203             sc->sc_state));
2204         KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
2205             sc->sc_name, sc->sc_state));
2206         disk = NULL;
2207         for (n = 0; n < sc->sc_ndisks; n++) {
2208                 if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
2209                         continue;
2210                 disk = &sc->sc_disks[n];
2211                 break;
2212         }
2213         if (disk == NULL)
2214                 return;
2215
2216         sx_xunlock(&sc->sc_lock);
2217         g_topology_lock();
2218         cp = g_new_consumer(sc->sc_sync.ds_geom);
2219         error = g_attach(cp, sc->sc_provider);
2220         KASSERT(error == 0,
2221             ("Cannot attach to %s (error=%d).", sc->sc_name, error));
2222         error = g_access(cp, 1, 0, 0);
2223         KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
2224         g_topology_unlock();
2225         sx_xlock(&sc->sc_lock);
2226
2227         G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
2228             g_raid3_get_diskname(disk));
2229         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0)
2230                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2231         KASSERT(disk->d_sync.ds_consumer == NULL,
2232             ("Sync consumer already exists (device=%s, disk=%s).",
2233             sc->sc_name, g_raid3_get_diskname(disk)));
2234
2235         disk->d_sync.ds_consumer = cp;
2236         disk->d_sync.ds_consumer->private = disk;
2237         disk->d_sync.ds_consumer->index = 0;
2238         sc->sc_syncdisk = disk;
2239
2240         /*
2241          * Allocate memory for synchronization bios and initialize them.
2242          */
2243         disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs,
2244             M_RAID3, M_WAITOK);
2245         for (n = 0; n < g_raid3_syncreqs; n++) {
2246                 bp = g_alloc_bio();
2247                 disk->d_sync.ds_bios[n] = bp;
2248                 bp->bio_parent = NULL;
2249                 bp->bio_cmd = BIO_READ;
2250                 bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
2251                 bp->bio_cflags = 0;
2252                 bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
2253                 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
2254                 disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
2255                 bp->bio_done = g_raid3_sync_done;
2256                 bp->bio_from = disk->d_sync.ds_consumer;
2257                 bp->bio_to = sc->sc_provider;
2258                 bp->bio_caller1 = (void *)(uintptr_t)n;
2259         }
2260
2261         /* Set the number of in-flight synchronization requests. */
2262         disk->d_sync.ds_inflight = g_raid3_syncreqs;
2263
2264         /*
2265          * Fire off first synchronization requests.
2266          */
2267         for (n = 0; n < g_raid3_syncreqs; n++) {
2268                 bp = disk->d_sync.ds_bios[n];
2269                 G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
2270                 disk->d_sync.ds_consumer->index++;
2271                 /*
2272                  * Delay the request if it is colliding with a regular request.
2273                  */
2274                 if (g_raid3_regular_collision(sc, bp))
2275                         g_raid3_sync_delay(sc, bp);
2276                 else
2277                         g_io_request(bp, disk->d_sync.ds_consumer);
2278         }
2279 }
2280
2281 /*
2282  * Stop synchronization process.
2283  * type: 0 - synchronization finished
2284  *       1 - synchronization stopped
2285  */
2286 static void
2287 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
2288 {
2289         struct g_raid3_disk *disk;
2290         struct g_consumer *cp;
2291
2292         g_topology_assert_not();
2293         sx_assert(&sc->sc_lock, SX_LOCKED);
2294
2295         KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2296             ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2297             sc->sc_state));
2298         disk = sc->sc_syncdisk;
2299         sc->sc_syncdisk = NULL;
2300         KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
2301         KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2302             ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2303             g_raid3_disk_state2str(disk->d_state)));
2304         if (disk->d_sync.ds_consumer == NULL)
2305                 return;
2306
2307         if (type == 0) {
2308                 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
2309                     sc->sc_name, g_raid3_get_diskname(disk));
2310         } else /* if (type == 1) */ {
2311                 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
2312                     sc->sc_name, g_raid3_get_diskname(disk));
2313         }
2314         free(disk->d_sync.ds_bios, M_RAID3);
2315         disk->d_sync.ds_bios = NULL;
2316         cp = disk->d_sync.ds_consumer;
2317         disk->d_sync.ds_consumer = NULL;
2318         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2319         sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
2320         g_topology_lock();
2321         g_raid3_kill_consumer(sc, cp);
2322         g_topology_unlock();
2323         sx_xlock(&sc->sc_lock);
2324 }
2325
2326 static void
2327 g_raid3_launch_provider(struct g_raid3_softc *sc)
2328 {
2329         struct g_provider *pp;
2330         struct g_raid3_disk *disk;
2331         int n;
2332
2333         sx_assert(&sc->sc_lock, SX_LOCKED);
2334
2335         g_topology_lock();
2336         pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2337         pp->mediasize = sc->sc_mediasize;
2338         pp->sectorsize = sc->sc_sectorsize;
2339         pp->stripesize = 0;
2340         pp->stripeoffset = 0;
2341         for (n = 0; n < sc->sc_ndisks; n++) {
2342                 disk = &sc->sc_disks[n];
2343                 if (disk->d_consumer && disk->d_consumer->provider &&
2344                     disk->d_consumer->provider->stripesize > pp->stripesize) {
2345                         pp->stripesize = disk->d_consumer->provider->stripesize;
2346                         pp->stripeoffset = disk->d_consumer->provider->stripeoffset;
2347                 }
2348         }
2349         pp->stripesize *= sc->sc_ndisks - 1;
2350         pp->stripeoffset *= sc->sc_ndisks - 1;
2351         sc->sc_provider = pp;
2352         g_error_provider(pp, 0);
2353         g_topology_unlock();
2354         G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
2355             g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks);
2356
2357         if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2358                 g_raid3_sync_start(sc);
2359 }
2360
2361 static void
2362 g_raid3_destroy_provider(struct g_raid3_softc *sc)
2363 {
2364         struct bio *bp;
2365
2366         g_topology_assert_not();
2367         KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2368             sc->sc_name));
2369
2370         g_topology_lock();
2371         g_error_provider(sc->sc_provider, ENXIO);
2372         mtx_lock(&sc->sc_queue_mtx);
2373         while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2374                 bioq_remove(&sc->sc_queue, bp);
2375                 g_io_deliver(bp, ENXIO);
2376         }
2377         mtx_unlock(&sc->sc_queue_mtx);
2378         G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2379             sc->sc_provider->name);
2380         sc->sc_provider->flags |= G_PF_WITHER;
2381         g_orphan_provider(sc->sc_provider, ENXIO);
2382         g_topology_unlock();
2383         sc->sc_provider = NULL;
2384         if (sc->sc_syncdisk != NULL)
2385                 g_raid3_sync_stop(sc, 1);
2386 }
2387
2388 static void
2389 g_raid3_go(void *arg)
2390 {
2391         struct g_raid3_softc *sc;
2392
2393         sc = arg;
2394         G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2395         g_raid3_event_send(sc, 0,
2396             G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2397 }
2398
2399 static u_int
2400 g_raid3_determine_state(struct g_raid3_disk *disk)
2401 {
2402         struct g_raid3_softc *sc;
2403         u_int state;
2404
2405         sc = disk->d_softc;
2406         if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2407                 if ((disk->d_flags &
2408                     G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2409                         /* Disk does not need synchronization. */
2410                         state = G_RAID3_DISK_STATE_ACTIVE;
2411                 } else {
2412                         if ((sc->sc_flags &
2413                              G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2414                             (disk->d_flags &
2415                              G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2416                                 /*
2417                                  * We can start synchronization from
2418                                  * the stored offset.
2419                                  */
2420                                 state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2421                         } else {
2422                                 state = G_RAID3_DISK_STATE_STALE;
2423                         }
2424                 }
2425         } else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2426                 /*
2427                  * Reset all synchronization data for this disk,
2428                  * because if it even was synchronized, it was
2429                  * synchronized to disks with different syncid.
2430                  */
2431                 disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2432                 disk->d_sync.ds_offset = 0;
2433                 disk->d_sync.ds_offset_done = 0;
2434                 disk->d_sync.ds_syncid = sc->sc_syncid;
2435                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2436                     (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2437                         state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2438                 } else {
2439                         state = G_RAID3_DISK_STATE_STALE;
2440                 }
2441         } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2442                 /*
2443                  * Not good, NOT GOOD!
2444                  * It means that device was started on stale disks
2445                  * and more fresh disk just arrive.
2446                  * If there were writes, device is broken, sorry.
2447                  * I think the best choice here is don't touch
2448                  * this disk and inform the user loudly.
2449                  */
2450                 G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2451                     "disk (%s) arrives!! It will not be connected to the "
2452                     "running device.", sc->sc_name,
2453                     g_raid3_get_diskname(disk));
2454                 g_raid3_destroy_disk(disk);
2455                 state = G_RAID3_DISK_STATE_NONE;
2456                 /* Return immediately, because disk was destroyed. */
2457                 return (state);
2458         }
2459         G_RAID3_DEBUG(3, "State for %s disk: %s.",
2460             g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2461         return (state);
2462 }
2463
2464 /*
2465  * Update device state.
2466  */
2467 static void
2468 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2469 {
2470         struct g_raid3_disk *disk;
2471         u_int state;
2472
2473         sx_assert(&sc->sc_lock, SX_XLOCKED);
2474
2475         switch (sc->sc_state) {
2476         case G_RAID3_DEVICE_STATE_STARTING:
2477             {
2478                 u_int n, ndirty, ndisks, genid, syncid;
2479
2480                 KASSERT(sc->sc_provider == NULL,
2481                     ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2482                 /*
2483                  * Are we ready? We are, if all disks are connected or
2484                  * one disk is missing and 'force' is true.
2485                  */
2486                 if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2487                         if (!force)
2488                                 callout_drain(&sc->sc_callout);
2489                 } else {
2490                         if (force) {
2491                                 /*
2492                                  * Timeout expired, so destroy device.
2493                                  */
2494                                 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2495                                 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
2496                                     __LINE__, sc->sc_rootmount);
2497                                 root_mount_rel(sc->sc_rootmount);
2498                                 sc->sc_rootmount = NULL;
2499                         }
2500                         return;
2501                 }
2502
2503                 /*
2504                  * Find the biggest genid.
2505                  */
2506                 genid = 0;
2507                 for (n = 0; n < sc->sc_ndisks; n++) {
2508                         disk = &sc->sc_disks[n];
2509                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2510                                 continue;
2511                         if (disk->d_genid > genid)
2512                                 genid = disk->d_genid;
2513                 }
2514                 sc->sc_genid = genid;
2515                 /*
2516                  * Remove all disks without the biggest genid.
2517                  */
2518                 for (n = 0; n < sc->sc_ndisks; n++) {
2519                         disk = &sc->sc_disks[n];
2520                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2521                                 continue;
2522                         if (disk->d_genid < genid) {
2523                                 G_RAID3_DEBUG(0,
2524                                     "Component %s (device %s) broken, skipping.",
2525                                     g_raid3_get_diskname(disk), sc->sc_name);
2526                                 g_raid3_destroy_disk(disk);
2527                         }
2528                 }
2529
2530                 /*
2531                  * There must be at least 'sc->sc_ndisks - 1' components
2532                  * with the same syncid and without SYNCHRONIZING flag.
2533                  */
2534
2535                 /*
2536                  * Find the biggest syncid, number of valid components and
2537                  * number of dirty components.
2538                  */
2539                 ndirty = ndisks = syncid = 0;
2540                 for (n = 0; n < sc->sc_ndisks; n++) {
2541                         disk = &sc->sc_disks[n];
2542                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2543                                 continue;
2544                         if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2545                                 ndirty++;
2546                         if (disk->d_sync.ds_syncid > syncid) {
2547                                 syncid = disk->d_sync.ds_syncid;
2548                                 ndisks = 0;
2549                         } else if (disk->d_sync.ds_syncid < syncid) {
2550                                 continue;
2551                         }
2552                         if ((disk->d_flags &
2553                             G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2554                                 continue;
2555                         }
2556                         ndisks++;
2557                 }
2558                 /*
2559                  * Do we have enough valid components?
2560                  */
2561                 if (ndisks + 1 < sc->sc_ndisks) {
2562                         G_RAID3_DEBUG(0,
2563                             "Device %s is broken, too few valid components.",
2564                             sc->sc_name);
2565                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2566                         return;
2567                 }
2568                 /*
2569                  * If there is one DIRTY component and all disks are present,
2570                  * mark it for synchronization. If there is more than one DIRTY
2571                  * component, mark parity component for synchronization.
2572                  */
2573                 if (ndisks == sc->sc_ndisks && ndirty == 1) {
2574                         for (n = 0; n < sc->sc_ndisks; n++) {
2575                                 disk = &sc->sc_disks[n];
2576                                 if ((disk->d_flags &
2577                                     G_RAID3_DISK_FLAG_DIRTY) == 0) {
2578                                         continue;
2579                                 }
2580                                 disk->d_flags |=
2581                                     G_RAID3_DISK_FLAG_SYNCHRONIZING;
2582                         }
2583                 } else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2584                         disk = &sc->sc_disks[sc->sc_ndisks - 1];
2585                         disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2586                 }
2587
2588                 sc->sc_syncid = syncid;
2589                 if (force) {
2590                         /* Remember to bump syncid on first write. */
2591                         sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2592                 }
2593                 if (ndisks == sc->sc_ndisks)
2594                         state = G_RAID3_DEVICE_STATE_COMPLETE;
2595                 else /* if (ndisks == sc->sc_ndisks - 1) */
2596                         state = G_RAID3_DEVICE_STATE_DEGRADED;
2597                 G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2598                     sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2599                     g_raid3_device_state2str(state));
2600                 sc->sc_state = state;
2601                 for (n = 0; n < sc->sc_ndisks; n++) {
2602                         disk = &sc->sc_disks[n];
2603                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2604                                 continue;
2605                         state = g_raid3_determine_state(disk);
2606                         g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2607                         if (state == G_RAID3_DISK_STATE_STALE)
2608                                 sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2609                 }
2610                 break;
2611             }
2612         case G_RAID3_DEVICE_STATE_DEGRADED:
2613                 /*
2614                  * Genid need to be bumped immediately, so do it here.
2615                  */
2616                 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2617                         sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2618                         g_raid3_bump_genid(sc);
2619                 }
2620
2621                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2622                         return;
2623                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2624                     sc->sc_ndisks - 1) {
2625                         if (sc->sc_provider != NULL)
2626                                 g_raid3_destroy_provider(sc);
2627                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2628                         return;
2629                 }
2630                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2631                     sc->sc_ndisks) {
2632                         state = G_RAID3_DEVICE_STATE_COMPLETE;
2633                         G_RAID3_DEBUG(1,
2634                             "Device %s state changed from %s to %s.",
2635                             sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2636                             g_raid3_device_state2str(state));
2637                         sc->sc_state = state;
2638                 }
2639                 if (sc->sc_provider == NULL)
2640                         g_raid3_launch_provider(sc);
2641                 if (sc->sc_rootmount != NULL) {
2642                         G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2643                             sc->sc_rootmount);
2644                         root_mount_rel(sc->sc_rootmount);
2645                         sc->sc_rootmount = NULL;
2646                 }
2647                 break;
2648         case G_RAID3_DEVICE_STATE_COMPLETE:
2649                 /*
2650                  * Genid need to be bumped immediately, so do it here.
2651                  */
2652                 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2653                         sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2654                         g_raid3_bump_genid(sc);
2655                 }
2656
2657                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2658                         return;
2659                 KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2660                     sc->sc_ndisks - 1,
2661                     ("Too few ACTIVE components in COMPLETE state (device %s).",
2662                     sc->sc_name));
2663                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2664                     sc->sc_ndisks - 1) {
2665                         state = G_RAID3_DEVICE_STATE_DEGRADED;
2666                         G_RAID3_DEBUG(1,
2667                             "Device %s state changed from %s to %s.",
2668                             sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2669                             g_raid3_device_state2str(state));
2670                         sc->sc_state = state;
2671                 }
2672                 if (sc->sc_provider == NULL)
2673                         g_raid3_launch_provider(sc);
2674                 if (sc->sc_rootmount != NULL) {
2675                         G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2676                             sc->sc_rootmount);
2677                         root_mount_rel(sc->sc_rootmount);
2678                         sc->sc_rootmount = NULL;
2679                 }
2680                 break;
2681         default:
2682                 KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2683                     g_raid3_device_state2str(sc->sc_state)));
2684                 break;
2685         }
2686 }
2687
2688 /*
2689  * Update disk state and device state if needed.
2690  */
2691 #define DISK_STATE_CHANGED()    G_RAID3_DEBUG(1,                        \
2692         "Disk %s state changed from %s to %s (device %s).",             \
2693         g_raid3_get_diskname(disk),                                     \
2694         g_raid3_disk_state2str(disk->d_state),                          \
2695         g_raid3_disk_state2str(state), sc->sc_name)
2696 static int
2697 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2698 {
2699         struct g_raid3_softc *sc;
2700
2701         sc = disk->d_softc;
2702         sx_assert(&sc->sc_lock, SX_XLOCKED);
2703
2704 again:
2705         G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2706             g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2707             g_raid3_disk_state2str(state));
2708         switch (state) {
2709         case G_RAID3_DISK_STATE_NEW:
2710                 /*
2711                  * Possible scenarios:
2712                  * 1. New disk arrive.
2713                  */
2714                 /* Previous state should be NONE. */
2715                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2716                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2717                     g_raid3_disk_state2str(disk->d_state)));
2718                 DISK_STATE_CHANGED();
2719
2720                 disk->d_state = state;
2721                 G_RAID3_DEBUG(1, "Device %s: provider %s detected.",
2722                     sc->sc_name, g_raid3_get_diskname(disk));
2723                 if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2724                         break;
2725                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2726                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2727                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2728                     g_raid3_device_state2str(sc->sc_state),
2729                     g_raid3_get_diskname(disk),
2730                     g_raid3_disk_state2str(disk->d_state)));
2731                 state = g_raid3_determine_state(disk);
2732                 if (state != G_RAID3_DISK_STATE_NONE)
2733                         goto again;
2734                 break;
2735         case G_RAID3_DISK_STATE_ACTIVE:
2736                 /*
2737                  * Possible scenarios:
2738                  * 1. New disk does not need synchronization.
2739                  * 2. Synchronization process finished successfully.
2740                  */
2741                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2742                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2743                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2744                     g_raid3_device_state2str(sc->sc_state),
2745                     g_raid3_get_diskname(disk),
2746                     g_raid3_disk_state2str(disk->d_state)));
2747                 /* Previous state should be NEW or SYNCHRONIZING. */
2748                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2749                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2750                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2751                     g_raid3_disk_state2str(disk->d_state)));
2752                 DISK_STATE_CHANGED();
2753
2754                 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2755                         disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2756                         disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2757                         g_raid3_sync_stop(sc, 0);
2758                 }
2759                 disk->d_state = state;
2760                 disk->d_sync.ds_offset = 0;
2761                 disk->d_sync.ds_offset_done = 0;
2762                 g_raid3_update_idle(sc, disk);
2763                 g_raid3_update_metadata(disk);
2764                 G_RAID3_DEBUG(1, "Device %s: provider %s activated.",
2765                     sc->sc_name, g_raid3_get_diskname(disk));
2766                 break;
2767         case G_RAID3_DISK_STATE_STALE:
2768                 /*
2769                  * Possible scenarios:
2770                  * 1. Stale disk was connected.
2771                  */
2772                 /* Previous state should be NEW. */
2773                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2774                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2775                     g_raid3_disk_state2str(disk->d_state)));
2776                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2777                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2778                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2779                     g_raid3_device_state2str(sc->sc_state),
2780                     g_raid3_get_diskname(disk),
2781                     g_raid3_disk_state2str(disk->d_state)));
2782                 /*
2783                  * STALE state is only possible if device is marked
2784                  * NOAUTOSYNC.
2785                  */
2786                 KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2787                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2788                     g_raid3_device_state2str(sc->sc_state),
2789                     g_raid3_get_diskname(disk),
2790                     g_raid3_disk_state2str(disk->d_state)));
2791                 DISK_STATE_CHANGED();
2792
2793                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2794                 disk->d_state = state;
2795                 g_raid3_update_metadata(disk);
2796                 G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2797                     sc->sc_name, g_raid3_get_diskname(disk));
2798                 break;
2799         case G_RAID3_DISK_STATE_SYNCHRONIZING:
2800                 /*
2801                  * Possible scenarios:
2802                  * 1. Disk which needs synchronization was connected.
2803                  */
2804                 /* Previous state should be NEW. */
2805                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2806                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2807                     g_raid3_disk_state2str(disk->d_state)));
2808                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2809                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2810                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2811                     g_raid3_device_state2str(sc->sc_state),
2812                     g_raid3_get_diskname(disk),
2813                     g_raid3_disk_state2str(disk->d_state)));
2814                 DISK_STATE_CHANGED();
2815
2816                 if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2817                         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2818                 disk->d_state = state;
2819                 if (sc->sc_provider != NULL) {
2820                         g_raid3_sync_start(sc);
2821                         g_raid3_update_metadata(disk);
2822                 }
2823                 break;
2824         case G_RAID3_DISK_STATE_DISCONNECTED:
2825                 /*
2826                  * Possible scenarios:
2827                  * 1. Device wasn't running yet, but disk disappear.
2828                  * 2. Disk was active and disapppear.
2829                  * 3. Disk disappear during synchronization process.
2830                  */
2831                 if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2832                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2833                         /*
2834                          * Previous state should be ACTIVE, STALE or
2835                          * SYNCHRONIZING.
2836                          */
2837                         KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2838                             disk->d_state == G_RAID3_DISK_STATE_STALE ||
2839                             disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2840                             ("Wrong disk state (%s, %s).",
2841                             g_raid3_get_diskname(disk),
2842                             g_raid3_disk_state2str(disk->d_state)));
2843                 } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2844                         /* Previous state should be NEW. */
2845                         KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2846                             ("Wrong disk state (%s, %s).",
2847                             g_raid3_get_diskname(disk),
2848                             g_raid3_disk_state2str(disk->d_state)));
2849                         /*
2850                          * Reset bumping syncid if disk disappeared in STARTING
2851                          * state.
2852                          */
2853                         if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2854                                 sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2855 #ifdef  INVARIANTS
2856                 } else {
2857                         KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2858                             sc->sc_name,
2859                             g_raid3_device_state2str(sc->sc_state),
2860                             g_raid3_get_diskname(disk),
2861                             g_raid3_disk_state2str(disk->d_state)));
2862 #endif
2863                 }
2864                 DISK_STATE_CHANGED();
2865                 G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2866                     sc->sc_name, g_raid3_get_diskname(disk));
2867
2868                 g_raid3_destroy_disk(disk);
2869                 break;
2870         default:
2871                 KASSERT(1 == 0, ("Unknown state (%u).", state));
2872                 break;
2873         }
2874         return (0);
2875 }
2876 #undef  DISK_STATE_CHANGED
2877
2878 int
2879 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2880 {
2881         struct g_provider *pp;
2882         u_char *buf;
2883         int error;
2884
2885         g_topology_assert();
2886
2887         error = g_access(cp, 1, 0, 0);
2888         if (error != 0)
2889                 return (error);
2890         pp = cp->provider;
2891         g_topology_unlock();
2892         /* Metadata are stored on last sector. */
2893         buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2894             &error);
2895         g_topology_lock();
2896         g_access(cp, -1, 0, 0);
2897         if (buf == NULL) {
2898                 G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2899                     cp->provider->name, error);
2900                 return (error);
2901         }
2902
2903         /* Decode metadata. */
2904         error = raid3_metadata_decode(buf, md);
2905         g_free(buf);
2906         if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2907                 return (EINVAL);
2908         if (md->md_version > G_RAID3_VERSION) {
2909                 G_RAID3_DEBUG(0,
2910                     "Kernel module is too old to handle metadata from %s.",
2911                     cp->provider->name);
2912                 return (EINVAL);
2913         }
2914         if (error != 0) {
2915                 G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2916                     cp->provider->name);
2917                 return (error);
2918         }
2919         if (md->md_sectorsize > MAXPHYS) {
2920                 G_RAID3_DEBUG(0, "The blocksize is too big.");
2921                 return (EINVAL);
2922         }
2923
2924         return (0);
2925 }
2926
2927 static int
2928 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2929     struct g_raid3_metadata *md)
2930 {
2931
2932         if (md->md_no >= sc->sc_ndisks) {
2933                 G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2934                     pp->name, md->md_no);
2935                 return (EINVAL);
2936         }
2937         if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2938                 G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2939                     pp->name, md->md_no);
2940                 return (EEXIST);
2941         }
2942         if (md->md_all != sc->sc_ndisks) {
2943                 G_RAID3_DEBUG(1,
2944                     "Invalid '%s' field on disk %s (device %s), skipping.",
2945                     "md_all", pp->name, sc->sc_name);
2946                 return (EINVAL);
2947         }
2948         if ((md->md_mediasize % md->md_sectorsize) != 0) {
2949                 G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != "
2950                     "0) on disk %s (device %s), skipping.", pp->name,
2951                     sc->sc_name);
2952                 return (EINVAL);
2953         }
2954         if (md->md_mediasize != sc->sc_mediasize) {
2955                 G_RAID3_DEBUG(1,
2956                     "Invalid '%s' field on disk %s (device %s), skipping.",
2957                     "md_mediasize", pp->name, sc->sc_name);
2958                 return (EINVAL);
2959         }
2960         if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2961                 G_RAID3_DEBUG(1,
2962                     "Invalid '%s' field on disk %s (device %s), skipping.",
2963                     "md_mediasize", pp->name, sc->sc_name);
2964                 return (EINVAL);
2965         }
2966         if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2967                 G_RAID3_DEBUG(1,
2968                     "Invalid size of disk %s (device %s), skipping.", pp->name,
2969                     sc->sc_name);
2970                 return (EINVAL);
2971         }
2972         if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2973                 G_RAID3_DEBUG(1,
2974                     "Invalid '%s' field on disk %s (device %s), skipping.",
2975                     "md_sectorsize", pp->name, sc->sc_name);
2976                 return (EINVAL);
2977         }
2978         if (md->md_sectorsize != sc->sc_sectorsize) {
2979                 G_RAID3_DEBUG(1,
2980                     "Invalid '%s' field on disk %s (device %s), skipping.",
2981                     "md_sectorsize", pp->name, sc->sc_name);
2982                 return (EINVAL);
2983         }
2984         if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2985                 G_RAID3_DEBUG(1,
2986                     "Invalid sector size of disk %s (device %s), skipping.",
2987                     pp->name, sc->sc_name);
2988                 return (EINVAL);
2989         }
2990         if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2991                 G_RAID3_DEBUG(1,
2992                     "Invalid device flags on disk %s (device %s), skipping.",
2993                     pp->name, sc->sc_name);
2994                 return (EINVAL);
2995         }
2996         if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2997             (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2998                 /*
2999                  * VERIFY and ROUND-ROBIN options are mutally exclusive.
3000                  */
3001                 G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
3002                     "disk %s (device %s), skipping.", pp->name, sc->sc_name);
3003                 return (EINVAL);
3004         }
3005         if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
3006                 G_RAID3_DEBUG(1,
3007                     "Invalid disk flags on disk %s (device %s), skipping.",
3008                     pp->name, sc->sc_name);
3009                 return (EINVAL);
3010         }
3011         return (0);
3012 }
3013
3014 int
3015 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
3016     struct g_raid3_metadata *md)
3017 {
3018         struct g_raid3_disk *disk;
3019         int error;
3020
3021         g_topology_assert_not();
3022         G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
3023
3024         error = g_raid3_check_metadata(sc, pp, md);
3025         if (error != 0)
3026                 return (error);
3027         if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
3028             md->md_genid < sc->sc_genid) {
3029                 G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
3030                     pp->name, sc->sc_name);
3031                 return (EINVAL);
3032         }
3033         disk = g_raid3_init_disk(sc, pp, md, &error);
3034         if (disk == NULL)
3035                 return (error);
3036         error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
3037             G_RAID3_EVENT_WAIT);
3038         if (error != 0)
3039                 return (error);
3040         if (md->md_version < G_RAID3_VERSION) {
3041                 G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
3042                     pp->name, md->md_version, G_RAID3_VERSION);
3043                 g_raid3_update_metadata(disk);
3044         }
3045         return (0);
3046 }
3047
3048 static void
3049 g_raid3_destroy_delayed(void *arg, int flag)
3050 {
3051         struct g_raid3_softc *sc;
3052         int error;
3053
3054         if (flag == EV_CANCEL) {
3055                 G_RAID3_DEBUG(1, "Destroying canceled.");
3056                 return;
3057         }
3058         sc = arg;
3059         g_topology_unlock();
3060         sx_xlock(&sc->sc_lock);
3061         KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0,
3062             ("DESTROY flag set on %s.", sc->sc_name));
3063         KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0,
3064             ("DESTROYING flag not set on %s.", sc->sc_name));
3065         G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name);
3066         error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT);
3067         if (error != 0) {
3068                 G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name);
3069                 sx_xunlock(&sc->sc_lock);
3070         }
3071         g_topology_lock();
3072 }
3073
3074 static int
3075 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
3076 {
3077         struct g_raid3_softc *sc;
3078         int dcr, dcw, dce, error = 0;
3079
3080         g_topology_assert();
3081         G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
3082             acw, ace);
3083
3084         sc = pp->geom->softc;
3085         if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
3086                 return (0);
3087         KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
3088
3089         dcr = pp->acr + acr;
3090         dcw = pp->acw + acw;
3091         dce = pp->ace + ace;
3092
3093         g_topology_unlock();
3094         sx_xlock(&sc->sc_lock);
3095         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 ||
3096             g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
3097                 if (acr > 0 || acw > 0 || ace > 0)
3098                         error = ENXIO;
3099                 goto end;
3100         }
3101         if (dcw == 0)
3102                 g_raid3_idle(sc, dcw);
3103         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) {
3104                 if (acr > 0 || acw > 0 || ace > 0) {
3105                         error = ENXIO;
3106                         goto end;
3107                 }
3108                 if (dcr == 0 && dcw == 0 && dce == 0) {
3109                         g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK,
3110                             sc, NULL);
3111                 }
3112         }
3113 end:
3114         sx_xunlock(&sc->sc_lock);
3115         g_topology_lock();
3116         return (error);
3117 }
3118
3119 static struct g_geom *
3120 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
3121 {
3122         struct g_raid3_softc *sc;
3123         struct g_geom *gp;
3124         int error, timeout;
3125         u_int n;
3126
3127         g_topology_assert();
3128         G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
3129
3130         /* One disk is minimum. */
3131         if (md->md_all < 1)
3132                 return (NULL);
3133         /*
3134          * Action geom.
3135          */
3136         gp = g_new_geomf(mp, "%s", md->md_name);
3137         sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
3138         sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
3139             M_WAITOK | M_ZERO);
3140         gp->start = g_raid3_start;
3141         gp->orphan = g_raid3_orphan;
3142         gp->access = g_raid3_access;
3143         gp->dumpconf = g_raid3_dumpconf;
3144
3145         sc->sc_id = md->md_id;
3146         sc->sc_mediasize = md->md_mediasize;
3147         sc->sc_sectorsize = md->md_sectorsize;
3148         sc->sc_ndisks = md->md_all;
3149         sc->sc_round_robin = 0;
3150         sc->sc_flags = md->md_mflags;
3151         sc->sc_bump_id = 0;
3152         sc->sc_idle = 1;
3153         sc->sc_last_write = time_uptime;
3154         sc->sc_writes = 0;
3155         for (n = 0; n < sc->sc_ndisks; n++) {
3156                 sc->sc_disks[n].d_softc = sc;
3157                 sc->sc_disks[n].d_no = n;
3158                 sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
3159         }
3160         sx_init(&sc->sc_lock, "graid3:lock");
3161         bioq_init(&sc->sc_queue);
3162         mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
3163         bioq_init(&sc->sc_regular_delayed);
3164         bioq_init(&sc->sc_inflight);
3165         bioq_init(&sc->sc_sync_delayed);
3166         TAILQ_INIT(&sc->sc_events);
3167         mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
3168         callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
3169         sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
3170         gp->softc = sc;
3171         sc->sc_geom = gp;
3172         sc->sc_provider = NULL;
3173         /*
3174          * Synchronization geom.
3175          */
3176         gp = g_new_geomf(mp, "%s.sync", md->md_name);
3177         gp->softc = sc;
3178         gp->orphan = g_raid3_orphan;
3179         sc->sc_sync.ds_geom = gp;
3180
3181         if (!g_raid3_use_malloc) {
3182                 sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k",
3183                     65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3184                     UMA_ALIGN_PTR, 0);
3185                 sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
3186                 sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
3187                 sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
3188                     sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
3189                 sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k",
3190                     16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3191                     UMA_ALIGN_PTR, 0);
3192                 sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
3193                 sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
3194                 sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
3195                     sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
3196                 sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k",
3197                     4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3198                     UMA_ALIGN_PTR, 0);
3199                 sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
3200                 sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
3201                 sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
3202                     sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
3203         }
3204
3205         error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
3206             "g_raid3 %s", md->md_name);
3207         if (error != 0) {
3208                 G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
3209                     sc->sc_name);
3210                 if (!g_raid3_use_malloc) {
3211                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
3212                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
3213                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
3214                 }
3215                 g_destroy_geom(sc->sc_sync.ds_geom);
3216                 mtx_destroy(&sc->sc_events_mtx);
3217                 mtx_destroy(&sc->sc_queue_mtx);
3218                 sx_destroy(&sc->sc_lock);
3219                 g_destroy_geom(sc->sc_geom);
3220                 free(sc->sc_disks, M_RAID3);
3221                 free(sc, M_RAID3);
3222                 return (NULL);
3223         }
3224
3225         G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).",
3226             sc->sc_name, sc->sc_ndisks, sc->sc_id);
3227
3228         sc->sc_rootmount = root_mount_hold("GRAID3");
3229         G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
3230
3231         /*
3232          * Run timeout.
3233          */
3234         timeout = atomic_load_acq_int(&g_raid3_timeout);
3235         callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
3236         return (sc->sc_geom);
3237 }
3238
3239 int
3240 g_raid3_destroy(struct g_raid3_softc *sc, int how)
3241 {
3242         struct g_provider *pp;
3243
3244         g_topology_assert_not();
3245         if (sc == NULL)
3246                 return (ENXIO);
3247         sx_assert(&sc->sc_lock, SX_XLOCKED);
3248
3249         pp = sc->sc_provider;
3250         if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
3251                 switch (how) {
3252                 case G_RAID3_DESTROY_SOFT:
3253                         G_RAID3_DEBUG(1,
3254                             "Device %s is still open (r%dw%de%d).", pp->name,
3255                             pp->acr, pp->acw, pp->ace);
3256                         return (EBUSY);
3257                 case G_RAID3_DESTROY_DELAYED:
3258                         G_RAID3_DEBUG(1,
3259                             "Device %s will be destroyed on last close.",
3260                             pp->name);
3261                         if (sc->sc_syncdisk != NULL)
3262                                 g_raid3_sync_stop(sc, 1);
3263                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING;
3264                         return (EBUSY);
3265                 case G_RAID3_DESTROY_HARD:
3266                         G_RAID3_DEBUG(1, "Device %s is still open, so it "
3267                             "can't be definitely removed.", pp->name);
3268                         break;
3269                 }
3270         }
3271
3272         g_topology_lock();
3273         if (sc->sc_geom->softc == NULL) {
3274                 g_topology_unlock();
3275                 return (0);
3276         }
3277         sc->sc_geom->softc = NULL;
3278         sc->sc_sync.ds_geom->softc = NULL;
3279         g_topology_unlock();
3280
3281         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
3282         sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
3283         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
3284         sx_xunlock(&sc->sc_lock);
3285         mtx_lock(&sc->sc_queue_mtx);
3286         wakeup(sc);
3287         wakeup(&sc->sc_queue);
3288         mtx_unlock(&sc->sc_queue_mtx);
3289         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
3290         while (sc->sc_worker != NULL)
3291                 tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
3292         G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
3293         sx_xlock(&sc->sc_lock);
3294         g_raid3_destroy_device(sc);
3295         free(sc->sc_disks, M_RAID3);
3296         free(sc, M_RAID3);
3297         return (0);
3298 }
3299
3300 static void
3301 g_raid3_taste_orphan(struct g_consumer *cp)
3302 {
3303
3304         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
3305             cp->provider->name));
3306 }
3307
3308 static struct g_geom *
3309 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
3310 {
3311         struct g_raid3_metadata md;
3312         struct g_raid3_softc *sc;
3313         struct g_consumer *cp;
3314         struct g_geom *gp;
3315         int error;
3316
3317         g_topology_assert();
3318         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
3319         G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
3320
3321         gp = g_new_geomf(mp, "raid3:taste");
3322         /* This orphan function should be never called. */
3323         gp->orphan = g_raid3_taste_orphan;
3324         cp = g_new_consumer(gp);
3325         g_attach(cp, pp);
3326         error = g_raid3_read_metadata(cp, &md);
3327         g_detach(cp);
3328         g_destroy_consumer(cp);
3329         g_destroy_geom(gp);
3330         if (error != 0)
3331                 return (NULL);
3332         gp = NULL;
3333
3334         if (md.md_provider[0] != '\0' &&
3335             !g_compare_names(md.md_provider, pp->name))
3336                 return (NULL);
3337         if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
3338                 return (NULL);
3339         if (g_raid3_debug >= 2)
3340                 raid3_metadata_dump(&md);
3341
3342         /*
3343          * Let's check if device already exists.
3344          */
3345         sc = NULL;
3346         LIST_FOREACH(gp, &mp->geom, geom) {
3347                 sc = gp->softc;
3348                 if (sc == NULL)
3349                         continue;
3350                 if (sc->sc_sync.ds_geom == gp)
3351                         continue;
3352                 if (strcmp(md.md_name, sc->sc_name) != 0)
3353                         continue;
3354                 if (md.md_id != sc->sc_id) {
3355                         G_RAID3_DEBUG(0, "Device %s already configured.",
3356                             sc->sc_name);
3357                         return (NULL);
3358                 }
3359                 break;
3360         }
3361         if (gp == NULL) {
3362                 gp = g_raid3_create(mp, &md);
3363                 if (gp == NULL) {
3364                         G_RAID3_DEBUG(0, "Cannot create device %s.",
3365                             md.md_name);
3366                         return (NULL);
3367                 }
3368                 sc = gp->softc;
3369         }
3370         G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
3371         g_topology_unlock();
3372         sx_xlock(&sc->sc_lock);
3373         error = g_raid3_add_disk(sc, pp, &md);
3374         if (error != 0) {
3375                 G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
3376                     pp->name, gp->name, error);
3377                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
3378                     sc->sc_ndisks) {
3379                         g_cancel_event(sc);
3380                         g_raid3_destroy(sc, G_RAID3_DESTROY_HARD);
3381                         g_topology_lock();
3382                         return (NULL);
3383                 }
3384                 gp = NULL;
3385         }
3386         sx_xunlock(&sc->sc_lock);
3387         g_topology_lock();
3388         return (gp);
3389 }
3390
3391 static int
3392 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
3393     struct g_geom *gp)
3394 {
3395         struct g_raid3_softc *sc;
3396         int error;
3397
3398         g_topology_unlock();
3399         sc = gp->softc;
3400         sx_xlock(&sc->sc_lock);
3401         g_cancel_event(sc);
3402         error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT);
3403         if (error != 0)
3404                 sx_xunlock(&sc->sc_lock);
3405         g_topology_lock();
3406         return (error);
3407 }
3408
3409 static void
3410 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
3411     struct g_consumer *cp, struct g_provider *pp)
3412 {
3413         struct g_raid3_softc *sc;
3414
3415         g_topology_assert();
3416
3417         sc = gp->softc;
3418         if (sc == NULL)
3419                 return;
3420         /* Skip synchronization geom. */
3421         if (gp == sc->sc_sync.ds_geom)
3422                 return;
3423         if (pp != NULL) {
3424                 /* Nothing here. */
3425         } else if (cp != NULL) {
3426                 struct g_raid3_disk *disk;
3427
3428                 disk = cp->private;
3429                 if (disk == NULL)
3430                         return;
3431                 g_topology_unlock();
3432                 sx_xlock(&sc->sc_lock);
3433                 sbuf_printf(sb, "%s<Type>", indent);
3434                 if (disk->d_no == sc->sc_ndisks - 1)
3435                         sbuf_printf(sb, "PARITY");
3436                 else
3437                         sbuf_printf(sb, "DATA");
3438                 sbuf_printf(sb, "</Type>\n");
3439                 sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
3440                     (u_int)disk->d_no);
3441                 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
3442                         sbuf_printf(sb, "%s<Synchronized>", indent);
3443                         if (disk->d_sync.ds_offset == 0)
3444                                 sbuf_printf(sb, "0%%");
3445                         else {
3446                                 sbuf_printf(sb, "%u%%",
3447                                     (u_int)((disk->d_sync.ds_offset * 100) /
3448                                     (sc->sc_mediasize / (sc->sc_ndisks - 1))));
3449                         }
3450                         sbuf_printf(sb, "</Synchronized>\n");
3451                         if (disk->d_sync.ds_offset > 0) {
3452                                 sbuf_printf(sb, "%s<BytesSynced>%jd"
3453                                     "</BytesSynced>\n", indent,
3454                                     (intmax_t)disk->d_sync.ds_offset);
3455                         }
3456                 }
3457                 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3458                     disk->d_sync.ds_syncid);
3459                 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
3460                 sbuf_printf(sb, "%s<Flags>", indent);
3461                 if (disk->d_flags == 0)
3462                         sbuf_printf(sb, "NONE");
3463                 else {
3464                         int first = 1;
3465
3466 #define ADD_FLAG(flag, name)    do {                                    \
3467         if ((disk->d_flags & (flag)) != 0) {                            \
3468                 if (!first)                                             \
3469                         sbuf_printf(sb, ", ");                          \
3470                 else                                                    \
3471                         first = 0;                                      \
3472                 sbuf_printf(sb, name);                                  \
3473         }                                                               \
3474 } while (0)
3475                         ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3476                         ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3477                         ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3478                             "SYNCHRONIZING");
3479                         ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3480                         ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
3481 #undef  ADD_FLAG
3482                 }
3483                 sbuf_printf(sb, "</Flags>\n");
3484                 sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3485                     g_raid3_disk_state2str(disk->d_state));
3486                 sx_xunlock(&sc->sc_lock);
3487                 g_topology_lock();
3488         } else {
3489                 g_topology_unlock();
3490                 sx_xlock(&sc->sc_lock);
3491                 if (!g_raid3_use_malloc) {
3492                         sbuf_printf(sb,
3493                             "%s<Zone4kRequested>%u</Zone4kRequested>\n", indent,
3494                             sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
3495                         sbuf_printf(sb,
3496                             "%s<Zone4kFailed>%u</Zone4kFailed>\n", indent,
3497                             sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
3498                         sbuf_printf(sb,
3499                             "%s<Zone16kRequested>%u</Zone16kRequested>\n", indent,
3500                             sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
3501                         sbuf_printf(sb,
3502                             "%s<Zone16kFailed>%u</Zone16kFailed>\n", indent,
3503                             sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
3504                         sbuf_printf(sb,
3505                             "%s<Zone64kRequested>%u</Zone64kRequested>\n", indent,
3506                             sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
3507                         sbuf_printf(sb,
3508                             "%s<Zone64kFailed>%u</Zone64kFailed>\n", indent,
3509                             sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
3510                 }
3511                 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3512                 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3513                 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3514                 sbuf_printf(sb, "%s<Flags>", indent);
3515                 if (sc->sc_flags == 0)
3516                         sbuf_printf(sb, "NONE");
3517                 else {
3518                         int first = 1;
3519
3520 #define ADD_FLAG(flag, name)    do {                                    \
3521         if ((sc->sc_flags & (flag)) != 0) {                             \
3522                 if (!first)                                             \
3523                         sbuf_printf(sb, ", ");                          \
3524                 else                                                    \
3525                         first = 0;                                      \
3526                 sbuf_printf(sb, name);                                  \
3527         }                                                               \
3528 } while (0)
3529                         ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
3530                         ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3531                         ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3532                             "ROUND-ROBIN");
3533                         ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3534 #undef  ADD_FLAG
3535                 }
3536                 sbuf_printf(sb, "</Flags>\n");
3537                 sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3538                     sc->sc_ndisks);
3539                 sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3540                     g_raid3_device_state2str(sc->sc_state));
3541                 sx_xunlock(&sc->sc_lock);
3542                 g_topology_lock();
3543         }
3544 }
3545
3546 static void
3547 g_raid3_shutdown_post_sync(void *arg, int howto)
3548 {
3549         struct g_class *mp;
3550         struct g_geom *gp, *gp2;
3551         struct g_raid3_softc *sc;
3552         int error;
3553
3554         mp = arg;
3555         DROP_GIANT();
3556         g_topology_lock();
3557         g_raid3_shutdown = 1;
3558         LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3559                 if ((sc = gp->softc) == NULL)
3560                         continue;
3561                 /* Skip synchronization geom. */
3562                 if (gp == sc->sc_sync.ds_geom)
3563                         continue;
3564                 g_topology_unlock();
3565                 sx_xlock(&sc->sc_lock);
3566                 g_raid3_idle(sc, -1);
3567                 g_cancel_event(sc);
3568                 error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED);
3569                 if (error != 0)
3570                         sx_xunlock(&sc->sc_lock);
3571                 g_topology_lock();
3572         }
3573         g_topology_unlock();
3574         PICKUP_GIANT();
3575 }
3576
3577 static void
3578 g_raid3_init(struct g_class *mp)
3579 {
3580
3581         g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
3582             g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
3583         if (g_raid3_post_sync == NULL)
3584                 G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3585 }
3586
3587 static void
3588 g_raid3_fini(struct g_class *mp)
3589 {
3590
3591         if (g_raid3_post_sync != NULL)
3592                 EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync);
3593 }
3594
3595 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);