]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/geom/raid3/g_raid3.c
MFV: r362286
[FreeBSD/FreeBSD.git] / sys / geom / raid3 / g_raid3.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/module.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/bio.h>
40 #include <sys/sbuf.h>
41 #include <sys/sysctl.h>
42 #include <sys/malloc.h>
43 #include <sys/eventhandler.h>
44 #include <vm/uma.h>
45 #include <geom/geom.h>
46 #include <geom/geom_dbg.h>
47 #include <sys/proc.h>
48 #include <sys/kthread.h>
49 #include <sys/sched.h>
50 #include <geom/raid3/g_raid3.h>
51
52 FEATURE(geom_raid3, "GEOM RAID-3 functionality");
53
54 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
55
56 SYSCTL_DECL(_kern_geom);
57 static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
58     "GEOM_RAID3 stuff");
59 u_int g_raid3_debug = 0;
60 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0,
61     "Debug level");
62 static u_int g_raid3_timeout = 4;
63 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout,
64     0, "Time to wait on all raid3 components");
65 static u_int g_raid3_idletime = 5;
66 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN,
67     &g_raid3_idletime, 0, "Mark components as clean when idling");
68 static u_int g_raid3_disconnect_on_failure = 1;
69 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
70     &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
71 static u_int g_raid3_syncreqs = 2;
72 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
73     &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
74 static u_int g_raid3_use_malloc = 0;
75 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN,
76     &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9).");
77
78 static u_int g_raid3_n64k = 50;
79 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0,
80     "Maximum number of 64kB allocations");
81 static u_int g_raid3_n16k = 200;
82 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0,
83     "Maximum number of 16kB allocations");
84 static u_int g_raid3_n4k = 1200;
85 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0,
86     "Maximum number of 4kB allocations");
87
88 static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat,
89     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
90     "GEOM_RAID3 statistics");
91 static u_int g_raid3_parity_mismatch = 0;
92 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
93     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
94
95 #define MSLEEP(ident, mtx, priority, wmesg, timeout)    do {            \
96         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));        \
97         msleep((ident), (mtx), (priority), (wmesg), (timeout));         \
98         G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));        \
99 } while (0)
100
101 static eventhandler_tag g_raid3_post_sync = NULL;
102 static int g_raid3_shutdown = 0;
103
104 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
105     struct g_geom *gp);
106 static g_taste_t g_raid3_taste;
107 static void g_raid3_init(struct g_class *mp);
108 static void g_raid3_fini(struct g_class *mp);
109
110 struct g_class g_raid3_class = {
111         .name = G_RAID3_CLASS_NAME,
112         .version = G_VERSION,
113         .ctlreq = g_raid3_config,
114         .taste = g_raid3_taste,
115         .destroy_geom = g_raid3_destroy_geom,
116         .init = g_raid3_init,
117         .fini = g_raid3_fini
118 };
119
120
121 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
122 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
123 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
124 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
125     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
126 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
127 static int g_raid3_register_request(struct bio *pbp);
128 static void g_raid3_sync_release(struct g_raid3_softc *sc);
129
130
131 static const char *
132 g_raid3_disk_state2str(int state)
133 {
134
135         switch (state) {
136         case G_RAID3_DISK_STATE_NODISK:
137                 return ("NODISK");
138         case G_RAID3_DISK_STATE_NONE:
139                 return ("NONE");
140         case G_RAID3_DISK_STATE_NEW:
141                 return ("NEW");
142         case G_RAID3_DISK_STATE_ACTIVE:
143                 return ("ACTIVE");
144         case G_RAID3_DISK_STATE_STALE:
145                 return ("STALE");
146         case G_RAID3_DISK_STATE_SYNCHRONIZING:
147                 return ("SYNCHRONIZING");
148         case G_RAID3_DISK_STATE_DISCONNECTED:
149                 return ("DISCONNECTED");
150         default:
151                 return ("INVALID");
152         }
153 }
154
155 static const char *
156 g_raid3_device_state2str(int state)
157 {
158
159         switch (state) {
160         case G_RAID3_DEVICE_STATE_STARTING:
161                 return ("STARTING");
162         case G_RAID3_DEVICE_STATE_DEGRADED:
163                 return ("DEGRADED");
164         case G_RAID3_DEVICE_STATE_COMPLETE:
165                 return ("COMPLETE");
166         default:
167                 return ("INVALID");
168         }
169 }
170
171 const char *
172 g_raid3_get_diskname(struct g_raid3_disk *disk)
173 {
174
175         if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
176                 return ("[unknown]");
177         return (disk->d_name);
178 }
179
180 static void *
181 g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags)
182 {
183         void *ptr;
184         enum g_raid3_zones zone;
185
186         if (g_raid3_use_malloc ||
187             (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
188                 ptr = malloc(size, M_RAID3, flags);
189         else {
190                 ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone,
191                    &sc->sc_zones[zone], flags);
192                 sc->sc_zones[zone].sz_requested++;
193                 if (ptr == NULL)
194                         sc->sc_zones[zone].sz_failed++;
195         }
196         return (ptr);
197 }
198
199 static void
200 g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size)
201 {
202         enum g_raid3_zones zone;
203
204         if (g_raid3_use_malloc ||
205             (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
206                 free(ptr, M_RAID3);
207         else {
208                 uma_zfree_arg(sc->sc_zones[zone].sz_zone,
209                     ptr, &sc->sc_zones[zone]);
210         }
211 }
212
213 static int
214 g_raid3_uma_ctor(void *mem, int size, void *arg, int flags)
215 {
216         struct g_raid3_zone *sz = arg;
217
218         if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max)
219                 return (ENOMEM);
220         sz->sz_inuse++;
221         return (0);
222 }
223
224 static void
225 g_raid3_uma_dtor(void *mem, int size, void *arg)
226 {
227         struct g_raid3_zone *sz = arg;
228
229         sz->sz_inuse--;
230 }
231
232 #define g_raid3_xor(src, dst, size)                                     \
233         _g_raid3_xor((uint64_t *)(src),                                 \
234             (uint64_t *)(dst), (size_t)size)
235 static void
236 _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size)
237 {
238
239         KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
240         for (; size > 0; size -= 128) {
241                 *dst++ ^= (*src++);
242                 *dst++ ^= (*src++);
243                 *dst++ ^= (*src++);
244                 *dst++ ^= (*src++);
245                 *dst++ ^= (*src++);
246                 *dst++ ^= (*src++);
247                 *dst++ ^= (*src++);
248                 *dst++ ^= (*src++);
249                 *dst++ ^= (*src++);
250                 *dst++ ^= (*src++);
251                 *dst++ ^= (*src++);
252                 *dst++ ^= (*src++);
253                 *dst++ ^= (*src++);
254                 *dst++ ^= (*src++);
255                 *dst++ ^= (*src++);
256                 *dst++ ^= (*src++);
257         }
258 }
259
260 static int
261 g_raid3_is_zero(struct bio *bp)
262 {
263         static const uint64_t zeros[] = {
264             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
265         };
266         u_char *addr;
267         ssize_t size;
268
269         size = bp->bio_length;
270         addr = (u_char *)bp->bio_data;
271         for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
272                 if (bcmp(addr, zeros, sizeof(zeros)) != 0)
273                         return (0);
274         }
275         return (1);
276 }
277
278 /*
279  * --- Events handling functions ---
280  * Events in geom_raid3 are used to maintain disks and device status
281  * from one thread to simplify locking.
282  */
283 static void
284 g_raid3_event_free(struct g_raid3_event *ep)
285 {
286
287         free(ep, M_RAID3);
288 }
289
290 int
291 g_raid3_event_send(void *arg, int state, int flags)
292 {
293         struct g_raid3_softc *sc;
294         struct g_raid3_disk *disk;
295         struct g_raid3_event *ep;
296         int error;
297
298         ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
299         G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
300         if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
301                 disk = NULL;
302                 sc = arg;
303         } else {
304                 disk = arg;
305                 sc = disk->d_softc;
306         }
307         ep->e_disk = disk;
308         ep->e_state = state;
309         ep->e_flags = flags;
310         ep->e_error = 0;
311         mtx_lock(&sc->sc_events_mtx);
312         TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
313         mtx_unlock(&sc->sc_events_mtx);
314         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
315         mtx_lock(&sc->sc_queue_mtx);
316         wakeup(sc);
317         wakeup(&sc->sc_queue);
318         mtx_unlock(&sc->sc_queue_mtx);
319         if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
320                 return (0);
321         sx_assert(&sc->sc_lock, SX_XLOCKED);
322         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
323         sx_xunlock(&sc->sc_lock);
324         while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
325                 mtx_lock(&sc->sc_events_mtx);
326                 MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
327                     hz * 5);
328         }
329         error = ep->e_error;
330         g_raid3_event_free(ep);
331         sx_xlock(&sc->sc_lock);
332         return (error);
333 }
334
335 static struct g_raid3_event *
336 g_raid3_event_get(struct g_raid3_softc *sc)
337 {
338         struct g_raid3_event *ep;
339
340         mtx_lock(&sc->sc_events_mtx);
341         ep = TAILQ_FIRST(&sc->sc_events);
342         mtx_unlock(&sc->sc_events_mtx);
343         return (ep);
344 }
345
346 static void
347 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
348 {
349
350         mtx_lock(&sc->sc_events_mtx);
351         TAILQ_REMOVE(&sc->sc_events, ep, e_next);
352         mtx_unlock(&sc->sc_events_mtx);
353 }
354
355 static void
356 g_raid3_event_cancel(struct g_raid3_disk *disk)
357 {
358         struct g_raid3_softc *sc;
359         struct g_raid3_event *ep, *tmpep;
360
361         sc = disk->d_softc;
362         sx_assert(&sc->sc_lock, SX_XLOCKED);
363
364         mtx_lock(&sc->sc_events_mtx);
365         TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
366                 if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
367                         continue;
368                 if (ep->e_disk != disk)
369                         continue;
370                 TAILQ_REMOVE(&sc->sc_events, ep, e_next);
371                 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
372                         g_raid3_event_free(ep);
373                 else {
374                         ep->e_error = ECANCELED;
375                         wakeup(ep);
376                 }
377         }
378         mtx_unlock(&sc->sc_events_mtx);
379 }
380
381 /*
382  * Return the number of disks in the given state.
383  * If state is equal to -1, count all connected disks.
384  */
385 u_int
386 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
387 {
388         struct g_raid3_disk *disk;
389         u_int n, ndisks;
390
391         sx_assert(&sc->sc_lock, SX_LOCKED);
392
393         for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
394                 disk = &sc->sc_disks[n];
395                 if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
396                         continue;
397                 if (state == -1 || disk->d_state == state)
398                         ndisks++;
399         }
400         return (ndisks);
401 }
402
403 static u_int
404 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
405 {
406         struct bio *bp;
407         u_int nreqs = 0;
408
409         mtx_lock(&sc->sc_queue_mtx);
410         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
411                 if (bp->bio_from == cp)
412                         nreqs++;
413         }
414         mtx_unlock(&sc->sc_queue_mtx);
415         return (nreqs);
416 }
417
418 static int
419 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
420 {
421
422         if (cp->index > 0) {
423                 G_RAID3_DEBUG(2,
424                     "I/O requests for %s exist, can't destroy it now.",
425                     cp->provider->name);
426                 return (1);
427         }
428         if (g_raid3_nrequests(sc, cp) > 0) {
429                 G_RAID3_DEBUG(2,
430                     "I/O requests for %s in queue, can't destroy it now.",
431                     cp->provider->name);
432                 return (1);
433         }
434         return (0);
435 }
436
437 static void
438 g_raid3_destroy_consumer(void *arg, int flags __unused)
439 {
440         struct g_consumer *cp;
441
442         g_topology_assert();
443
444         cp = arg;
445         G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
446         g_detach(cp);
447         g_destroy_consumer(cp);
448 }
449
450 static void
451 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
452 {
453         struct g_provider *pp;
454         int retaste_wait;
455
456         g_topology_assert();
457
458         cp->private = NULL;
459         if (g_raid3_is_busy(sc, cp))
460                 return;
461         G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
462         pp = cp->provider;
463         retaste_wait = 0;
464         if (cp->acw == 1) {
465                 if ((pp->geom->flags & G_GEOM_WITHER) == 0)
466                         retaste_wait = 1;
467         }
468         G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
469             -cp->acw, -cp->ace, 0);
470         if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
471                 g_access(cp, -cp->acr, -cp->acw, -cp->ace);
472         if (retaste_wait) {
473                 /*
474                  * After retaste event was send (inside g_access()), we can send
475                  * event to detach and destroy consumer.
476                  * A class, which has consumer to the given provider connected
477                  * will not receive retaste event for the provider.
478                  * This is the way how I ignore retaste events when I close
479                  * consumers opened for write: I detach and destroy consumer
480                  * after retaste event is sent.
481                  */
482                 g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
483                 return;
484         }
485         G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
486         g_detach(cp);
487         g_destroy_consumer(cp);
488 }
489
490 static int
491 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
492 {
493         struct g_consumer *cp;
494         int error;
495
496         g_topology_assert_not();
497         KASSERT(disk->d_consumer == NULL,
498             ("Disk already connected (device %s).", disk->d_softc->sc_name));
499
500         g_topology_lock();
501         cp = g_new_consumer(disk->d_softc->sc_geom);
502         error = g_attach(cp, pp);
503         if (error != 0) {
504                 g_destroy_consumer(cp);
505                 g_topology_unlock();
506                 return (error);
507         }
508         error = g_access(cp, 1, 1, 1);
509                 g_topology_unlock();
510         if (error != 0) {
511                 g_detach(cp);
512                 g_destroy_consumer(cp);
513                 G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
514                     pp->name, error);
515                 return (error);
516         }
517         disk->d_consumer = cp;
518         disk->d_consumer->private = disk;
519         disk->d_consumer->index = 0;
520         G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
521         return (0);
522 }
523
524 static void
525 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
526 {
527
528         g_topology_assert();
529
530         if (cp == NULL)
531                 return;
532         if (cp->provider != NULL)
533                 g_raid3_kill_consumer(sc, cp);
534         else
535                 g_destroy_consumer(cp);
536 }
537
538 /*
539  * Initialize disk. This means allocate memory, create consumer, attach it
540  * to the provider and open access (r1w1e1) to it.
541  */
542 static struct g_raid3_disk *
543 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
544     struct g_raid3_metadata *md, int *errorp)
545 {
546         struct g_raid3_disk *disk;
547         int error;
548
549         disk = &sc->sc_disks[md->md_no];
550         error = g_raid3_connect_disk(disk, pp);
551         if (error != 0) {
552                 if (errorp != NULL)
553                         *errorp = error;
554                 return (NULL);
555         }
556         disk->d_state = G_RAID3_DISK_STATE_NONE;
557         disk->d_flags = md->md_dflags;
558         if (md->md_provider[0] != '\0')
559                 disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
560         disk->d_sync.ds_consumer = NULL;
561         disk->d_sync.ds_offset = md->md_sync_offset;
562         disk->d_sync.ds_offset_done = md->md_sync_offset;
563         disk->d_genid = md->md_genid;
564         disk->d_sync.ds_syncid = md->md_syncid;
565         if (errorp != NULL)
566                 *errorp = 0;
567         return (disk);
568 }
569
570 static void
571 g_raid3_destroy_disk(struct g_raid3_disk *disk)
572 {
573         struct g_raid3_softc *sc;
574
575         g_topology_assert_not();
576         sc = disk->d_softc;
577         sx_assert(&sc->sc_lock, SX_XLOCKED);
578
579         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
580                 return;
581         g_raid3_event_cancel(disk);
582         switch (disk->d_state) {
583         case G_RAID3_DISK_STATE_SYNCHRONIZING:
584                 if (sc->sc_syncdisk != NULL)
585                         g_raid3_sync_stop(sc, 1);
586                 /* FALLTHROUGH */
587         case G_RAID3_DISK_STATE_NEW:
588         case G_RAID3_DISK_STATE_STALE:
589         case G_RAID3_DISK_STATE_ACTIVE:
590                 g_topology_lock();
591                 g_raid3_disconnect_consumer(sc, disk->d_consumer);
592                 g_topology_unlock();
593                 disk->d_consumer = NULL;
594                 break;
595         default:
596                 KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
597                     g_raid3_get_diskname(disk),
598                     g_raid3_disk_state2str(disk->d_state)));
599         }
600         disk->d_state = G_RAID3_DISK_STATE_NODISK;
601 }
602
603 static void
604 g_raid3_destroy_device(struct g_raid3_softc *sc)
605 {
606         struct g_raid3_event *ep;
607         struct g_raid3_disk *disk;
608         struct g_geom *gp;
609         struct g_consumer *cp;
610         u_int n;
611
612         g_topology_assert_not();
613         sx_assert(&sc->sc_lock, SX_XLOCKED);
614
615         gp = sc->sc_geom;
616         if (sc->sc_provider != NULL)
617                 g_raid3_destroy_provider(sc);
618         for (n = 0; n < sc->sc_ndisks; n++) {
619                 disk = &sc->sc_disks[n];
620                 if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
621                         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
622                         g_raid3_update_metadata(disk);
623                         g_raid3_destroy_disk(disk);
624                 }
625         }
626         while ((ep = g_raid3_event_get(sc)) != NULL) {
627                 g_raid3_event_remove(sc, ep);
628                 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
629                         g_raid3_event_free(ep);
630                 else {
631                         ep->e_error = ECANCELED;
632                         ep->e_flags |= G_RAID3_EVENT_DONE;
633                         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
634                         mtx_lock(&sc->sc_events_mtx);
635                         wakeup(ep);
636                         mtx_unlock(&sc->sc_events_mtx);
637                 }
638         }
639         callout_drain(&sc->sc_callout);
640         cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
641         g_topology_lock();
642         if (cp != NULL)
643                 g_raid3_disconnect_consumer(sc, cp);
644         g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
645         G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
646         g_wither_geom(gp, ENXIO);
647         g_topology_unlock();
648         if (!g_raid3_use_malloc) {
649                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
650                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
651                 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
652         }
653         mtx_destroy(&sc->sc_queue_mtx);
654         mtx_destroy(&sc->sc_events_mtx);
655         sx_xunlock(&sc->sc_lock);
656         sx_destroy(&sc->sc_lock);
657 }
658
659 static void
660 g_raid3_orphan(struct g_consumer *cp)
661 {
662         struct g_raid3_disk *disk;
663
664         g_topology_assert();
665
666         disk = cp->private;
667         if (disk == NULL)
668                 return;
669         disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
670         g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
671             G_RAID3_EVENT_DONTWAIT);
672 }
673
674 static int
675 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
676 {
677         struct g_raid3_softc *sc;
678         struct g_consumer *cp;
679         off_t offset, length;
680         u_char *sector;
681         int error = 0;
682
683         g_topology_assert_not();
684         sc = disk->d_softc;
685         sx_assert(&sc->sc_lock, SX_LOCKED);
686
687         cp = disk->d_consumer;
688         KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
689         KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
690         KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
691             ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
692             cp->acw, cp->ace));
693         length = cp->provider->sectorsize;
694         offset = cp->provider->mediasize - length;
695         sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
696         if (md != NULL)
697                 raid3_metadata_encode(md, sector);
698         error = g_write_data(cp, offset, sector, length);
699         free(sector, M_RAID3);
700         if (error != 0) {
701                 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
702                         G_RAID3_DEBUG(0, "Cannot write metadata on %s "
703                             "(device=%s, error=%d).",
704                             g_raid3_get_diskname(disk), sc->sc_name, error);
705                         disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
706                 } else {
707                         G_RAID3_DEBUG(1, "Cannot write metadata on %s "
708                             "(device=%s, error=%d).",
709                             g_raid3_get_diskname(disk), sc->sc_name, error);
710                 }
711                 if (g_raid3_disconnect_on_failure &&
712                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
713                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
714                         g_raid3_event_send(disk,
715                             G_RAID3_DISK_STATE_DISCONNECTED,
716                             G_RAID3_EVENT_DONTWAIT);
717                 }
718         }
719         return (error);
720 }
721
722 int
723 g_raid3_clear_metadata(struct g_raid3_disk *disk)
724 {
725         int error;
726
727         g_topology_assert_not();
728         sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
729
730         error = g_raid3_write_metadata(disk, NULL);
731         if (error == 0) {
732                 G_RAID3_DEBUG(2, "Metadata on %s cleared.",
733                     g_raid3_get_diskname(disk));
734         } else {
735                 G_RAID3_DEBUG(0,
736                     "Cannot clear metadata on disk %s (error=%d).",
737                     g_raid3_get_diskname(disk), error);
738         }
739         return (error);
740 }
741
742 void
743 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
744 {
745         struct g_raid3_softc *sc;
746         struct g_provider *pp;
747
748         sc = disk->d_softc;
749         strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
750         md->md_version = G_RAID3_VERSION;
751         strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
752         md->md_id = sc->sc_id;
753         md->md_all = sc->sc_ndisks;
754         md->md_genid = sc->sc_genid;
755         md->md_mediasize = sc->sc_mediasize;
756         md->md_sectorsize = sc->sc_sectorsize;
757         md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
758         md->md_no = disk->d_no;
759         md->md_syncid = disk->d_sync.ds_syncid;
760         md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
761         if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
762                 md->md_sync_offset = 0;
763         else {
764                 md->md_sync_offset =
765                     disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1);
766         }
767         if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
768                 pp = disk->d_consumer->provider;
769         else
770                 pp = NULL;
771         if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
772                 strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
773         else
774                 bzero(md->md_provider, sizeof(md->md_provider));
775         if (pp != NULL)
776                 md->md_provsize = pp->mediasize;
777         else
778                 md->md_provsize = 0;
779 }
780
781 void
782 g_raid3_update_metadata(struct g_raid3_disk *disk)
783 {
784         struct g_raid3_softc *sc;
785         struct g_raid3_metadata md;
786         int error;
787
788         g_topology_assert_not();
789         sc = disk->d_softc;
790         sx_assert(&sc->sc_lock, SX_LOCKED);
791
792         g_raid3_fill_metadata(disk, &md);
793         error = g_raid3_write_metadata(disk, &md);
794         if (error == 0) {
795                 G_RAID3_DEBUG(2, "Metadata on %s updated.",
796                     g_raid3_get_diskname(disk));
797         } else {
798                 G_RAID3_DEBUG(0,
799                     "Cannot update metadata on disk %s (error=%d).",
800                     g_raid3_get_diskname(disk), error);
801         }
802 }
803
804 static void
805 g_raid3_bump_syncid(struct g_raid3_softc *sc)
806 {
807         struct g_raid3_disk *disk;
808         u_int n;
809
810         g_topology_assert_not();
811         sx_assert(&sc->sc_lock, SX_XLOCKED);
812         KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
813             ("%s called with no active disks (device=%s).", __func__,
814             sc->sc_name));
815
816         sc->sc_syncid++;
817         G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
818             sc->sc_syncid);
819         for (n = 0; n < sc->sc_ndisks; n++) {
820                 disk = &sc->sc_disks[n];
821                 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
822                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
823                         disk->d_sync.ds_syncid = sc->sc_syncid;
824                         g_raid3_update_metadata(disk);
825                 }
826         }
827 }
828
829 static void
830 g_raid3_bump_genid(struct g_raid3_softc *sc)
831 {
832         struct g_raid3_disk *disk;
833         u_int n;
834
835         g_topology_assert_not();
836         sx_assert(&sc->sc_lock, SX_XLOCKED);
837         KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
838             ("%s called with no active disks (device=%s).", __func__,
839             sc->sc_name));
840
841         sc->sc_genid++;
842         G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
843             sc->sc_genid);
844         for (n = 0; n < sc->sc_ndisks; n++) {
845                 disk = &sc->sc_disks[n];
846                 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
847                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
848                         disk->d_genid = sc->sc_genid;
849                         g_raid3_update_metadata(disk);
850                 }
851         }
852 }
853
854 static int
855 g_raid3_idle(struct g_raid3_softc *sc, int acw)
856 {
857         struct g_raid3_disk *disk;
858         u_int i;
859         int timeout;
860
861         g_topology_assert_not();
862         sx_assert(&sc->sc_lock, SX_XLOCKED);
863
864         if (sc->sc_provider == NULL)
865                 return (0);
866         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
867                 return (0);
868         if (sc->sc_idle)
869                 return (0);
870         if (sc->sc_writes > 0)
871                 return (0);
872         if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
873                 timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
874                 if (!g_raid3_shutdown && timeout > 0)
875                         return (timeout);
876         }
877         sc->sc_idle = 1;
878         for (i = 0; i < sc->sc_ndisks; i++) {
879                 disk = &sc->sc_disks[i];
880                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
881                         continue;
882                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
883                     g_raid3_get_diskname(disk), sc->sc_name);
884                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
885                 g_raid3_update_metadata(disk);
886         }
887         return (0);
888 }
889
890 static void
891 g_raid3_unidle(struct g_raid3_softc *sc)
892 {
893         struct g_raid3_disk *disk;
894         u_int i;
895
896         g_topology_assert_not();
897         sx_assert(&sc->sc_lock, SX_XLOCKED);
898
899         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
900                 return;
901         sc->sc_idle = 0;
902         sc->sc_last_write = time_uptime;
903         for (i = 0; i < sc->sc_ndisks; i++) {
904                 disk = &sc->sc_disks[i];
905                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
906                         continue;
907                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
908                     g_raid3_get_diskname(disk), sc->sc_name);
909                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
910                 g_raid3_update_metadata(disk);
911         }
912 }
913
914 /*
915  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
916  * in child bio as pointer to the next element on the list.
917  */
918 #define G_RAID3_HEAD_BIO(pbp)   (pbp)->bio_driver1
919
920 #define G_RAID3_NEXT_BIO(cbp)   (cbp)->bio_caller1
921
922 #define G_RAID3_FOREACH_BIO(pbp, bp)                                    \
923         for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;                \
924             (bp) = G_RAID3_NEXT_BIO(bp))
925
926 #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)                        \
927         for ((bp) = G_RAID3_HEAD_BIO(pbp);                              \
928             (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);        \
929             (bp) = (tmpbp))
930
931 static void
932 g_raid3_init_bio(struct bio *pbp)
933 {
934
935         G_RAID3_HEAD_BIO(pbp) = NULL;
936 }
937
938 static void
939 g_raid3_remove_bio(struct bio *cbp)
940 {
941         struct bio *pbp, *bp;
942
943         pbp = cbp->bio_parent;
944         if (G_RAID3_HEAD_BIO(pbp) == cbp)
945                 G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
946         else {
947                 G_RAID3_FOREACH_BIO(pbp, bp) {
948                         if (G_RAID3_NEXT_BIO(bp) == cbp) {
949                                 G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
950                                 break;
951                         }
952                 }
953         }
954         G_RAID3_NEXT_BIO(cbp) = NULL;
955 }
956
957 static void
958 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
959 {
960         struct bio *pbp, *bp;
961
962         g_raid3_remove_bio(sbp);
963         pbp = dbp->bio_parent;
964         G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
965         if (G_RAID3_HEAD_BIO(pbp) == dbp)
966                 G_RAID3_HEAD_BIO(pbp) = sbp;
967         else {
968                 G_RAID3_FOREACH_BIO(pbp, bp) {
969                         if (G_RAID3_NEXT_BIO(bp) == dbp) {
970                                 G_RAID3_NEXT_BIO(bp) = sbp;
971                                 break;
972                         }
973                 }
974         }
975         G_RAID3_NEXT_BIO(dbp) = NULL;
976 }
977
978 static void
979 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
980 {
981         struct bio *bp, *pbp;
982         size_t size;
983
984         pbp = cbp->bio_parent;
985         pbp->bio_children--;
986         KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
987         size = pbp->bio_length / (sc->sc_ndisks - 1);
988         g_raid3_free(sc, cbp->bio_data, size);
989         if (G_RAID3_HEAD_BIO(pbp) == cbp) {
990                 G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
991                 G_RAID3_NEXT_BIO(cbp) = NULL;
992                 g_destroy_bio(cbp);
993         } else {
994                 G_RAID3_FOREACH_BIO(pbp, bp) {
995                         if (G_RAID3_NEXT_BIO(bp) == cbp)
996                                 break;
997                 }
998                 if (bp != NULL) {
999                         KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
1000                             ("NULL bp->bio_driver1"));
1001                         G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
1002                         G_RAID3_NEXT_BIO(cbp) = NULL;
1003                 }
1004                 g_destroy_bio(cbp);
1005         }
1006 }
1007
1008 static struct bio *
1009 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
1010 {
1011         struct bio *bp, *cbp;
1012         size_t size;
1013         int memflag;
1014
1015         cbp = g_clone_bio(pbp);
1016         if (cbp == NULL)
1017                 return (NULL);
1018         size = pbp->bio_length / (sc->sc_ndisks - 1);
1019         if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
1020                 memflag = M_WAITOK;
1021         else
1022                 memflag = M_NOWAIT;
1023         cbp->bio_data = g_raid3_alloc(sc, size, memflag);
1024         if (cbp->bio_data == NULL) {
1025                 pbp->bio_children--;
1026                 g_destroy_bio(cbp);
1027                 return (NULL);
1028         }
1029         G_RAID3_NEXT_BIO(cbp) = NULL;
1030         if (G_RAID3_HEAD_BIO(pbp) == NULL)
1031                 G_RAID3_HEAD_BIO(pbp) = cbp;
1032         else {
1033                 G_RAID3_FOREACH_BIO(pbp, bp) {
1034                         if (G_RAID3_NEXT_BIO(bp) == NULL) {
1035                                 G_RAID3_NEXT_BIO(bp) = cbp;
1036                                 break;
1037                         }
1038                 }
1039         }
1040         return (cbp);
1041 }
1042
1043 static void
1044 g_raid3_scatter(struct bio *pbp)
1045 {
1046         struct g_raid3_softc *sc;
1047         struct g_raid3_disk *disk;
1048         struct bio *bp, *cbp, *tmpbp;
1049         off_t atom, cadd, padd, left;
1050         int first;
1051
1052         sc = pbp->bio_to->geom->softc;
1053         bp = NULL;
1054         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1055                 /*
1056                  * Find bio for which we should calculate data.
1057                  */
1058                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1059                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1060                                 bp = cbp;
1061                                 break;
1062                         }
1063                 }
1064                 KASSERT(bp != NULL, ("NULL parity bio."));
1065         }
1066         atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1067         cadd = padd = 0;
1068         for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1069                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1070                         if (cbp == bp)
1071                                 continue;
1072                         bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1073                         padd += atom;
1074                 }
1075                 cadd += atom;
1076         }
1077         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1078                 /*
1079                  * Calculate parity.
1080                  */
1081                 first = 1;
1082                 G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1083                         if (cbp == bp)
1084                                 continue;
1085                         if (first) {
1086                                 bcopy(cbp->bio_data, bp->bio_data,
1087                                     bp->bio_length);
1088                                 first = 0;
1089                         } else {
1090                                 g_raid3_xor(cbp->bio_data, bp->bio_data,
1091                                     bp->bio_length);
1092                         }
1093                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1094                                 g_raid3_destroy_bio(sc, cbp);
1095                 }
1096         }
1097         G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1098                 struct g_consumer *cp;
1099
1100                 disk = cbp->bio_caller2;
1101                 cp = disk->d_consumer;
1102                 cbp->bio_to = cp->provider;
1103                 G_RAID3_LOGREQ(3, cbp, "Sending request.");
1104                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1105                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1106                     cp->acr, cp->acw, cp->ace));
1107                 cp->index++;
1108                 sc->sc_writes++;
1109                 g_io_request(cbp, cp);
1110         }
1111 }
1112
1113 static void
1114 g_raid3_gather(struct bio *pbp)
1115 {
1116         struct g_raid3_softc *sc;
1117         struct g_raid3_disk *disk;
1118         struct bio *xbp, *fbp, *cbp;
1119         off_t atom, cadd, padd, left;
1120
1121         sc = pbp->bio_to->geom->softc;
1122         /*
1123          * Find bio for which we have to calculate data.
1124          * While going through this path, check if all requests
1125          * succeeded, if not, deny whole request.
1126          * If we're in COMPLETE mode, we allow one request to fail,
1127          * so if we find one, we're sending it to the parity consumer.
1128          * If there are more failed requests, we deny whole request.
1129          */
1130         xbp = fbp = NULL;
1131         G_RAID3_FOREACH_BIO(pbp, cbp) {
1132                 if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1133                         KASSERT(xbp == NULL, ("More than one parity bio."));
1134                         xbp = cbp;
1135                 }
1136                 if (cbp->bio_error == 0)
1137                         continue;
1138                 /*
1139                  * Found failed request.
1140                  */
1141                 if (fbp == NULL) {
1142                         if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1143                                 /*
1144                                  * We are already in degraded mode, so we can't
1145                                  * accept any failures.
1146                                  */
1147                                 if (pbp->bio_error == 0)
1148                                         pbp->bio_error = cbp->bio_error;
1149                         } else {
1150                                 fbp = cbp;
1151                         }
1152                 } else {
1153                         /*
1154                          * Next failed request, that's too many.
1155                          */
1156                         if (pbp->bio_error == 0)
1157                                 pbp->bio_error = fbp->bio_error;
1158                 }
1159                 disk = cbp->bio_caller2;
1160                 if (disk == NULL)
1161                         continue;
1162                 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1163                         disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1164                         G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
1165                             cbp->bio_error);
1166                 } else {
1167                         G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
1168                             cbp->bio_error);
1169                 }
1170                 if (g_raid3_disconnect_on_failure &&
1171                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1172                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1173                         g_raid3_event_send(disk,
1174                             G_RAID3_DISK_STATE_DISCONNECTED,
1175                             G_RAID3_EVENT_DONTWAIT);
1176                 }
1177         }
1178         if (pbp->bio_error != 0)
1179                 goto finish;
1180         if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1181                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1182                 if (xbp != fbp)
1183                         g_raid3_replace_bio(xbp, fbp);
1184                 g_raid3_destroy_bio(sc, fbp);
1185         } else if (fbp != NULL) {
1186                 struct g_consumer *cp;
1187
1188                 /*
1189                  * One request failed, so send the same request to
1190                  * the parity consumer.
1191                  */
1192                 disk = pbp->bio_driver2;
1193                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1194                         pbp->bio_error = fbp->bio_error;
1195                         goto finish;
1196                 }
1197                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1198                 pbp->bio_inbed--;
1199                 fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1200                 if (disk->d_no == sc->sc_ndisks - 1)
1201                         fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1202                 fbp->bio_error = 0;
1203                 fbp->bio_completed = 0;
1204                 fbp->bio_children = 0;
1205                 fbp->bio_inbed = 0;
1206                 cp = disk->d_consumer;
1207                 fbp->bio_caller2 = disk;
1208                 fbp->bio_to = cp->provider;
1209                 G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1210                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1211                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1212                     cp->acr, cp->acw, cp->ace));
1213                 cp->index++;
1214                 g_io_request(fbp, cp);
1215                 return;
1216         }
1217         if (xbp != NULL) {
1218                 /*
1219                  * Calculate parity.
1220                  */
1221                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1222                         if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1223                                 continue;
1224                         g_raid3_xor(cbp->bio_data, xbp->bio_data,
1225                             xbp->bio_length);
1226                 }
1227                 xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1228                 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1229                         if (!g_raid3_is_zero(xbp)) {
1230                                 g_raid3_parity_mismatch++;
1231                                 pbp->bio_error = EIO;
1232                                 goto finish;
1233                         }
1234                         g_raid3_destroy_bio(sc, xbp);
1235                 }
1236         }
1237         atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1238         cadd = padd = 0;
1239         for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1240                 G_RAID3_FOREACH_BIO(pbp, cbp) {
1241                         bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1242                         pbp->bio_completed += atom;
1243                         padd += atom;
1244                 }
1245                 cadd += atom;
1246         }
1247 finish:
1248         if (pbp->bio_error == 0)
1249                 G_RAID3_LOGREQ(3, pbp, "Request finished.");
1250         else {
1251                 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1252                         G_RAID3_LOGREQ(1, pbp, "Verification error.");
1253                 else
1254                         G_RAID3_LOGREQ(0, pbp, "Request failed.");
1255         }
1256         pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1257         while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1258                 g_raid3_destroy_bio(sc, cbp);
1259         g_io_deliver(pbp, pbp->bio_error);
1260 }
1261
1262 static void
1263 g_raid3_done(struct bio *bp)
1264 {
1265         struct g_raid3_softc *sc;
1266
1267         sc = bp->bio_from->geom->softc;
1268         bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1269         G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1270         mtx_lock(&sc->sc_queue_mtx);
1271         bioq_insert_head(&sc->sc_queue, bp);
1272         mtx_unlock(&sc->sc_queue_mtx);
1273         wakeup(sc);
1274         wakeup(&sc->sc_queue);
1275 }
1276
1277 static void
1278 g_raid3_regular_request(struct bio *cbp)
1279 {
1280         struct g_raid3_softc *sc;
1281         struct g_raid3_disk *disk;
1282         struct bio *pbp;
1283
1284         g_topology_assert_not();
1285
1286         pbp = cbp->bio_parent;
1287         sc = pbp->bio_to->geom->softc;
1288         cbp->bio_from->index--;
1289         if (cbp->bio_cmd == BIO_WRITE)
1290                 sc->sc_writes--;
1291         disk = cbp->bio_from->private;
1292         if (disk == NULL) {
1293                 g_topology_lock();
1294                 g_raid3_kill_consumer(sc, cbp->bio_from);
1295                 g_topology_unlock();
1296         }
1297
1298         G_RAID3_LOGREQ(3, cbp, "Request finished.");
1299         pbp->bio_inbed++;
1300         KASSERT(pbp->bio_inbed <= pbp->bio_children,
1301             ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1302             pbp->bio_children));
1303         if (pbp->bio_inbed != pbp->bio_children)
1304                 return;
1305         switch (pbp->bio_cmd) {
1306         case BIO_READ:
1307                 g_raid3_gather(pbp);
1308                 break;
1309         case BIO_WRITE:
1310         case BIO_DELETE:
1311             {
1312                 int error = 0;
1313
1314                 pbp->bio_completed = pbp->bio_length;
1315                 while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1316                         if (cbp->bio_error == 0) {
1317                                 g_raid3_destroy_bio(sc, cbp);
1318                                 continue;
1319                         }
1320
1321                         if (error == 0)
1322                                 error = cbp->bio_error;
1323                         else if (pbp->bio_error == 0) {
1324                                 /*
1325                                  * Next failed request, that's too many.
1326                                  */
1327                                 pbp->bio_error = error;
1328                         }
1329
1330                         disk = cbp->bio_caller2;
1331                         if (disk == NULL) {
1332                                 g_raid3_destroy_bio(sc, cbp);
1333                                 continue;
1334                         }
1335
1336                         if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1337                                 disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1338                                 G_RAID3_LOGREQ(0, cbp,
1339                                     "Request failed (error=%d).",
1340                                     cbp->bio_error);
1341                         } else {
1342                                 G_RAID3_LOGREQ(1, cbp,
1343                                     "Request failed (error=%d).",
1344                                     cbp->bio_error);
1345                         }
1346                         if (g_raid3_disconnect_on_failure &&
1347                             sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1348                                 sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1349                                 g_raid3_event_send(disk,
1350                                     G_RAID3_DISK_STATE_DISCONNECTED,
1351                                     G_RAID3_EVENT_DONTWAIT);
1352                         }
1353                         g_raid3_destroy_bio(sc, cbp);
1354                 }
1355                 if (pbp->bio_error == 0)
1356                         G_RAID3_LOGREQ(3, pbp, "Request finished.");
1357                 else
1358                         G_RAID3_LOGREQ(0, pbp, "Request failed.");
1359                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1360                 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1361                 bioq_remove(&sc->sc_inflight, pbp);
1362                 /* Release delayed sync requests if possible. */
1363                 g_raid3_sync_release(sc);
1364                 g_io_deliver(pbp, pbp->bio_error);
1365                 break;
1366             }
1367         }
1368 }
1369
1370 static void
1371 g_raid3_sync_done(struct bio *bp)
1372 {
1373         struct g_raid3_softc *sc;
1374
1375         G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1376         sc = bp->bio_from->geom->softc;
1377         bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1378         mtx_lock(&sc->sc_queue_mtx);
1379         bioq_insert_head(&sc->sc_queue, bp);
1380         mtx_unlock(&sc->sc_queue_mtx);
1381         wakeup(sc);
1382         wakeup(&sc->sc_queue);
1383 }
1384
1385 static void
1386 g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp)
1387 {
1388         struct bio_queue_head queue;
1389         struct g_raid3_disk *disk;
1390         struct g_consumer *cp;
1391         struct bio *cbp;
1392         u_int i;
1393
1394         bioq_init(&queue);
1395         for (i = 0; i < sc->sc_ndisks; i++) {
1396                 disk = &sc->sc_disks[i];
1397                 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
1398                         continue;
1399                 cbp = g_clone_bio(bp);
1400                 if (cbp == NULL) {
1401                         for (cbp = bioq_first(&queue); cbp != NULL;
1402                             cbp = bioq_first(&queue)) {
1403                                 bioq_remove(&queue, cbp);
1404                                 g_destroy_bio(cbp);
1405                         }
1406                         if (bp->bio_error == 0)
1407                                 bp->bio_error = ENOMEM;
1408                         g_io_deliver(bp, bp->bio_error);
1409                         return;
1410                 }
1411                 bioq_insert_tail(&queue, cbp);
1412                 cbp->bio_done = g_std_done;
1413                 cbp->bio_caller1 = disk;
1414                 cbp->bio_to = disk->d_consumer->provider;
1415         }
1416         for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
1417                 bioq_remove(&queue, cbp);
1418                 G_RAID3_LOGREQ(3, cbp, "Sending request.");
1419                 disk = cbp->bio_caller1;
1420                 cbp->bio_caller1 = NULL;
1421                 cp = disk->d_consumer;
1422                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1423                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1424                     cp->acr, cp->acw, cp->ace));
1425                 g_io_request(cbp, disk->d_consumer);
1426         }
1427 }
1428
1429 static void
1430 g_raid3_start(struct bio *bp)
1431 {
1432         struct g_raid3_softc *sc;
1433
1434         sc = bp->bio_to->geom->softc;
1435         /*
1436          * If sc == NULL or there are no valid disks, provider's error
1437          * should be set and g_raid3_start() should not be called at all.
1438          */
1439         KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1440             sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1441             ("Provider's error should be set (error=%d)(device=%s).",
1442             bp->bio_to->error, bp->bio_to->name));
1443         G_RAID3_LOGREQ(3, bp, "Request received.");
1444
1445         switch (bp->bio_cmd) {
1446         case BIO_READ:
1447         case BIO_WRITE:
1448         case BIO_DELETE:
1449                 break;
1450         case BIO_SPEEDUP:
1451         case BIO_FLUSH:
1452                 g_raid3_flush(sc, bp);
1453                 return;
1454         case BIO_GETATTR:
1455         default:
1456                 g_io_deliver(bp, EOPNOTSUPP);
1457                 return;
1458         }
1459         mtx_lock(&sc->sc_queue_mtx);
1460         bioq_insert_tail(&sc->sc_queue, bp);
1461         mtx_unlock(&sc->sc_queue_mtx);
1462         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1463         wakeup(sc);
1464 }
1465
1466 /*
1467  * Return TRUE if the given request is colliding with a in-progress
1468  * synchronization request.
1469  */
1470 static int
1471 g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp)
1472 {
1473         struct g_raid3_disk *disk;
1474         struct bio *sbp;
1475         off_t rstart, rend, sstart, send;
1476         int i;
1477
1478         disk = sc->sc_syncdisk;
1479         if (disk == NULL)
1480                 return (0);
1481         rstart = bp->bio_offset;
1482         rend = bp->bio_offset + bp->bio_length;
1483         for (i = 0; i < g_raid3_syncreqs; i++) {
1484                 sbp = disk->d_sync.ds_bios[i];
1485                 if (sbp == NULL)
1486                         continue;
1487                 sstart = sbp->bio_offset;
1488                 send = sbp->bio_length;
1489                 if (sbp->bio_cmd == BIO_WRITE) {
1490                         sstart *= sc->sc_ndisks - 1;
1491                         send *= sc->sc_ndisks - 1;
1492                 }
1493                 send += sstart;
1494                 if (rend > sstart && rstart < send)
1495                         return (1);
1496         }
1497         return (0);
1498 }
1499
1500 /*
1501  * Return TRUE if the given sync request is colliding with a in-progress regular
1502  * request.
1503  */
1504 static int
1505 g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp)
1506 {
1507         off_t rstart, rend, sstart, send;
1508         struct bio *bp;
1509
1510         if (sc->sc_syncdisk == NULL)
1511                 return (0);
1512         sstart = sbp->bio_offset;
1513         send = sstart + sbp->bio_length;
1514         TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
1515                 rstart = bp->bio_offset;
1516                 rend = bp->bio_offset + bp->bio_length;
1517                 if (rend > sstart && rstart < send)
1518                         return (1);
1519         }
1520         return (0);
1521 }
1522
1523 /*
1524  * Puts request onto delayed queue.
1525  */
1526 static void
1527 g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp)
1528 {
1529
1530         G_RAID3_LOGREQ(2, bp, "Delaying request.");
1531         bioq_insert_head(&sc->sc_regular_delayed, bp);
1532 }
1533
1534 /*
1535  * Puts synchronization request onto delayed queue.
1536  */
1537 static void
1538 g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp)
1539 {
1540
1541         G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
1542         bioq_insert_tail(&sc->sc_sync_delayed, bp);
1543 }
1544
1545 /*
1546  * Releases delayed regular requests which don't collide anymore with sync
1547  * requests.
1548  */
1549 static void
1550 g_raid3_regular_release(struct g_raid3_softc *sc)
1551 {
1552         struct bio *bp, *bp2;
1553
1554         TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
1555                 if (g_raid3_sync_collision(sc, bp))
1556                         continue;
1557                 bioq_remove(&sc->sc_regular_delayed, bp);
1558                 G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
1559                 mtx_lock(&sc->sc_queue_mtx);
1560                 bioq_insert_head(&sc->sc_queue, bp);
1561 #if 0
1562                 /*
1563                  * wakeup() is not needed, because this function is called from
1564                  * the worker thread.
1565                  */
1566                 wakeup(&sc->sc_queue);
1567 #endif
1568                 mtx_unlock(&sc->sc_queue_mtx);
1569         }
1570 }
1571
1572 /*
1573  * Releases delayed sync requests which don't collide anymore with regular
1574  * requests.
1575  */
1576 static void
1577 g_raid3_sync_release(struct g_raid3_softc *sc)
1578 {
1579         struct bio *bp, *bp2;
1580
1581         TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
1582                 if (g_raid3_regular_collision(sc, bp))
1583                         continue;
1584                 bioq_remove(&sc->sc_sync_delayed, bp);
1585                 G_RAID3_LOGREQ(2, bp,
1586                     "Releasing delayed synchronization request.");
1587                 g_io_request(bp, bp->bio_from);
1588         }
1589 }
1590
1591 /*
1592  * Handle synchronization requests.
1593  * Every synchronization request is two-steps process: first, READ request is
1594  * send to active provider and then WRITE request (with read data) to the provider
1595  * being synchronized. When WRITE is finished, new synchronization request is
1596  * send.
1597  */
1598 static void
1599 g_raid3_sync_request(struct bio *bp)
1600 {
1601         struct g_raid3_softc *sc;
1602         struct g_raid3_disk *disk;
1603
1604         bp->bio_from->index--;
1605         sc = bp->bio_from->geom->softc;
1606         disk = bp->bio_from->private;
1607         if (disk == NULL) {
1608                 sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
1609                 g_topology_lock();
1610                 g_raid3_kill_consumer(sc, bp->bio_from);
1611                 g_topology_unlock();
1612                 free(bp->bio_data, M_RAID3);
1613                 g_destroy_bio(bp);
1614                 sx_xlock(&sc->sc_lock);
1615                 return;
1616         }
1617
1618         /*
1619          * Synchronization request.
1620          */
1621         switch (bp->bio_cmd) {
1622         case BIO_READ:
1623             {
1624                 struct g_consumer *cp;
1625                 u_char *dst, *src;
1626                 off_t left;
1627                 u_int atom;
1628
1629                 if (bp->bio_error != 0) {
1630                         G_RAID3_LOGREQ(0, bp,
1631                             "Synchronization request failed (error=%d).",
1632                             bp->bio_error);
1633                         g_destroy_bio(bp);
1634                         return;
1635                 }
1636                 G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1637                 atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1638                 dst = src = bp->bio_data;
1639                 if (disk->d_no == sc->sc_ndisks - 1) {
1640                         u_int n;
1641
1642                         /* Parity component. */
1643                         for (left = bp->bio_length; left > 0;
1644                             left -= sc->sc_sectorsize) {
1645                                 bcopy(src, dst, atom);
1646                                 src += atom;
1647                                 for (n = 1; n < sc->sc_ndisks - 1; n++) {
1648                                         g_raid3_xor(src, dst, atom);
1649                                         src += atom;
1650                                 }
1651                                 dst += atom;
1652                         }
1653                 } else {
1654                         /* Regular component. */
1655                         src += atom * disk->d_no;
1656                         for (left = bp->bio_length; left > 0;
1657                             left -= sc->sc_sectorsize) {
1658                                 bcopy(src, dst, atom);
1659                                 src += sc->sc_sectorsize;
1660                                 dst += atom;
1661                         }
1662                 }
1663                 bp->bio_driver1 = bp->bio_driver2 = NULL;
1664                 bp->bio_pflags = 0;
1665                 bp->bio_offset /= sc->sc_ndisks - 1;
1666                 bp->bio_length /= sc->sc_ndisks - 1;
1667                 bp->bio_cmd = BIO_WRITE;
1668                 bp->bio_cflags = 0;
1669                 bp->bio_children = bp->bio_inbed = 0;
1670                 cp = disk->d_consumer;
1671                 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1672                     ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1673                     cp->acr, cp->acw, cp->ace));
1674                 cp->index++;
1675                 g_io_request(bp, cp);
1676                 return;
1677             }
1678         case BIO_WRITE:
1679             {
1680                 struct g_raid3_disk_sync *sync;
1681                 off_t boffset, moffset;
1682                 void *data;
1683                 int i;
1684
1685                 if (bp->bio_error != 0) {
1686                         G_RAID3_LOGREQ(0, bp,
1687                             "Synchronization request failed (error=%d).",
1688                             bp->bio_error);
1689                         g_destroy_bio(bp);
1690                         sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1691                         g_raid3_event_send(disk,
1692                             G_RAID3_DISK_STATE_DISCONNECTED,
1693                             G_RAID3_EVENT_DONTWAIT);
1694                         return;
1695                 }
1696                 G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1697                 sync = &disk->d_sync;
1698                 if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) ||
1699                     sync->ds_consumer == NULL ||
1700                     (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1701                         /* Don't send more synchronization requests. */
1702                         sync->ds_inflight--;
1703                         if (sync->ds_bios != NULL) {
1704                                 i = (int)(uintptr_t)bp->bio_caller1;
1705                                 sync->ds_bios[i] = NULL;
1706                         }
1707                         free(bp->bio_data, M_RAID3);
1708                         g_destroy_bio(bp);
1709                         if (sync->ds_inflight > 0)
1710                                 return;
1711                         if (sync->ds_consumer == NULL ||
1712                             (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1713                                 return;
1714                         }
1715                         /*
1716                          * Disk up-to-date, activate it.
1717                          */
1718                         g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1719                             G_RAID3_EVENT_DONTWAIT);
1720                         return;
1721                 }
1722
1723                 /* Send next synchronization request. */
1724                 data = bp->bio_data;
1725                 g_reset_bio(bp);
1726                 bp->bio_cmd = BIO_READ;
1727                 bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
1728                 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1729                 sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1730                 bp->bio_done = g_raid3_sync_done;
1731                 bp->bio_data = data;
1732                 bp->bio_from = sync->ds_consumer;
1733                 bp->bio_to = sc->sc_provider;
1734                 G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1735                 sync->ds_consumer->index++;
1736                 /*
1737                  * Delay the request if it is colliding with a regular request.
1738                  */
1739                 if (g_raid3_regular_collision(sc, bp))
1740                         g_raid3_sync_delay(sc, bp);
1741                 else
1742                         g_io_request(bp, sync->ds_consumer);
1743
1744                 /* Release delayed requests if possible. */
1745                 g_raid3_regular_release(sc);
1746
1747                 /* Find the smallest offset. */
1748                 moffset = sc->sc_mediasize;
1749                 for (i = 0; i < g_raid3_syncreqs; i++) {
1750                         bp = sync->ds_bios[i];
1751                         boffset = bp->bio_offset;
1752                         if (bp->bio_cmd == BIO_WRITE)
1753                                 boffset *= sc->sc_ndisks - 1;
1754                         if (boffset < moffset)
1755                                 moffset = boffset;
1756                 }
1757                 if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
1758                         /* Update offset_done on every 100 blocks. */
1759                         sync->ds_offset_done = moffset;
1760                         g_raid3_update_metadata(disk);
1761                 }
1762                 return;
1763             }
1764         default:
1765                 KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1766                     bp->bio_cmd, sc->sc_name));
1767                 break;
1768         }
1769 }
1770
1771 static int
1772 g_raid3_register_request(struct bio *pbp)
1773 {
1774         struct g_raid3_softc *sc;
1775         struct g_raid3_disk *disk;
1776         struct g_consumer *cp;
1777         struct bio *cbp, *tmpbp;
1778         off_t offset, length;
1779         u_int n, ndisks;
1780         int round_robin, verify;
1781
1782         ndisks = 0;
1783         sc = pbp->bio_to->geom->softc;
1784         if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1785             sc->sc_syncdisk == NULL) {
1786                 g_io_deliver(pbp, EIO);
1787                 return (0);
1788         }
1789         g_raid3_init_bio(pbp);
1790         length = pbp->bio_length / (sc->sc_ndisks - 1);
1791         offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1792         round_robin = verify = 0;
1793         switch (pbp->bio_cmd) {
1794         case BIO_READ:
1795                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1796                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1797                         pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1798                         verify = 1;
1799                         ndisks = sc->sc_ndisks;
1800                 } else {
1801                         verify = 0;
1802                         ndisks = sc->sc_ndisks - 1;
1803                 }
1804                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1805                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1806                         round_robin = 1;
1807                 } else {
1808                         round_robin = 0;
1809                 }
1810                 KASSERT(!round_robin || !verify,
1811                     ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1812                 pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1813                 break;
1814         case BIO_WRITE:
1815         case BIO_DELETE:
1816                 /*
1817                  * Delay the request if it is colliding with a synchronization
1818                  * request.
1819                  */
1820                 if (g_raid3_sync_collision(sc, pbp)) {
1821                         g_raid3_regular_delay(sc, pbp);
1822                         return (0);
1823                 }
1824
1825                 if (sc->sc_idle)
1826                         g_raid3_unidle(sc);
1827                 else
1828                         sc->sc_last_write = time_uptime;
1829
1830                 ndisks = sc->sc_ndisks;
1831                 break;
1832         }
1833         for (n = 0; n < ndisks; n++) {
1834                 disk = &sc->sc_disks[n];
1835                 cbp = g_raid3_clone_bio(sc, pbp);
1836                 if (cbp == NULL) {
1837                         while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1838                                 g_raid3_destroy_bio(sc, cbp);
1839                         /*
1840                          * To prevent deadlock, we must run back up
1841                          * with the ENOMEM for failed requests of any
1842                          * of our consumers.  Our own sync requests
1843                          * can stick around, as they are finite.
1844                          */
1845                         if ((pbp->bio_cflags &
1846                             G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1847                                 g_io_deliver(pbp, ENOMEM);
1848                                 return (0);
1849                         }
1850                         return (ENOMEM);
1851                 }
1852                 cbp->bio_offset = offset;
1853                 cbp->bio_length = length;
1854                 cbp->bio_done = g_raid3_done;
1855                 switch (pbp->bio_cmd) {
1856                 case BIO_READ:
1857                         if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1858                                 /*
1859                                  * Replace invalid component with the parity
1860                                  * component.
1861                                  */
1862                                 disk = &sc->sc_disks[sc->sc_ndisks - 1];
1863                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1864                                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1865                         } else if (round_robin &&
1866                             disk->d_no == sc->sc_round_robin) {
1867                                 /*
1868                                  * In round-robin mode skip one data component
1869                                  * and use parity component when reading.
1870                                  */
1871                                 pbp->bio_driver2 = disk;
1872                                 disk = &sc->sc_disks[sc->sc_ndisks - 1];
1873                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1874                                 sc->sc_round_robin++;
1875                                 round_robin = 0;
1876                         } else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1877                                 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1878                         }
1879                         break;
1880                 case BIO_WRITE:
1881                 case BIO_DELETE:
1882                         if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1883                             disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1884                                 if (n == ndisks - 1) {
1885                                         /*
1886                                          * Active parity component, mark it as such.
1887                                          */
1888                                         cbp->bio_cflags |=
1889                                             G_RAID3_BIO_CFLAG_PARITY;
1890                                 }
1891                         } else {
1892                                 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1893                                 if (n == ndisks - 1) {
1894                                         /*
1895                                          * Parity component is not connected,
1896                                          * so destroy its request.
1897                                          */
1898                                         pbp->bio_pflags |=
1899                                             G_RAID3_BIO_PFLAG_NOPARITY;
1900                                         g_raid3_destroy_bio(sc, cbp);
1901                                         cbp = NULL;
1902                                 } else {
1903                                         cbp->bio_cflags |=
1904                                             G_RAID3_BIO_CFLAG_NODISK;
1905                                         disk = NULL;
1906                                 }
1907                         }
1908                         break;
1909                 }
1910                 if (cbp != NULL)
1911                         cbp->bio_caller2 = disk;
1912         }
1913         switch (pbp->bio_cmd) {
1914         case BIO_READ:
1915                 if (round_robin) {
1916                         /*
1917                          * If we are in round-robin mode and 'round_robin' is
1918                          * still 1, it means, that we skipped parity component
1919                          * for this read and must reset sc_round_robin field.
1920                          */
1921                         sc->sc_round_robin = 0;
1922                 }
1923                 G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1924                         disk = cbp->bio_caller2;
1925                         cp = disk->d_consumer;
1926                         cbp->bio_to = cp->provider;
1927                         G_RAID3_LOGREQ(3, cbp, "Sending request.");
1928                         KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1929                             ("Consumer %s not opened (r%dw%de%d).",
1930                             cp->provider->name, cp->acr, cp->acw, cp->ace));
1931                         cp->index++;
1932                         g_io_request(cbp, cp);
1933                 }
1934                 break;
1935         case BIO_WRITE:
1936         case BIO_DELETE:
1937                 /*
1938                  * Put request onto inflight queue, so we can check if new
1939                  * synchronization requests don't collide with it.
1940                  */
1941                 bioq_insert_tail(&sc->sc_inflight, pbp);
1942
1943                 /*
1944                  * Bump syncid on first write.
1945                  */
1946                 if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1947                         sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1948                         g_raid3_bump_syncid(sc);
1949                 }
1950                 g_raid3_scatter(pbp);
1951                 break;
1952         }
1953         return (0);
1954 }
1955
1956 static int
1957 g_raid3_can_destroy(struct g_raid3_softc *sc)
1958 {
1959         struct g_geom *gp;
1960         struct g_consumer *cp;
1961
1962         g_topology_assert();
1963         gp = sc->sc_geom;
1964         if (gp->softc == NULL)
1965                 return (1);
1966         LIST_FOREACH(cp, &gp->consumer, consumer) {
1967                 if (g_raid3_is_busy(sc, cp))
1968                         return (0);
1969         }
1970         gp = sc->sc_sync.ds_geom;
1971         LIST_FOREACH(cp, &gp->consumer, consumer) {
1972                 if (g_raid3_is_busy(sc, cp))
1973                         return (0);
1974         }
1975         G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1976             sc->sc_name);
1977         return (1);
1978 }
1979
1980 static int
1981 g_raid3_try_destroy(struct g_raid3_softc *sc)
1982 {
1983
1984         g_topology_assert_not();
1985         sx_assert(&sc->sc_lock, SX_XLOCKED);
1986
1987         if (sc->sc_rootmount != NULL) {
1988                 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
1989                     sc->sc_rootmount);
1990                 root_mount_rel(sc->sc_rootmount);
1991                 sc->sc_rootmount = NULL;
1992         }
1993
1994         g_topology_lock();
1995         if (!g_raid3_can_destroy(sc)) {
1996                 g_topology_unlock();
1997                 return (0);
1998         }
1999         sc->sc_geom->softc = NULL;
2000         sc->sc_sync.ds_geom->softc = NULL;
2001         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
2002                 g_topology_unlock();
2003                 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
2004                     &sc->sc_worker);
2005                 /* Unlock sc_lock here, as it can be destroyed after wakeup. */
2006                 sx_xunlock(&sc->sc_lock);
2007                 wakeup(&sc->sc_worker);
2008                 sc->sc_worker = NULL;
2009         } else {
2010                 g_topology_unlock();
2011                 g_raid3_destroy_device(sc);
2012                 free(sc->sc_disks, M_RAID3);
2013                 free(sc, M_RAID3);
2014         }
2015         return (1);
2016 }
2017
2018 /*
2019  * Worker thread.
2020  */
2021 static void
2022 g_raid3_worker(void *arg)
2023 {
2024         struct g_raid3_softc *sc;
2025         struct g_raid3_event *ep;
2026         struct bio *bp;
2027         int timeout;
2028
2029         sc = arg;
2030         thread_lock(curthread);
2031         sched_prio(curthread, PRIBIO);
2032         thread_unlock(curthread);
2033
2034         sx_xlock(&sc->sc_lock);
2035         for (;;) {
2036                 G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
2037                 /*
2038                  * First take a look at events.
2039                  * This is important to handle events before any I/O requests.
2040                  */
2041                 ep = g_raid3_event_get(sc);
2042                 if (ep != NULL) {
2043                         g_raid3_event_remove(sc, ep);
2044                         if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
2045                                 /* Update only device status. */
2046                                 G_RAID3_DEBUG(3,
2047                                     "Running event for device %s.",
2048                                     sc->sc_name);
2049                                 ep->e_error = 0;
2050                                 g_raid3_update_device(sc, 1);
2051                         } else {
2052                                 /* Update disk status. */
2053                                 G_RAID3_DEBUG(3, "Running event for disk %s.",
2054                                      g_raid3_get_diskname(ep->e_disk));
2055                                 ep->e_error = g_raid3_update_disk(ep->e_disk,
2056                                     ep->e_state);
2057                                 if (ep->e_error == 0)
2058                                         g_raid3_update_device(sc, 0);
2059                         }
2060                         if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
2061                                 KASSERT(ep->e_error == 0,
2062                                     ("Error cannot be handled."));
2063                                 g_raid3_event_free(ep);
2064                         } else {
2065                                 ep->e_flags |= G_RAID3_EVENT_DONE;
2066                                 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
2067                                     ep);
2068                                 mtx_lock(&sc->sc_events_mtx);
2069                                 wakeup(ep);
2070                                 mtx_unlock(&sc->sc_events_mtx);
2071                         }
2072                         if ((sc->sc_flags &
2073                             G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2074                                 if (g_raid3_try_destroy(sc)) {
2075                                         curthread->td_pflags &= ~TDP_GEOM;
2076                                         G_RAID3_DEBUG(1, "Thread exiting.");
2077                                         kproc_exit(0);
2078                                 }
2079                         }
2080                         G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
2081                         continue;
2082                 }
2083                 /*
2084                  * Check if we can mark array as CLEAN and if we can't take
2085                  * how much seconds should we wait.
2086                  */
2087                 timeout = g_raid3_idle(sc, -1);
2088                 /*
2089                  * Now I/O requests.
2090                  */
2091                 /* Get first request from the queue. */
2092                 mtx_lock(&sc->sc_queue_mtx);
2093                 bp = bioq_first(&sc->sc_queue);
2094                 if (bp == NULL) {
2095                         if ((sc->sc_flags &
2096                             G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2097                                 mtx_unlock(&sc->sc_queue_mtx);
2098                                 if (g_raid3_try_destroy(sc)) {
2099                                         curthread->td_pflags &= ~TDP_GEOM;
2100                                         G_RAID3_DEBUG(1, "Thread exiting.");
2101                                         kproc_exit(0);
2102                                 }
2103                                 mtx_lock(&sc->sc_queue_mtx);
2104                         }
2105                         sx_xunlock(&sc->sc_lock);
2106                         /*
2107                          * XXX: We can miss an event here, because an event
2108                          *      can be added without sx-device-lock and without
2109                          *      mtx-queue-lock. Maybe I should just stop using
2110                          *      dedicated mutex for events synchronization and
2111                          *      stick with the queue lock?
2112                          *      The event will hang here until next I/O request
2113                          *      or next event is received.
2114                          */
2115                         MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1",
2116                             timeout * hz);
2117                         sx_xlock(&sc->sc_lock);
2118                         G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
2119                         continue;
2120                 }
2121 process:
2122                 bioq_remove(&sc->sc_queue, bp);
2123                 mtx_unlock(&sc->sc_queue_mtx);
2124
2125                 if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
2126                     (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
2127                         g_raid3_sync_request(bp);       /* READ */
2128                 } else if (bp->bio_to != sc->sc_provider) {
2129                         if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
2130                                 g_raid3_regular_request(bp);
2131                         else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
2132                                 g_raid3_sync_request(bp);       /* WRITE */
2133                         else {
2134                                 KASSERT(0,
2135                                     ("Invalid request cflags=0x%hx to=%s.",
2136                                     bp->bio_cflags, bp->bio_to->name));
2137                         }
2138                 } else if (g_raid3_register_request(bp) != 0) {
2139                         mtx_lock(&sc->sc_queue_mtx);
2140                         bioq_insert_head(&sc->sc_queue, bp);
2141                         /*
2142                          * We are short in memory, let see if there are finished
2143                          * request we can free.
2144                          */
2145                         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
2146                                 if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR)
2147                                         goto process;
2148                         }
2149                         /*
2150                          * No finished regular request, so at least keep
2151                          * synchronization running.
2152                          */
2153                         TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
2154                                 if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC)
2155                                         goto process;
2156                         }
2157                         sx_xunlock(&sc->sc_lock);
2158                         MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP,
2159                             "r3:lowmem", hz / 10);
2160                         sx_xlock(&sc->sc_lock);
2161                 }
2162                 G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
2163         }
2164 }
2165
2166 static void
2167 g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk)
2168 {
2169
2170         sx_assert(&sc->sc_lock, SX_LOCKED);
2171         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
2172                 return;
2173         if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
2174                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2175                     g_raid3_get_diskname(disk), sc->sc_name);
2176                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2177         } else if (sc->sc_idle &&
2178             (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
2179                 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2180                     g_raid3_get_diskname(disk), sc->sc_name);
2181                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2182         }
2183 }
2184
2185 static void
2186 g_raid3_sync_start(struct g_raid3_softc *sc)
2187 {
2188         struct g_raid3_disk *disk;
2189         struct g_consumer *cp;
2190         struct bio *bp;
2191         int error;
2192         u_int n;
2193
2194         g_topology_assert_not();
2195         sx_assert(&sc->sc_lock, SX_XLOCKED);
2196
2197         KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2198             ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2199             sc->sc_state));
2200         KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
2201             sc->sc_name, sc->sc_state));
2202         disk = NULL;
2203         for (n = 0; n < sc->sc_ndisks; n++) {
2204                 if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
2205                         continue;
2206                 disk = &sc->sc_disks[n];
2207                 break;
2208         }
2209         if (disk == NULL)
2210                 return;
2211
2212         sx_xunlock(&sc->sc_lock);
2213         g_topology_lock();
2214         cp = g_new_consumer(sc->sc_sync.ds_geom);
2215         error = g_attach(cp, sc->sc_provider);
2216         KASSERT(error == 0,
2217             ("Cannot attach to %s (error=%d).", sc->sc_name, error));
2218         error = g_access(cp, 1, 0, 0);
2219         KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
2220         g_topology_unlock();
2221         sx_xlock(&sc->sc_lock);
2222
2223         G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
2224             g_raid3_get_diskname(disk));
2225         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0)
2226                 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2227         KASSERT(disk->d_sync.ds_consumer == NULL,
2228             ("Sync consumer already exists (device=%s, disk=%s).",
2229             sc->sc_name, g_raid3_get_diskname(disk)));
2230
2231         disk->d_sync.ds_consumer = cp;
2232         disk->d_sync.ds_consumer->private = disk;
2233         disk->d_sync.ds_consumer->index = 0;
2234         sc->sc_syncdisk = disk;
2235
2236         /*
2237          * Allocate memory for synchronization bios and initialize them.
2238          */
2239         disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs,
2240             M_RAID3, M_WAITOK);
2241         for (n = 0; n < g_raid3_syncreqs; n++) {
2242                 bp = g_alloc_bio();
2243                 disk->d_sync.ds_bios[n] = bp;
2244                 bp->bio_parent = NULL;
2245                 bp->bio_cmd = BIO_READ;
2246                 bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
2247                 bp->bio_cflags = 0;
2248                 bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
2249                 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
2250                 disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
2251                 bp->bio_done = g_raid3_sync_done;
2252                 bp->bio_from = disk->d_sync.ds_consumer;
2253                 bp->bio_to = sc->sc_provider;
2254                 bp->bio_caller1 = (void *)(uintptr_t)n;
2255         }
2256
2257         /* Set the number of in-flight synchronization requests. */
2258         disk->d_sync.ds_inflight = g_raid3_syncreqs;
2259
2260         /*
2261          * Fire off first synchronization requests.
2262          */
2263         for (n = 0; n < g_raid3_syncreqs; n++) {
2264                 bp = disk->d_sync.ds_bios[n];
2265                 G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
2266                 disk->d_sync.ds_consumer->index++;
2267                 /*
2268                  * Delay the request if it is colliding with a regular request.
2269                  */
2270                 if (g_raid3_regular_collision(sc, bp))
2271                         g_raid3_sync_delay(sc, bp);
2272                 else
2273                         g_io_request(bp, disk->d_sync.ds_consumer);
2274         }
2275 }
2276
2277 /*
2278  * Stop synchronization process.
2279  * type: 0 - synchronization finished
2280  *       1 - synchronization stopped
2281  */
2282 static void
2283 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
2284 {
2285         struct g_raid3_disk *disk;
2286         struct g_consumer *cp;
2287
2288         g_topology_assert_not();
2289         sx_assert(&sc->sc_lock, SX_LOCKED);
2290
2291         KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2292             ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2293             sc->sc_state));
2294         disk = sc->sc_syncdisk;
2295         sc->sc_syncdisk = NULL;
2296         KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
2297         KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2298             ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2299             g_raid3_disk_state2str(disk->d_state)));
2300         if (disk->d_sync.ds_consumer == NULL)
2301                 return;
2302
2303         if (type == 0) {
2304                 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
2305                     sc->sc_name, g_raid3_get_diskname(disk));
2306         } else /* if (type == 1) */ {
2307                 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
2308                     sc->sc_name, g_raid3_get_diskname(disk));
2309         }
2310         free(disk->d_sync.ds_bios, M_RAID3);
2311         disk->d_sync.ds_bios = NULL;
2312         cp = disk->d_sync.ds_consumer;
2313         disk->d_sync.ds_consumer = NULL;
2314         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2315         sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
2316         g_topology_lock();
2317         g_raid3_kill_consumer(sc, cp);
2318         g_topology_unlock();
2319         sx_xlock(&sc->sc_lock);
2320 }
2321
2322 static void
2323 g_raid3_launch_provider(struct g_raid3_softc *sc)
2324 {
2325         struct g_provider *pp;
2326         struct g_raid3_disk *disk;
2327         int n;
2328
2329         sx_assert(&sc->sc_lock, SX_LOCKED);
2330
2331         g_topology_lock();
2332         pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2333         pp->mediasize = sc->sc_mediasize;
2334         pp->sectorsize = sc->sc_sectorsize;
2335         pp->stripesize = 0;
2336         pp->stripeoffset = 0;
2337         for (n = 0; n < sc->sc_ndisks; n++) {
2338                 disk = &sc->sc_disks[n];
2339                 if (disk->d_consumer && disk->d_consumer->provider &&
2340                     disk->d_consumer->provider->stripesize > pp->stripesize) {
2341                         pp->stripesize = disk->d_consumer->provider->stripesize;
2342                         pp->stripeoffset = disk->d_consumer->provider->stripeoffset;
2343                 }
2344         }
2345         pp->stripesize *= sc->sc_ndisks - 1;
2346         pp->stripeoffset *= sc->sc_ndisks - 1;
2347         sc->sc_provider = pp;
2348         g_error_provider(pp, 0);
2349         g_topology_unlock();
2350         G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
2351             g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks);
2352
2353         if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2354                 g_raid3_sync_start(sc);
2355 }
2356
2357 static void
2358 g_raid3_destroy_provider(struct g_raid3_softc *sc)
2359 {
2360         struct bio *bp;
2361
2362         g_topology_assert_not();
2363         KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2364             sc->sc_name));
2365
2366         g_topology_lock();
2367         g_error_provider(sc->sc_provider, ENXIO);
2368         mtx_lock(&sc->sc_queue_mtx);
2369         while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2370                 bioq_remove(&sc->sc_queue, bp);
2371                 g_io_deliver(bp, ENXIO);
2372         }
2373         mtx_unlock(&sc->sc_queue_mtx);
2374         G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2375             sc->sc_provider->name);
2376         g_wither_provider(sc->sc_provider, ENXIO);
2377         g_topology_unlock();
2378         sc->sc_provider = NULL;
2379         if (sc->sc_syncdisk != NULL)
2380                 g_raid3_sync_stop(sc, 1);
2381 }
2382
2383 static void
2384 g_raid3_go(void *arg)
2385 {
2386         struct g_raid3_softc *sc;
2387
2388         sc = arg;
2389         G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2390         g_raid3_event_send(sc, 0,
2391             G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2392 }
2393
2394 static u_int
2395 g_raid3_determine_state(struct g_raid3_disk *disk)
2396 {
2397         struct g_raid3_softc *sc;
2398         u_int state;
2399
2400         sc = disk->d_softc;
2401         if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2402                 if ((disk->d_flags &
2403                     G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2404                         /* Disk does not need synchronization. */
2405                         state = G_RAID3_DISK_STATE_ACTIVE;
2406                 } else {
2407                         if ((sc->sc_flags &
2408                              G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2409                             (disk->d_flags &
2410                              G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2411                                 /*
2412                                  * We can start synchronization from
2413                                  * the stored offset.
2414                                  */
2415                                 state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2416                         } else {
2417                                 state = G_RAID3_DISK_STATE_STALE;
2418                         }
2419                 }
2420         } else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2421                 /*
2422                  * Reset all synchronization data for this disk,
2423                  * because if it even was synchronized, it was
2424                  * synchronized to disks with different syncid.
2425                  */
2426                 disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2427                 disk->d_sync.ds_offset = 0;
2428                 disk->d_sync.ds_offset_done = 0;
2429                 disk->d_sync.ds_syncid = sc->sc_syncid;
2430                 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2431                     (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2432                         state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2433                 } else {
2434                         state = G_RAID3_DISK_STATE_STALE;
2435                 }
2436         } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2437                 /*
2438                  * Not good, NOT GOOD!
2439                  * It means that device was started on stale disks
2440                  * and more fresh disk just arrive.
2441                  * If there were writes, device is broken, sorry.
2442                  * I think the best choice here is don't touch
2443                  * this disk and inform the user loudly.
2444                  */
2445                 G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2446                     "disk (%s) arrives!! It will not be connected to the "
2447                     "running device.", sc->sc_name,
2448                     g_raid3_get_diskname(disk));
2449                 g_raid3_destroy_disk(disk);
2450                 state = G_RAID3_DISK_STATE_NONE;
2451                 /* Return immediately, because disk was destroyed. */
2452                 return (state);
2453         }
2454         G_RAID3_DEBUG(3, "State for %s disk: %s.",
2455             g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2456         return (state);
2457 }
2458
2459 /*
2460  * Update device state.
2461  */
2462 static void
2463 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2464 {
2465         struct g_raid3_disk *disk;
2466         u_int state;
2467
2468         sx_assert(&sc->sc_lock, SX_XLOCKED);
2469
2470         switch (sc->sc_state) {
2471         case G_RAID3_DEVICE_STATE_STARTING:
2472             {
2473                 u_int n, ndirty, ndisks, genid, syncid;
2474
2475                 KASSERT(sc->sc_provider == NULL,
2476                     ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2477                 /*
2478                  * Are we ready? We are, if all disks are connected or
2479                  * one disk is missing and 'force' is true.
2480                  */
2481                 if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2482                         if (!force)
2483                                 callout_drain(&sc->sc_callout);
2484                 } else {
2485                         if (force) {
2486                                 /*
2487                                  * Timeout expired, so destroy device.
2488                                  */
2489                                 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2490                                 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
2491                                     __LINE__, sc->sc_rootmount);
2492                                 root_mount_rel(sc->sc_rootmount);
2493                                 sc->sc_rootmount = NULL;
2494                         }
2495                         return;
2496                 }
2497
2498                 /*
2499                  * Find the biggest genid.
2500                  */
2501                 genid = 0;
2502                 for (n = 0; n < sc->sc_ndisks; n++) {
2503                         disk = &sc->sc_disks[n];
2504                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2505                                 continue;
2506                         if (disk->d_genid > genid)
2507                                 genid = disk->d_genid;
2508                 }
2509                 sc->sc_genid = genid;
2510                 /*
2511                  * Remove all disks without the biggest genid.
2512                  */
2513                 for (n = 0; n < sc->sc_ndisks; n++) {
2514                         disk = &sc->sc_disks[n];
2515                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2516                                 continue;
2517                         if (disk->d_genid < genid) {
2518                                 G_RAID3_DEBUG(0,
2519                                     "Component %s (device %s) broken, skipping.",
2520                                     g_raid3_get_diskname(disk), sc->sc_name);
2521                                 g_raid3_destroy_disk(disk);
2522                         }
2523                 }
2524
2525                 /*
2526                  * There must be at least 'sc->sc_ndisks - 1' components
2527                  * with the same syncid and without SYNCHRONIZING flag.
2528                  */
2529
2530                 /*
2531                  * Find the biggest syncid, number of valid components and
2532                  * number of dirty components.
2533                  */
2534                 ndirty = ndisks = syncid = 0;
2535                 for (n = 0; n < sc->sc_ndisks; n++) {
2536                         disk = &sc->sc_disks[n];
2537                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2538                                 continue;
2539                         if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2540                                 ndirty++;
2541                         if (disk->d_sync.ds_syncid > syncid) {
2542                                 syncid = disk->d_sync.ds_syncid;
2543                                 ndisks = 0;
2544                         } else if (disk->d_sync.ds_syncid < syncid) {
2545                                 continue;
2546                         }
2547                         if ((disk->d_flags &
2548                             G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2549                                 continue;
2550                         }
2551                         ndisks++;
2552                 }
2553                 /*
2554                  * Do we have enough valid components?
2555                  */
2556                 if (ndisks + 1 < sc->sc_ndisks) {
2557                         G_RAID3_DEBUG(0,
2558                             "Device %s is broken, too few valid components.",
2559                             sc->sc_name);
2560                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2561                         return;
2562                 }
2563                 /*
2564                  * If there is one DIRTY component and all disks are present,
2565                  * mark it for synchronization. If there is more than one DIRTY
2566                  * component, mark parity component for synchronization.
2567                  */
2568                 if (ndisks == sc->sc_ndisks && ndirty == 1) {
2569                         for (n = 0; n < sc->sc_ndisks; n++) {
2570                                 disk = &sc->sc_disks[n];
2571                                 if ((disk->d_flags &
2572                                     G_RAID3_DISK_FLAG_DIRTY) == 0) {
2573                                         continue;
2574                                 }
2575                                 disk->d_flags |=
2576                                     G_RAID3_DISK_FLAG_SYNCHRONIZING;
2577                         }
2578                 } else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2579                         disk = &sc->sc_disks[sc->sc_ndisks - 1];
2580                         disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2581                 }
2582
2583                 sc->sc_syncid = syncid;
2584                 if (force) {
2585                         /* Remember to bump syncid on first write. */
2586                         sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2587                 }
2588                 if (ndisks == sc->sc_ndisks)
2589                         state = G_RAID3_DEVICE_STATE_COMPLETE;
2590                 else /* if (ndisks == sc->sc_ndisks - 1) */
2591                         state = G_RAID3_DEVICE_STATE_DEGRADED;
2592                 G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2593                     sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2594                     g_raid3_device_state2str(state));
2595                 sc->sc_state = state;
2596                 for (n = 0; n < sc->sc_ndisks; n++) {
2597                         disk = &sc->sc_disks[n];
2598                         if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2599                                 continue;
2600                         state = g_raid3_determine_state(disk);
2601                         g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2602                         if (state == G_RAID3_DISK_STATE_STALE)
2603                                 sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2604                 }
2605                 break;
2606             }
2607         case G_RAID3_DEVICE_STATE_DEGRADED:
2608                 /*
2609                  * Genid need to be bumped immediately, so do it here.
2610                  */
2611                 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2612                         sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2613                         g_raid3_bump_genid(sc);
2614                 }
2615
2616                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2617                         return;
2618                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2619                     sc->sc_ndisks - 1) {
2620                         if (sc->sc_provider != NULL)
2621                                 g_raid3_destroy_provider(sc);
2622                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2623                         return;
2624                 }
2625                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2626                     sc->sc_ndisks) {
2627                         state = G_RAID3_DEVICE_STATE_COMPLETE;
2628                         G_RAID3_DEBUG(1,
2629                             "Device %s state changed from %s to %s.",
2630                             sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2631                             g_raid3_device_state2str(state));
2632                         sc->sc_state = state;
2633                 }
2634                 if (sc->sc_provider == NULL)
2635                         g_raid3_launch_provider(sc);
2636                 if (sc->sc_rootmount != NULL) {
2637                         G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2638                             sc->sc_rootmount);
2639                         root_mount_rel(sc->sc_rootmount);
2640                         sc->sc_rootmount = NULL;
2641                 }
2642                 break;
2643         case G_RAID3_DEVICE_STATE_COMPLETE:
2644                 /*
2645                  * Genid need to be bumped immediately, so do it here.
2646                  */
2647                 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2648                         sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2649                         g_raid3_bump_genid(sc);
2650                 }
2651
2652                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2653                         return;
2654                 KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2655                     sc->sc_ndisks - 1,
2656                     ("Too few ACTIVE components in COMPLETE state (device %s).",
2657                     sc->sc_name));
2658                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2659                     sc->sc_ndisks - 1) {
2660                         state = G_RAID3_DEVICE_STATE_DEGRADED;
2661                         G_RAID3_DEBUG(1,
2662                             "Device %s state changed from %s to %s.",
2663                             sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2664                             g_raid3_device_state2str(state));
2665                         sc->sc_state = state;
2666                 }
2667                 if (sc->sc_provider == NULL)
2668                         g_raid3_launch_provider(sc);
2669                 if (sc->sc_rootmount != NULL) {
2670                         G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2671                             sc->sc_rootmount);
2672                         root_mount_rel(sc->sc_rootmount);
2673                         sc->sc_rootmount = NULL;
2674                 }
2675                 break;
2676         default:
2677                 KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2678                     g_raid3_device_state2str(sc->sc_state)));
2679                 break;
2680         }
2681 }
2682
2683 /*
2684  * Update disk state and device state if needed.
2685  */
2686 #define DISK_STATE_CHANGED()    G_RAID3_DEBUG(1,                        \
2687         "Disk %s state changed from %s to %s (device %s).",             \
2688         g_raid3_get_diskname(disk),                                     \
2689         g_raid3_disk_state2str(disk->d_state),                          \
2690         g_raid3_disk_state2str(state), sc->sc_name)
2691 static int
2692 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2693 {
2694         struct g_raid3_softc *sc;
2695
2696         sc = disk->d_softc;
2697         sx_assert(&sc->sc_lock, SX_XLOCKED);
2698
2699 again:
2700         G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2701             g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2702             g_raid3_disk_state2str(state));
2703         switch (state) {
2704         case G_RAID3_DISK_STATE_NEW:
2705                 /*
2706                  * Possible scenarios:
2707                  * 1. New disk arrive.
2708                  */
2709                 /* Previous state should be NONE. */
2710                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2711                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2712                     g_raid3_disk_state2str(disk->d_state)));
2713                 DISK_STATE_CHANGED();
2714
2715                 disk->d_state = state;
2716                 G_RAID3_DEBUG(1, "Device %s: provider %s detected.",
2717                     sc->sc_name, g_raid3_get_diskname(disk));
2718                 if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2719                         break;
2720                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2721                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2722                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2723                     g_raid3_device_state2str(sc->sc_state),
2724                     g_raid3_get_diskname(disk),
2725                     g_raid3_disk_state2str(disk->d_state)));
2726                 state = g_raid3_determine_state(disk);
2727                 if (state != G_RAID3_DISK_STATE_NONE)
2728                         goto again;
2729                 break;
2730         case G_RAID3_DISK_STATE_ACTIVE:
2731                 /*
2732                  * Possible scenarios:
2733                  * 1. New disk does not need synchronization.
2734                  * 2. Synchronization process finished successfully.
2735                  */
2736                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2737                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2738                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2739                     g_raid3_device_state2str(sc->sc_state),
2740                     g_raid3_get_diskname(disk),
2741                     g_raid3_disk_state2str(disk->d_state)));
2742                 /* Previous state should be NEW or SYNCHRONIZING. */
2743                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2744                     disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2745                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2746                     g_raid3_disk_state2str(disk->d_state)));
2747                 DISK_STATE_CHANGED();
2748
2749                 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2750                         disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2751                         disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2752                         g_raid3_sync_stop(sc, 0);
2753                 }
2754                 disk->d_state = state;
2755                 disk->d_sync.ds_offset = 0;
2756                 disk->d_sync.ds_offset_done = 0;
2757                 g_raid3_update_idle(sc, disk);
2758                 g_raid3_update_metadata(disk);
2759                 G_RAID3_DEBUG(1, "Device %s: provider %s activated.",
2760                     sc->sc_name, g_raid3_get_diskname(disk));
2761                 break;
2762         case G_RAID3_DISK_STATE_STALE:
2763                 /*
2764                  * Possible scenarios:
2765                  * 1. Stale disk was connected.
2766                  */
2767                 /* Previous state should be NEW. */
2768                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2769                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2770                     g_raid3_disk_state2str(disk->d_state)));
2771                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2772                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2773                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2774                     g_raid3_device_state2str(sc->sc_state),
2775                     g_raid3_get_diskname(disk),
2776                     g_raid3_disk_state2str(disk->d_state)));
2777                 /*
2778                  * STALE state is only possible if device is marked
2779                  * NOAUTOSYNC.
2780                  */
2781                 KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2782                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2783                     g_raid3_device_state2str(sc->sc_state),
2784                     g_raid3_get_diskname(disk),
2785                     g_raid3_disk_state2str(disk->d_state)));
2786                 DISK_STATE_CHANGED();
2787
2788                 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2789                 disk->d_state = state;
2790                 g_raid3_update_metadata(disk);
2791                 G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2792                     sc->sc_name, g_raid3_get_diskname(disk));
2793                 break;
2794         case G_RAID3_DISK_STATE_SYNCHRONIZING:
2795                 /*
2796                  * Possible scenarios:
2797                  * 1. Disk which needs synchronization was connected.
2798                  */
2799                 /* Previous state should be NEW. */
2800                 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2801                     ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2802                     g_raid3_disk_state2str(disk->d_state)));
2803                 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2804                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2805                     ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2806                     g_raid3_device_state2str(sc->sc_state),
2807                     g_raid3_get_diskname(disk),
2808                     g_raid3_disk_state2str(disk->d_state)));
2809                 DISK_STATE_CHANGED();
2810
2811                 if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2812                         disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2813                 disk->d_state = state;
2814                 if (sc->sc_provider != NULL) {
2815                         g_raid3_sync_start(sc);
2816                         g_raid3_update_metadata(disk);
2817                 }
2818                 break;
2819         case G_RAID3_DISK_STATE_DISCONNECTED:
2820                 /*
2821                  * Possible scenarios:
2822                  * 1. Device wasn't running yet, but disk disappear.
2823                  * 2. Disk was active and disapppear.
2824                  * 3. Disk disappear during synchronization process.
2825                  */
2826                 if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2827                     sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2828                         /*
2829                          * Previous state should be ACTIVE, STALE or
2830                          * SYNCHRONIZING.
2831                          */
2832                         KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2833                             disk->d_state == G_RAID3_DISK_STATE_STALE ||
2834                             disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2835                             ("Wrong disk state (%s, %s).",
2836                             g_raid3_get_diskname(disk),
2837                             g_raid3_disk_state2str(disk->d_state)));
2838                 } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2839                         /* Previous state should be NEW. */
2840                         KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2841                             ("Wrong disk state (%s, %s).",
2842                             g_raid3_get_diskname(disk),
2843                             g_raid3_disk_state2str(disk->d_state)));
2844                         /*
2845                          * Reset bumping syncid if disk disappeared in STARTING
2846                          * state.
2847                          */
2848                         if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2849                                 sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2850 #ifdef  INVARIANTS
2851                 } else {
2852                         KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2853                             sc->sc_name,
2854                             g_raid3_device_state2str(sc->sc_state),
2855                             g_raid3_get_diskname(disk),
2856                             g_raid3_disk_state2str(disk->d_state)));
2857 #endif
2858                 }
2859                 DISK_STATE_CHANGED();
2860                 G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2861                     sc->sc_name, g_raid3_get_diskname(disk));
2862
2863                 g_raid3_destroy_disk(disk);
2864                 break;
2865         default:
2866                 KASSERT(1 == 0, ("Unknown state (%u).", state));
2867                 break;
2868         }
2869         return (0);
2870 }
2871 #undef  DISK_STATE_CHANGED
2872
2873 int
2874 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2875 {
2876         struct g_provider *pp;
2877         u_char *buf;
2878         int error;
2879
2880         g_topology_assert();
2881
2882         error = g_access(cp, 1, 0, 0);
2883         if (error != 0)
2884                 return (error);
2885         pp = cp->provider;
2886         g_topology_unlock();
2887         /* Metadata are stored on last sector. */
2888         buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2889             &error);
2890         g_topology_lock();
2891         g_access(cp, -1, 0, 0);
2892         if (buf == NULL) {
2893                 G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2894                     cp->provider->name, error);
2895                 return (error);
2896         }
2897
2898         /* Decode metadata. */
2899         error = raid3_metadata_decode(buf, md);
2900         g_free(buf);
2901         if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2902                 return (EINVAL);
2903         if (md->md_version > G_RAID3_VERSION) {
2904                 G_RAID3_DEBUG(0,
2905                     "Kernel module is too old to handle metadata from %s.",
2906                     cp->provider->name);
2907                 return (EINVAL);
2908         }
2909         if (error != 0) {
2910                 G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2911                     cp->provider->name);
2912                 return (error);
2913         }
2914         if (md->md_sectorsize > MAXPHYS) {
2915                 G_RAID3_DEBUG(0, "The blocksize is too big.");
2916                 return (EINVAL);
2917         }
2918
2919         return (0);
2920 }
2921
2922 static int
2923 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2924     struct g_raid3_metadata *md)
2925 {
2926
2927         if (md->md_no >= sc->sc_ndisks) {
2928                 G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2929                     pp->name, md->md_no);
2930                 return (EINVAL);
2931         }
2932         if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2933                 G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2934                     pp->name, md->md_no);
2935                 return (EEXIST);
2936         }
2937         if (md->md_all != sc->sc_ndisks) {
2938                 G_RAID3_DEBUG(1,
2939                     "Invalid '%s' field on disk %s (device %s), skipping.",
2940                     "md_all", pp->name, sc->sc_name);
2941                 return (EINVAL);
2942         }
2943         if ((md->md_mediasize % md->md_sectorsize) != 0) {
2944                 G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != "
2945                     "0) on disk %s (device %s), skipping.", pp->name,
2946                     sc->sc_name);
2947                 return (EINVAL);
2948         }
2949         if (md->md_mediasize != sc->sc_mediasize) {
2950                 G_RAID3_DEBUG(1,
2951                     "Invalid '%s' field on disk %s (device %s), skipping.",
2952                     "md_mediasize", pp->name, sc->sc_name);
2953                 return (EINVAL);
2954         }
2955         if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2956                 G_RAID3_DEBUG(1,
2957                     "Invalid '%s' field on disk %s (device %s), skipping.",
2958                     "md_mediasize", pp->name, sc->sc_name);
2959                 return (EINVAL);
2960         }
2961         if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2962                 G_RAID3_DEBUG(1,
2963                     "Invalid size of disk %s (device %s), skipping.", pp->name,
2964                     sc->sc_name);
2965                 return (EINVAL);
2966         }
2967         if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2968                 G_RAID3_DEBUG(1,
2969                     "Invalid '%s' field on disk %s (device %s), skipping.",
2970                     "md_sectorsize", pp->name, sc->sc_name);
2971                 return (EINVAL);
2972         }
2973         if (md->md_sectorsize != sc->sc_sectorsize) {
2974                 G_RAID3_DEBUG(1,
2975                     "Invalid '%s' field on disk %s (device %s), skipping.",
2976                     "md_sectorsize", pp->name, sc->sc_name);
2977                 return (EINVAL);
2978         }
2979         if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2980                 G_RAID3_DEBUG(1,
2981                     "Invalid sector size of disk %s (device %s), skipping.",
2982                     pp->name, sc->sc_name);
2983                 return (EINVAL);
2984         }
2985         if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2986                 G_RAID3_DEBUG(1,
2987                     "Invalid device flags on disk %s (device %s), skipping.",
2988                     pp->name, sc->sc_name);
2989                 return (EINVAL);
2990         }
2991         if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2992             (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2993                 /*
2994                  * VERIFY and ROUND-ROBIN options are mutally exclusive.
2995                  */
2996                 G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2997                     "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2998                 return (EINVAL);
2999         }
3000         if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
3001                 G_RAID3_DEBUG(1,
3002                     "Invalid disk flags on disk %s (device %s), skipping.",
3003                     pp->name, sc->sc_name);
3004                 return (EINVAL);
3005         }
3006         return (0);
3007 }
3008
3009 int
3010 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
3011     struct g_raid3_metadata *md)
3012 {
3013         struct g_raid3_disk *disk;
3014         int error;
3015
3016         g_topology_assert_not();
3017         G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
3018
3019         error = g_raid3_check_metadata(sc, pp, md);
3020         if (error != 0)
3021                 return (error);
3022         if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
3023             md->md_genid < sc->sc_genid) {
3024                 G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
3025                     pp->name, sc->sc_name);
3026                 return (EINVAL);
3027         }
3028         disk = g_raid3_init_disk(sc, pp, md, &error);
3029         if (disk == NULL)
3030                 return (error);
3031         error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
3032             G_RAID3_EVENT_WAIT);
3033         if (error != 0)
3034                 return (error);
3035         if (md->md_version < G_RAID3_VERSION) {
3036                 G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
3037                     pp->name, md->md_version, G_RAID3_VERSION);
3038                 g_raid3_update_metadata(disk);
3039         }
3040         return (0);
3041 }
3042
3043 static void
3044 g_raid3_destroy_delayed(void *arg, int flag)
3045 {
3046         struct g_raid3_softc *sc;
3047         int error;
3048
3049         if (flag == EV_CANCEL) {
3050                 G_RAID3_DEBUG(1, "Destroying canceled.");
3051                 return;
3052         }
3053         sc = arg;
3054         g_topology_unlock();
3055         sx_xlock(&sc->sc_lock);
3056         KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0,
3057             ("DESTROY flag set on %s.", sc->sc_name));
3058         KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0,
3059             ("DESTROYING flag not set on %s.", sc->sc_name));
3060         G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name);
3061         error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT);
3062         if (error != 0) {
3063                 G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name);
3064                 sx_xunlock(&sc->sc_lock);
3065         }
3066         g_topology_lock();
3067 }
3068
3069 static int
3070 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
3071 {
3072         struct g_raid3_softc *sc;
3073         int dcr, dcw, dce, error = 0;
3074
3075         g_topology_assert();
3076         G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
3077             acw, ace);
3078
3079         sc = pp->geom->softc;
3080         if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
3081                 return (0);
3082         KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
3083
3084         dcr = pp->acr + acr;
3085         dcw = pp->acw + acw;
3086         dce = pp->ace + ace;
3087
3088         g_topology_unlock();
3089         sx_xlock(&sc->sc_lock);
3090         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 ||
3091             g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
3092                 if (acr > 0 || acw > 0 || ace > 0)
3093                         error = ENXIO;
3094                 goto end;
3095         }
3096         if (dcw == 0)
3097                 g_raid3_idle(sc, dcw);
3098         if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) {
3099                 if (acr > 0 || acw > 0 || ace > 0) {
3100                         error = ENXIO;
3101                         goto end;
3102                 }
3103                 if (dcr == 0 && dcw == 0 && dce == 0) {
3104                         g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK,
3105                             sc, NULL);
3106                 }
3107         }
3108 end:
3109         sx_xunlock(&sc->sc_lock);
3110         g_topology_lock();
3111         return (error);
3112 }
3113
3114 static struct g_geom *
3115 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
3116 {
3117         struct g_raid3_softc *sc;
3118         struct g_geom *gp;
3119         int error, timeout;
3120         u_int n;
3121
3122         g_topology_assert();
3123         G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
3124
3125         /* One disk is minimum. */
3126         if (md->md_all < 1)
3127                 return (NULL);
3128         /*
3129          * Action geom.
3130          */
3131         gp = g_new_geomf(mp, "%s", md->md_name);
3132         sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
3133         sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
3134             M_WAITOK | M_ZERO);
3135         gp->start = g_raid3_start;
3136         gp->orphan = g_raid3_orphan;
3137         gp->access = g_raid3_access;
3138         gp->dumpconf = g_raid3_dumpconf;
3139
3140         sc->sc_id = md->md_id;
3141         sc->sc_mediasize = md->md_mediasize;
3142         sc->sc_sectorsize = md->md_sectorsize;
3143         sc->sc_ndisks = md->md_all;
3144         sc->sc_round_robin = 0;
3145         sc->sc_flags = md->md_mflags;
3146         sc->sc_bump_id = 0;
3147         sc->sc_idle = 1;
3148         sc->sc_last_write = time_uptime;
3149         sc->sc_writes = 0;
3150         for (n = 0; n < sc->sc_ndisks; n++) {
3151                 sc->sc_disks[n].d_softc = sc;
3152                 sc->sc_disks[n].d_no = n;
3153                 sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
3154         }
3155         sx_init(&sc->sc_lock, "graid3:lock");
3156         bioq_init(&sc->sc_queue);
3157         mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
3158         bioq_init(&sc->sc_regular_delayed);
3159         bioq_init(&sc->sc_inflight);
3160         bioq_init(&sc->sc_sync_delayed);
3161         TAILQ_INIT(&sc->sc_events);
3162         mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
3163         callout_init(&sc->sc_callout, 1);
3164         sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
3165         gp->softc = sc;
3166         sc->sc_geom = gp;
3167         sc->sc_provider = NULL;
3168         /*
3169          * Synchronization geom.
3170          */
3171         gp = g_new_geomf(mp, "%s.sync", md->md_name);
3172         gp->softc = sc;
3173         gp->orphan = g_raid3_orphan;
3174         sc->sc_sync.ds_geom = gp;
3175
3176         if (!g_raid3_use_malloc) {
3177                 sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k",
3178                     65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3179                     UMA_ALIGN_PTR, 0);
3180                 sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
3181                 sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
3182                 sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
3183                     sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
3184                 sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k",
3185                     16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3186                     UMA_ALIGN_PTR, 0);
3187                 sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
3188                 sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
3189                 sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
3190                     sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
3191                 sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k",
3192                     4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
3193                     UMA_ALIGN_PTR, 0);
3194                 sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
3195                 sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
3196                 sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
3197                     sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
3198         }
3199
3200         error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
3201             "g_raid3 %s", md->md_name);
3202         if (error != 0) {
3203                 G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
3204                     sc->sc_name);
3205                 if (!g_raid3_use_malloc) {
3206                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
3207                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
3208                         uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
3209                 }
3210                 g_destroy_geom(sc->sc_sync.ds_geom);
3211                 mtx_destroy(&sc->sc_events_mtx);
3212                 mtx_destroy(&sc->sc_queue_mtx);
3213                 sx_destroy(&sc->sc_lock);
3214                 g_destroy_geom(sc->sc_geom);
3215                 free(sc->sc_disks, M_RAID3);
3216                 free(sc, M_RAID3);
3217                 return (NULL);
3218         }
3219
3220         G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).",
3221             sc->sc_name, sc->sc_ndisks, sc->sc_id);
3222
3223         sc->sc_rootmount = root_mount_hold("GRAID3");
3224         G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
3225
3226         /*
3227          * Run timeout.
3228          */
3229         timeout = atomic_load_acq_int(&g_raid3_timeout);
3230         callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
3231         return (sc->sc_geom);
3232 }
3233
3234 int
3235 g_raid3_destroy(struct g_raid3_softc *sc, int how)
3236 {
3237         struct g_provider *pp;
3238
3239         g_topology_assert_not();
3240         if (sc == NULL)
3241                 return (ENXIO);
3242         sx_assert(&sc->sc_lock, SX_XLOCKED);
3243
3244         pp = sc->sc_provider;
3245         if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
3246                 switch (how) {
3247                 case G_RAID3_DESTROY_SOFT:
3248                         G_RAID3_DEBUG(1,
3249                             "Device %s is still open (r%dw%de%d).", pp->name,
3250                             pp->acr, pp->acw, pp->ace);
3251                         return (EBUSY);
3252                 case G_RAID3_DESTROY_DELAYED:
3253                         G_RAID3_DEBUG(1,
3254                             "Device %s will be destroyed on last close.",
3255                             pp->name);
3256                         if (sc->sc_syncdisk != NULL)
3257                                 g_raid3_sync_stop(sc, 1);
3258                         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING;
3259                         return (EBUSY);
3260                 case G_RAID3_DESTROY_HARD:
3261                         G_RAID3_DEBUG(1, "Device %s is still open, so it "
3262                             "can't be definitely removed.", pp->name);
3263                         break;
3264                 }
3265         }
3266
3267         g_topology_lock();
3268         if (sc->sc_geom->softc == NULL) {
3269                 g_topology_unlock();
3270                 return (0);
3271         }
3272         sc->sc_geom->softc = NULL;
3273         sc->sc_sync.ds_geom->softc = NULL;
3274         g_topology_unlock();
3275
3276         sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
3277         sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
3278         G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
3279         sx_xunlock(&sc->sc_lock);
3280         mtx_lock(&sc->sc_queue_mtx);
3281         wakeup(sc);
3282         wakeup(&sc->sc_queue);
3283         mtx_unlock(&sc->sc_queue_mtx);
3284         G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
3285         while (sc->sc_worker != NULL)
3286                 tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
3287         G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
3288         sx_xlock(&sc->sc_lock);
3289         g_raid3_destroy_device(sc);
3290         free(sc->sc_disks, M_RAID3);
3291         free(sc, M_RAID3);
3292         return (0);
3293 }
3294
3295 static void
3296 g_raid3_taste_orphan(struct g_consumer *cp)
3297 {
3298
3299         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
3300             cp->provider->name));
3301 }
3302
3303 static struct g_geom *
3304 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
3305 {
3306         struct g_raid3_metadata md;
3307         struct g_raid3_softc *sc;
3308         struct g_consumer *cp;
3309         struct g_geom *gp;
3310         int error;
3311
3312         g_topology_assert();
3313         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
3314         G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
3315
3316         gp = g_new_geomf(mp, "raid3:taste");
3317         /* This orphan function should be never called. */
3318         gp->orphan = g_raid3_taste_orphan;
3319         cp = g_new_consumer(gp);
3320         g_attach(cp, pp);
3321         error = g_raid3_read_metadata(cp, &md);
3322         g_detach(cp);
3323         g_destroy_consumer(cp);
3324         g_destroy_geom(gp);
3325         if (error != 0)
3326                 return (NULL);
3327         gp = NULL;
3328
3329         if (md.md_provider[0] != '\0' &&
3330             !g_compare_names(md.md_provider, pp->name))
3331                 return (NULL);
3332         if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
3333                 return (NULL);
3334         if (g_raid3_debug >= 2)
3335                 raid3_metadata_dump(&md);
3336
3337         /*
3338          * Let's check if device already exists.
3339          */
3340         sc = NULL;
3341         LIST_FOREACH(gp, &mp->geom, geom) {
3342                 sc = gp->softc;
3343                 if (sc == NULL)
3344                         continue;
3345                 if (sc->sc_sync.ds_geom == gp)
3346                         continue;
3347                 if (strcmp(md.md_name, sc->sc_name) != 0)
3348                         continue;
3349                 if (md.md_id != sc->sc_id) {
3350                         G_RAID3_DEBUG(0, "Device %s already configured.",
3351                             sc->sc_name);
3352                         return (NULL);
3353                 }
3354                 break;
3355         }
3356         if (gp == NULL) {
3357                 gp = g_raid3_create(mp, &md);
3358                 if (gp == NULL) {
3359                         G_RAID3_DEBUG(0, "Cannot create device %s.",
3360                             md.md_name);
3361                         return (NULL);
3362                 }
3363                 sc = gp->softc;
3364         }
3365         G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
3366         g_topology_unlock();
3367         sx_xlock(&sc->sc_lock);
3368         error = g_raid3_add_disk(sc, pp, &md);
3369         if (error != 0) {
3370                 G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
3371                     pp->name, gp->name, error);
3372                 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
3373                     sc->sc_ndisks) {
3374                         g_cancel_event(sc);
3375                         g_raid3_destroy(sc, G_RAID3_DESTROY_HARD);
3376                         g_topology_lock();
3377                         return (NULL);
3378                 }
3379                 gp = NULL;
3380         }
3381         sx_xunlock(&sc->sc_lock);
3382         g_topology_lock();
3383         return (gp);
3384 }
3385
3386 static int
3387 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
3388     struct g_geom *gp)
3389 {
3390         struct g_raid3_softc *sc;
3391         int error;
3392
3393         g_topology_unlock();
3394         sc = gp->softc;
3395         sx_xlock(&sc->sc_lock);
3396         g_cancel_event(sc);
3397         error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT);
3398         if (error != 0)
3399                 sx_xunlock(&sc->sc_lock);
3400         g_topology_lock();
3401         return (error);
3402 }
3403
3404 static void
3405 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
3406     struct g_consumer *cp, struct g_provider *pp)
3407 {
3408         struct g_raid3_softc *sc;
3409
3410         g_topology_assert();
3411
3412         sc = gp->softc;
3413         if (sc == NULL)
3414                 return;
3415         /* Skip synchronization geom. */
3416         if (gp == sc->sc_sync.ds_geom)
3417                 return;
3418         if (pp != NULL) {
3419                 /* Nothing here. */
3420         } else if (cp != NULL) {
3421                 struct g_raid3_disk *disk;
3422
3423                 disk = cp->private;
3424                 if (disk == NULL)
3425                         return;
3426                 g_topology_unlock();
3427                 sx_xlock(&sc->sc_lock);
3428                 sbuf_printf(sb, "%s<Type>", indent);
3429                 if (disk->d_no == sc->sc_ndisks - 1)
3430                         sbuf_cat(sb, "PARITY");
3431                 else
3432                         sbuf_cat(sb, "DATA");
3433                 sbuf_cat(sb, "</Type>\n");
3434                 sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
3435                     (u_int)disk->d_no);
3436                 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
3437                         sbuf_printf(sb, "%s<Synchronized>", indent);
3438                         if (disk->d_sync.ds_offset == 0)
3439                                 sbuf_cat(sb, "0%");
3440                         else {
3441                                 sbuf_printf(sb, "%u%%",
3442                                     (u_int)((disk->d_sync.ds_offset * 100) /
3443                                     (sc->sc_mediasize / (sc->sc_ndisks - 1))));
3444                         }
3445                         sbuf_cat(sb, "</Synchronized>\n");
3446                         if (disk->d_sync.ds_offset > 0) {
3447                                 sbuf_printf(sb, "%s<BytesSynced>%jd"
3448                                     "</BytesSynced>\n", indent,
3449                                     (intmax_t)disk->d_sync.ds_offset);
3450                         }
3451                 }
3452                 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3453                     disk->d_sync.ds_syncid);
3454                 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
3455                 sbuf_printf(sb, "%s<Flags>", indent);
3456                 if (disk->d_flags == 0)
3457                         sbuf_cat(sb, "NONE");
3458                 else {
3459                         int first = 1;
3460
3461 #define ADD_FLAG(flag, name)    do {                                    \
3462         if ((disk->d_flags & (flag)) != 0) {                            \
3463                 if (!first)                                             \
3464                         sbuf_cat(sb, ", ");                             \
3465                 else                                                    \
3466                         first = 0;                                      \
3467                 sbuf_cat(sb, name);                                     \
3468         }                                                               \
3469 } while (0)
3470                         ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3471                         ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3472                         ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3473                             "SYNCHRONIZING");
3474                         ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3475                         ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
3476 #undef  ADD_FLAG
3477                 }
3478                 sbuf_cat(sb, "</Flags>\n");
3479                 sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3480                     g_raid3_disk_state2str(disk->d_state));
3481                 sx_xunlock(&sc->sc_lock);
3482                 g_topology_lock();
3483         } else {
3484                 g_topology_unlock();
3485                 sx_xlock(&sc->sc_lock);
3486                 if (!g_raid3_use_malloc) {
3487                         sbuf_printf(sb,
3488                             "%s<Zone4kRequested>%u</Zone4kRequested>\n", indent,
3489                             sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
3490                         sbuf_printf(sb,
3491                             "%s<Zone4kFailed>%u</Zone4kFailed>\n", indent,
3492                             sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
3493                         sbuf_printf(sb,
3494                             "%s<Zone16kRequested>%u</Zone16kRequested>\n", indent,
3495                             sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
3496                         sbuf_printf(sb,
3497                             "%s<Zone16kFailed>%u</Zone16kFailed>\n", indent,
3498                             sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
3499                         sbuf_printf(sb,
3500                             "%s<Zone64kRequested>%u</Zone64kRequested>\n", indent,
3501                             sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
3502                         sbuf_printf(sb,
3503                             "%s<Zone64kFailed>%u</Zone64kFailed>\n", indent,
3504                             sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
3505                 }
3506                 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3507                 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3508                 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3509                 sbuf_printf(sb, "%s<Flags>", indent);
3510                 if (sc->sc_flags == 0)
3511                         sbuf_cat(sb, "NONE");
3512                 else {
3513                         int first = 1;
3514
3515 #define ADD_FLAG(flag, name)    do {                                    \
3516         if ((sc->sc_flags & (flag)) != 0) {                             \
3517                 if (!first)                                             \
3518                         sbuf_cat(sb, ", ");                             \
3519                 else                                                    \
3520                         first = 0;                                      \
3521                 sbuf_cat(sb, name);                                     \
3522         }                                                               \
3523 } while (0)
3524                         ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
3525                         ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3526                         ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3527                             "ROUND-ROBIN");
3528                         ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3529 #undef  ADD_FLAG
3530                 }
3531                 sbuf_cat(sb, "</Flags>\n");
3532                 sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3533                     sc->sc_ndisks);
3534                 sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3535                     g_raid3_device_state2str(sc->sc_state));
3536                 sx_xunlock(&sc->sc_lock);
3537                 g_topology_lock();
3538         }
3539 }
3540
3541 static void
3542 g_raid3_shutdown_post_sync(void *arg, int howto)
3543 {
3544         struct g_class *mp;
3545         struct g_geom *gp, *gp2;
3546         struct g_raid3_softc *sc;
3547         int error;
3548
3549         mp = arg;
3550         g_topology_lock();
3551         g_raid3_shutdown = 1;
3552         LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3553                 if ((sc = gp->softc) == NULL)
3554                         continue;
3555                 /* Skip synchronization geom. */
3556                 if (gp == sc->sc_sync.ds_geom)
3557                         continue;
3558                 g_topology_unlock();
3559                 sx_xlock(&sc->sc_lock);
3560                 g_raid3_idle(sc, -1);
3561                 g_cancel_event(sc);
3562                 error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED);
3563                 if (error != 0)
3564                         sx_xunlock(&sc->sc_lock);
3565                 g_topology_lock();
3566         }
3567         g_topology_unlock();
3568 }
3569
3570 static void
3571 g_raid3_init(struct g_class *mp)
3572 {
3573
3574         g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
3575             g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
3576         if (g_raid3_post_sync == NULL)
3577                 G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3578 }
3579
3580 static void
3581 g_raid3_fini(struct g_class *mp)
3582 {
3583
3584         if (g_raid3_post_sync != NULL)
3585                 EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync);
3586 }
3587
3588 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3589 MODULE_VERSION(geom_raid3, 0);