]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/geom/journal/g_journal.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / geom / journal / g_journal.c
1 /*-
2  * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sysctl.h>
39 #include <sys/malloc.h>
40 #include <sys/mount.h>
41 #include <sys/eventhandler.h>
42 #include <sys/proc.h>
43 #include <sys/kthread.h>
44 #include <sys/sched.h>
45 #include <sys/taskqueue.h>
46 #include <sys/vnode.h>
47 #include <sys/sbuf.h>
48 #ifdef GJ_MEMDEBUG
49 #include <sys/stack.h>
50 #include <sys/kdb.h>
51 #endif
52 #include <vm/vm.h>
53 #include <vm/vm_kern.h>
54 #include <geom/geom.h>
55
56 #include <geom/journal/g_journal.h>
57
58 FEATURE(geom_journal, "GEOM journaling support");
59
60 /*
61  * On-disk journal format:
62  *
63  * JH - Journal header
64  * RH - Record header
65  *
66  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
67  * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
68  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
69  *
70  */
71
72 CTASSERT(sizeof(struct g_journal_header) <= 512);
73 CTASSERT(sizeof(struct g_journal_record_header) <= 512);
74
75 static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
76 static struct mtx g_journal_cache_mtx;
77 MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
78
79 const struct g_journal_desc *g_journal_filesystems[] = {
80         &g_journal_ufs,
81         NULL
82 };
83
84 SYSCTL_DECL(_kern_geom);
85
86 int g_journal_debug = 0;
87 TUNABLE_INT("kern.geom.journal.debug", &g_journal_debug);
88 static u_int g_journal_switch_time = 10;
89 static u_int g_journal_force_switch = 70;
90 static u_int g_journal_parallel_flushes = 16;
91 static u_int g_journal_parallel_copies = 16;
92 static u_int g_journal_accept_immediately = 64;
93 static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
94 static u_int g_journal_do_optimize = 1;
95
96 SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0, "GEOM_JOURNAL stuff");
97 SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RW, &g_journal_debug, 0,
98     "Debug level");
99 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
100     &g_journal_switch_time, 0, "Switch journals every N seconds");
101 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
102     &g_journal_force_switch, 0, "Force switch when journal is N% full");
103 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
104     &g_journal_parallel_flushes, 0,
105     "Number of flush I/O requests to send in parallel");
106 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
107     &g_journal_accept_immediately, 0,
108     "Number of I/O requests accepted immediately");
109 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
110     &g_journal_parallel_copies, 0,
111     "Number of copy I/O requests to send in parallel");
112 static int
113 g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
114 {
115         u_int entries;
116         int error;
117
118         entries = g_journal_record_entries;
119         error = sysctl_handle_int(oidp, &entries, 0, req);
120         if (error != 0 || req->newptr == NULL)
121                 return (error);
122         if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
123                 return (EINVAL);
124         g_journal_record_entries = entries;
125         return (0);
126 }
127 SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
128     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
129     "Maximum number of entires in one journal record");
130 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
131     &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
132
133 static u_int g_journal_cache_used = 0;
134 static u_int g_journal_cache_limit = 64 * 1024 * 1024;
135 TUNABLE_INT("kern.geom.journal.cache.limit", &g_journal_cache_limit);
136 static u_int g_journal_cache_divisor = 2;
137 TUNABLE_INT("kern.geom.journal.cache.divisor", &g_journal_cache_divisor);
138 static u_int g_journal_cache_switch = 90;
139 static u_int g_journal_cache_misses = 0;
140 static u_int g_journal_cache_alloc_failures = 0;
141 static u_int g_journal_cache_low = 0;
142
143 SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
144     "GEOM_JOURNAL cache");
145 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
146     &g_journal_cache_used, 0, "Number of allocated bytes");
147 static int
148 g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
149 {
150         u_int limit;
151         int error;
152
153         limit = g_journal_cache_limit;
154         error = sysctl_handle_int(oidp, &limit, 0, req);
155         if (error != 0 || req->newptr == NULL)
156                 return (error);
157         g_journal_cache_limit = limit;
158         g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
159         return (0);
160 }
161 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
162     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_limit_sysctl, "I",
163     "Maximum number of allocated bytes");
164 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
165     &g_journal_cache_divisor, 0,
166     "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
167 static int
168 g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
169 {
170         u_int cswitch;
171         int error;
172
173         cswitch = g_journal_cache_switch;
174         error = sysctl_handle_int(oidp, &cswitch, 0, req);
175         if (error != 0 || req->newptr == NULL)
176                 return (error);
177         if (cswitch < 0 || cswitch > 100)
178                 return (EINVAL);
179         g_journal_cache_switch = cswitch;
180         g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
181         return (0);
182 }
183 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
184     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
185     "Force switch when we hit this percent of cache use");
186 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
187     &g_journal_cache_misses, 0, "Number of cache misses");
188 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
189     &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
190
191 static u_long g_journal_stats_bytes_skipped = 0;
192 static u_long g_journal_stats_combined_ios = 0;
193 static u_long g_journal_stats_switches = 0;
194 static u_long g_journal_stats_wait_for_copy = 0;
195 static u_long g_journal_stats_journal_full = 0;
196 static u_long g_journal_stats_low_mem = 0;
197
198 SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
199     "GEOM_JOURNAL statistics");
200 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
201     &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
202 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
203     &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
204 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
205     &g_journal_stats_switches, 0, "Number of journal switches");
206 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
207     &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
208 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
209     &g_journal_stats_journal_full, 0,
210     "Number of times journal was almost full.");
211 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
212     &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
213
214 static g_taste_t g_journal_taste;
215 static g_ctl_req_t g_journal_config;
216 static g_dumpconf_t g_journal_dumpconf;
217 static g_init_t g_journal_init;
218 static g_fini_t g_journal_fini;
219
220 struct g_class g_journal_class = {
221         .name = G_JOURNAL_CLASS_NAME,
222         .version = G_VERSION,
223         .taste = g_journal_taste,
224         .ctlreq = g_journal_config,
225         .dumpconf = g_journal_dumpconf,
226         .init = g_journal_init,
227         .fini = g_journal_fini
228 };
229
230 static int g_journal_destroy(struct g_journal_softc *sc);
231 static void g_journal_metadata_update(struct g_journal_softc *sc);
232 static void g_journal_switch_wait(struct g_journal_softc *sc);
233
234 #define GJ_SWITCHER_WORKING     0
235 #define GJ_SWITCHER_DIE         1
236 #define GJ_SWITCHER_DIED        2
237 static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
238 static int g_journal_switcher_wokenup = 0;
239 static int g_journal_sync_requested = 0;
240
241 #ifdef GJ_MEMDEBUG
242 struct meminfo {
243         size_t          mi_size;
244         struct stack    mi_stack;
245 };
246 #endif
247
248 /*
249  * We use our own malloc/realloc/free funtions, so we can collect statistics
250  * and force journal switch when we're running out of cache.
251  */
252 static void *
253 gj_malloc(size_t size, int flags)
254 {
255         void *p;
256 #ifdef GJ_MEMDEBUG
257         struct meminfo *mi;
258 #endif
259
260         mtx_lock(&g_journal_cache_mtx);
261         if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
262             g_journal_cache_used + size > g_journal_cache_low) {
263                 GJ_DEBUG(1, "No cache, waking up the switcher.");
264                 g_journal_switcher_wokenup = 1;
265                 wakeup(&g_journal_switcher_state);
266         }
267         if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
268             g_journal_cache_used + size > g_journal_cache_limit) {
269                 mtx_unlock(&g_journal_cache_mtx);
270                 g_journal_cache_alloc_failures++;
271                 return (NULL);
272         }
273         g_journal_cache_used += size;
274         mtx_unlock(&g_journal_cache_mtx);
275         flags &= ~M_NOWAIT;
276 #ifndef GJ_MEMDEBUG
277         p = malloc(size, M_JOURNAL, flags | M_WAITOK);
278 #else
279         mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
280         p = (u_char *)mi + sizeof(*mi);
281         mi->mi_size = size;
282         stack_save(&mi->mi_stack);
283 #endif
284         return (p);
285 }
286
287 static void
288 gj_free(void *p, size_t size)
289 {
290 #ifdef GJ_MEMDEBUG
291         struct meminfo *mi;
292 #endif
293
294         KASSERT(p != NULL, ("p=NULL"));
295         KASSERT(size > 0, ("size=0"));
296         mtx_lock(&g_journal_cache_mtx);
297         KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
298         g_journal_cache_used -= size;
299         mtx_unlock(&g_journal_cache_mtx);
300 #ifdef GJ_MEMDEBUG
301         mi = p = (void *)((u_char *)p - sizeof(*mi));
302         if (mi->mi_size != size) {
303                 printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
304                     mi->mi_size);
305                 printf("GJOURNAL: Alloc backtrace:\n");
306                 stack_print(&mi->mi_stack);
307                 printf("GJOURNAL: Free backtrace:\n");
308                 kdb_backtrace();
309         }
310 #endif
311         free(p, M_JOURNAL);
312 }
313
314 static void *
315 gj_realloc(void *p, size_t size, size_t oldsize)
316 {
317         void *np;
318
319 #ifndef GJ_MEMDEBUG
320         mtx_lock(&g_journal_cache_mtx);
321         g_journal_cache_used -= oldsize;
322         g_journal_cache_used += size;
323         mtx_unlock(&g_journal_cache_mtx);
324         np = realloc(p, size, M_JOURNAL, M_WAITOK);
325 #else
326         np = gj_malloc(size, M_WAITOK);
327         bcopy(p, np, MIN(oldsize, size));
328         gj_free(p, oldsize);
329 #endif
330         return (np);
331 }
332
333 static void
334 g_journal_check_overflow(struct g_journal_softc *sc)
335 {
336         off_t length, used;
337
338         if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
339              sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
340             (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
341              sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
342              sc->sc_journal_offset < sc->sc_active.jj_offset)) {
343                 panic("Journal overflow (joffset=%jd active=%jd inactive=%jd)",
344                     (intmax_t)sc->sc_journal_offset,
345                     (intmax_t)sc->sc_active.jj_offset,
346                     (intmax_t)sc->sc_inactive.jj_offset);
347         }
348         if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
349                 length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
350                 used = sc->sc_journal_offset - sc->sc_active.jj_offset;
351         } else {
352                 length = sc->sc_jend - sc->sc_active.jj_offset;
353                 length += sc->sc_inactive.jj_offset - sc->sc_jstart;
354                 if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
355                         used = sc->sc_journal_offset - sc->sc_active.jj_offset;
356                 else {
357                         used = sc->sc_jend - sc->sc_active.jj_offset;
358                         used += sc->sc_journal_offset - sc->sc_jstart;
359                 }
360         }
361         /* Already woken up? */
362         if (g_journal_switcher_wokenup)
363                 return;
364         /*
365          * If the active journal takes more than g_journal_force_switch precent
366          * of free journal space, we force journal switch.
367          */
368         KASSERT(length > 0,
369             ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
370             (intmax_t)length, (intmax_t)used,
371             (intmax_t)sc->sc_active.jj_offset,
372             (intmax_t)sc->sc_inactive.jj_offset,
373             (intmax_t)sc->sc_journal_offset));
374         if ((used * 100) / length > g_journal_force_switch) {
375                 g_journal_stats_journal_full++;
376                 GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
377                     sc->sc_name, (used * 100) / length);
378                 mtx_lock(&g_journal_cache_mtx);
379                 g_journal_switcher_wokenup = 1;
380                 wakeup(&g_journal_switcher_state);
381                 mtx_unlock(&g_journal_cache_mtx);
382         }
383 }
384
385 static void
386 g_journal_orphan(struct g_consumer *cp)
387 {
388         struct g_journal_softc *sc;
389         char name[256];
390         int error;
391
392         g_topology_assert();
393         sc = cp->geom->softc;
394         strlcpy(name, cp->provider->name, sizeof(name));
395         GJ_DEBUG(0, "Lost provider %s.", name);
396         if (sc == NULL)
397                 return;
398         error = g_journal_destroy(sc);
399         if (error == 0)
400                 GJ_DEBUG(0, "Journal %s destroyed.", name);
401         else {
402                 GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
403                     "Destroy it manually after last close.", sc->sc_name,
404                     error);
405         }
406 }
407
408 static int
409 g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
410 {
411         struct g_journal_softc *sc;
412         int dcr, dcw, dce;
413
414         g_topology_assert();
415         GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
416             acr, acw, ace);
417
418         dcr = pp->acr + acr;
419         dcw = pp->acw + acw;
420         dce = pp->ace + ace;
421
422         sc = pp->geom->softc;
423         if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
424                 if (acr <= 0 && acw <= 0 && ace <= 0)
425                         return (0);
426                 else
427                         return (ENXIO);
428         }
429         if (pp->acw == 0 && dcw > 0) {
430                 GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
431                 sc->sc_flags &= ~GJF_DEVICE_CLEAN;
432                 g_topology_unlock();
433                 g_journal_metadata_update(sc);
434                 g_topology_lock();
435         } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
436                 GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
437                 sc->sc_flags |= GJF_DEVICE_CLEAN;
438                 g_topology_unlock();
439                 g_journal_metadata_update(sc);
440                 g_topology_lock();
441         } */
442         return (0);
443 }
444
445 static void
446 g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
447 {
448
449         bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
450         data += sizeof(GJ_HEADER_MAGIC);
451         le32enc(data, hdr->jh_journal_id);
452         data += 4;
453         le32enc(data, hdr->jh_journal_next_id);
454 }
455
456 static int
457 g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
458 {
459
460         bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
461         data += sizeof(hdr->jh_magic);
462         if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
463                 return (EINVAL);
464         hdr->jh_journal_id = le32dec(data);
465         data += 4;
466         hdr->jh_journal_next_id = le32dec(data);
467         return (0);
468 }
469
470 static void
471 g_journal_flush_cache(struct g_journal_softc *sc)
472 {
473         struct bintime bt;
474         int error;
475
476         if (sc->sc_bio_flush == 0)
477                 return;
478         GJ_TIMER_START(1, &bt);
479         if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
480                 error = g_io_flush(sc->sc_jconsumer);
481                 GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
482                     sc->sc_jconsumer->provider->name, error);
483         }
484         if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
485                 /*
486                  * TODO: This could be called in parallel with the
487                  *       previous call.
488                  */
489                 error = g_io_flush(sc->sc_dconsumer);
490                 GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
491                     sc->sc_dconsumer->provider->name, error);
492         }
493         GJ_TIMER_STOP(1, &bt, "Cache flush time");
494 }
495
496 static int
497 g_journal_write_header(struct g_journal_softc *sc)
498 {
499         struct g_journal_header hdr;
500         struct g_consumer *cp;
501         u_char *buf;
502         int error;
503
504         cp = sc->sc_jconsumer;
505         buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
506
507         strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
508         hdr.jh_journal_id = sc->sc_journal_id;
509         hdr.jh_journal_next_id = sc->sc_journal_next_id;
510         g_journal_header_encode(&hdr, buf);
511         error = g_write_data(cp, sc->sc_journal_offset, buf,
512             cp->provider->sectorsize);
513         /* if (error == 0) */
514         sc->sc_journal_offset += cp->provider->sectorsize;
515
516         gj_free(buf, cp->provider->sectorsize);
517         return (error);
518 }
519
520 /*
521  * Every journal record has a header and data following it.
522  * Functions below are used to decode the header before storing it to
523  * little endian and to encode it after reading to system endianess.
524  */
525 static void
526 g_journal_record_header_encode(struct g_journal_record_header *hdr,
527     u_char *data)
528 {
529         struct g_journal_entry *ent;
530         u_int i;
531
532         bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
533         data += sizeof(GJ_RECORD_HEADER_MAGIC);
534         le32enc(data, hdr->jrh_journal_id);
535         data += 8;
536         le16enc(data, hdr->jrh_nentries);
537         data += 2;
538         bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
539         data += 8;
540         for (i = 0; i < hdr->jrh_nentries; i++) {
541                 ent = &hdr->jrh_entries[i];
542                 le64enc(data, ent->je_joffset);
543                 data += 8;
544                 le64enc(data, ent->je_offset);
545                 data += 8;
546                 le64enc(data, ent->je_length);
547                 data += 8;
548         }
549 }
550
551 static int
552 g_journal_record_header_decode(const u_char *data,
553     struct g_journal_record_header *hdr)
554 {
555         struct g_journal_entry *ent;
556         u_int i;
557
558         bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
559         data += sizeof(hdr->jrh_magic);
560         if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
561                 return (EINVAL);
562         hdr->jrh_journal_id = le32dec(data);
563         data += 8;
564         hdr->jrh_nentries = le16dec(data);
565         data += 2;
566         if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
567                 return (EINVAL);
568         bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
569         data += 8;
570         for (i = 0; i < hdr->jrh_nentries; i++) {
571                 ent = &hdr->jrh_entries[i];
572                 ent->je_joffset = le64dec(data);
573                 data += 8;
574                 ent->je_offset = le64dec(data);
575                 data += 8;
576                 ent->je_length = le64dec(data);
577                 data += 8;
578         }
579         return (0);
580 }
581
582 /*
583  * Function reads metadata from a provider (via the given consumer), decodes
584  * it to system endianess and verifies its correctness.
585  */
586 static int
587 g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
588 {
589         struct g_provider *pp;
590         u_char *buf;
591         int error;
592
593         g_topology_assert();
594
595         error = g_access(cp, 1, 0, 0);
596         if (error != 0)
597                 return (error);
598         pp = cp->provider;
599         g_topology_unlock();
600         /* Metadata is stored in last sector. */
601         buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
602             &error);
603         g_topology_lock();
604         g_access(cp, -1, 0, 0);
605         if (buf == NULL) {
606                 GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
607                     cp->provider->name, error);
608                 return (error);
609         }
610
611         /* Decode metadata. */
612         error = journal_metadata_decode(buf, md);
613         g_free(buf);
614         /* Is this is gjournal provider at all? */
615         if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
616                 return (EINVAL);
617         /*
618          * Are we able to handle this version of metadata?
619          * We only maintain backward compatibility.
620          */
621         if (md->md_version > G_JOURNAL_VERSION) {
622                 GJ_DEBUG(0,
623                     "Kernel module is too old to handle metadata from %s.",
624                     cp->provider->name);
625                 return (EINVAL);
626         }
627         /* Is checksum correct? */
628         if (error != 0) {
629                 GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
630                     cp->provider->name);
631                 return (error);
632         }
633         return (0);
634 }
635
636 /*
637  * Two functions below are responsible for updating metadata.
638  * Only metadata on the data provider is updated (we need to update
639  * information about active journal in there).
640  */
641 static void
642 g_journal_metadata_done(struct bio *bp)
643 {
644
645         /*
646          * There is not much we can do on error except informing about it.
647          */
648         if (bp->bio_error != 0) {
649                 GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
650                     bp->bio_error);
651         } else {
652                 GJ_LOGREQ(2, bp, "Metadata updated.");
653         }
654         gj_free(bp->bio_data, bp->bio_length);
655         g_destroy_bio(bp);
656 }
657
658 static void
659 g_journal_metadata_update(struct g_journal_softc *sc)
660 {
661         struct g_journal_metadata md;
662         struct g_consumer *cp;
663         struct bio *bp;
664         u_char *sector;
665
666         cp = sc->sc_dconsumer;
667         sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
668         strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
669         md.md_version = G_JOURNAL_VERSION;
670         md.md_id = sc->sc_id;
671         md.md_type = sc->sc_orig_type;
672         md.md_jstart = sc->sc_jstart;
673         md.md_jend = sc->sc_jend;
674         md.md_joffset = sc->sc_inactive.jj_offset;
675         md.md_jid = sc->sc_journal_previous_id;
676         md.md_flags = 0;
677         if (sc->sc_flags & GJF_DEVICE_CLEAN)
678                 md.md_flags |= GJ_FLAG_CLEAN;
679
680         if (sc->sc_flags & GJF_DEVICE_HARDCODED)
681                 strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
682         else
683                 bzero(md.md_provider, sizeof(md.md_provider));
684         md.md_provsize = cp->provider->mediasize;
685         journal_metadata_encode(&md, sector);
686
687         /*
688          * Flush the cache, so we know all data are on disk.
689          * We write here informations like "journal is consistent", so we need
690          * to be sure it is. Without BIO_FLUSH here, we can end up in situation
691          * where metadata is stored on disk, but not all data.
692          */
693         g_journal_flush_cache(sc);
694
695         bp = g_alloc_bio();
696         bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
697         bp->bio_length = cp->provider->sectorsize;
698         bp->bio_data = sector;
699         bp->bio_cmd = BIO_WRITE;
700         if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
701                 bp->bio_done = g_journal_metadata_done;
702                 g_io_request(bp, cp);
703         } else {
704                 bp->bio_done = NULL;
705                 g_io_request(bp, cp);
706                 biowait(bp, "gjmdu");
707                 g_journal_metadata_done(bp);
708         }
709
710         /*
711          * Be sure metadata reached the disk.
712          */
713         g_journal_flush_cache(sc);
714 }
715
716 /*
717  * This is where the I/O request comes from the GEOM.
718  */
719 static void
720 g_journal_start(struct bio *bp)
721 {
722         struct g_journal_softc *sc;
723
724         sc = bp->bio_to->geom->softc;
725         GJ_LOGREQ(3, bp, "Request received.");
726
727         switch (bp->bio_cmd) {
728         case BIO_READ:
729         case BIO_WRITE:
730                 mtx_lock(&sc->sc_mtx);
731                 bioq_insert_tail(&sc->sc_regular_queue, bp);
732                 wakeup(sc);
733                 mtx_unlock(&sc->sc_mtx);
734                 return;
735         case BIO_GETATTR:
736                 if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
737                         strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
738                         bp->bio_completed = strlen(bp->bio_to->name) + 1;
739                         g_io_deliver(bp, 0);
740                         return;
741                 }
742                 /* FALLTHROUGH */
743         case BIO_DELETE:
744         default:
745                 g_io_deliver(bp, EOPNOTSUPP);
746                 return;
747         }
748 }
749
750 static void
751 g_journal_std_done(struct bio *bp)
752 {
753         struct g_journal_softc *sc;
754
755         sc = bp->bio_from->geom->softc;
756         mtx_lock(&sc->sc_mtx);
757         bioq_insert_tail(&sc->sc_back_queue, bp);
758         wakeup(sc);
759         mtx_unlock(&sc->sc_mtx);
760 }
761
762 static struct bio *
763 g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
764     int flags)
765 {
766         struct bio *bp;
767
768         bp = g_alloc_bio();
769         bp->bio_offset = start;
770         bp->bio_joffset = joffset;
771         bp->bio_length = end - start;
772         bp->bio_cmd = BIO_WRITE;
773         bp->bio_done = g_journal_std_done;
774         if (data == NULL)
775                 bp->bio_data = NULL;
776         else {
777                 bp->bio_data = gj_malloc(bp->bio_length, flags);
778                 if (bp->bio_data != NULL)
779                         bcopy(data, bp->bio_data, bp->bio_length);
780         }
781         return (bp);
782 }
783
784 #define g_journal_insert_bio(head, bp, flags)                           \
785         g_journal_insert((head), (bp)->bio_offset,                      \
786                 (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \
787                 (bp)->bio_data, flags)
788 /*
789  * The function below does a lot more than just inserting bio to the queue.
790  * It keeps the queue sorted by offset and ensures that there are no doubled
791  * data (it combines bios where ranges overlap).
792  *
793  * The function returns the number of bios inserted (as bio can be splitted).
794  */
795 static int
796 g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
797     u_char *data, int flags)
798 {
799         struct bio *nbp, *cbp, *pbp;
800         off_t cstart, cend;
801         u_char *tmpdata;
802         int n;
803
804         GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
805             joffset);
806         n = 0;
807         pbp = NULL;
808         GJQ_FOREACH(*head, cbp) {
809                 cstart = cbp->bio_offset;
810                 cend = cbp->bio_offset + cbp->bio_length;
811
812                 if (nstart >= cend) {
813                         /*
814                          *  +-------------+
815                          *  |             |
816                          *  |   current   |  +-------------+
817                          *  |     bio     |  |             |
818                          *  |             |  |     new     |
819                          *  +-------------+  |     bio     |
820                          *                   |             |
821                          *                   +-------------+
822                          */
823                         GJ_DEBUG(3, "INSERT(%p): 1", *head);
824                 } else if (nend <= cstart) {
825                         /*
826                          *                   +-------------+
827                          *                   |             |
828                          *  +-------------+  |   current   |
829                          *  |             |  |     bio     |
830                          *  |     new     |  |             |
831                          *  |     bio     |  +-------------+
832                          *  |             |
833                          *  +-------------+
834                          */
835                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
836                             flags);
837                         if (pbp == NULL)
838                                 *head = nbp;
839                         else
840                                 pbp->bio_next = nbp;
841                         nbp->bio_next = cbp;
842                         n++;
843                         GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
844                             pbp);
845                         goto end;
846                 } else if (nstart <= cstart && nend >= cend) {
847                         /*
848                          *      +-------------+      +-------------+
849                          *      | current bio |      | current bio |
850                          *  +---+-------------+---+  +-------------+---+
851                          *  |   |             |   |  |             |   |
852                          *  |   |             |   |  |             |   |
853                          *  |   +-------------+   |  +-------------+   |
854                          *  |       new bio       |  |     new bio     |
855                          *  +---------------------+  +-----------------+
856                          *
857                          *      +-------------+  +-------------+
858                          *      | current bio |  | current bio |
859                          *  +---+-------------+  +-------------+
860                          *  |   |             |  |             |
861                          *  |   |             |  |             |
862                          *  |   +-------------+  +-------------+
863                          *  |     new bio     |  |   new bio   |
864                          *  +-----------------+  +-------------+
865                          */
866                         g_journal_stats_bytes_skipped += cbp->bio_length;
867                         cbp->bio_offset = nstart;
868                         cbp->bio_joffset = joffset;
869                         cbp->bio_length = cend - nstart;
870                         if (cbp->bio_data != NULL) {
871                                 gj_free(cbp->bio_data, cend - cstart);
872                                 cbp->bio_data = NULL;
873                         }
874                         if (data != NULL) {
875                                 cbp->bio_data = gj_malloc(cbp->bio_length,
876                                     flags);
877                                 if (cbp->bio_data != NULL) {
878                                         bcopy(data, cbp->bio_data,
879                                             cbp->bio_length);
880                                 }
881                                 data += cend - nstart;
882                         }
883                         joffset += cend - nstart;
884                         nstart = cend;
885                         GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
886                 } else if (nstart > cstart && nend >= cend) {
887                         /*
888                          *  +-----------------+  +-------------+
889                          *  |   current bio   |  | current bio |
890                          *  |   +-------------+  |   +---------+---+
891                          *  |   |             |  |   |         |   |
892                          *  |   |             |  |   |         |   |
893                          *  +---+-------------+  +---+---------+   |
894                          *      |   new bio   |      |   new bio   |
895                          *      +-------------+      +-------------+
896                          */
897                         g_journal_stats_bytes_skipped += cend - nstart;
898                         nbp = g_journal_new_bio(nstart, cend, joffset, data,
899                             flags);
900                         nbp->bio_next = cbp->bio_next;
901                         cbp->bio_next = nbp;
902                         cbp->bio_length = nstart - cstart;
903                         if (cbp->bio_data != NULL) {
904                                 cbp->bio_data = gj_realloc(cbp->bio_data,
905                                     cbp->bio_length, cend - cstart);
906                         }
907                         if (data != NULL)
908                                 data += cend - nstart;
909                         joffset += cend - nstart;
910                         nstart = cend;
911                         n++;
912                         GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
913                 } else if (nstart > cstart && nend < cend) {
914                         /*
915                          *  +---------------------+
916                          *  |     current bio     |
917                          *  |   +-------------+   |
918                          *  |   |             |   |
919                          *  |   |             |   |
920                          *  +---+-------------+---+
921                          *      |   new bio   |
922                          *      +-------------+
923                          */
924                         g_journal_stats_bytes_skipped += nend - nstart;
925                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
926                             flags);
927                         nbp->bio_next = cbp->bio_next;
928                         cbp->bio_next = nbp;
929                         if (cbp->bio_data == NULL)
930                                 tmpdata = NULL;
931                         else
932                                 tmpdata = cbp->bio_data + nend - cstart;
933                         nbp = g_journal_new_bio(nend, cend,
934                             cbp->bio_joffset + nend - cstart, tmpdata, flags);
935                         nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
936                         ((struct bio *)cbp->bio_next)->bio_next = nbp;
937                         cbp->bio_length = nstart - cstart;
938                         if (cbp->bio_data != NULL) {
939                                 cbp->bio_data = gj_realloc(cbp->bio_data,
940                                     cbp->bio_length, cend - cstart);
941                         }
942                         n += 2;
943                         GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
944                         goto end;
945                 } else if (nstart <= cstart && nend < cend) {
946                         /*
947                          *  +-----------------+      +-------------+
948                          *  |   current bio   |      | current bio |
949                          *  +-------------+   |  +---+---------+   |
950                          *  |             |   |  |   |         |   |
951                          *  |             |   |  |   |         |   |
952                          *  +-------------+---+  |   +---------+---+
953                          *  |   new bio   |      |   new bio   |
954                          *  +-------------+      +-------------+
955                          */
956                         g_journal_stats_bytes_skipped += nend - nstart;
957                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
958                             flags);
959                         if (pbp == NULL)
960                                 *head = nbp;
961                         else
962                                 pbp->bio_next = nbp;
963                         nbp->bio_next = cbp;
964                         cbp->bio_offset = nend;
965                         cbp->bio_length = cend - nend;
966                         cbp->bio_joffset += nend - cstart;
967                         tmpdata = cbp->bio_data;
968                         if (tmpdata != NULL) {
969                                 cbp->bio_data = gj_malloc(cbp->bio_length,
970                                     flags);
971                                 if (cbp->bio_data != NULL) {
972                                         bcopy(tmpdata + nend - cstart,
973                                             cbp->bio_data, cbp->bio_length);
974                                 }
975                                 gj_free(tmpdata, cend - cstart);
976                         }
977                         n++;
978                         GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
979                         goto end;
980                 }
981                 if (nstart == nend)
982                         goto end;
983                 pbp = cbp;
984         }
985         nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
986         if (pbp == NULL)
987                 *head = nbp;
988         else
989                 pbp->bio_next = nbp;
990         nbp->bio_next = NULL;
991         n++;
992         GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
993 end:
994         if (g_journal_debug >= 3) {
995                 GJQ_FOREACH(*head, cbp) {
996                         GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
997                             (intmax_t)cbp->bio_offset,
998                             (intmax_t)cbp->bio_length,
999                             (intmax_t)cbp->bio_joffset, cbp->bio_data);
1000                 }
1001                 GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
1002         }
1003         return (n);
1004 }
1005
1006 /*
1007  * The function combines neighbour bios trying to squeeze as much data as
1008  * possible into one bio.
1009  *
1010  * The function returns the number of bios combined (negative value).
1011  */
1012 static int
1013 g_journal_optimize(struct bio *head)
1014 {
1015         struct bio *cbp, *pbp;
1016         int n;
1017
1018         n = 0;
1019         pbp = NULL;
1020         GJQ_FOREACH(head, cbp) {
1021                 /* Skip bios which has to be read first. */
1022                 if (cbp->bio_data == NULL) {
1023                         pbp = NULL;
1024                         continue;
1025                 }
1026                 /* There is no previous bio yet. */
1027                 if (pbp == NULL) {
1028                         pbp = cbp;
1029                         continue;
1030                 }
1031                 /* Is this a neighbour bio? */
1032                 if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
1033                         /* Be sure that bios queue is sorted. */
1034                         KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
1035                             ("poffset=%jd plength=%jd coffset=%jd",
1036                             (intmax_t)pbp->bio_offset,
1037                             (intmax_t)pbp->bio_length,
1038                             (intmax_t)cbp->bio_offset));
1039                         pbp = cbp;
1040                         continue;
1041                 }
1042                 /* Be sure we don't end up with too big bio. */
1043                 if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
1044                         pbp = cbp;
1045                         continue;
1046                 }
1047                 /* Ok, we can join bios. */
1048                 GJ_LOGREQ(4, pbp, "Join: ");
1049                 GJ_LOGREQ(4, cbp, "and: ");
1050                 pbp->bio_data = gj_realloc(pbp->bio_data,
1051                     pbp->bio_length + cbp->bio_length, pbp->bio_length);
1052                 bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
1053                     cbp->bio_length);
1054                 gj_free(cbp->bio_data, cbp->bio_length);
1055                 pbp->bio_length += cbp->bio_length;
1056                 pbp->bio_next = cbp->bio_next;
1057                 g_destroy_bio(cbp);
1058                 cbp = pbp;
1059                 g_journal_stats_combined_ios++;
1060                 n--;
1061                 GJ_LOGREQ(4, pbp, "Got: ");
1062         }
1063         return (n);
1064 }
1065
1066 /*
1067  * TODO: Update comment.
1068  * These are functions responsible for copying one portion of data from journal
1069  * to the destination provider.
1070  * The order goes like this:
1071  * 1. Read the header, which contains informations about data blocks
1072  *    following it.
1073  * 2. Read the data blocks from the journal.
1074  * 3. Write the data blocks on the data provider.
1075  *
1076  * g_journal_copy_start()
1077  * g_journal_copy_done() - got finished write request, logs potential errors.
1078  */
1079
1080 /*
1081  * When there is no data in cache, this function is used to read it.
1082  */
1083 static void
1084 g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
1085 {
1086         struct bio *cbp;
1087
1088         /*
1089          * We were short in memory, so data was freed.
1090          * In that case we need to read it back from journal.
1091          */
1092         cbp = g_alloc_bio();
1093         cbp->bio_cflags = bp->bio_cflags;
1094         cbp->bio_parent = bp;
1095         cbp->bio_offset = bp->bio_joffset;
1096         cbp->bio_length = bp->bio_length;
1097         cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
1098         cbp->bio_cmd = BIO_READ;
1099         cbp->bio_done = g_journal_std_done;
1100         GJ_LOGREQ(4, cbp, "READ FIRST");
1101         g_io_request(cbp, sc->sc_jconsumer);
1102         g_journal_cache_misses++;
1103 }
1104
1105 static void
1106 g_journal_copy_send(struct g_journal_softc *sc)
1107 {
1108         struct bio *bioq, *bp, *lbp;
1109
1110         bioq = lbp = NULL;
1111         mtx_lock(&sc->sc_mtx);
1112         for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
1113                 bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
1114                 if (bp == NULL)
1115                         break;
1116                 GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
1117                 sc->sc_copy_in_progress++;
1118                 GJQ_INSERT_AFTER(bioq, bp, lbp);
1119                 lbp = bp;
1120         }
1121         mtx_unlock(&sc->sc_mtx);
1122         if (g_journal_do_optimize)
1123                 sc->sc_copy_in_progress += g_journal_optimize(bioq);
1124         while ((bp = GJQ_FIRST(bioq)) != NULL) {
1125                 GJQ_REMOVE(bioq, bp);
1126                 GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
1127                 bp->bio_cflags = GJ_BIO_COPY;
1128                 if (bp->bio_data == NULL)
1129                         g_journal_read_first(sc, bp);
1130                 else {
1131                         bp->bio_joffset = 0;
1132                         GJ_LOGREQ(4, bp, "SEND");
1133                         g_io_request(bp, sc->sc_dconsumer);
1134                 }
1135         }
1136 }
1137
1138 static void
1139 g_journal_copy_start(struct g_journal_softc *sc)
1140 {
1141
1142         /*
1143          * Remember in metadata that we're starting to copy journaled data
1144          * to the data provider.
1145          * In case of power failure, we will copy these data once again on boot.
1146          */
1147         if (!sc->sc_journal_copying) {
1148                 sc->sc_journal_copying = 1;
1149                 GJ_DEBUG(1, "Starting copy of journal.");
1150                 g_journal_metadata_update(sc);
1151         }
1152         g_journal_copy_send(sc);
1153 }
1154
1155 /*
1156  * Data block has been read from the journal provider.
1157  */
1158 static int
1159 g_journal_copy_read_done(struct bio *bp)
1160 {
1161         struct g_journal_softc *sc;
1162         struct g_consumer *cp;
1163         struct bio *pbp;
1164
1165         KASSERT(bp->bio_cflags == GJ_BIO_COPY,
1166             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
1167
1168         sc = bp->bio_from->geom->softc;
1169         pbp = bp->bio_parent;
1170
1171         if (bp->bio_error != 0) {
1172                 GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
1173                     bp->bio_to->name, bp->bio_error);
1174                 /*
1175                  * We will not be able to deliver WRITE request as well.
1176                  */
1177                 gj_free(bp->bio_data, bp->bio_length);
1178                 g_destroy_bio(pbp);
1179                 g_destroy_bio(bp);
1180                 sc->sc_copy_in_progress--;
1181                 return (1);
1182         }
1183         pbp->bio_data = bp->bio_data;
1184         cp = sc->sc_dconsumer;
1185         g_io_request(pbp, cp);
1186         GJ_LOGREQ(4, bp, "READ DONE");
1187         g_destroy_bio(bp);
1188         return (0);
1189 }
1190
1191 /*
1192  * Data block has been written to the data provider.
1193  */
1194 static void
1195 g_journal_copy_write_done(struct bio *bp)
1196 {
1197         struct g_journal_softc *sc;
1198
1199         KASSERT(bp->bio_cflags == GJ_BIO_COPY,
1200             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
1201
1202         sc = bp->bio_from->geom->softc;
1203         sc->sc_copy_in_progress--;
1204
1205         if (bp->bio_error != 0) {
1206                 GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
1207                     bp->bio_error);
1208         }
1209         GJQ_REMOVE(sc->sc_copy_queue, bp);
1210         gj_free(bp->bio_data, bp->bio_length);
1211         GJ_LOGREQ(4, bp, "DONE");
1212         g_destroy_bio(bp);
1213
1214         if (sc->sc_copy_in_progress == 0) {
1215                 /*
1216                  * This was the last write request for this journal.
1217                  */
1218                 GJ_DEBUG(1, "Data has been copied.");
1219                 sc->sc_journal_copying = 0;
1220         }
1221 }
1222
1223 static void g_journal_flush_done(struct bio *bp);
1224
1225 /*
1226  * Flush one record onto active journal provider.
1227  */
1228 static void
1229 g_journal_flush(struct g_journal_softc *sc)
1230 {
1231         struct g_journal_record_header hdr;
1232         struct g_journal_entry *ent;
1233         struct g_provider *pp;
1234         struct bio **bioq;
1235         struct bio *bp, *fbp, *pbp;
1236         off_t joffset, size;
1237         u_char *data, hash[16];
1238         MD5_CTX ctx;
1239         u_int i;
1240
1241         if (sc->sc_current_count == 0)
1242                 return;
1243
1244         size = 0;
1245         pp = sc->sc_jprovider;
1246         GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
1247         joffset = sc->sc_journal_offset;
1248
1249         GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
1250             sc->sc_current_count, pp->name, (intmax_t)joffset);
1251
1252         /*
1253          * Store 'journal id', so we know to which journal this record belongs.
1254          */
1255         hdr.jrh_journal_id = sc->sc_journal_id;
1256         /* Could be less than g_journal_record_entries if called due timeout. */
1257         hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
1258         strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
1259
1260         bioq = &sc->sc_active.jj_queue;
1261         pbp = sc->sc_flush_queue;
1262
1263         fbp = g_alloc_bio();
1264         fbp->bio_parent = NULL;
1265         fbp->bio_cflags = GJ_BIO_JOURNAL;
1266         fbp->bio_offset = -1;
1267         fbp->bio_joffset = joffset;
1268         fbp->bio_length = pp->sectorsize;
1269         fbp->bio_cmd = BIO_WRITE;
1270         fbp->bio_done = g_journal_std_done;
1271         GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
1272         pbp = fbp;
1273         fbp->bio_to = pp;
1274         GJ_LOGREQ(4, fbp, "FLUSH_OUT");
1275         joffset += pp->sectorsize;
1276         sc->sc_flush_count++;
1277         if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1278                 MD5Init(&ctx);
1279
1280         for (i = 0; i < hdr.jrh_nentries; i++) {
1281                 bp = sc->sc_current_queue;
1282                 KASSERT(bp != NULL, ("NULL bp"));
1283                 bp->bio_to = pp;
1284                 GJ_LOGREQ(4, bp, "FLUSHED");
1285                 sc->sc_current_queue = bp->bio_next;
1286                 bp->bio_next = NULL;
1287                 sc->sc_current_count--;
1288
1289                 /* Add to the header. */
1290                 ent = &hdr.jrh_entries[i];
1291                 ent->je_offset = bp->bio_offset;
1292                 ent->je_joffset = joffset;
1293                 ent->je_length = bp->bio_length;
1294                 size += ent->je_length;
1295
1296                 data = bp->bio_data;
1297                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1298                         MD5Update(&ctx, data, ent->je_length);
1299                 bzero(bp, sizeof(*bp));
1300                 bp->bio_cflags = GJ_BIO_JOURNAL;
1301                 bp->bio_offset = ent->je_offset;
1302                 bp->bio_joffset = ent->je_joffset;
1303                 bp->bio_length = ent->je_length;
1304                 bp->bio_data = data;
1305                 bp->bio_cmd = BIO_WRITE;
1306                 bp->bio_done = g_journal_std_done;
1307                 GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
1308                 pbp = bp;
1309                 bp->bio_to = pp;
1310                 GJ_LOGREQ(4, bp, "FLUSH_OUT");
1311                 joffset += bp->bio_length;
1312                 sc->sc_flush_count++;
1313
1314                 /*
1315                  * Add request to the active sc_journal_queue queue.
1316                  * This is our cache. After journal switch we don't have to
1317                  * read the data from the inactive journal, because we keep
1318                  * it in memory.
1319                  */
1320                 g_journal_insert(bioq, ent->je_offset,
1321                     ent->je_offset + ent->je_length, ent->je_joffset, data,
1322                     M_NOWAIT);
1323         }
1324
1325         /*
1326          * After all requests, store valid header.
1327          */
1328         data = gj_malloc(pp->sectorsize, M_WAITOK);
1329         if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1330                 MD5Final(hash, &ctx);
1331                 bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
1332         }
1333         g_journal_record_header_encode(&hdr, data);
1334         fbp->bio_data = data;
1335
1336         sc->sc_journal_offset = joffset;
1337
1338         g_journal_check_overflow(sc);
1339 }
1340
1341 /*
1342  * Flush request finished.
1343  */
1344 static void
1345 g_journal_flush_done(struct bio *bp)
1346 {
1347         struct g_journal_softc *sc;
1348         struct g_consumer *cp;
1349
1350         KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
1351             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
1352
1353         cp = bp->bio_from;
1354         sc = cp->geom->softc;
1355         sc->sc_flush_in_progress--;
1356
1357         if (bp->bio_error != 0) {
1358                 GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
1359                     bp->bio_error);
1360         }
1361         gj_free(bp->bio_data, bp->bio_length);
1362         GJ_LOGREQ(4, bp, "DONE");
1363         g_destroy_bio(bp);
1364 }
1365
1366 static void g_journal_release_delayed(struct g_journal_softc *sc);
1367
1368 static void
1369 g_journal_flush_send(struct g_journal_softc *sc)
1370 {
1371         struct g_consumer *cp;
1372         struct bio *bioq, *bp, *lbp;
1373
1374         cp = sc->sc_jconsumer;
1375         bioq = lbp = NULL;
1376         while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
1377                 /* Send one flush requests to the active journal. */
1378                 bp = GJQ_FIRST(sc->sc_flush_queue);
1379                 if (bp != NULL) {
1380                         GJQ_REMOVE(sc->sc_flush_queue, bp);
1381                         sc->sc_flush_count--;
1382                         bp->bio_offset = bp->bio_joffset;
1383                         bp->bio_joffset = 0;
1384                         sc->sc_flush_in_progress++;
1385                         GJQ_INSERT_AFTER(bioq, bp, lbp);
1386                         lbp = bp;
1387                 }
1388                 /* Try to release delayed requests. */
1389                 g_journal_release_delayed(sc);
1390                 /* If there are no requests to flush, leave. */
1391                 if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
1392                         break;
1393         }
1394         if (g_journal_do_optimize)
1395                 sc->sc_flush_in_progress += g_journal_optimize(bioq);
1396         while ((bp = GJQ_FIRST(bioq)) != NULL) {
1397                 GJQ_REMOVE(bioq, bp);
1398                 GJ_LOGREQ(3, bp, "Flush request send");
1399                 g_io_request(bp, cp);
1400         }
1401 }
1402
1403 static void
1404 g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
1405 {
1406         int n;
1407
1408         GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
1409         n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
1410         sc->sc_current_count += n;
1411         n = g_journal_optimize(sc->sc_current_queue);
1412         sc->sc_current_count += n;
1413         /*
1414          * For requests which are added to the current queue we deliver
1415          * response immediately.
1416          */
1417         bp->bio_completed = bp->bio_length;
1418         g_io_deliver(bp, 0);
1419         if (sc->sc_current_count >= g_journal_record_entries) {
1420                 /*
1421                  * Let's flush one record onto active journal provider.
1422                  */
1423                 g_journal_flush(sc);
1424         }
1425 }
1426
1427 static void
1428 g_journal_release_delayed(struct g_journal_softc *sc)
1429 {
1430         struct bio *bp;
1431
1432         for (;;) {
1433                 /* The flush queue is full, exit. */
1434                 if (sc->sc_flush_count >= g_journal_accept_immediately)
1435                         return;
1436                 bp = bioq_takefirst(&sc->sc_delayed_queue);
1437                 if (bp == NULL)
1438                         return;
1439                 sc->sc_delayed_count--;
1440                 g_journal_add_current(sc, bp);
1441         }
1442 }
1443
1444 /*
1445  * Add I/O request to the current queue. If we have enough requests for one
1446  * journal record we flush them onto active journal provider.
1447  */
1448 static void
1449 g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
1450 {
1451
1452         /*
1453          * The flush queue is full, we need to delay the request.
1454          */
1455         if (sc->sc_delayed_count > 0 ||
1456             sc->sc_flush_count >= g_journal_accept_immediately) {
1457                 GJ_LOGREQ(4, bp, "DELAYED");
1458                 bioq_insert_tail(&sc->sc_delayed_queue, bp);
1459                 sc->sc_delayed_count++;
1460                 return;
1461         }
1462
1463         KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
1464             ("DELAYED queue not empty."));
1465         g_journal_add_current(sc, bp);
1466 }
1467
1468 static void g_journal_read_done(struct bio *bp);
1469
1470 /*
1471  * Try to find requested data in cache.
1472  */
1473 static struct bio *
1474 g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
1475     off_t oend)
1476 {
1477         off_t cstart, cend;
1478         struct bio *bp;
1479
1480         GJQ_FOREACH(head, bp) {
1481                 if (bp->bio_offset == -1)
1482                         continue;
1483                 cstart = MAX(ostart, bp->bio_offset);
1484                 cend = MIN(oend, bp->bio_offset + bp->bio_length);
1485                 if (cend <= ostart)
1486                         continue;
1487                 else if (cstart >= oend) {
1488                         if (!sorted)
1489                                 continue;
1490                         else {
1491                                 bp = NULL;
1492                                 break;
1493                         }
1494                 }
1495                 if (bp->bio_data == NULL)
1496                         break;
1497                 GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
1498                     bp);
1499                 bcopy(bp->bio_data + cstart - bp->bio_offset,
1500                     pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
1501                 pbp->bio_completed += cend - cstart;
1502                 if (pbp->bio_completed == pbp->bio_length) {
1503                         /*
1504                          * Cool, the whole request was in cache, deliver happy
1505                          * message.
1506                          */
1507                         g_io_deliver(pbp, 0);
1508                         return (pbp);
1509                 }
1510                 break;
1511         }
1512         return (bp);
1513 }
1514
1515 /*
1516  * Try to find requested data in cache.
1517  */
1518 static struct bio *
1519 g_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart,
1520     off_t oend)
1521 {
1522         off_t cstart, cend;
1523         struct bio *bp;
1524
1525         TAILQ_FOREACH(bp, head, bio_queue) {
1526                 cstart = MAX(ostart, bp->bio_offset);
1527                 cend = MIN(oend, bp->bio_offset + bp->bio_length);
1528                 if (cend <= ostart)
1529                         continue;
1530                 else if (cstart >= oend)
1531                         continue;
1532                 KASSERT(bp->bio_data != NULL,
1533                     ("%s: bio_data == NULL", __func__));
1534                 GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
1535                     bp);
1536                 bcopy(bp->bio_data + cstart - bp->bio_offset,
1537                     pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
1538                 pbp->bio_completed += cend - cstart;
1539                 if (pbp->bio_completed == pbp->bio_length) {
1540                         /*
1541                          * Cool, the whole request was in cache, deliver happy
1542                          * message.
1543                          */
1544                         g_io_deliver(pbp, 0);
1545                         return (pbp);
1546                 }
1547                 break;
1548         }
1549         return (bp);
1550 }
1551
1552 /*
1553  * This function is used for colecting data on read.
1554  * The complexity is because parts of the data can be stored in four different
1555  * places:
1556  * - in delayed requests
1557  * - in memory - the data not yet send to the active journal provider
1558  * - in requests which are going to be sent to the active journal
1559  * - in the active journal
1560  * - in the inactive journal
1561  * - in the data provider
1562  */
1563 static void
1564 g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
1565     off_t oend)
1566 {
1567         struct bio *bp, *nbp, *head;
1568         off_t cstart, cend;
1569         u_int i, sorted = 0;
1570
1571         GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
1572
1573         cstart = cend = -1;
1574         bp = NULL;
1575         head = NULL;
1576         for (i = 0; i <= 5; i++) {
1577                 switch (i) {
1578                 case 0: /* Delayed requests. */
1579                         head = NULL;
1580                         sorted = 0;
1581                         break;
1582                 case 1: /* Not-yet-send data. */
1583                         head = sc->sc_current_queue;
1584                         sorted = 1;
1585                         break;
1586                 case 2: /* In-flight to the active journal. */
1587                         head = sc->sc_flush_queue;
1588                         sorted = 0;
1589                         break;
1590                 case 3: /* Active journal. */
1591                         head = sc->sc_active.jj_queue;
1592                         sorted = 1;
1593                         break;
1594                 case 4: /* Inactive journal. */
1595                         /*
1596                          * XXX: Here could be a race with g_journal_lowmem().
1597                          */
1598                         head = sc->sc_inactive.jj_queue;
1599                         sorted = 1;
1600                         break;
1601                 case 5: /* In-flight to the data provider. */
1602                         head = sc->sc_copy_queue;
1603                         sorted = 0;
1604                         break;
1605                 default:
1606                         panic("gjournal %s: i=%d", __func__, i);
1607                 }
1608                 if (i == 0)
1609                         bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend);
1610                 else
1611                         bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
1612                 if (bp == pbp) { /* Got the whole request. */
1613                         GJ_DEBUG(2, "Got the whole request from %u.", i);
1614                         return;
1615                 } else if (bp != NULL) {
1616                         cstart = MAX(ostart, bp->bio_offset);
1617                         cend = MIN(oend, bp->bio_offset + bp->bio_length);
1618                         GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
1619                             i, (intmax_t)cstart, (intmax_t)cend);
1620                         break;
1621                 }
1622         }
1623         if (bp != NULL) {
1624                 if (bp->bio_data == NULL) {
1625                         nbp = g_duplicate_bio(pbp);
1626                         nbp->bio_cflags = GJ_BIO_READ;
1627                         nbp->bio_data =
1628                             pbp->bio_data + cstart - pbp->bio_offset;
1629                         nbp->bio_offset =
1630                             bp->bio_joffset + cstart - bp->bio_offset;
1631                         nbp->bio_length = cend - cstart;
1632                         nbp->bio_done = g_journal_read_done;
1633                         g_io_request(nbp, sc->sc_jconsumer);
1634                 }
1635                 /*
1636                  * If we don't have the whole request yet, call g_journal_read()
1637                  * recursively.
1638                  */
1639                 if (ostart < cstart)
1640                         g_journal_read(sc, pbp, ostart, cstart);
1641                 if (oend > cend)
1642                         g_journal_read(sc, pbp, cend, oend);
1643         } else {
1644                 /*
1645                  * No data in memory, no data in journal.
1646                  * Its time for asking data provider.
1647                  */
1648                 GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
1649                 nbp = g_duplicate_bio(pbp);
1650                 nbp->bio_cflags = GJ_BIO_READ;
1651                 nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
1652                 nbp->bio_offset = ostart;
1653                 nbp->bio_length = oend - ostart;
1654                 nbp->bio_done = g_journal_read_done;
1655                 g_io_request(nbp, sc->sc_dconsumer);
1656                 /* We have the whole request, return here. */
1657                 return;
1658         }
1659 }
1660
1661 /*
1662  * Function responsible for handling finished READ requests.
1663  * Actually, g_std_done() could be used here, the only difference is that we
1664  * log error.
1665  */
1666 static void
1667 g_journal_read_done(struct bio *bp)
1668 {
1669         struct bio *pbp;
1670
1671         KASSERT(bp->bio_cflags == GJ_BIO_READ,
1672             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
1673
1674         pbp = bp->bio_parent;
1675         pbp->bio_inbed++;
1676         pbp->bio_completed += bp->bio_length;
1677
1678         if (bp->bio_error != 0) {
1679                 if (pbp->bio_error == 0)
1680                         pbp->bio_error = bp->bio_error;
1681                 GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
1682                     bp->bio_to->name, bp->bio_error);
1683         }
1684         g_destroy_bio(bp);
1685         if (pbp->bio_children == pbp->bio_inbed &&
1686             pbp->bio_completed == pbp->bio_length) {
1687                 /* We're done. */
1688                 g_io_deliver(pbp, 0);
1689         }
1690 }
1691
1692 /*
1693  * Deactive current journal and active next one.
1694  */
1695 static void
1696 g_journal_switch(struct g_journal_softc *sc)
1697 {
1698         struct g_provider *pp;
1699
1700         if (JEMPTY(sc)) {
1701                 GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
1702                 pp = LIST_FIRST(&sc->sc_geom->provider);
1703                 if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
1704                         sc->sc_flags |= GJF_DEVICE_CLEAN;
1705                         GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
1706                         g_journal_metadata_update(sc);
1707                 }
1708         } else {
1709                 GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
1710
1711                 pp = sc->sc_jprovider;
1712
1713                 sc->sc_journal_previous_id = sc->sc_journal_id;
1714
1715                 sc->sc_journal_id = sc->sc_journal_next_id;
1716                 sc->sc_journal_next_id = arc4random();
1717
1718                 GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
1719
1720                 g_journal_write_header(sc);
1721
1722                 sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
1723                 sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
1724
1725                 sc->sc_active.jj_offset =
1726                     sc->sc_journal_offset - pp->sectorsize;
1727                 sc->sc_active.jj_queue = NULL;
1728
1729                 /*
1730                  * Switch is done, start copying data from the (now) inactive
1731                  * journal to the data provider.
1732                  */
1733                 g_journal_copy_start(sc);
1734         }
1735         mtx_lock(&sc->sc_mtx);
1736         sc->sc_flags &= ~GJF_DEVICE_SWITCH;
1737         mtx_unlock(&sc->sc_mtx);
1738 }
1739
1740 static void
1741 g_journal_initialize(struct g_journal_softc *sc)
1742 {
1743
1744         sc->sc_journal_id = arc4random();
1745         sc->sc_journal_next_id = arc4random();
1746         sc->sc_journal_previous_id = sc->sc_journal_id;
1747         sc->sc_journal_offset = sc->sc_jstart;
1748         sc->sc_inactive.jj_offset = sc->sc_jstart;
1749         g_journal_write_header(sc);
1750         sc->sc_active.jj_offset = sc->sc_jstart;
1751 }
1752
1753 static void
1754 g_journal_mark_as_dirty(struct g_journal_softc *sc)
1755 {
1756         const struct g_journal_desc *desc;
1757         int i;
1758
1759         GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
1760         for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
1761                 desc->jd_dirty(sc->sc_dconsumer);
1762 }
1763
1764 /*
1765  * Function read record header from the given journal.
1766  * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
1767  * and data on every call.
1768  */
1769 static int
1770 g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
1771     void *data)
1772 {
1773         int error;
1774
1775         bzero(bp, sizeof(*bp));
1776         bp->bio_cmd = BIO_READ;
1777         bp->bio_done = NULL;
1778         bp->bio_offset = offset;
1779         bp->bio_length = cp->provider->sectorsize;
1780         bp->bio_data = data;
1781         g_io_request(bp, cp);
1782         error = biowait(bp, "gjs_read");
1783         return (error);
1784 }
1785
1786 #if 0
1787 /*
1788  * Function is called when we start the journal device and we detect that
1789  * one of the journals was not fully copied.
1790  * The purpose of this function is to read all records headers from journal
1791  * and placed them in the inactive queue, so we can start journal
1792  * synchronization process and the journal provider itself.
1793  * Design decision was taken to not synchronize the whole journal here as it
1794  * can take too much time. Reading headers only and delaying synchronization
1795  * process until after journal provider is started should be the best choice.
1796  */
1797 #endif
1798
1799 static void
1800 g_journal_sync(struct g_journal_softc *sc)
1801 {
1802         struct g_journal_record_header rhdr;
1803         struct g_journal_entry *ent;
1804         struct g_journal_header jhdr;
1805         struct g_consumer *cp;
1806         struct bio *bp, *fbp, *tbp;
1807         off_t joffset, offset;
1808         u_char *buf, sum[16];
1809         uint64_t id;
1810         MD5_CTX ctx;
1811         int error, found, i;
1812
1813         found = 0;
1814         fbp = NULL;
1815         cp = sc->sc_jconsumer;
1816         bp = g_alloc_bio();
1817         buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
1818         offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
1819
1820         GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
1821
1822         /*
1823          * Read and decode first journal header.
1824          */
1825         error = g_journal_sync_read(cp, bp, offset, buf);
1826         if (error != 0) {
1827                 GJ_DEBUG(0, "Error while reading journal header from %s.",
1828                     cp->provider->name);
1829                 goto end;
1830         }
1831         error = g_journal_header_decode(buf, &jhdr);
1832         if (error != 0) {
1833                 GJ_DEBUG(0, "Cannot decode journal header from %s.",
1834                     cp->provider->name);
1835                 goto end;
1836         }
1837         id = sc->sc_journal_id;
1838         if (jhdr.jh_journal_id != sc->sc_journal_id) {
1839                 GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
1840                     (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
1841                 goto end;
1842         }
1843         offset += cp->provider->sectorsize;
1844         id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
1845
1846         for (;;) {
1847                 /*
1848                  * If the biggest record won't fit, look for a record header or
1849                  * journal header from the begining.
1850                  */
1851                 GJ_VALIDATE_OFFSET(offset, sc);
1852                 error = g_journal_sync_read(cp, bp, offset, buf);
1853                 if (error != 0) {
1854                         /*
1855                          * Not good. Having an error while reading header
1856                          * means, that we cannot read next headers and in
1857                          * consequence we cannot find termination.
1858                          */
1859                         GJ_DEBUG(0,
1860                             "Error while reading record header from %s.",
1861                             cp->provider->name);
1862                         break;
1863                 }
1864
1865                 error = g_journal_record_header_decode(buf, &rhdr);
1866                 if (error != 0) {
1867                         GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
1868                             (intmax_t)offset, error);
1869                         /*
1870                          * This is not a record header.
1871                          * If we are lucky, this is next journal header.
1872                          */
1873                         error = g_journal_header_decode(buf, &jhdr);
1874                         if (error != 0) {
1875                                 GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
1876                                     (intmax_t)offset, error);
1877                                 /*
1878                                  * Nope, this is not journal header, which
1879                                  * bascially means that journal is not
1880                                  * terminated properly.
1881                                  */
1882                                 error = ENOENT;
1883                                 break;
1884                         }
1885                         /*
1886                          * Ok. This is header of _some_ journal. Now we need to
1887                          * verify if this is header of the _next_ journal.
1888                          */
1889                         if (jhdr.jh_journal_id != id) {
1890                                 GJ_DEBUG(1, "Journal ID mismatch at %jd "
1891                                     "(0x%08x != 0x%08x).", (intmax_t)offset,
1892                                     (u_int)jhdr.jh_journal_id, (u_int)id);
1893                                 error = ENOENT;
1894                                 break;
1895                         }
1896
1897                         /* Found termination. */
1898                         found++;
1899                         GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
1900                             (intmax_t)offset, (u_int)id);
1901                         sc->sc_active.jj_offset = offset;
1902                         sc->sc_journal_offset =
1903                             offset + cp->provider->sectorsize;
1904                         sc->sc_journal_id = id;
1905                         id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
1906
1907                         while ((tbp = fbp) != NULL) {
1908                                 fbp = tbp->bio_next;
1909                                 GJ_LOGREQ(3, tbp, "Adding request.");
1910                                 g_journal_insert_bio(&sc->sc_inactive.jj_queue,
1911                                     tbp, M_WAITOK);
1912                         }
1913
1914                         /* Skip journal's header. */
1915                         offset += cp->provider->sectorsize;
1916                         continue;
1917                 }
1918
1919                 /* Skip record's header. */
1920                 offset += cp->provider->sectorsize;
1921
1922                 /*
1923                  * Add information about every record entry to the inactive
1924                  * queue.
1925                  */
1926                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1927                         MD5Init(&ctx);
1928                 for (i = 0; i < rhdr.jrh_nentries; i++) {
1929                         ent = &rhdr.jrh_entries[i];
1930                         GJ_DEBUG(3, "Insert entry: %jd %jd.",
1931                             (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
1932                         g_journal_insert(&fbp, ent->je_offset,
1933                             ent->je_offset + ent->je_length, ent->je_joffset,
1934                             NULL, M_WAITOK);
1935                         if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1936                                 u_char *buf2;
1937
1938                                 /*
1939                                  * TODO: Should use faster function (like
1940                                  *       g_journal_sync_read()).
1941                                  */
1942                                 buf2 = g_read_data(cp, offset, ent->je_length,
1943                                     NULL);
1944                                 if (buf2 == NULL)
1945                                         GJ_DEBUG(0, "Cannot read data at %jd.",
1946                                             (intmax_t)offset);
1947                                 else {
1948                                         MD5Update(&ctx, buf2, ent->je_length);
1949                                         g_free(buf2);
1950                                 }
1951                         }
1952                         /* Skip entry's data. */
1953                         offset += ent->je_length;
1954                 }
1955                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1956                         MD5Final(sum, &ctx);
1957                         if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
1958                                 GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
1959                                     (intmax_t)offset);
1960                         }
1961                 }
1962         }
1963 end:
1964         gj_free(bp->bio_data, cp->provider->sectorsize);
1965         g_destroy_bio(bp);
1966
1967         /* Remove bios from unterminated journal. */
1968         while ((tbp = fbp) != NULL) {
1969                 fbp = tbp->bio_next;
1970                 g_destroy_bio(tbp);
1971         }
1972
1973         if (found < 1 && joffset > 0) {
1974                 GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
1975                     sc->sc_name);
1976                 while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
1977                         sc->sc_inactive.jj_queue = tbp->bio_next;
1978                         g_destroy_bio(tbp);
1979                 }
1980                 g_journal_initialize(sc);
1981                 g_journal_mark_as_dirty(sc);
1982         } else {
1983                 GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
1984                 g_journal_copy_start(sc);
1985         }
1986 }
1987
1988 /*
1989  * Wait for requests.
1990  * If we have requests in the current queue, flush them after 3 seconds from the
1991  * last flush. In this way we don't wait forever (or for journal switch) with
1992  * storing not full records on journal.
1993  */
1994 static void
1995 g_journal_wait(struct g_journal_softc *sc, time_t last_write)
1996 {
1997         int error, timeout;
1998
1999         GJ_DEBUG(3, "%s: enter", __func__);
2000         if (sc->sc_current_count == 0) {
2001                 if (g_journal_debug < 2)
2002                         msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
2003                 else {
2004                         /*
2005                          * If we have debug turned on, show number of elements
2006                          * in various queues.
2007                          */
2008                         for (;;) {
2009                                 error = msleep(sc, &sc->sc_mtx, PRIBIO,
2010                                     "gj:work", hz * 3);
2011                                 if (error == 0) {
2012                                         mtx_unlock(&sc->sc_mtx);
2013                                         break;
2014                                 }
2015                                 GJ_DEBUG(3, "Report: current count=%d",
2016                                     sc->sc_current_count);
2017                                 GJ_DEBUG(3, "Report: flush count=%d",
2018                                     sc->sc_flush_count);
2019                                 GJ_DEBUG(3, "Report: flush in progress=%d",
2020                                     sc->sc_flush_in_progress);
2021                                 GJ_DEBUG(3, "Report: copy in progress=%d",
2022                                     sc->sc_copy_in_progress);
2023                                 GJ_DEBUG(3, "Report: delayed=%d",
2024                                     sc->sc_delayed_count);
2025                         }
2026                 }
2027                 GJ_DEBUG(3, "%s: exit 1", __func__);
2028                 return;
2029         }
2030
2031         /*
2032          * Flush even not full records every 3 seconds.
2033          */
2034         timeout = (last_write + 3 - time_second) * hz;
2035         if (timeout <= 0) {
2036                 mtx_unlock(&sc->sc_mtx);
2037                 g_journal_flush(sc);
2038                 g_journal_flush_send(sc);
2039                 GJ_DEBUG(3, "%s: exit 2", __func__);
2040                 return;
2041         }
2042         error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
2043         if (error == EWOULDBLOCK)
2044                 g_journal_flush_send(sc);
2045         GJ_DEBUG(3, "%s: exit 3", __func__);
2046 }
2047
2048 /*
2049  * Worker thread.
2050  */
2051 static void
2052 g_journal_worker(void *arg)
2053 {
2054         struct g_journal_softc *sc;
2055         struct g_geom *gp;
2056         struct g_provider *pp;
2057         struct bio *bp;
2058         time_t last_write;
2059         int type;
2060
2061         thread_lock(curthread);
2062         sched_prio(curthread, PRIBIO);
2063         thread_unlock(curthread);
2064
2065         sc = arg;
2066         type = 0;       /* gcc */
2067
2068         if (sc->sc_flags & GJF_DEVICE_CLEAN) {
2069                 GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
2070                 g_journal_initialize(sc);
2071         } else {
2072                 g_journal_sync(sc);
2073         }
2074         /*
2075          * Check if we can use BIO_FLUSH.
2076          */
2077         sc->sc_bio_flush = 0;
2078         if (g_io_flush(sc->sc_jconsumer) == 0) {
2079                 sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
2080                 GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
2081                     sc->sc_jconsumer->provider->name);
2082         } else {
2083                 GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
2084                     sc->sc_jconsumer->provider->name);
2085         }
2086         if (sc->sc_jconsumer != sc->sc_dconsumer) {
2087                 if (g_io_flush(sc->sc_dconsumer) == 0) {
2088                         sc->sc_bio_flush |= GJ_FLUSH_DATA;
2089                         GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
2090                             sc->sc_dconsumer->provider->name);
2091                 } else {
2092                         GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
2093                             sc->sc_dconsumer->provider->name);
2094                 }
2095         }
2096
2097         gp = sc->sc_geom;
2098         g_topology_lock();
2099         pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
2100         pp->mediasize = sc->sc_mediasize;
2101         /*
2102          * There could be a problem when data provider and journal providers
2103          * have different sectorsize, but such scenario is prevented on journal
2104          * creation.
2105          */
2106         pp->sectorsize = sc->sc_sectorsize;
2107         g_error_provider(pp, 0);
2108         g_topology_unlock();
2109         last_write = time_second;
2110
2111         if (sc->sc_rootmount != NULL) {
2112                 GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
2113                 root_mount_rel(sc->sc_rootmount);
2114                 sc->sc_rootmount = NULL;
2115         }
2116
2117         for (;;) {
2118                 /* Get first request from the queue. */
2119                 mtx_lock(&sc->sc_mtx);
2120                 bp = bioq_first(&sc->sc_back_queue);
2121                 if (bp != NULL)
2122                         type = (bp->bio_cflags & GJ_BIO_MASK);
2123                 if (bp == NULL) {
2124                         bp = bioq_first(&sc->sc_regular_queue);
2125                         if (bp != NULL)
2126                                 type = GJ_BIO_REGULAR;
2127                 }
2128                 if (bp == NULL) {
2129 try_switch:
2130                         if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
2131                             (sc->sc_flags & GJF_DEVICE_DESTROY)) {
2132                                 if (sc->sc_current_count > 0) {
2133                                         mtx_unlock(&sc->sc_mtx);
2134                                         g_journal_flush(sc);
2135                                         g_journal_flush_send(sc);
2136                                         continue;
2137                                 }
2138                                 if (sc->sc_flush_in_progress > 0)
2139                                         goto sleep;
2140                                 if (sc->sc_copy_in_progress > 0)
2141                                         goto sleep;
2142                         }
2143                         if (sc->sc_flags & GJF_DEVICE_SWITCH) {
2144                                 mtx_unlock(&sc->sc_mtx);
2145                                 g_journal_switch(sc);
2146                                 wakeup(&sc->sc_journal_copying);
2147                                 continue;
2148                         }
2149                         if (sc->sc_flags & GJF_DEVICE_DESTROY) {
2150                                 GJ_DEBUG(1, "Shutting down worker "
2151                                     "thread for %s.", gp->name);
2152                                 sc->sc_worker = NULL;
2153                                 wakeup(&sc->sc_worker);
2154                                 mtx_unlock(&sc->sc_mtx);
2155                                 kproc_exit(0);
2156                         }
2157 sleep:
2158                         g_journal_wait(sc, last_write);
2159                         continue;
2160                 }
2161                 /*
2162                  * If we're in switch process, we need to delay all new
2163                  * write requests until its done.
2164                  */
2165                 if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
2166                     type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
2167                         GJ_LOGREQ(2, bp, "WRITE on SWITCH");
2168                         goto try_switch;
2169                 }
2170                 if (type == GJ_BIO_REGULAR)
2171                         bioq_remove(&sc->sc_regular_queue, bp);
2172                 else
2173                         bioq_remove(&sc->sc_back_queue, bp);
2174                 mtx_unlock(&sc->sc_mtx);
2175                 switch (type) {
2176                 case GJ_BIO_REGULAR:
2177                         /* Regular request. */
2178                         switch (bp->bio_cmd) {
2179                         case BIO_READ:
2180                                 g_journal_read(sc, bp, bp->bio_offset,
2181                                     bp->bio_offset + bp->bio_length);
2182                                 break;
2183                         case BIO_WRITE:
2184                                 last_write = time_second;
2185                                 g_journal_add_request(sc, bp);
2186                                 g_journal_flush_send(sc);
2187                                 break;
2188                         default:
2189                                 panic("Invalid bio_cmd (%d).", bp->bio_cmd);
2190                         }
2191                         break;
2192                 case GJ_BIO_COPY:
2193                         switch (bp->bio_cmd) {
2194                         case BIO_READ:
2195                                 if (g_journal_copy_read_done(bp))
2196                                         g_journal_copy_send(sc);
2197                                 break;
2198                         case BIO_WRITE:
2199                                 g_journal_copy_write_done(bp);
2200                                 g_journal_copy_send(sc);
2201                                 break;
2202                         default:
2203                                 panic("Invalid bio_cmd (%d).", bp->bio_cmd);
2204                         }
2205                         break;
2206                 case GJ_BIO_JOURNAL:
2207                         g_journal_flush_done(bp);
2208                         g_journal_flush_send(sc);
2209                         break;
2210                 case GJ_BIO_READ:
2211                 default:
2212                         panic("Invalid bio (%d).", type);
2213                 }
2214         }
2215 }
2216
2217 static void
2218 g_journal_destroy_event(void *arg, int flags __unused)
2219 {
2220         struct g_journal_softc *sc;
2221
2222         g_topology_assert();
2223         sc = arg;
2224         g_journal_destroy(sc);
2225 }
2226
2227 static void
2228 g_journal_timeout(void *arg)
2229 {
2230         struct g_journal_softc *sc;
2231
2232         sc = arg;
2233         GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
2234             sc->sc_geom->name);
2235         g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
2236 }
2237
2238 static struct g_geom *
2239 g_journal_create(struct g_class *mp, struct g_provider *pp,
2240     const struct g_journal_metadata *md)
2241 {
2242         struct g_journal_softc *sc;
2243         struct g_geom *gp;
2244         struct g_consumer *cp;
2245         int error;
2246
2247         sc = NULL;      /* gcc */
2248
2249         g_topology_assert();
2250         /*
2251          * There are two possibilities:
2252          * 1. Data and both journals are on the same provider.
2253          * 2. Data and journals are all on separated providers.
2254          */
2255         /* Look for journal device with the same ID. */
2256         LIST_FOREACH(gp, &mp->geom, geom) {
2257                 sc = gp->softc;
2258                 if (sc == NULL)
2259                         continue;
2260                 if (sc->sc_id == md->md_id)
2261                         break;
2262         }
2263         if (gp == NULL)
2264                 sc = NULL;
2265         else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
2266                 GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
2267                 return (NULL);
2268         }
2269         if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
2270                 GJ_DEBUG(0, "Invalid type on %s.", pp->name);
2271                 return (NULL);
2272         }
2273         if (md->md_type & GJ_TYPE_DATA) {
2274                 GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
2275                     pp->name);
2276         }
2277         if (md->md_type & GJ_TYPE_JOURNAL) {
2278                 GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
2279                     pp->name);
2280         }
2281
2282         if (sc == NULL) {
2283                 /* Action geom. */
2284                 sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
2285                 sc->sc_id = md->md_id;
2286                 sc->sc_type = 0;
2287                 sc->sc_flags = 0;
2288                 sc->sc_worker = NULL;
2289
2290                 gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
2291                 gp->start = g_journal_start;
2292                 gp->orphan = g_journal_orphan;
2293                 gp->access = g_journal_access;
2294                 gp->softc = sc;
2295                 gp->flags |= G_GEOM_VOLATILE_BIO;
2296                 sc->sc_geom = gp;
2297
2298                 mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
2299
2300                 bioq_init(&sc->sc_back_queue);
2301                 bioq_init(&sc->sc_regular_queue);
2302                 bioq_init(&sc->sc_delayed_queue);
2303                 sc->sc_delayed_count = 0;
2304                 sc->sc_current_queue = NULL;
2305                 sc->sc_current_count = 0;
2306                 sc->sc_flush_queue = NULL;
2307                 sc->sc_flush_count = 0;
2308                 sc->sc_flush_in_progress = 0;
2309                 sc->sc_copy_queue = NULL;
2310                 sc->sc_copy_in_progress = 0;
2311                 sc->sc_inactive.jj_queue = NULL;
2312                 sc->sc_active.jj_queue = NULL;
2313
2314                 sc->sc_rootmount = root_mount_hold("GJOURNAL");
2315                 GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
2316
2317                 callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2318                 if (md->md_type != GJ_TYPE_COMPLETE) {
2319                         /*
2320                          * Journal and data are on separate providers.
2321                          * At this point we have only one of them.
2322                          * We setup a timeout in case the other part will not
2323                          * appear, so we won't wait forever.
2324                          */
2325                         callout_reset(&sc->sc_callout, 5 * hz,
2326                             g_journal_timeout, sc);
2327                 }
2328         }
2329
2330         /* Remember type of the data provider. */
2331         if (md->md_type & GJ_TYPE_DATA)
2332                 sc->sc_orig_type = md->md_type;
2333         sc->sc_type |= md->md_type;
2334         cp = NULL;
2335
2336         if (md->md_type & GJ_TYPE_DATA) {
2337                 if (md->md_flags & GJ_FLAG_CLEAN)
2338                         sc->sc_flags |= GJF_DEVICE_CLEAN;
2339                 if (md->md_flags & GJ_FLAG_CHECKSUM)
2340                         sc->sc_flags |= GJF_DEVICE_CHECKSUM;
2341                 cp = g_new_consumer(gp);
2342                 error = g_attach(cp, pp);
2343                 KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
2344                     pp->name, error));
2345                 error = g_access(cp, 1, 1, 1);
2346                 if (error != 0) {
2347                         GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
2348                             error);
2349                         g_journal_destroy(sc);
2350                         return (NULL);
2351                 }
2352                 sc->sc_dconsumer = cp;
2353                 sc->sc_mediasize = pp->mediasize - pp->sectorsize;
2354                 sc->sc_sectorsize = pp->sectorsize;
2355                 sc->sc_jstart = md->md_jstart;
2356                 sc->sc_jend = md->md_jend;
2357                 if (md->md_provider[0] != '\0')
2358                         sc->sc_flags |= GJF_DEVICE_HARDCODED;
2359                 sc->sc_journal_offset = md->md_joffset;
2360                 sc->sc_journal_id = md->md_jid;
2361                 sc->sc_journal_previous_id = md->md_jid;
2362         }
2363         if (md->md_type & GJ_TYPE_JOURNAL) {
2364                 if (cp == NULL) {
2365                         cp = g_new_consumer(gp);
2366                         error = g_attach(cp, pp);
2367                         KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
2368                             pp->name, error));
2369                         error = g_access(cp, 1, 1, 1);
2370                         if (error != 0) {
2371                                 GJ_DEBUG(0, "Cannot access %s (error=%d).",
2372                                     pp->name, error);
2373                                 g_journal_destroy(sc);
2374                                 return (NULL);
2375                         }
2376                 } else {
2377                         /*
2378                          * Journal is on the same provider as data, which means
2379                          * that data provider ends where journal starts.
2380                          */
2381                         sc->sc_mediasize = md->md_jstart;
2382                 }
2383                 sc->sc_jconsumer = cp;
2384         }
2385
2386         if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
2387                 /* Journal is not complete yet. */
2388                 return (gp);
2389         } else {
2390                 /* Journal complete, cancel timeout. */
2391                 callout_drain(&sc->sc_callout);
2392         }
2393
2394         error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
2395             "g_journal %s", sc->sc_name);
2396         if (error != 0) {
2397                 GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
2398                     sc->sc_name);
2399                 g_journal_destroy(sc);
2400                 return (NULL);
2401         }
2402
2403         return (gp);
2404 }
2405
2406 static void
2407 g_journal_destroy_consumer(void *arg, int flags __unused)
2408 {
2409         struct g_consumer *cp;
2410
2411         g_topology_assert();
2412         cp = arg;
2413         g_detach(cp);
2414         g_destroy_consumer(cp);
2415 }
2416
2417 static int
2418 g_journal_destroy(struct g_journal_softc *sc)
2419 {
2420         struct g_geom *gp;
2421         struct g_provider *pp;
2422         struct g_consumer *cp;
2423
2424         g_topology_assert();
2425
2426         if (sc == NULL)
2427                 return (ENXIO);
2428
2429         gp = sc->sc_geom;
2430         pp = LIST_FIRST(&gp->provider);
2431         if (pp != NULL) {
2432                 if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
2433                         GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
2434                             pp->name, pp->acr, pp->acw, pp->ace);
2435                         return (EBUSY);
2436                 }
2437                 g_error_provider(pp, ENXIO);
2438
2439                 g_journal_flush(sc);
2440                 g_journal_flush_send(sc);
2441                 g_journal_switch(sc);
2442         }
2443
2444         sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
2445
2446         g_topology_unlock();
2447
2448         if (sc->sc_rootmount != NULL) {
2449                 GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
2450                 root_mount_rel(sc->sc_rootmount);
2451                 sc->sc_rootmount = NULL;
2452         }
2453
2454         callout_drain(&sc->sc_callout);
2455         mtx_lock(&sc->sc_mtx);
2456         wakeup(sc);
2457         while (sc->sc_worker != NULL)
2458                 msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
2459         mtx_unlock(&sc->sc_mtx);
2460
2461         if (pp != NULL) {
2462                 GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
2463                 g_journal_metadata_update(sc);
2464                 g_topology_lock();
2465                 pp->flags |= G_PF_WITHER;
2466                 g_orphan_provider(pp, ENXIO);
2467         } else {
2468                 g_topology_lock();
2469         }
2470         mtx_destroy(&sc->sc_mtx);
2471
2472         if (sc->sc_current_count != 0) {
2473                 GJ_DEBUG(0, "Warning! Number of current requests %d.",
2474                     sc->sc_current_count);
2475         }
2476
2477         LIST_FOREACH(cp, &gp->consumer, consumer) {
2478                 if (cp->acr + cp->acw + cp->ace > 0)
2479                         g_access(cp, -1, -1, -1);
2480                 /*
2481                  * We keep all consumers open for writting, so if I'll detach
2482                  * and destroy consumer here, I'll get providers for taste, so
2483                  * journal will be started again.
2484                  * Sending an event here, prevents this from happening.
2485                  */
2486                 g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
2487         }
2488         gp->softc = NULL;
2489         g_wither_geom(gp, ENXIO);
2490         free(sc, M_JOURNAL);
2491         return (0);
2492 }
2493
2494 static void
2495 g_journal_taste_orphan(struct g_consumer *cp)
2496 {
2497
2498         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2499             cp->provider->name));
2500 }
2501
2502 static struct g_geom *
2503 g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2504 {
2505         struct g_journal_metadata md;
2506         struct g_consumer *cp;
2507         struct g_geom *gp;
2508         int error;
2509
2510         g_topology_assert();
2511         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2512         GJ_DEBUG(2, "Tasting %s.", pp->name);
2513         if (pp->geom->class == mp)
2514                 return (NULL);
2515
2516         gp = g_new_geomf(mp, "journal:taste");
2517         /* This orphan function should be never called. */
2518         gp->orphan = g_journal_taste_orphan;
2519         cp = g_new_consumer(gp);
2520         g_attach(cp, pp);
2521         error = g_journal_metadata_read(cp, &md);
2522         g_detach(cp);
2523         g_destroy_consumer(cp);
2524         g_destroy_geom(gp);
2525         if (error != 0)
2526                 return (NULL);
2527         gp = NULL;
2528
2529         if (md.md_provider[0] != '\0' &&
2530             !g_compare_names(md.md_provider, pp->name))
2531                 return (NULL);
2532         if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
2533                 return (NULL);
2534         if (g_journal_debug >= 2)
2535                 journal_metadata_dump(&md);
2536
2537         gp = g_journal_create(mp, pp, &md);
2538         return (gp);
2539 }
2540
2541 static struct g_journal_softc *
2542 g_journal_find_device(struct g_class *mp, const char *name)
2543 {
2544         struct g_journal_softc *sc;
2545         struct g_geom *gp;
2546         struct g_provider *pp;
2547
2548         if (strncmp(name, "/dev/", 5) == 0)
2549                 name += 5;
2550         LIST_FOREACH(gp, &mp->geom, geom) {
2551                 sc = gp->softc;
2552                 if (sc == NULL)
2553                         continue;
2554                 if (sc->sc_flags & GJF_DEVICE_DESTROY)
2555                         continue;
2556                 if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
2557                         continue;
2558                 pp = LIST_FIRST(&gp->provider);
2559                 if (strcmp(sc->sc_name, name) == 0)
2560                         return (sc);
2561                 if (pp != NULL && strcmp(pp->name, name) == 0)
2562                         return (sc);
2563         }
2564         return (NULL);
2565 }
2566
2567 static void
2568 g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
2569 {
2570         struct g_journal_softc *sc;
2571         const char *name;
2572         char param[16];
2573         int *nargs;
2574         int error, i;
2575
2576         g_topology_assert();
2577
2578         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
2579         if (nargs == NULL) {
2580                 gctl_error(req, "No '%s' argument.", "nargs");
2581                 return;
2582         }
2583         if (*nargs <= 0) {
2584                 gctl_error(req, "Missing device(s).");
2585                 return;
2586         }
2587
2588         for (i = 0; i < *nargs; i++) {
2589                 snprintf(param, sizeof(param), "arg%d", i);
2590                 name = gctl_get_asciiparam(req, param);
2591                 if (name == NULL) {
2592                         gctl_error(req, "No 'arg%d' argument.", i);
2593                         return;
2594                 }
2595                 sc = g_journal_find_device(mp, name);
2596                 if (sc == NULL) {
2597                         gctl_error(req, "No such device: %s.", name);
2598                         return;
2599                 }
2600                 error = g_journal_destroy(sc);
2601                 if (error != 0) {
2602                         gctl_error(req, "Cannot destroy device %s (error=%d).",
2603                             LIST_FIRST(&sc->sc_geom->provider)->name, error);
2604                         return;
2605                 }
2606         }
2607 }
2608
2609 static void
2610 g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
2611 {
2612
2613         g_topology_assert();
2614         g_topology_unlock();
2615         g_journal_sync_requested++;
2616         wakeup(&g_journal_switcher_state);
2617         while (g_journal_sync_requested > 0)
2618                 tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
2619         g_topology_lock();
2620 }
2621
2622 static void
2623 g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
2624 {
2625         uint32_t *version;
2626
2627         g_topology_assert();
2628
2629         version = gctl_get_paraml(req, "version", sizeof(*version));
2630         if (version == NULL) {
2631                 gctl_error(req, "No '%s' argument.", "version");
2632                 return;
2633         }
2634         if (*version != G_JOURNAL_VERSION) {
2635                 gctl_error(req, "Userland and kernel parts are out of sync.");
2636                 return;
2637         }
2638
2639         if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
2640                 g_journal_ctl_destroy(req, mp);
2641                 return;
2642         } else if (strcmp(verb, "sync") == 0) {
2643                 g_journal_ctl_sync(req, mp);
2644                 return;
2645         }
2646
2647         gctl_error(req, "Unknown verb.");
2648 }
2649
2650 static void
2651 g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2652     struct g_consumer *cp, struct g_provider *pp)
2653 {
2654         struct g_journal_softc *sc;
2655
2656         g_topology_assert();
2657
2658         sc = gp->softc;
2659         if (sc == NULL)
2660                 return;
2661         if (pp != NULL) {
2662                 /* Nothing here. */
2663         } else if (cp != NULL) {
2664                 int first = 1;
2665
2666                 sbuf_printf(sb, "%s<Role>", indent);
2667                 if (cp == sc->sc_dconsumer) {
2668                         sbuf_printf(sb, "Data");
2669                         first = 0;
2670                 }
2671                 if (cp == sc->sc_jconsumer) {
2672                         if (!first)
2673                                 sbuf_printf(sb, ",");
2674                         sbuf_printf(sb, "Journal");
2675                 }
2676                 sbuf_printf(sb, "</Role>\n");
2677                 if (cp == sc->sc_jconsumer) {
2678                         sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
2679                             (intmax_t)sc->sc_jstart);
2680                         sbuf_printf(sb, "<Jend>%jd</Jend>\n",
2681                             (intmax_t)sc->sc_jend);
2682                 }
2683         } else {
2684                 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
2685         }
2686 }
2687
2688 static eventhandler_tag g_journal_event_shutdown = NULL;
2689 static eventhandler_tag g_journal_event_lowmem = NULL;
2690
2691 static void
2692 g_journal_shutdown(void *arg, int howto __unused)
2693 {
2694         struct g_class *mp;
2695         struct g_geom *gp, *gp2;
2696
2697         if (panicstr != NULL)
2698                 return;
2699         mp = arg;
2700         DROP_GIANT();
2701         g_topology_lock();
2702         LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2703                 if (gp->softc == NULL)
2704                         continue;
2705                 GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
2706                 g_journal_destroy(gp->softc);
2707         }
2708         g_topology_unlock();
2709         PICKUP_GIANT();
2710 }
2711
2712 /*
2713  * Free cached requests from inactive queue in case of low memory.
2714  * We free GJ_FREE_AT_ONCE elements at once.
2715  */
2716 #define GJ_FREE_AT_ONCE 4
2717 static void
2718 g_journal_lowmem(void *arg, int howto __unused)
2719 {
2720         struct g_journal_softc *sc;
2721         struct g_class *mp;
2722         struct g_geom *gp;
2723         struct bio *bp;
2724         u_int nfree = GJ_FREE_AT_ONCE;
2725
2726         g_journal_stats_low_mem++;
2727         mp = arg;
2728         DROP_GIANT();
2729         g_topology_lock();
2730         LIST_FOREACH(gp, &mp->geom, geom) {
2731                 sc = gp->softc;
2732                 if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
2733                         continue;
2734                 mtx_lock(&sc->sc_mtx);
2735                 for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
2736                     nfree--, bp = bp->bio_next) {
2737                         /*
2738                          * This is safe to free the bio_data, because:
2739                          * 1. If bio_data is NULL it will be read from the
2740                          *    inactive journal.
2741                          * 2. If bp is sent down, it is first removed from the
2742                          *    inactive queue, so it's impossible to free the
2743                          *    data from under in-flight bio.
2744                          * On the other hand, freeing elements from the active
2745                          * queue, is not safe.
2746                          */
2747                         if (bp->bio_data != NULL) {
2748                                 GJ_DEBUG(2, "Freeing data from %s.",
2749                                     sc->sc_name);
2750                                 gj_free(bp->bio_data, bp->bio_length);
2751                                 bp->bio_data = NULL;
2752                         }
2753                 }
2754                 mtx_unlock(&sc->sc_mtx);
2755                 if (nfree == 0)
2756                         break;
2757         }
2758         g_topology_unlock();
2759         PICKUP_GIANT();
2760 }
2761
2762 static void g_journal_switcher(void *arg);
2763
2764 static void
2765 g_journal_init(struct g_class *mp)
2766 {
2767         int error;
2768
2769         /* Pick a conservative value if provided value sucks. */
2770         if (g_journal_cache_divisor <= 0 ||
2771             (vm_kmem_size / g_journal_cache_divisor == 0)) {
2772                 g_journal_cache_divisor = 5;
2773         }
2774         if (g_journal_cache_limit > 0) {
2775                 g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
2776                 g_journal_cache_low =
2777                     (g_journal_cache_limit / 100) * g_journal_cache_switch;
2778         }
2779         g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
2780             g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
2781         if (g_journal_event_shutdown == NULL)
2782                 GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
2783         g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
2784             g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
2785         if (g_journal_event_lowmem == NULL)
2786                 GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
2787         error = kproc_create(g_journal_switcher, mp, NULL, 0, 0,
2788             "g_journal switcher");
2789         KASSERT(error == 0, ("Cannot create switcher thread."));
2790 }
2791
2792 static void
2793 g_journal_fini(struct g_class *mp)
2794 {
2795
2796         if (g_journal_event_shutdown != NULL) {
2797                 EVENTHANDLER_DEREGISTER(shutdown_post_sync,
2798                     g_journal_event_shutdown);
2799         }
2800         if (g_journal_event_lowmem != NULL)
2801                 EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
2802         g_journal_switcher_state = GJ_SWITCHER_DIE;
2803         wakeup(&g_journal_switcher_state);
2804         while (g_journal_switcher_state != GJ_SWITCHER_DIED)
2805                 tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
2806         GJ_DEBUG(1, "Switcher died.");
2807 }
2808
2809 DECLARE_GEOM_CLASS(g_journal_class, g_journal);
2810
2811 static const struct g_journal_desc *
2812 g_journal_find_desc(const char *fstype)
2813 {
2814         const struct g_journal_desc *desc;
2815         int i;
2816
2817         for (desc = g_journal_filesystems[i = 0]; desc != NULL;
2818              desc = g_journal_filesystems[++i]) {
2819                 if (strcmp(desc->jd_fstype, fstype) == 0)
2820                         break;
2821         }
2822         return (desc);
2823 }
2824
2825 static void
2826 g_journal_switch_wait(struct g_journal_softc *sc)
2827 {
2828         struct bintime bt;
2829
2830         mtx_assert(&sc->sc_mtx, MA_OWNED);
2831         if (g_journal_debug >= 2) {
2832                 if (sc->sc_flush_in_progress > 0) {
2833                         GJ_DEBUG(2, "%d requests flushing.",
2834                             sc->sc_flush_in_progress);
2835                 }
2836                 if (sc->sc_copy_in_progress > 0) {
2837                         GJ_DEBUG(2, "%d requests copying.",
2838                             sc->sc_copy_in_progress);
2839                 }
2840                 if (sc->sc_flush_count > 0) {
2841                         GJ_DEBUG(2, "%d requests to flush.",
2842                             sc->sc_flush_count);
2843                 }
2844                 if (sc->sc_delayed_count > 0) {
2845                         GJ_DEBUG(2, "%d requests delayed.",
2846                             sc->sc_delayed_count);
2847                 }
2848         }
2849         g_journal_stats_switches++;
2850         if (sc->sc_copy_in_progress > 0)
2851                 g_journal_stats_wait_for_copy++;
2852         GJ_TIMER_START(1, &bt);
2853         sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
2854         sc->sc_flags |= GJF_DEVICE_SWITCH;
2855         wakeup(sc);
2856         while (sc->sc_flags & GJF_DEVICE_SWITCH) {
2857                 msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
2858                     "gj:switch", 0);
2859         }
2860         GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
2861 }
2862
2863 static void
2864 g_journal_do_switch(struct g_class *classp)
2865 {
2866         struct g_journal_softc *sc;
2867         const struct g_journal_desc *desc;
2868         struct g_geom *gp;
2869         struct mount *mp;
2870         struct bintime bt;
2871         char *mountpoint;
2872         int error, vfslocked;
2873
2874         DROP_GIANT();
2875         g_topology_lock();
2876         LIST_FOREACH(gp, &classp->geom, geom) {
2877                 sc = gp->softc;
2878                 if (sc == NULL)
2879                         continue;
2880                 if (sc->sc_flags & GJF_DEVICE_DESTROY)
2881                         continue;
2882                 if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
2883                         continue;
2884                 mtx_lock(&sc->sc_mtx);
2885                 sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
2886                 mtx_unlock(&sc->sc_mtx);
2887         }
2888         g_topology_unlock();
2889         PICKUP_GIANT();
2890
2891         mtx_lock(&mountlist_mtx);
2892         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2893                 if (mp->mnt_gjprovider == NULL)
2894                         continue;
2895                 if (mp->mnt_flag & MNT_RDONLY)
2896                         continue;
2897                 desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
2898                 if (desc == NULL)
2899                         continue;
2900                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
2901                         continue;
2902                 /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
2903
2904                 DROP_GIANT();
2905                 g_topology_lock();
2906                 sc = g_journal_find_device(classp, mp->mnt_gjprovider);
2907                 g_topology_unlock();
2908                 PICKUP_GIANT();
2909
2910                 if (sc == NULL) {
2911                         GJ_DEBUG(0, "Cannot find journal geom for %s.",
2912                             mp->mnt_gjprovider);
2913                         goto next;
2914                 } else if (JEMPTY(sc)) {
2915                         mtx_lock(&sc->sc_mtx);
2916                         sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
2917                         mtx_unlock(&sc->sc_mtx);
2918                         GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
2919                         goto next;
2920                 }
2921
2922                 mountpoint = mp->mnt_stat.f_mntonname;
2923
2924                 vfslocked = VFS_LOCK_GIANT(mp);
2925
2926                 error = vn_start_write(NULL, &mp, V_WAIT);
2927                 if (error != 0) {
2928                         VFS_UNLOCK_GIANT(vfslocked);
2929                         GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
2930                             mountpoint, error);
2931                         goto next;
2932                 }
2933
2934                 MNT_ILOCK(mp);
2935                 mp->mnt_noasync++;
2936                 mp->mnt_kern_flag &= ~MNTK_ASYNC;
2937                 MNT_IUNLOCK(mp);
2938
2939                 GJ_TIMER_START(1, &bt);
2940                 vfs_msync(mp, MNT_NOWAIT);
2941                 GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
2942
2943                 GJ_TIMER_START(1, &bt);
2944                 error = VFS_SYNC(mp, MNT_NOWAIT);
2945                 if (error == 0)
2946                         GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
2947                 else {
2948                         GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
2949                             mountpoint, error);
2950                 }
2951
2952                 MNT_ILOCK(mp);
2953                 mp->mnt_noasync--;
2954                 if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
2955                         mp->mnt_kern_flag |= MNTK_ASYNC;
2956                 MNT_IUNLOCK(mp);
2957
2958                 vn_finished_write(mp);
2959
2960                 if (error != 0) {
2961                         VFS_UNLOCK_GIANT(vfslocked);
2962                         goto next;
2963                 }
2964
2965                 /*
2966                  * Send BIO_FLUSH before freezing the file system, so it can be
2967                  * faster after the freeze.
2968                  */
2969                 GJ_TIMER_START(1, &bt);
2970                 g_journal_flush_cache(sc);
2971                 GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
2972
2973                 GJ_TIMER_START(1, &bt);
2974                 error = vfs_write_suspend(mp);
2975                 VFS_UNLOCK_GIANT(vfslocked);
2976                 GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
2977                 if (error != 0) {
2978                         GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
2979                             mountpoint, error);
2980                         goto next;
2981                 }
2982
2983                 error = desc->jd_clean(mp);
2984                 if (error != 0)
2985                         goto next;
2986
2987                 mtx_lock(&sc->sc_mtx);
2988                 g_journal_switch_wait(sc);
2989                 mtx_unlock(&sc->sc_mtx);
2990
2991                 vfs_write_resume(mp);
2992 next:
2993                 mtx_lock(&mountlist_mtx);
2994                 vfs_unbusy(mp);
2995         }
2996         mtx_unlock(&mountlist_mtx);
2997
2998         sc = NULL;
2999         for (;;) {
3000                 DROP_GIANT();
3001                 g_topology_lock();
3002                 LIST_FOREACH(gp, &g_journal_class.geom, geom) {
3003                         sc = gp->softc;
3004                         if (sc == NULL)
3005                                 continue;
3006                         mtx_lock(&sc->sc_mtx);
3007                         if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
3008                             !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
3009                             (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
3010                                 break;
3011                         }
3012                         mtx_unlock(&sc->sc_mtx);
3013                         sc = NULL;
3014                 }
3015                 g_topology_unlock();
3016                 PICKUP_GIANT();
3017                 if (sc == NULL)
3018                         break;
3019                 mtx_assert(&sc->sc_mtx, MA_OWNED);
3020                 g_journal_switch_wait(sc);
3021                 mtx_unlock(&sc->sc_mtx);
3022         }
3023 }
3024
3025 /*
3026  * TODO: Switcher thread should be started on first geom creation and killed on
3027  * last geom destruction.
3028  */
3029 static void
3030 g_journal_switcher(void *arg)
3031 {
3032         struct g_class *mp;
3033         struct bintime bt;
3034         int error;
3035
3036         mp = arg;
3037         curthread->td_pflags |= TDP_NORUNNINGBUF;
3038         for (;;) {
3039                 g_journal_switcher_wokenup = 0;
3040                 error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
3041                     g_journal_switch_time * hz);
3042                 if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
3043                         g_journal_switcher_state = GJ_SWITCHER_DIED;
3044                         GJ_DEBUG(1, "Switcher exiting.");
3045                         wakeup(&g_journal_switcher_state);
3046                         kproc_exit(0);
3047                 }
3048                 if (error == 0 && g_journal_sync_requested == 0) {
3049                         GJ_DEBUG(1, "Out of cache, force switch (used=%u "
3050                             "limit=%u).", g_journal_cache_used,
3051                             g_journal_cache_limit);
3052                 }
3053                 GJ_TIMER_START(1, &bt);
3054                 g_journal_do_switch(mp);
3055                 GJ_TIMER_STOP(1, &bt, "Entire switch time");
3056                 if (g_journal_sync_requested > 0) {
3057                         g_journal_sync_requested = 0;
3058                         wakeup(&g_journal_sync_requested);
3059                 }
3060         }
3061 }