sys/geom/journal/g_journal.c

   1 /*-
   2  * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/kernel.h>
  33 #include <sys/module.h>
  34 #include <sys/limits.h>
  35 #include <sys/lock.h>
  36 #include <sys/mutex.h>
  37 #include <sys/bio.h>
  38 #include <sys/sysctl.h>
  39 #include <sys/malloc.h>
  40 #include <sys/mount.h>
  41 #include <sys/eventhandler.h>
  42 #include <sys/proc.h>
  43 #include <sys/kthread.h>
  44 #include <sys/sched.h>
  45 #include <sys/taskqueue.h>
  46 #include <sys/vnode.h>
  47 #include <sys/sbuf.h>
  48 #ifdef GJ_MEMDEBUG
  49 #include <sys/stack.h>
  50 #include <sys/kdb.h>
  51 #endif
  52 #include <vm/vm.h>
  53 #include <vm/vm_kern.h>
  54 #include <geom/geom.h>
  55
  56 #include <geom/journal/g_journal.h>
  57
  58 FEATURE(geom_journal, "GEOM journaling support");
  59
  60 /*
  61  * On-disk journal format:
  62  *
  63  * JH - Journal header
  64  * RH - Record header
  65  *
  66  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
  67  * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
  68  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
  69  *
  70  */
  71
  72 CTASSERT(sizeof(struct g_journal_header) <= 512);
  73 CTASSERT(sizeof(struct g_journal_record_header) <= 512);
  74
  75 static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
  76 static struct mtx g_journal_cache_mtx;
  77 MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
  78
  79 const struct g_journal_desc *g_journal_filesystems[] = {
  80         &g_journal_ufs,
  81         NULL
  82 };
  83
  84 SYSCTL_DECL(_kern_geom);
  85
  86 int g_journal_debug = 0;
  87 TUNABLE_INT("kern.geom.journal.debug", &g_journal_debug);
  88 static u_int g_journal_switch_time = 10;
  89 static u_int g_journal_force_switch = 70;
  90 static u_int g_journal_parallel_flushes = 16;
  91 static u_int g_journal_parallel_copies = 16;
  92 static u_int g_journal_accept_immediately = 64;
  93 static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
  94 static u_int g_journal_do_optimize = 1;
  95
  96 SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0, "GEOM_JOURNAL stuff");
  97 SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RW, &g_journal_debug, 0,
  98     "Debug level");
  99 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
 100     &g_journal_switch_time, 0, "Switch journals every N seconds");
 101 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
 102     &g_journal_force_switch, 0, "Force switch when journal is N% full");
 103 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
 104     &g_journal_parallel_flushes, 0,
 105     "Number of flush I/O requests to send in parallel");
 106 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
 107     &g_journal_accept_immediately, 0,
 108     "Number of I/O requests accepted immediately");
 109 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
 110     &g_journal_parallel_copies, 0,
 111     "Number of copy I/O requests to send in parallel");
 112 static int
 113 g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
 114 {
 115         u_int entries;
 116         int error;
 117
 118         entries = g_journal_record_entries;
 119         error = sysctl_handle_int(oidp, &entries, 0, req);
 120         if (error != 0 || req->newptr == NULL)
 121                 return (error);
 122         if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
 123                 return (EINVAL);
 124         g_journal_record_entries = entries;
 125         return (0);
 126 }
 127 SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
 128     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
 129     "Maximum number of entires in one journal record");
 130 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
 131     &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
 132
 133 static u_int g_journal_cache_used = 0;
 134 static u_int g_journal_cache_limit = 64 * 1024 * 1024;
 135 TUNABLE_INT("kern.geom.journal.cache.limit", &g_journal_cache_limit);
 136 static u_int g_journal_cache_divisor = 2;
 137 TUNABLE_INT("kern.geom.journal.cache.divisor", &g_journal_cache_divisor);
 138 static u_int g_journal_cache_switch = 90;
 139 static u_int g_journal_cache_misses = 0;
 140 static u_int g_journal_cache_alloc_failures = 0;
 141 static u_int g_journal_cache_low = 0;
 142
 143 SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
 144     "GEOM_JOURNAL cache");
 145 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
 146     &g_journal_cache_used, 0, "Number of allocated bytes");
 147 static int
 148 g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
 149 {
 150         u_int limit;
 151         int error;
 152
 153         limit = g_journal_cache_limit;
 154         error = sysctl_handle_int(oidp, &limit, 0, req);
 155         if (error != 0 || req->newptr == NULL)
 156                 return (error);
 157         g_journal_cache_limit = limit;
 158         g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
 159         return (0);
 160 }
 161 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
 162     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_limit_sysctl, "I",
 163     "Maximum number of allocated bytes");
 164 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
 165     &g_journal_cache_divisor, 0,
 166     "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
 167 static int
 168 g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
 169 {
 170         u_int cswitch;
 171         int error;
 172
 173         cswitch = g_journal_cache_switch;
 174         error = sysctl_handle_int(oidp, &cswitch, 0, req);
 175         if (error != 0 || req->newptr == NULL)
 176                 return (error);
 177         if (cswitch < 0 || cswitch > 100)
 178                 return (EINVAL);
 179         g_journal_cache_switch = cswitch;
 180         g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
 181         return (0);
 182 }
 183 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
 184     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
 185     "Force switch when we hit this percent of cache use");
 186 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
 187     &g_journal_cache_misses, 0, "Number of cache misses");
 188 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
 189     &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
 190
 191 static u_long g_journal_stats_bytes_skipped = 0;
 192 static u_long g_journal_stats_combined_ios = 0;
 193 static u_long g_journal_stats_switches = 0;
 194 static u_long g_journal_stats_wait_for_copy = 0;
 195 static u_long g_journal_stats_journal_full = 0;
 196 static u_long g_journal_stats_low_mem = 0;
 197
 198 SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
 199     "GEOM_JOURNAL statistics");
 200 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
 201     &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
 202 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
 203     &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
 204 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
 205     &g_journal_stats_switches, 0, "Number of journal switches");
 206 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
 207     &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
 208 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
 209     &g_journal_stats_journal_full, 0,
 210     "Number of times journal was almost full.");
 211 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
 212     &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
 213
 214 static g_taste_t g_journal_taste;
 215 static g_ctl_req_t g_journal_config;
 216 static g_dumpconf_t g_journal_dumpconf;
 217 static g_init_t g_journal_init;
 218 static g_fini_t g_journal_fini;
 219
 220 struct g_class g_journal_class = {
 221         .name = G_JOURNAL_CLASS_NAME,
 222         .version = G_VERSION,
 223         .taste = g_journal_taste,
 224         .ctlreq = g_journal_config,
 225         .dumpconf = g_journal_dumpconf,
 226         .init = g_journal_init,
 227         .fini = g_journal_fini
 228 };
 229
 230 static int g_journal_destroy(struct g_journal_softc *sc);
 231 static void g_journal_metadata_update(struct g_journal_softc *sc);
 232 static void g_journal_switch_wait(struct g_journal_softc *sc);
 233
 234 #define GJ_SWITCHER_WORKING     0
 235 #define GJ_SWITCHER_DIE         1
 236 #define GJ_SWITCHER_DIED        2
 237 static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
 238 static int g_journal_switcher_wokenup = 0;
 239 static int g_journal_sync_requested = 0;
 240
 241 #ifdef GJ_MEMDEBUG
 242 struct meminfo {
 243         size_t          mi_size;
 244         struct stack    mi_stack;
 245 };
 246 #endif
 247
 248 /*
 249  * We use our own malloc/realloc/free funtions, so we can collect statistics
 250  * and force journal switch when we're running out of cache.
 251  */
 252 static void *
 253 gj_malloc(size_t size, int flags)
 254 {
 255         void *p;
 256 #ifdef GJ_MEMDEBUG
 257         struct meminfo *mi;
 258 #endif
 259
 260         mtx_lock(&g_journal_cache_mtx);
 261         if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
 262             g_journal_cache_used + size > g_journal_cache_low) {
 263                 GJ_DEBUG(1, "No cache, waking up the switcher.");
 264                 g_journal_switcher_wokenup = 1;
 265                 wakeup(&g_journal_switcher_state);
 266         }
 267         if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
 268             g_journal_cache_used + size > g_journal_cache_limit) {
 269                 mtx_unlock(&g_journal_cache_mtx);
 270                 g_journal_cache_alloc_failures++;
 271                 return (NULL);
 272         }
 273         g_journal_cache_used += size;
 274         mtx_unlock(&g_journal_cache_mtx);
 275         flags &= ~M_NOWAIT;
 276 #ifndef GJ_MEMDEBUG
 277         p = malloc(size, M_JOURNAL, flags | M_WAITOK);
 278 #else
 279         mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
 280         p = (u_char *)mi + sizeof(*mi);
 281         mi->mi_size = size;
 282         stack_save(&mi->mi_stack);
 283 #endif
 284         return (p);
 285 }
 286
 287 static void
 288 gj_free(void *p, size_t size)
 289 {
 290 #ifdef GJ_MEMDEBUG
 291         struct meminfo *mi;
 292 #endif
 293
 294         KASSERT(p != NULL, ("p=NULL"));
 295         KASSERT(size > 0, ("size=0"));
 296         mtx_lock(&g_journal_cache_mtx);
 297         KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
 298         g_journal_cache_used -= size;
 299         mtx_unlock(&g_journal_cache_mtx);
 300 #ifdef GJ_MEMDEBUG
 301         mi = p = (void *)((u_char *)p - sizeof(*mi));
 302         if (mi->mi_size != size) {
 303                 printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
 304                     mi->mi_size);
 305                 printf("GJOURNAL: Alloc backtrace:\n");
 306                 stack_print(&mi->mi_stack);
 307                 printf("GJOURNAL: Free backtrace:\n");
 308                 kdb_backtrace();
 309         }
 310 #endif
 311         free(p, M_JOURNAL);
 312 }
 313
 314 static void *
 315 gj_realloc(void *p, size_t size, size_t oldsize)
 316 {
 317         void *np;
 318
 319 #ifndef GJ_MEMDEBUG
 320         mtx_lock(&g_journal_cache_mtx);
 321         g_journal_cache_used -= oldsize;
 322         g_journal_cache_used += size;
 323         mtx_unlock(&g_journal_cache_mtx);
 324         np = realloc(p, size, M_JOURNAL, M_WAITOK);
 325 #else
 326         np = gj_malloc(size, M_WAITOK);
 327         bcopy(p, np, MIN(oldsize, size));
 328         gj_free(p, oldsize);
 329 #endif
 330         return (np);
 331 }
 332
 333 static void
 334 g_journal_check_overflow(struct g_journal_softc *sc)
 335 {
 336         off_t length, used;
 337
 338         if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
 339              sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
 340             (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
 341              sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
 342              sc->sc_journal_offset < sc->sc_active.jj_offset)) {
 343                 panic("Journal overflow (joffset=%jd active=%jd inactive=%jd)",
 344                     (intmax_t)sc->sc_journal_offset,
 345                     (intmax_t)sc->sc_active.jj_offset,
 346                     (intmax_t)sc->sc_inactive.jj_offset);
 347         }
 348         if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
 349                 length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
 350                 used = sc->sc_journal_offset - sc->sc_active.jj_offset;
 351         } else {
 352                 length = sc->sc_jend - sc->sc_active.jj_offset;
 353                 length += sc->sc_inactive.jj_offset - sc->sc_jstart;
 354                 if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
 355                         used = sc->sc_journal_offset - sc->sc_active.jj_offset;
 356                 else {
 357                         used = sc->sc_jend - sc->sc_active.jj_offset;
 358                         used += sc->sc_journal_offset - sc->sc_jstart;
 359                 }
 360         }
 361         /* Already woken up? */
 362         if (g_journal_switcher_wokenup)
 363                 return;
 364         /*
 365          * If the active journal takes more than g_journal_force_switch precent
 366          * of free journal space, we force journal switch.
 367          */
 368         KASSERT(length > 0,
 369             ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
 370             (intmax_t)length, (intmax_t)used,
 371             (intmax_t)sc->sc_active.jj_offset,
 372             (intmax_t)sc->sc_inactive.jj_offset,
 373             (intmax_t)sc->sc_journal_offset));
 374         if ((used * 100) / length > g_journal_force_switch) {
 375                 g_journal_stats_journal_full++;
 376                 GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
 377                     sc->sc_name, (used * 100) / length);
 378                 mtx_lock(&g_journal_cache_mtx);
 379                 g_journal_switcher_wokenup = 1;
 380                 wakeup(&g_journal_switcher_state);
 381                 mtx_unlock(&g_journal_cache_mtx);
 382         }
 383 }
 384
 385 static void
 386 g_journal_orphan(struct g_consumer *cp)
 387 {
 388         struct g_journal_softc *sc;
 389         char name[256];
 390         int error;
 391
 392         g_topology_assert();
 393         sc = cp->geom->softc;
 394         strlcpy(name, cp->provider->name, sizeof(name));
 395         GJ_DEBUG(0, "Lost provider %s.", name);
 396         if (sc == NULL)
 397                 return;
 398         error = g_journal_destroy(sc);
 399         if (error == 0)
 400                 GJ_DEBUG(0, "Journal %s destroyed.", name);
 401         else {
 402                 GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
 403                     "Destroy it manually after last close.", sc->sc_name,
 404                     error);
 405         }
 406 }
 407
 408 static int
 409 g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
 410 {
 411         struct g_journal_softc *sc;
 412         int dcr, dcw, dce;
 413
 414         g_topology_assert();
 415         GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
 416             acr, acw, ace);
 417
 418         dcr = pp->acr + acr;
 419         dcw = pp->acw + acw;
 420         dce = pp->ace + ace;
 421
 422         sc = pp->geom->softc;
 423         if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 424                 if (acr <= 0 && acw <= 0 && ace <= 0)
 425                         return (0);
 426                 else
 427                         return (ENXIO);
 428         }
 429         if (pp->acw == 0 && dcw > 0) {
 430                 GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
 431                 sc->sc_flags &= ~GJF_DEVICE_CLEAN;
 432                 g_topology_unlock();
 433                 g_journal_metadata_update(sc);
 434                 g_topology_lock();
 435         } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
 436                 GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 437                 sc->sc_flags |= GJF_DEVICE_CLEAN;
 438                 g_topology_unlock();
 439                 g_journal_metadata_update(sc);
 440                 g_topology_lock();
 441         } */
 442         return (0);
 443 }
 444
 445 static void
 446 g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
 447 {
 448
 449         bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
 450         data += sizeof(GJ_HEADER_MAGIC);
 451         le32enc(data, hdr->jh_journal_id);
 452         data += 4;
 453         le32enc(data, hdr->jh_journal_next_id);
 454 }
 455
 456 static int
 457 g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
 458 {
 459
 460         bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
 461         data += sizeof(hdr->jh_magic);
 462         if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
 463                 return (EINVAL);
 464         hdr->jh_journal_id = le32dec(data);
 465         data += 4;
 466         hdr->jh_journal_next_id = le32dec(data);
 467         return (0);
 468 }
 469
 470 static void
 471 g_journal_flush_cache(struct g_journal_softc *sc)
 472 {
 473         struct bintime bt;
 474         int error;
 475
 476         if (sc->sc_bio_flush == 0)
 477                 return;
 478         GJ_TIMER_START(1, &bt);
 479         if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
 480                 error = g_io_flush(sc->sc_jconsumer);
 481                 GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
 482                     sc->sc_jconsumer->provider->name, error);
 483         }
 484         if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
 485                 /*
 486                  * TODO: This could be called in parallel with the
 487                  *       previous call.
 488                  */
 489                 error = g_io_flush(sc->sc_dconsumer);
 490                 GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
 491                     sc->sc_dconsumer->provider->name, error);
 492         }
 493         GJ_TIMER_STOP(1, &bt, "Cache flush time");
 494 }
 495
 496 static int
 497 g_journal_write_header(struct g_journal_softc *sc)
 498 {
 499         struct g_journal_header hdr;
 500         struct g_consumer *cp;
 501         u_char *buf;
 502         int error;
 503
 504         cp = sc->sc_jconsumer;
 505         buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 506
 507         strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
 508         hdr.jh_journal_id = sc->sc_journal_id;
 509         hdr.jh_journal_next_id = sc->sc_journal_next_id;
 510         g_journal_header_encode(&hdr, buf);
 511         error = g_write_data(cp, sc->sc_journal_offset, buf,
 512             cp->provider->sectorsize);
 513         /* if (error == 0) */
 514         sc->sc_journal_offset += cp->provider->sectorsize;
 515
 516         gj_free(buf, cp->provider->sectorsize);
 517         return (error);
 518 }
 519
 520 /*
 521  * Every journal record has a header and data following it.
 522  * Functions below are used to decode the header before storing it to
 523  * little endian and to encode it after reading to system endianess.
 524  */
 525 static void
 526 g_journal_record_header_encode(struct g_journal_record_header *hdr,
 527     u_char *data)
 528 {
 529         struct g_journal_entry *ent;
 530         u_int i;
 531
 532         bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
 533         data += sizeof(GJ_RECORD_HEADER_MAGIC);
 534         le32enc(data, hdr->jrh_journal_id);
 535         data += 8;
 536         le16enc(data, hdr->jrh_nentries);
 537         data += 2;
 538         bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
 539         data += 8;
 540         for (i = 0; i < hdr->jrh_nentries; i++) {
 541                 ent = &hdr->jrh_entries[i];
 542                 le64enc(data, ent->je_joffset);
 543                 data += 8;
 544                 le64enc(data, ent->je_offset);
 545                 data += 8;
 546                 le64enc(data, ent->je_length);
 547                 data += 8;
 548         }
 549 }
 550
 551 static int
 552 g_journal_record_header_decode(const u_char *data,
 553     struct g_journal_record_header *hdr)
 554 {
 555         struct g_journal_entry *ent;
 556         u_int i;
 557
 558         bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
 559         data += sizeof(hdr->jrh_magic);
 560         if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
 561                 return (EINVAL);
 562         hdr->jrh_journal_id = le32dec(data);
 563         data += 8;
 564         hdr->jrh_nentries = le16dec(data);
 565         data += 2;
 566         if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
 567                 return (EINVAL);
 568         bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
 569         data += 8;
 570         for (i = 0; i < hdr->jrh_nentries; i++) {
 571                 ent = &hdr->jrh_entries[i];
 572                 ent->je_joffset = le64dec(data);
 573                 data += 8;
 574                 ent->je_offset = le64dec(data);
 575                 data += 8;
 576                 ent->je_length = le64dec(data);
 577                 data += 8;
 578         }
 579         return (0);
 580 }
 581
 582 /*
 583  * Function reads metadata from a provider (via the given consumer), decodes
 584  * it to system endianess and verifies its correctness.
 585  */
 586 static int
 587 g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
 588 {
 589         struct g_provider *pp;
 590         u_char *buf;
 591         int error;
 592
 593         g_topology_assert();
 594
 595         error = g_access(cp, 1, 0, 0);
 596         if (error != 0)
 597                 return (error);
 598         pp = cp->provider;
 599         g_topology_unlock();
 600         /* Metadata is stored in last sector. */
 601         buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 602             &error);
 603         g_topology_lock();
 604         g_access(cp, -1, 0, 0);
 605         if (buf == NULL) {
 606                 GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 607                     cp->provider->name, error);
 608                 return (error);
 609         }
 610
 611         /* Decode metadata. */
 612         error = journal_metadata_decode(buf, md);
 613         g_free(buf);
 614         /* Is this is gjournal provider at all? */
 615         if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
 616                 return (EINVAL);
 617         /*
 618          * Are we able to handle this version of metadata?
 619          * We only maintain backward compatibility.
 620          */
 621         if (md->md_version > G_JOURNAL_VERSION) {
 622                 GJ_DEBUG(0,
 623                     "Kernel module is too old to handle metadata from %s.",
 624                     cp->provider->name);
 625                 return (EINVAL);
 626         }
 627         /* Is checksum correct? */
 628         if (error != 0) {
 629                 GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
 630                     cp->provider->name);
 631                 return (error);
 632         }
 633         return (0);
 634 }
 635
 636 /*
 637  * Two functions below are responsible for updating metadata.
 638  * Only metadata on the data provider is updated (we need to update
 639  * information about active journal in there).
 640  */
 641 static void
 642 g_journal_metadata_done(struct bio *bp)
 643 {
 644
 645         /*
 646          * There is not much we can do on error except informing about it.
 647          */
 648         if (bp->bio_error != 0) {
 649                 GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
 650                     bp->bio_error);
 651         } else {
 652                 GJ_LOGREQ(2, bp, "Metadata updated.");
 653         }
 654         gj_free(bp->bio_data, bp->bio_length);
 655         g_destroy_bio(bp);
 656 }
 657
 658 static void
 659 g_journal_metadata_update(struct g_journal_softc *sc)
 660 {
 661         struct g_journal_metadata md;
 662         struct g_consumer *cp;
 663         struct bio *bp;
 664         u_char *sector;
 665
 666         cp = sc->sc_dconsumer;
 667         sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 668         strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
 669         md.md_version = G_JOURNAL_VERSION;
 670         md.md_id = sc->sc_id;
 671         md.md_type = sc->sc_orig_type;
 672         md.md_jstart = sc->sc_jstart;
 673         md.md_jend = sc->sc_jend;
 674         md.md_joffset = sc->sc_inactive.jj_offset;
 675         md.md_jid = sc->sc_journal_previous_id;
 676         md.md_flags = 0;
 677         if (sc->sc_flags & GJF_DEVICE_CLEAN)
 678                 md.md_flags |= GJ_FLAG_CLEAN;
 679
 680         if (sc->sc_flags & GJF_DEVICE_HARDCODED)
 681                 strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
 682         else
 683                 bzero(md.md_provider, sizeof(md.md_provider));
 684         md.md_provsize = cp->provider->mediasize;
 685         journal_metadata_encode(&md, sector);
 686
 687         /*
 688          * Flush the cache, so we know all data are on disk.
 689          * We write here informations like "journal is consistent", so we need
 690          * to be sure it is. Without BIO_FLUSH here, we can end up in situation
 691          * where metadata is stored on disk, but not all data.
 692          */
 693         g_journal_flush_cache(sc);
 694
 695         bp = g_alloc_bio();
 696         bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
 697         bp->bio_length = cp->provider->sectorsize;
 698         bp->bio_data = sector;
 699         bp->bio_cmd = BIO_WRITE;
 700         if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
 701                 bp->bio_done = g_journal_metadata_done;
 702                 g_io_request(bp, cp);
 703         } else {
 704                 bp->bio_done = NULL;
 705                 g_io_request(bp, cp);
 706                 biowait(bp, "gjmdu");
 707                 g_journal_metadata_done(bp);
 708         }
 709
 710         /*
 711          * Be sure metadata reached the disk.
 712          */
 713         g_journal_flush_cache(sc);
 714 }
 715
 716 /*
 717  * This is where the I/O request comes from the GEOM.
 718  */
 719 static void
 720 g_journal_start(struct bio *bp)
 721 {
 722         struct g_journal_softc *sc;
 723
 724         sc = bp->bio_to->geom->softc;
 725         GJ_LOGREQ(3, bp, "Request received.");
 726
 727         switch (bp->bio_cmd) {
 728         case BIO_READ:
 729         case BIO_WRITE:
 730                 mtx_lock(&sc->sc_mtx);
 731                 bioq_insert_tail(&sc->sc_regular_queue, bp);
 732                 wakeup(sc);
 733                 mtx_unlock(&sc->sc_mtx);
 734                 return;
 735         case BIO_GETATTR:
 736                 if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
 737                         strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
 738                         bp->bio_completed = strlen(bp->bio_to->name) + 1;
 739                         g_io_deliver(bp, 0);
 740                         return;
 741                 }
 742                 /* FALLTHROUGH */
 743         case BIO_DELETE:
 744         default:
 745                 g_io_deliver(bp, EOPNOTSUPP);
 746                 return;
 747         }
 748 }
 749
 750 static void
 751 g_journal_std_done(struct bio *bp)
 752 {
 753         struct g_journal_softc *sc;
 754
 755         sc = bp->bio_from->geom->softc;
 756         mtx_lock(&sc->sc_mtx);
 757         bioq_insert_tail(&sc->sc_back_queue, bp);
 758         wakeup(sc);
 759         mtx_unlock(&sc->sc_mtx);
 760 }
 761
 762 static struct bio *
 763 g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
 764     int flags)
 765 {
 766         struct bio *bp;
 767
 768         bp = g_alloc_bio();
 769         bp->bio_offset = start;
 770         bp->bio_joffset = joffset;
 771         bp->bio_length = end - start;
 772         bp->bio_cmd = BIO_WRITE;
 773         bp->bio_done = g_journal_std_done;
 774         if (data == NULL)
 775                 bp->bio_data = NULL;
 776         else {
 777                 bp->bio_data = gj_malloc(bp->bio_length, flags);
 778                 if (bp->bio_data != NULL)
 779                         bcopy(data, bp->bio_data, bp->bio_length);
 780         }
 781         return (bp);
 782 }
 783
 784 #define g_journal_insert_bio(head, bp, flags)                           \
 785         g_journal_insert((head), (bp)->bio_offset,                      \
 786                 (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \
 787                 (bp)->bio_data, flags)
 788 /*
 789  * The function below does a lot more than just inserting bio to the queue.
 790  * It keeps the queue sorted by offset and ensures that there are no doubled
 791  * data (it combines bios where ranges overlap).
 792  *
 793  * The function returns the number of bios inserted (as bio can be splitted).
 794  */
 795 static int
 796 g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
 797     u_char *data, int flags)
 798 {
 799         struct bio *nbp, *cbp, *pbp;
 800         off_t cstart, cend;
 801         u_char *tmpdata;
 802         int n;
 803
 804         GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
 805             joffset);
 806         n = 0;
 807         pbp = NULL;
 808         GJQ_FOREACH(*head, cbp) {
 809                 cstart = cbp->bio_offset;
 810                 cend = cbp->bio_offset + cbp->bio_length;
 811
 812                 if (nstart >= cend) {
 813                         /*
 814                          *  +-------------+
 815                          *  |             |
 816                          *  |   current   |  +-------------+
 817                          *  |     bio     |  |             |
 818                          *  |             |  |     new     |
 819                          *  +-------------+  |     bio     |
 820                          *                   |             |
 821                          *                   +-------------+
 822                          */
 823                         GJ_DEBUG(3, "INSERT(%p): 1", *head);
 824                 } else if (nend <= cstart) {
 825                         /*
 826                          *                   +-------------+
 827                          *                   |             |
 828                          *  +-------------+  |   current   |
 829                          *  |             |  |     bio     |
 830                          *  |     new     |  |             |
 831                          *  |     bio     |  +-------------+
 832                          *  |             |
 833                          *  +-------------+
 834                          */
 835                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
 836                             flags);
 837                         if (pbp == NULL)
 838                                 *head = nbp;
 839                         else
 840                                 pbp->bio_next = nbp;
 841                         nbp->bio_next = cbp;
 842                         n++;
 843                         GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
 844                             pbp);
 845                         goto end;
 846                 } else if (nstart <= cstart && nend >= cend) {
 847                         /*
 848                          *      +-------------+      +-------------+
 849                          *      | current bio |      | current bio |
 850                          *  +---+-------------+---+  +-------------+---+
 851                          *  |   |             |   |  |             |   |
 852                          *  |   |             |   |  |             |   |
 853                          *  |   +-------------+   |  +-------------+   |
 854                          *  |       new bio       |  |     new bio     |
 855                          *  +---------------------+  +-----------------+
 856                          *
 857                          *      +-------------+  +-------------+
 858                          *      | current bio |  | current bio |
 859                          *  +---+-------------+  +-------------+
 860                          *  |   |             |  |             |
 861                          *  |   |             |  |             |
 862                          *  |   +-------------+  +-------------+
 863                          *  |     new bio     |  |   new bio   |
 864                          *  +-----------------+  +-------------+
 865                          */
 866                         g_journal_stats_bytes_skipped += cbp->bio_length;
 867                         cbp->bio_offset = nstart;
 868                         cbp->bio_joffset = joffset;
 869                         cbp->bio_length = cend - nstart;
 870                         if (cbp->bio_data != NULL) {
 871                                 gj_free(cbp->bio_data, cend - cstart);
 872                                 cbp->bio_data = NULL;
 873                         }
 874                         if (data != NULL) {
 875                                 cbp->bio_data = gj_malloc(cbp->bio_length,
 876                                     flags);
 877                                 if (cbp->bio_data != NULL) {
 878                                         bcopy(data, cbp->bio_data,
 879                                             cbp->bio_length);
 880                                 }
 881                                 data += cend - nstart;
 882                         }
 883                         joffset += cend - nstart;
 884                         nstart = cend;
 885                         GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
 886                 } else if (nstart > cstart && nend >= cend) {
 887                         /*
 888                          *  +-----------------+  +-------------+
 889                          *  |   current bio   |  | current bio |
 890                          *  |   +-------------+  |   +---------+---+
 891                          *  |   |             |  |   |         |   |
 892                          *  |   |             |  |   |         |   |
 893                          *  +---+-------------+  +---+---------+   |
 894                          *      |   new bio   |      |   new bio   |
 895                          *      +-------------+      +-------------+
 896                          */
 897                         g_journal_stats_bytes_skipped += cend - nstart;
 898                         nbp = g_journal_new_bio(nstart, cend, joffset, data,
 899                             flags);
 900                         nbp->bio_next = cbp->bio_next;
 901                         cbp->bio_next = nbp;
 902                         cbp->bio_length = nstart - cstart;
 903                         if (cbp->bio_data != NULL) {
 904                                 cbp->bio_data = gj_realloc(cbp->bio_data,
 905                                     cbp->bio_length, cend - cstart);
 906                         }
 907                         if (data != NULL)
 908                                 data += cend - nstart;
 909                         joffset += cend - nstart;
 910                         nstart = cend;
 911                         n++;
 912                         GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
 913                 } else if (nstart > cstart && nend < cend) {
 914                         /*
 915                          *  +---------------------+
 916                          *  |     current bio     |
 917                          *  |   +-------------+   |
 918                          *  |   |             |   |
 919                          *  |   |             |   |
 920                          *  +---+-------------+---+
 921                          *      |   new bio   |
 922                          *      +-------------+
 923                          */
 924                         g_journal_stats_bytes_skipped += nend - nstart;
 925                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
 926                             flags);
 927                         nbp->bio_next = cbp->bio_next;
 928                         cbp->bio_next = nbp;
 929                         if (cbp->bio_data == NULL)
 930                                 tmpdata = NULL;
 931                         else
 932                                 tmpdata = cbp->bio_data + nend - cstart;
 933                         nbp = g_journal_new_bio(nend, cend,
 934                             cbp->bio_joffset + nend - cstart, tmpdata, flags);
 935                         nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
 936                         ((struct bio *)cbp->bio_next)->bio_next = nbp;
 937                         cbp->bio_length = nstart - cstart;
 938                         if (cbp->bio_data != NULL) {
 939                                 cbp->bio_data = gj_realloc(cbp->bio_data,
 940                                     cbp->bio_length, cend - cstart);
 941                         }
 942                         n += 2;
 943                         GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
 944                         goto end;
 945                 } else if (nstart <= cstart && nend < cend) {
 946                         /*
 947                          *  +-----------------+      +-------------+
 948                          *  |   current bio   |      | current bio |
 949                          *  +-------------+   |  +---+---------+   |
 950                          *  |             |   |  |   |         |   |
 951                          *  |             |   |  |   |         |   |
 952                          *  +-------------+---+  |   +---------+---+
 953                          *  |   new bio   |      |   new bio   |
 954                          *  +-------------+      +-------------+
 955                          */
 956                         g_journal_stats_bytes_skipped += nend - nstart;
 957                         nbp = g_journal_new_bio(nstart, nend, joffset, data,
 958                             flags);
 959                         if (pbp == NULL)
 960                                 *head = nbp;
 961                         else
 962                                 pbp->bio_next = nbp;
 963                         nbp->bio_next = cbp;
 964                         cbp->bio_offset = nend;
 965                         cbp->bio_length = cend - nend;
 966                         cbp->bio_joffset += nend - cstart;
 967                         tmpdata = cbp->bio_data;
 968                         if (tmpdata != NULL) {
 969                                 cbp->bio_data = gj_malloc(cbp->bio_length,
 970                                     flags);
 971                                 if (cbp->bio_data != NULL) {
 972                                         bcopy(tmpdata + nend - cstart,
 973                                             cbp->bio_data, cbp->bio_length);
 974                                 }
 975                                 gj_free(tmpdata, cend - cstart);
 976                         }
 977                         n++;
 978                         GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
 979                         goto end;
 980                 }
 981                 if (nstart == nend)
 982                         goto end;
 983                 pbp = cbp;
 984         }
 985         nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
 986         if (pbp == NULL)
 987                 *head = nbp;
 988         else
 989                 pbp->bio_next = nbp;
 990         nbp->bio_next = NULL;
 991         n++;
 992         GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
 993 end:
 994         if (g_journal_debug >= 3) {
 995                 GJQ_FOREACH(*head, cbp) {
 996                         GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
 997                             (intmax_t)cbp->bio_offset,
 998                             (intmax_t)cbp->bio_length,
 999                             (intmax_t)cbp->bio_joffset, cbp->bio_data);
1000                 }
1001                 GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
1002         }
1003         return (n);
1004 }
1005
1006 /*
1007  * The function combines neighbour bios trying to squeeze as much data as
1008  * possible into one bio.
1009  *
1010  * The function returns the number of bios combined (negative value).
1011  */
1012 static int
1013 g_journal_optimize(struct bio *head)
1014 {
1015         struct bio *cbp, *pbp;
1016         int n;
1017
1018         n = 0;
1019         pbp = NULL;
1020         GJQ_FOREACH(head, cbp) {
1021                 /* Skip bios which has to be read first. */
1022                 if (cbp->bio_data == NULL) {
1023                         pbp = NULL;
1024                         continue;
1025                 }
1026                 /* There is no previous bio yet. */
1027                 if (pbp == NULL) {
1028                         pbp = cbp;
1029                         continue;
1030                 }
1031                 /* Is this a neighbour bio? */
1032                 if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
1033                         /* Be sure that bios queue is sorted. */
1034                         KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
1035                             ("poffset=%jd plength=%jd coffset=%jd",
1036                             (intmax_t)pbp->bio_offset,
1037                             (intmax_t)pbp->bio_length,
1038                             (intmax_t)cbp->bio_offset));
1039                         pbp = cbp;
1040                         continue;
1041                 }
1042                 /* Be sure we don't end up with too big bio. */
1043                 if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
1044                         pbp = cbp;
1045                         continue;
1046                 }
1047                 /* Ok, we can join bios. */
1048                 GJ_LOGREQ(4, pbp, "Join: ");
1049                 GJ_LOGREQ(4, cbp, "and: ");
1050                 pbp->bio_data = gj_realloc(pbp->bio_data,
1051                     pbp->bio_length + cbp->bio_length, pbp->bio_length);
1052                 bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
1053                     cbp->bio_length);
1054                 gj_free(cbp->bio_data, cbp->bio_length);
1055                 pbp->bio_length += cbp->bio_length;
1056                 pbp->bio_next = cbp->bio_next;
1057                 g_destroy_bio(cbp);
1058                 cbp = pbp;
1059                 g_journal_stats_combined_ios++;
1060                 n--;
1061                 GJ_LOGREQ(4, pbp, "Got: ");
1062         }
1063         return (n);
1064 }
1065
1066 /*
1067  * TODO: Update comment.
1068  * These are functions responsible for copying one portion of data from journal
1069  * to the destination provider.
1070  * The order goes like this:
1071  * 1. Read the header, which contains informations about data blocks
1072  *    following it.
1073  * 2. Read the data blocks from the journal.
1074  * 3. Write the data blocks on the data provider.
1075  *
1076  * g_journal_copy_start()
1077  * g_journal_copy_done() - got finished write request, logs potential errors.
1078  */
1079
1080 /*
1081  * When there is no data in cache, this function is used to read it.
1082  */
1083 static void
1084 g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
1085 {
1086         struct bio *cbp;
1087
1088         /*
1089          * We were short in memory, so data was freed.
1090          * In that case we need to read it back from journal.
1091          */
1092         cbp = g_alloc_bio();
1093         cbp->bio_cflags = bp->bio_cflags;
1094         cbp->bio_parent = bp;
1095         cbp->bio_offset = bp->bio_joffset;
1096         cbp->bio_length = bp->bio_length;
1097         cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
1098         cbp->bio_cmd = BIO_READ;
1099         cbp->bio_done = g_journal_std_done;
1100         GJ_LOGREQ(4, cbp, "READ FIRST");
1101         g_io_request(cbp, sc->sc_jconsumer);
1102         g_journal_cache_misses++;
1103 }
1104
1105 static void
1106 g_journal_copy_send(struct g_journal_softc *sc)
1107 {
1108         struct bio *bioq, *bp, *lbp;
1109
1110         bioq = lbp = NULL;
1111         mtx_lock(&sc->sc_mtx);
1112         for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
1113                 bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
1114                 if (bp == NULL)
1115                         break;
1116                 GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
1117                 sc->sc_copy_in_progress++;
1118                 GJQ_INSERT_AFTER(bioq, bp, lbp);
1119                 lbp = bp;
1120         }
1121         mtx_unlock(&sc->sc_mtx);
1122         if (g_journal_do_optimize)
1123                 sc->sc_copy_in_progress += g_journal_optimize(bioq);
1124         while ((bp = GJQ_FIRST(bioq)) != NULL) {
1125                 GJQ_REMOVE(bioq, bp);
1126                 GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
1127                 bp->bio_cflags = GJ_BIO_COPY;
1128                 if (bp->bio_data == NULL)
1129                         g_journal_read_first(sc, bp);
1130                 else {
1131                         bp->bio_joffset = 0;
1132                         GJ_LOGREQ(4, bp, "SEND");
1133                         g_io_request(bp, sc->sc_dconsumer);
1134                 }
1135         }
1136 }
1137
1138 static void
1139 g_journal_copy_start(struct g_journal_softc *sc)
1140 {
1141
1142         /*
1143          * Remember in metadata that we're starting to copy journaled data
1144          * to the data provider.
1145          * In case of power failure, we will copy these data once again on boot.
1146          */
1147         if (!sc->sc_journal_copying) {
1148                 sc->sc_journal_copying = 1;
1149                 GJ_DEBUG(1, "Starting copy of journal.");
1150                 g_journal_metadata_update(sc);
1151         }
1152         g_journal_copy_send(sc);
1153 }
1154
1155 /*
1156  * Data block has been read from the journal provider.
1157  */
1158 static int
1159 g_journal_copy_read_done(struct bio *bp)
1160 {
1161         struct g_journal_softc *sc;
1162         struct g_consumer *cp;
1163         struct bio *pbp;
1164
1165         KASSERT(bp->bio_cflags == GJ_BIO_COPY,
1166             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
1167
1168         sc = bp->bio_from->geom->softc;
1169         pbp = bp->bio_parent;
1170
1171         if (bp->bio_error != 0) {
1172                 GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
1173                     bp->bio_to->name, bp->bio_error);
1174                 /*
1175                  * We will not be able to deliver WRITE request as well.
1176                  */
1177                 gj_free(bp->bio_data, bp->bio_length);
1178                 g_destroy_bio(pbp);
1179                 g_destroy_bio(bp);
1180                 sc->sc_copy_in_progress--;
1181                 return (1);
1182         }
1183         pbp->bio_data = bp->bio_data;
1184         cp = sc->sc_dconsumer;
1185         g_io_request(pbp, cp);
1186         GJ_LOGREQ(4, bp, "READ DONE");
1187         g_destroy_bio(bp);
1188         return (0);
1189 }
1190
1191 /*
1192  * Data block has been written to the data provider.
1193  */
1194 static void
1195 g_journal_copy_write_done(struct bio *bp)
1196 {
1197         struct g_journal_softc *sc;
1198
1199         KASSERT(bp->bio_cflags == GJ_BIO_COPY,
1200             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
1201
1202         sc = bp->bio_from->geom->softc;
1203         sc->sc_copy_in_progress--;
1204
1205         if (bp->bio_error != 0) {
1206                 GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
1207                     bp->bio_error);
1208         }
1209         GJQ_REMOVE(sc->sc_copy_queue, bp);
1210         gj_free(bp->bio_data, bp->bio_length);
1211         GJ_LOGREQ(4, bp, "DONE");
1212         g_destroy_bio(bp);
1213
1214         if (sc->sc_copy_in_progress == 0) {
1215                 /*
1216                  * This was the last write request for this journal.
1217                  */
1218                 GJ_DEBUG(1, "Data has been copied.");
1219                 sc->sc_journal_copying = 0;
1220         }
1221 }
1222
1223 static void g_journal_flush_done(struct bio *bp);
1224
1225 /*
1226  * Flush one record onto active journal provider.
1227  */
1228 static void
1229 g_journal_flush(struct g_journal_softc *sc)
1230 {
1231         struct g_journal_record_header hdr;
1232         struct g_journal_entry *ent;
1233         struct g_provider *pp;
1234         struct bio **bioq;
1235         struct bio *bp, *fbp, *pbp;
1236         off_t joffset, size;
1237         u_char *data, hash[16];
1238         MD5_CTX ctx;
1239         u_int i;
1240
1241         if (sc->sc_current_count == 0)
1242                 return;
1243
1244         size = 0;
1245         pp = sc->sc_jprovider;
1246         GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
1247         joffset = sc->sc_journal_offset;
1248
1249         GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
1250             sc->sc_current_count, pp->name, (intmax_t)joffset);
1251
1252         /*
1253          * Store 'journal id', so we know to which journal this record belongs.
1254          */
1255         hdr.jrh_journal_id = sc->sc_journal_id;
1256         /* Could be less than g_journal_record_entries if called due timeout. */
1257         hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
1258         strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
1259
1260         bioq = &sc->sc_active.jj_queue;
1261         pbp = sc->sc_flush_queue;
1262
1263         fbp = g_alloc_bio();
1264         fbp->bio_parent = NULL;
1265         fbp->bio_cflags = GJ_BIO_JOURNAL;
1266         fbp->bio_offset = -1;
1267         fbp->bio_joffset = joffset;
1268         fbp->bio_length = pp->sectorsize;
1269         fbp->bio_cmd = BIO_WRITE;
1270         fbp->bio_done = g_journal_std_done;
1271         GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
1272         pbp = fbp;
1273         fbp->bio_to = pp;
1274         GJ_LOGREQ(4, fbp, "FLUSH_OUT");
1275         joffset += pp->sectorsize;
1276         sc->sc_flush_count++;
1277         if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1278                 MD5Init(&ctx);
1279
1280         for (i = 0; i < hdr.jrh_nentries; i++) {
1281                 bp = sc->sc_current_queue;
1282                 KASSERT(bp != NULL, ("NULL bp"));
1283                 bp->bio_to = pp;
1284                 GJ_LOGREQ(4, bp, "FLUSHED");
1285                 sc->sc_current_queue = bp->bio_next;
1286                 bp->bio_next = NULL;
1287                 sc->sc_current_count--;
1288
1289                 /* Add to the header. */
1290                 ent = &hdr.jrh_entries[i];
1291                 ent->je_offset = bp->bio_offset;
1292                 ent->je_joffset = joffset;
1293                 ent->je_length = bp->bio_length;
1294                 size += ent->je_length;
1295
1296                 data = bp->bio_data;
1297                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1298                         MD5Update(&ctx, data, ent->je_length);
1299                 bzero(bp, sizeof(*bp));
1300                 bp->bio_cflags = GJ_BIO_JOURNAL;
1301                 bp->bio_offset = ent->je_offset;
1302                 bp->bio_joffset = ent->je_joffset;
1303                 bp->bio_length = ent->je_length;
1304                 bp->bio_data = data;
1305                 bp->bio_cmd = BIO_WRITE;
1306                 bp->bio_done = g_journal_std_done;
1307                 GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
1308                 pbp = bp;
1309                 bp->bio_to = pp;
1310                 GJ_LOGREQ(4, bp, "FLUSH_OUT");
1311                 joffset += bp->bio_length;
1312                 sc->sc_flush_count++;
1313
1314                 /*
1315                  * Add request to the active sc_journal_queue queue.
1316                  * This is our cache. After journal switch we don't have to
1317                  * read the data from the inactive journal, because we keep
1318                  * it in memory.
1319                  */
1320                 g_journal_insert(bioq, ent->je_offset,
1321                     ent->je_offset + ent->je_length, ent->je_joffset, data,
1322                     M_NOWAIT);
1323         }
1324
1325         /*
1326          * After all requests, store valid header.
1327          */
1328         data = gj_malloc(pp->sectorsize, M_WAITOK);
1329         if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1330                 MD5Final(hash, &ctx);
1331                 bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
1332         }
1333         g_journal_record_header_encode(&hdr, data);
1334         fbp->bio_data = data;
1335
1336         sc->sc_journal_offset = joffset;
1337
1338         g_journal_check_overflow(sc);
1339 }
1340
1341 /*
1342  * Flush request finished.
1343  */
1344 static void
1345 g_journal_flush_done(struct bio *bp)
1346 {
1347         struct g_journal_softc *sc;
1348         struct g_consumer *cp;
1349
1350         KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
1351             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
1352
1353         cp = bp->bio_from;
1354         sc = cp->geom->softc;
1355         sc->sc_flush_in_progress--;
1356
1357         if (bp->bio_error != 0) {
1358                 GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
1359                     bp->bio_error);
1360         }
1361         gj_free(bp->bio_data, bp->bio_length);
1362         GJ_LOGREQ(4, bp, "DONE");
1363         g_destroy_bio(bp);
1364 }
1365
1366 static void g_journal_release_delayed(struct g_journal_softc *sc);
1367
1368 static void
1369 g_journal_flush_send(struct g_journal_softc *sc)
1370 {
1371         struct g_consumer *cp;
1372         struct bio *bioq, *bp, *lbp;
1373
1374         cp = sc->sc_jconsumer;
1375         bioq = lbp = NULL;
1376         while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
1377                 /* Send one flush requests to the active journal. */
1378                 bp = GJQ_FIRST(sc->sc_flush_queue);
1379                 if (bp != NULL) {
1380                         GJQ_REMOVE(sc->sc_flush_queue, bp);
1381                         sc->sc_flush_count--;
1382                         bp->bio_offset = bp->bio_joffset;
1383                         bp->bio_joffset = 0;
1384                         sc->sc_flush_in_progress++;
1385                         GJQ_INSERT_AFTER(bioq, bp, lbp);
1386                         lbp = bp;
1387                 }
1388                 /* Try to release delayed requests. */
1389                 g_journal_release_delayed(sc);
1390                 /* If there are no requests to flush, leave. */
1391                 if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
1392                         break;
1393         }
1394         if (g_journal_do_optimize)
1395                 sc->sc_flush_in_progress += g_journal_optimize(bioq);
1396         while ((bp = GJQ_FIRST(bioq)) != NULL) {
1397                 GJQ_REMOVE(bioq, bp);
1398                 GJ_LOGREQ(3, bp, "Flush request send");
1399                 g_io_request(bp, cp);
1400         }
1401 }
1402
1403 static void
1404 g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
1405 {
1406         int n;
1407
1408         GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
1409         n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
1410         sc->sc_current_count += n;
1411         n = g_journal_optimize(sc->sc_current_queue);
1412         sc->sc_current_count += n;
1413         /*
1414          * For requests which are added to the current queue we deliver
1415          * response immediately.
1416          */
1417         bp->bio_completed = bp->bio_length;
1418         g_io_deliver(bp, 0);
1419         if (sc->sc_current_count >= g_journal_record_entries) {
1420                 /*
1421                  * Let's flush one record onto active journal provider.
1422                  */
1423                 g_journal_flush(sc);
1424         }
1425 }
1426
1427 static void
1428 g_journal_release_delayed(struct g_journal_softc *sc)
1429 {
1430         struct bio *bp;
1431
1432         for (;;) {
1433                 /* The flush queue is full, exit. */
1434                 if (sc->sc_flush_count >= g_journal_accept_immediately)
1435                         return;
1436                 bp = bioq_takefirst(&sc->sc_delayed_queue);
1437                 if (bp == NULL)
1438                         return;
1439                 sc->sc_delayed_count--;
1440                 g_journal_add_current(sc, bp);
1441         }
1442 }
1443
1444 /*
1445  * Add I/O request to the current queue. If we have enough requests for one
1446  * journal record we flush them onto active journal provider.
1447  */
1448 static void
1449 g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
1450 {
1451
1452         /*
1453          * The flush queue is full, we need to delay the request.
1454          */
1455         if (sc->sc_delayed_count > 0 ||
1456             sc->sc_flush_count >= g_journal_accept_immediately) {
1457                 GJ_LOGREQ(4, bp, "DELAYED");
1458                 bioq_insert_tail(&sc->sc_delayed_queue, bp);
1459                 sc->sc_delayed_count++;
1460                 return;
1461         }
1462
1463         KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
1464             ("DELAYED queue not empty."));
1465         g_journal_add_current(sc, bp);
1466 }
1467
1468 static void g_journal_read_done(struct bio *bp);
1469
1470 /*
1471  * Try to find requested data in cache.
1472  */
1473 static struct bio *
1474 g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
1475     off_t oend)
1476 {
1477         off_t cstart, cend;
1478         struct bio *bp;
1479
1480         GJQ_FOREACH(head, bp) {
1481                 if (bp->bio_offset == -1)
1482                         continue;
1483                 cstart = MAX(ostart, bp->bio_offset);
1484                 cend = MIN(oend, bp->bio_offset + bp->bio_length);
1485                 if (cend <= ostart)
1486                         continue;
1487                 else if (cstart >= oend) {
1488                         if (!sorted)
1489                                 continue;
1490                         else {
1491                                 bp = NULL;
1492                                 break;
1493                         }
1494                 }
1495                 if (bp->bio_data == NULL)
1496                         break;
1497                 GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
1498                     bp);
1499                 bcopy(bp->bio_data + cstart - bp->bio_offset,
1500                     pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
1501                 pbp->bio_completed += cend - cstart;
1502                 if (pbp->bio_completed == pbp->bio_length) {
1503                         /*
1504                          * Cool, the whole request was in cache, deliver happy
1505                          * message.
1506                          */
1507                         g_io_deliver(pbp, 0);
1508                         return (pbp);
1509                 }
1510                 break;
1511         }
1512         return (bp);
1513 }
1514
1515 /*
1516  * Try to find requested data in cache.
1517  */
1518 static struct bio *
1519 g_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart,
1520     off_t oend)
1521 {
1522         off_t cstart, cend;
1523         struct bio *bp;
1524
1525         TAILQ_FOREACH(bp, head, bio_queue) {
1526                 cstart = MAX(ostart, bp->bio_offset);
1527                 cend = MIN(oend, bp->bio_offset + bp->bio_length);
1528                 if (cend <= ostart)
1529                         continue;
1530                 else if (cstart >= oend)
1531                         continue;
1532                 KASSERT(bp->bio_data != NULL,
1533                     ("%s: bio_data == NULL", __func__));
1534                 GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
1535                     bp);
1536                 bcopy(bp->bio_data + cstart - bp->bio_offset,
1537                     pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
1538                 pbp->bio_completed += cend - cstart;
1539                 if (pbp->bio_completed == pbp->bio_length) {
1540                         /*
1541                          * Cool, the whole request was in cache, deliver happy
1542                          * message.
1543                          */
1544                         g_io_deliver(pbp, 0);
1545                         return (pbp);
1546                 }
1547                 break;
1548         }
1549         return (bp);
1550 }
1551
1552 /*
1553  * This function is used for colecting data on read.
1554  * The complexity is because parts of the data can be stored in four different
1555  * places:
1556  * - in delayed requests
1557  * - in memory - the data not yet send to the active journal provider
1558  * - in requests which are going to be sent to the active journal
1559  * - in the active journal
1560  * - in the inactive journal
1561  * - in the data provider
1562  */
1563 static void
1564 g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
1565     off_t oend)
1566 {
1567         struct bio *bp, *nbp, *head;
1568         off_t cstart, cend;
1569         u_int i, sorted = 0;
1570
1571         GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
1572
1573         cstart = cend = -1;
1574         bp = NULL;
1575         head = NULL;
1576         for (i = 0; i <= 5; i++) {
1577                 switch (i) {
1578                 case 0: /* Delayed requests. */
1579                         head = NULL;
1580                         sorted = 0;
1581                         break;
1582                 case 1: /* Not-yet-send data. */
1583                         head = sc->sc_current_queue;
1584                         sorted = 1;
1585                         break;
1586                 case 2: /* In-flight to the active journal. */
1587                         head = sc->sc_flush_queue;
1588                         sorted = 0;
1589                         break;
1590                 case 3: /* Active journal. */
1591                         head = sc->sc_active.jj_queue;
1592                         sorted = 1;
1593                         break;
1594                 case 4: /* Inactive journal. */
1595                         /*
1596                          * XXX: Here could be a race with g_journal_lowmem().
1597                          */
1598                         head = sc->sc_inactive.jj_queue;
1599                         sorted = 1;
1600                         break;
1601                 case 5: /* In-flight to the data provider. */
1602                         head = sc->sc_copy_queue;
1603                         sorted = 0;
1604                         break;
1605                 default:
1606                         panic("gjournal %s: i=%d", __func__, i);
1607                 }
1608                 if (i == 0)
1609                         bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend);
1610                 else
1611                         bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
1612                 if (bp == pbp) { /* Got the whole request. */
1613                         GJ_DEBUG(2, "Got the whole request from %u.", i);
1614                         return;
1615                 } else if (bp != NULL) {
1616                         cstart = MAX(ostart, bp->bio_offset);
1617                         cend = MIN(oend, bp->bio_offset + bp->bio_length);
1618                         GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
1619                             i, (intmax_t)cstart, (intmax_t)cend);
1620                         break;
1621                 }
1622         }
1623         if (bp != NULL) {
1624                 if (bp->bio_data == NULL) {
1625                         nbp = g_duplicate_bio(pbp);
1626                         nbp->bio_cflags = GJ_BIO_READ;
1627                         nbp->bio_data =
1628                             pbp->bio_data + cstart - pbp->bio_offset;
1629                         nbp->bio_offset =
1630                             bp->bio_joffset + cstart - bp->bio_offset;
1631                         nbp->bio_length = cend - cstart;
1632                         nbp->bio_done = g_journal_read_done;
1633                         g_io_request(nbp, sc->sc_jconsumer);
1634                 }
1635                 /*
1636                  * If we don't have the whole request yet, call g_journal_read()
1637                  * recursively.
1638                  */
1639                 if (ostart < cstart)
1640                         g_journal_read(sc, pbp, ostart, cstart);
1641                 if (oend > cend)
1642                         g_journal_read(sc, pbp, cend, oend);
1643         } else {
1644                 /*
1645                  * No data in memory, no data in journal.
1646                  * Its time for asking data provider.
1647                  */
1648                 GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
1649                 nbp = g_duplicate_bio(pbp);
1650                 nbp->bio_cflags = GJ_BIO_READ;
1651                 nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
1652                 nbp->bio_offset = ostart;
1653                 nbp->bio_length = oend - ostart;
1654                 nbp->bio_done = g_journal_read_done;
1655                 g_io_request(nbp, sc->sc_dconsumer);
1656                 /* We have the whole request, return here. */
1657                 return;
1658         }
1659 }
1660
1661 /*
1662  * Function responsible for handling finished READ requests.
1663  * Actually, g_std_done() could be used here, the only difference is that we
1664  * log error.
1665  */
1666 static void
1667 g_journal_read_done(struct bio *bp)
1668 {
1669         struct bio *pbp;
1670
1671         KASSERT(bp->bio_cflags == GJ_BIO_READ,
1672             ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
1673
1674         pbp = bp->bio_parent;
1675         pbp->bio_inbed++;
1676         pbp->bio_completed += bp->bio_length;
1677
1678         if (bp->bio_error != 0) {
1679                 if (pbp->bio_error == 0)
1680                         pbp->bio_error = bp->bio_error;
1681                 GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
1682                     bp->bio_to->name, bp->bio_error);
1683         }
1684         g_destroy_bio(bp);
1685         if (pbp->bio_children == pbp->bio_inbed &&
1686             pbp->bio_completed == pbp->bio_length) {
1687                 /* We're done. */
1688                 g_io_deliver(pbp, 0);
1689         }
1690 }
1691
1692 /*
1693  * Deactive current journal and active next one.
1694  */
1695 static void
1696 g_journal_switch(struct g_journal_softc *sc)
1697 {
1698         struct g_provider *pp;
1699
1700         if (JEMPTY(sc)) {
1701                 GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
1702                 pp = LIST_FIRST(&sc->sc_geom->provider);
1703                 if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
1704                         sc->sc_flags |= GJF_DEVICE_CLEAN;
1705                         GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
1706                         g_journal_metadata_update(sc);
1707                 }
1708         } else {
1709                 GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
1710
1711                 pp = sc->sc_jprovider;
1712
1713                 sc->sc_journal_previous_id = sc->sc_journal_id;
1714
1715                 sc->sc_journal_id = sc->sc_journal_next_id;
1716                 sc->sc_journal_next_id = arc4random();
1717
1718                 GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
1719
1720                 g_journal_write_header(sc);
1721
1722                 sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
1723                 sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
1724
1725                 sc->sc_active.jj_offset =
1726                     sc->sc_journal_offset - pp->sectorsize;
1727                 sc->sc_active.jj_queue = NULL;
1728
1729                 /*
1730                  * Switch is done, start copying data from the (now) inactive
1731                  * journal to the data provider.
1732                  */
1733                 g_journal_copy_start(sc);
1734         }
1735         mtx_lock(&sc->sc_mtx);
1736         sc->sc_flags &= ~GJF_DEVICE_SWITCH;
1737         mtx_unlock(&sc->sc_mtx);
1738 }
1739
1740 static void
1741 g_journal_initialize(struct g_journal_softc *sc)
1742 {
1743
1744         sc->sc_journal_id = arc4random();
1745         sc->sc_journal_next_id = arc4random();
1746         sc->sc_journal_previous_id = sc->sc_journal_id;
1747         sc->sc_journal_offset = sc->sc_jstart;
1748         sc->sc_inactive.jj_offset = sc->sc_jstart;
1749         g_journal_write_header(sc);
1750         sc->sc_active.jj_offset = sc->sc_jstart;
1751 }
1752
1753 static void
1754 g_journal_mark_as_dirty(struct g_journal_softc *sc)
1755 {
1756         const struct g_journal_desc *desc;
1757         int i;
1758
1759         GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
1760         for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
1761                 desc->jd_dirty(sc->sc_dconsumer);
1762 }
1763
1764 /*
1765  * Function read record header from the given journal.
1766  * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
1767  * and data on every call.
1768  */
1769 static int
1770 g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
1771     void *data)
1772 {
1773         int error;
1774
1775         bzero(bp, sizeof(*bp));
1776         bp->bio_cmd = BIO_READ;
1777         bp->bio_done = NULL;
1778         bp->bio_offset = offset;
1779         bp->bio_length = cp->provider->sectorsize;
1780         bp->bio_data = data;
1781         g_io_request(bp, cp);
1782         error = biowait(bp, "gjs_read");
1783         return (error);
1784 }
1785
1786 #if 0
1787 /*
1788  * Function is called when we start the journal device and we detect that
1789  * one of the journals was not fully copied.
1790  * The purpose of this function is to read all records headers from journal
1791  * and placed them in the inactive queue, so we can start journal
1792  * synchronization process and the journal provider itself.
1793  * Design decision was taken to not synchronize the whole journal here as it
1794  * can take too much time. Reading headers only and delaying synchronization
1795  * process until after journal provider is started should be the best choice.
1796  */
1797 #endif
1798
1799 static void
1800 g_journal_sync(struct g_journal_softc *sc)
1801 {
1802         struct g_journal_record_header rhdr;
1803         struct g_journal_entry *ent;
1804         struct g_journal_header jhdr;
1805         struct g_consumer *cp;
1806         struct bio *bp, *fbp, *tbp;
1807         off_t joffset, offset;
1808         u_char *buf, sum[16];
1809         uint64_t id;
1810         MD5_CTX ctx;
1811         int error, found, i;
1812
1813         found = 0;
1814         fbp = NULL;
1815         cp = sc->sc_jconsumer;
1816         bp = g_alloc_bio();
1817         buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
1818         offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
1819
1820         GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
1821
1822         /*
1823          * Read and decode first journal header.
1824          */
1825         error = g_journal_sync_read(cp, bp, offset, buf);
1826         if (error != 0) {
1827                 GJ_DEBUG(0, "Error while reading journal header from %s.",
1828                     cp->provider->name);
1829                 goto end;
1830         }
1831         error = g_journal_header_decode(buf, &jhdr);
1832         if (error != 0) {
1833                 GJ_DEBUG(0, "Cannot decode journal header from %s.",
1834                     cp->provider->name);
1835                 goto end;
1836         }
1837         id = sc->sc_journal_id;
1838         if (jhdr.jh_journal_id != sc->sc_journal_id) {
1839                 GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
1840                     (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
1841                 goto end;
1842         }
1843         offset += cp->provider->sectorsize;
1844         id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
1845
1846         for (;;) {
1847                 /*
1848                  * If the biggest record won't fit, look for a record header or
1849                  * journal header from the begining.
1850                  */
1851                 GJ_VALIDATE_OFFSET(offset, sc);
1852                 error = g_journal_sync_read(cp, bp, offset, buf);
1853                 if (error != 0) {
1854                         /*
1855                          * Not good. Having an error while reading header
1856                          * means, that we cannot read next headers and in
1857                          * consequence we cannot find termination.
1858                          */
1859                         GJ_DEBUG(0,
1860                             "Error while reading record header from %s.",
1861                             cp->provider->name);
1862                         break;
1863                 }
1864
1865                 error = g_journal_record_header_decode(buf, &rhdr);
1866                 if (error != 0) {
1867                         GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
1868                             (intmax_t)offset, error);
1869                         /*
1870                          * This is not a record header.
1871                          * If we are lucky, this is next journal header.
1872                          */
1873                         error = g_journal_header_decode(buf, &jhdr);
1874                         if (error != 0) {
1875                                 GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
1876                                     (intmax_t)offset, error);
1877                                 /*
1878                                  * Nope, this is not journal header, which
1879                                  * bascially means that journal is not
1880                                  * terminated properly.
1881                                  */
1882                                 error = ENOENT;
1883                                 break;
1884                         }
1885                         /*
1886                          * Ok. This is header of _some_ journal. Now we need to
1887                          * verify if this is header of the _next_ journal.
1888                          */
1889                         if (jhdr.jh_journal_id != id) {
1890                                 GJ_DEBUG(1, "Journal ID mismatch at %jd "
1891                                     "(0x%08x != 0x%08x).", (intmax_t)offset,
1892                                     (u_int)jhdr.jh_journal_id, (u_int)id);
1893                                 error = ENOENT;
1894                                 break;
1895                         }
1896
1897                         /* Found termination. */
1898                         found++;
1899                         GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
1900                             (intmax_t)offset, (u_int)id);
1901                         sc->sc_active.jj_offset = offset;
1902                         sc->sc_journal_offset =
1903                             offset + cp->provider->sectorsize;
1904                         sc->sc_journal_id = id;
1905                         id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
1906
1907                         while ((tbp = fbp) != NULL) {
1908                                 fbp = tbp->bio_next;
1909                                 GJ_LOGREQ(3, tbp, "Adding request.");
1910                                 g_journal_insert_bio(&sc->sc_inactive.jj_queue,
1911                                     tbp, M_WAITOK);
1912                         }
1913
1914                         /* Skip journal's header. */
1915                         offset += cp->provider->sectorsize;
1916                         continue;
1917                 }
1918
1919                 /* Skip record's header. */
1920                 offset += cp->provider->sectorsize;
1921
1922                 /*
1923                  * Add information about every record entry to the inactive
1924                  * queue.
1925                  */
1926                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1927                         MD5Init(&ctx);
1928                 for (i = 0; i < rhdr.jrh_nentries; i++) {
1929                         ent = &rhdr.jrh_entries[i];
1930                         GJ_DEBUG(3, "Insert entry: %jd %jd.",
1931                             (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
1932                         g_journal_insert(&fbp, ent->je_offset,
1933                             ent->je_offset + ent->je_length, ent->je_joffset,
1934                             NULL, M_WAITOK);
1935                         if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1936                                 u_char *buf2;
1937
1938                                 /*
1939                                  * TODO: Should use faster function (like
1940                                  *       g_journal_sync_read()).
1941                                  */
1942                                 buf2 = g_read_data(cp, offset, ent->je_length,
1943                                     NULL);
1944                                 if (buf2 == NULL)
1945                                         GJ_DEBUG(0, "Cannot read data at %jd.",
1946                                             (intmax_t)offset);
1947                                 else {
1948                                         MD5Update(&ctx, buf2, ent->je_length);
1949                                         g_free(buf2);
1950                                 }
1951                         }
1952                         /* Skip entry's data. */
1953                         offset += ent->je_length;
1954                 }
1955                 if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1956                         MD5Final(sum, &ctx);
1957                         if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
1958                                 GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
1959                                     (intmax_t)offset);
1960                         }
1961                 }
1962         }
1963 end:
1964         gj_free(bp->bio_data, cp->provider->sectorsize);
1965         g_destroy_bio(bp);
1966
1967         /* Remove bios from unterminated journal. */
1968         while ((tbp = fbp) != NULL) {
1969                 fbp = tbp->bio_next;
1970                 g_destroy_bio(tbp);
1971         }
1972
1973         if (found < 1 && joffset > 0) {
1974                 GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
1975                     sc->sc_name);
1976                 while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
1977                         sc->sc_inactive.jj_queue = tbp->bio_next;
1978                         g_destroy_bio(tbp);
1979                 }
1980                 g_journal_initialize(sc);
1981                 g_journal_mark_as_dirty(sc);
1982         } else {
1983                 GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
1984                 g_journal_copy_start(sc);
1985         }
1986 }
1987
1988 /*
1989  * Wait for requests.
1990  * If we have requests in the current queue, flush them after 3 seconds from the
1991  * last flush. In this way we don't wait forever (or for journal switch) with
1992  * storing not full records on journal.
1993  */
1994 static void
1995 g_journal_wait(struct g_journal_softc *sc, time_t last_write)
1996 {
1997         int error, timeout;
1998
1999         GJ_DEBUG(3, "%s: enter", __func__);
2000         if (sc->sc_current_count == 0) {
2001                 if (g_journal_debug < 2)
2002                         msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
2003                 else {
2004                         /*
2005                          * If we have debug turned on, show number of elements
2006                          * in various queues.
2007                          */
2008                         for (;;) {
2009                                 error = msleep(sc, &sc->sc_mtx, PRIBIO,
2010                                     "gj:work", hz * 3);
2011                                 if (error == 0) {
2012                                         mtx_unlock(&sc->sc_mtx);
2013                                         break;
2014                                 }
2015                                 GJ_DEBUG(3, "Report: current count=%d",
2016                                     sc->sc_current_count);
2017                                 GJ_DEBUG(3, "Report: flush count=%d",
2018                                     sc->sc_flush_count);
2019                                 GJ_DEBUG(3, "Report: flush in progress=%d",
2020                                     sc->sc_flush_in_progress);
2021                                 GJ_DEBUG(3, "Report: copy in progress=%d",
2022                                     sc->sc_copy_in_progress);
2023                                 GJ_DEBUG(3, "Report: delayed=%d",
2024                                     sc->sc_delayed_count);
2025                         }
2026                 }
2027                 GJ_DEBUG(3, "%s: exit 1", __func__);
2028                 return;
2029         }
2030
2031         /*
2032          * Flush even not full records every 3 seconds.
2033          */
2034         timeout = (last_write + 3 - time_second) * hz;
2035         if (timeout <= 0) {
2036                 mtx_unlock(&sc->sc_mtx);
2037                 g_journal_flush(sc);
2038                 g_journal_flush_send(sc);
2039                 GJ_DEBUG(3, "%s: exit 2", __func__);
2040                 return;
2041         }
2042         error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
2043         if (error == EWOULDBLOCK)
2044                 g_journal_flush_send(sc);
2045         GJ_DEBUG(3, "%s: exit 3", __func__);
2046 }
2047
2048 /*
2049  * Worker thread.
2050  */
2051 static void
2052 g_journal_worker(void *arg)
2053 {
2054         struct g_journal_softc *sc;
2055         struct g_geom *gp;
2056         struct g_provider *pp;
2057         struct bio *bp;
2058         time_t last_write;
2059         int type;
2060
2061         thread_lock(curthread);
2062         sched_prio(curthread, PRIBIO);
2063         thread_unlock(curthread);
2064
2065         sc = arg;
2066         type = 0;       /* gcc */
2067
2068         if (sc->sc_flags & GJF_DEVICE_CLEAN) {
2069                 GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
2070                 g_journal_initialize(sc);
2071         } else {
2072                 g_journal_sync(sc);
2073         }
2074         /*
2075          * Check if we can use BIO_FLUSH.
2076          */
2077         sc->sc_bio_flush = 0;
2078         if (g_io_flush(sc->sc_jconsumer) == 0) {
2079                 sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
2080                 GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
2081                     sc->sc_jconsumer->provider->name);
2082         } else {
2083                 GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
2084                     sc->sc_jconsumer->provider->name);
2085         }
2086         if (sc->sc_jconsumer != sc->sc_dconsumer) {
2087                 if (g_io_flush(sc->sc_dconsumer) == 0) {
2088                         sc->sc_bio_flush |= GJ_FLUSH_DATA;
2089                         GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
2090                             sc->sc_dconsumer->provider->name);
2091                 } else {
2092                         GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
2093                             sc->sc_dconsumer->provider->name);
2094                 }
2095         }
2096
2097         gp = sc->sc_geom;
2098         g_topology_lock();
2099         pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
2100         pp->mediasize = sc->sc_mediasize;
2101         /*
2102          * There could be a problem when data provider and journal providers
2103          * have different sectorsize, but such scenario is prevented on journal
2104          * creation.
2105          */
2106         pp->sectorsize = sc->sc_sectorsize;
2107         g_error_provider(pp, 0);
2108         g_topology_unlock();
2109         last_write = time_second;
2110
2111         if (sc->sc_rootmount != NULL) {
2112                 GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
2113                 root_mount_rel(sc->sc_rootmount);
2114                 sc->sc_rootmount = NULL;
2115         }
2116
2117         for (;;) {
2118                 /* Get first request from the queue. */
2119                 mtx_lock(&sc->sc_mtx);
2120                 bp = bioq_first(&sc->sc_back_queue);
2121                 if (bp != NULL)
2122                         type = (bp->bio_cflags & GJ_BIO_MASK);
2123                 if (bp == NULL) {
2124                         bp = bioq_first(&sc->sc_regular_queue);
2125                         if (bp != NULL)
2126                                 type = GJ_BIO_REGULAR;
2127                 }
2128                 if (bp == NULL) {
2129 try_switch:
2130                         if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
2131                             (sc->sc_flags & GJF_DEVICE_DESTROY)) {
2132                                 if (sc->sc_current_count > 0) {
2133                                         mtx_unlock(&sc->sc_mtx);
2134                                         g_journal_flush(sc);
2135                                         g_journal_flush_send(sc);
2136                                         continue;
2137                                 }
2138                                 if (sc->sc_flush_in_progress > 0)
2139                                         goto sleep;
2140                                 if (sc->sc_copy_in_progress > 0)
2141                                         goto sleep;
2142                         }
2143                         if (sc->sc_flags & GJF_DEVICE_SWITCH) {
2144                                 mtx_unlock(&sc->sc_mtx);
2145                                 g_journal_switch(sc);
2146                                 wakeup(&sc->sc_journal_copying);
2147                                 continue;
2148                         }
2149                         if (sc->sc_flags & GJF_DEVICE_DESTROY) {
2150                                 GJ_DEBUG(1, "Shutting down worker "
2151                                     "thread for %s.", gp->name);
2152                                 sc->sc_worker = NULL;
2153                                 wakeup(&sc->sc_worker);
2154                                 mtx_unlock(&sc->sc_mtx);
2155                                 kproc_exit(0);
2156                         }
2157 sleep:
2158                         g_journal_wait(sc, last_write);
2159                         continue;
2160                 }
2161                 /*
2162                  * If we're in switch process, we need to delay all new
2163                  * write requests until its done.
2164                  */
2165                 if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
2166                     type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
2167                         GJ_LOGREQ(2, bp, "WRITE on SWITCH");
2168                         goto try_switch;
2169                 }
2170                 if (type == GJ_BIO_REGULAR)
2171                         bioq_remove(&sc->sc_regular_queue, bp);
2172                 else
2173                         bioq_remove(&sc->sc_back_queue, bp);
2174                 mtx_unlock(&sc->sc_mtx);
2175                 switch (type) {
2176                 case GJ_BIO_REGULAR:
2177                         /* Regular request. */
2178                         switch (bp->bio_cmd) {
2179                         case BIO_READ:
2180                                 g_journal_read(sc, bp, bp->bio_offset,
2181                                     bp->bio_offset + bp->bio_length);
2182                                 break;
2183                         case BIO_WRITE:
2184                                 last_write = time_second;
2185                                 g_journal_add_request(sc, bp);
2186                                 g_journal_flush_send(sc);
2187                                 break;
2188                         default:
2189                                 panic("Invalid bio_cmd (%d).", bp->bio_cmd);
2190                         }
2191                         break;
2192                 case GJ_BIO_COPY:
2193                         switch (bp->bio_cmd) {
2194                         case BIO_READ:
2195                                 if (g_journal_copy_read_done(bp))
2196                                         g_journal_copy_send(sc);
2197                                 break;
2198                         case BIO_WRITE:
2199                                 g_journal_copy_write_done(bp);
2200                                 g_journal_copy_send(sc);
2201                                 break;
2202                         default:
2203                                 panic("Invalid bio_cmd (%d).", bp->bio_cmd);
2204                         }
2205                         break;
2206                 case GJ_BIO_JOURNAL:
2207                         g_journal_flush_done(bp);
2208                         g_journal_flush_send(sc);
2209                         break;
2210                 case GJ_BIO_READ:
2211                 default:
2212                         panic("Invalid bio (%d).", type);
2213                 }
2214         }
2215 }
2216
2217 static void
2218 g_journal_destroy_event(void *arg, int flags __unused)
2219 {
2220         struct g_journal_softc *sc;
2221
2222         g_topology_assert();
2223         sc = arg;
2224         g_journal_destroy(sc);
2225 }
2226
2227 static void
2228 g_journal_timeout(void *arg)
2229 {
2230         struct g_journal_softc *sc;
2231
2232         sc = arg;
2233         GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
2234             sc->sc_geom->name);
2235         g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
2236 }
2237
2238 static struct g_geom *
2239 g_journal_create(struct g_class *mp, struct g_provider *pp,
2240     const struct g_journal_metadata *md)
2241 {
2242         struct g_journal_softc *sc;
2243         struct g_geom *gp;
2244         struct g_consumer *cp;
2245         int error;
2246
2247         sc = NULL;      /* gcc */
2248
2249         g_topology_assert();
2250         /*
2251          * There are two possibilities:
2252          * 1. Data and both journals are on the same provider.
2253          * 2. Data and journals are all on separated providers.
2254          */
2255         /* Look for journal device with the same ID. */
2256         LIST_FOREACH(gp, &mp->geom, geom) {
2257                 sc = gp->softc;
2258                 if (sc == NULL)
2259                         continue;
2260                 if (sc->sc_id == md->md_id)
2261                         break;
2262         }
2263         if (gp == NULL)
2264                 sc = NULL;
2265         else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
2266                 GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
2267                 return (NULL);
2268         }
2269         if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
2270                 GJ_DEBUG(0, "Invalid type on %s.", pp->name);
2271                 return (NULL);
2272         }
2273         if (md->md_type & GJ_TYPE_DATA) {
2274                 GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
2275                     pp->name);
2276         }
2277         if (md->md_type & GJ_TYPE_JOURNAL) {
2278                 GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
2279                     pp->name);
2280         }
2281
2282         if (sc == NULL) {
2283                 /* Action geom. */
2284                 sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
2285                 sc->sc_id = md->md_id;
2286                 sc->sc_type = 0;
2287                 sc->sc_flags = 0;
2288                 sc->sc_worker = NULL;
2289
2290                 gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
2291                 gp->start = g_journal_start;
2292                 gp->orphan = g_journal_orphan;
2293                 gp->access = g_journal_access;
2294                 gp->softc = sc;
2295                 gp->flags |= G_GEOM_VOLATILE_BIO;
2296                 sc->sc_geom = gp;
2297
2298                 mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
2299
2300                 bioq_init(&sc->sc_back_queue);
2301                 bioq_init(&sc->sc_regular_queue);
2302                 bioq_init(&sc->sc_delayed_queue);
2303                 sc->sc_delayed_count = 0;
2304                 sc->sc_current_queue = NULL;
2305                 sc->sc_current_count = 0;
2306                 sc->sc_flush_queue = NULL;
2307                 sc->sc_flush_count = 0;
2308                 sc->sc_flush_in_progress = 0;
2309                 sc->sc_copy_queue = NULL;
2310                 sc->sc_copy_in_progress = 0;
2311                 sc->sc_inactive.jj_queue = NULL;
2312                 sc->sc_active.jj_queue = NULL;
2313
2314                 sc->sc_rootmount = root_mount_hold("GJOURNAL");
2315                 GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
2316
2317                 callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2318                 if (md->md_type != GJ_TYPE_COMPLETE) {
2319                         /*
2320                          * Journal and data are on separate providers.
2321                          * At this point we have only one of them.
2322                          * We setup a timeout in case the other part will not
2323                          * appear, so we won't wait forever.
2324                          */
2325                         callout_reset(&sc->sc_callout, 5 * hz,
2326                             g_journal_timeout, sc);
2327                 }
2328         }
2329
2330         /* Remember type of the data provider. */
2331         if (md->md_type & GJ_TYPE_DATA)
2332                 sc->sc_orig_type = md->md_type;
2333         sc->sc_type |= md->md_type;
2334         cp = NULL;
2335
2336         if (md->md_type & GJ_TYPE_DATA) {
2337                 if (md->md_flags & GJ_FLAG_CLEAN)
2338                         sc->sc_flags |= GJF_DEVICE_CLEAN;
2339                 if (md->md_flags & GJ_FLAG_CHECKSUM)
2340                         sc->sc_flags |= GJF_DEVICE_CHECKSUM;
2341                 cp = g_new_consumer(gp);
2342                 error = g_attach(cp, pp);
2343                 KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
2344                     pp->name, error));
2345                 error = g_access(cp, 1, 1, 1);
2346                 if (error != 0) {
2347                         GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
2348                             error);
2349                         g_journal_destroy(sc);
2350                         return (NULL);
2351                 }
2352                 sc->sc_dconsumer = cp;
2353                 sc->sc_mediasize = pp->mediasize - pp->sectorsize;
2354                 sc->sc_sectorsize = pp->sectorsize;
2355                 sc->sc_jstart = md->md_jstart;
2356                 sc->sc_jend = md->md_jend;
2357                 if (md->md_provider[0] != '\0')
2358                         sc->sc_flags |= GJF_DEVICE_HARDCODED;
2359                 sc->sc_journal_offset = md->md_joffset;
2360                 sc->sc_journal_id = md->md_jid;
2361                 sc->sc_journal_previous_id = md->md_jid;
2362         }
2363         if (md->md_type & GJ_TYPE_JOURNAL) {
2364                 if (cp == NULL) {
2365                         cp = g_new_consumer(gp);
2366                         error = g_attach(cp, pp);
2367                         KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
2368                             pp->name, error));
2369                         error = g_access(cp, 1, 1, 1);
2370                         if (error != 0) {
2371                                 GJ_DEBUG(0, "Cannot access %s (error=%d).",
2372                                     pp->name, error);
2373                                 g_journal_destroy(sc);
2374                                 return (NULL);
2375                         }
2376                 } else {
2377                         /*
2378                          * Journal is on the same provider as data, which means
2379                          * that data provider ends where journal starts.
2380                          */
2381                         sc->sc_mediasize = md->md_jstart;
2382                 }
2383                 sc->sc_jconsumer = cp;
2384         }
2385
2386         if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
2387                 /* Journal is not complete yet. */
2388                 return (gp);
2389         } else {
2390                 /* Journal complete, cancel timeout. */
2391                 callout_drain(&sc->sc_callout);
2392         }
2393
2394         error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
2395             "g_journal %s", sc->sc_name);
2396         if (error != 0) {
2397                 GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
2398                     sc->sc_name);
2399                 g_journal_destroy(sc);
2400                 return (NULL);
2401         }
2402
2403         return (gp);
2404 }
2405
2406 static void
2407 g_journal_destroy_consumer(void *arg, int flags __unused)
2408 {
2409         struct g_consumer *cp;
2410
2411         g_topology_assert();
2412         cp = arg;
2413         g_detach(cp);
2414         g_destroy_consumer(cp);
2415 }
2416
2417 static int
2418 g_journal_destroy(struct g_journal_softc *sc)
2419 {
2420         struct g_geom *gp;
2421         struct g_provider *pp;
2422         struct g_consumer *cp;
2423
2424         g_topology_assert();
2425
2426         if (sc == NULL)
2427                 return (ENXIO);
2428
2429         gp = sc->sc_geom;
2430         pp = LIST_FIRST(&gp->provider);
2431         if (pp != NULL) {
2432                 if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
2433                         GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
2434                             pp->name, pp->acr, pp->acw, pp->ace);
2435                         return (EBUSY);
2436                 }
2437                 g_error_provider(pp, ENXIO);
2438
2439                 g_journal_flush(sc);
2440                 g_journal_flush_send(sc);
2441                 g_journal_switch(sc);
2442         }
2443
2444         sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
2445
2446         g_topology_unlock();
2447
2448         if (sc->sc_rootmount != NULL) {
2449                 GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
2450                 root_mount_rel(sc->sc_rootmount);
2451                 sc->sc_rootmount = NULL;
2452         }
2453
2454         callout_drain(&sc->sc_callout);
2455         mtx_lock(&sc->sc_mtx);
2456         wakeup(sc);
2457         while (sc->sc_worker != NULL)
2458                 msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
2459         mtx_unlock(&sc->sc_mtx);
2460
2461         if (pp != NULL) {
2462                 GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
2463                 g_journal_metadata_update(sc);
2464                 g_topology_lock();
2465                 pp->flags |= G_PF_WITHER;
2466                 g_orphan_provider(pp, ENXIO);
2467         } else {
2468                 g_topology_lock();
2469         }
2470         mtx_destroy(&sc->sc_mtx);
2471
2472         if (sc->sc_current_count != 0) {
2473                 GJ_DEBUG(0, "Warning! Number of current requests %d.",
2474                     sc->sc_current_count);
2475         }
2476
2477         LIST_FOREACH(cp, &gp->consumer, consumer) {
2478                 if (cp->acr + cp->acw + cp->ace > 0)
2479                         g_access(cp, -1, -1, -1);
2480                 /*
2481                  * We keep all consumers open for writting, so if I'll detach
2482                  * and destroy consumer here, I'll get providers for taste, so
2483                  * journal will be started again.
2484                  * Sending an event here, prevents this from happening.
2485                  */
2486                 g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
2487         }
2488         gp->softc = NULL;
2489         g_wither_geom(gp, ENXIO);
2490         free(sc, M_JOURNAL);
2491         return (0);
2492 }
2493
2494 static void
2495 g_journal_taste_orphan(struct g_consumer *cp)
2496 {
2497
2498         KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2499             cp->provider->name));
2500 }
2501
2502 static struct g_geom *
2503 g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2504 {
2505         struct g_journal_metadata md;
2506         struct g_consumer *cp;
2507         struct g_geom *gp;
2508         int error;
2509
2510         g_topology_assert();
2511         g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2512         GJ_DEBUG(2, "Tasting %s.", pp->name);
2513         if (pp->geom->class == mp)
2514                 return (NULL);
2515
2516         gp = g_new_geomf(mp, "journal:taste");
2517         /* This orphan function should be never called. */
2518         gp->orphan = g_journal_taste_orphan;
2519         cp = g_new_consumer(gp);
2520         g_attach(cp, pp);
2521         error = g_journal_metadata_read(cp, &md);
2522         g_detach(cp);
2523         g_destroy_consumer(cp);
2524         g_destroy_geom(gp);
2525         if (error != 0)
2526                 return (NULL);
2527         gp = NULL;
2528
2529         if (md.md_provider[0] != '\0' &&
2530             !g_compare_names(md.md_provider, pp->name))
2531                 return (NULL);
2532         if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
2533                 return (NULL);
2534         if (g_journal_debug >= 2)
2535                 journal_metadata_dump(&md);
2536
2537         gp = g_journal_create(mp, pp, &md);
2538         return (gp);
2539 }
2540
2541 static struct g_journal_softc *
2542 g_journal_find_device(struct g_class *mp, const char *name)
2543 {
2544         struct g_journal_softc *sc;
2545         struct g_geom *gp;
2546         struct g_provider *pp;
2547
2548         if (strncmp(name, "/dev/", 5) == 0)
2549                 name += 5;
2550         LIST_FOREACH(gp, &mp->geom, geom) {
2551                 sc = gp->softc;
2552                 if (sc == NULL)
2553                         continue;
2554                 if (sc->sc_flags & GJF_DEVICE_DESTROY)
2555                         continue;
2556                 if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
2557                         continue;
2558                 pp = LIST_FIRST(&gp->provider);
2559                 if (strcmp(sc->sc_name, name) == 0)
2560                         return (sc);
2561                 if (pp != NULL && strcmp(pp->name, name) == 0)
2562                         return (sc);
2563         }
2564         return (NULL);
2565 }
2566
2567 static void
2568 g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
2569 {
2570         struct g_journal_softc *sc;
2571         const char *name;
2572         char param[16];
2573         int *nargs;
2574         int error, i;
2575
2576         g_topology_assert();
2577
2578         nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
2579         if (nargs == NULL) {
2580                 gctl_error(req, "No '%s' argument.", "nargs");
2581                 return;
2582         }
2583         if (*nargs <= 0) {
2584                 gctl_error(req, "Missing device(s).");
2585                 return;
2586         }
2587
2588         for (i = 0; i < *nargs; i++) {
2589                 snprintf(param, sizeof(param), "arg%d", i);
2590                 name = gctl_get_asciiparam(req, param);
2591                 if (name == NULL) {
2592                         gctl_error(req, "No 'arg%d' argument.", i);
2593                         return;
2594                 }
2595                 sc = g_journal_find_device(mp, name);
2596                 if (sc == NULL) {
2597                         gctl_error(req, "No such device: %s.", name);
2598                         return;
2599                 }
2600                 error = g_journal_destroy(sc);
2601                 if (error != 0) {
2602                         gctl_error(req, "Cannot destroy device %s (error=%d).",
2603                             LIST_FIRST(&sc->sc_geom->provider)->name, error);
2604                         return;
2605                 }
2606         }
2607 }
2608
2609 static void
2610 g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
2611 {
2612
2613         g_topology_assert();
2614         g_topology_unlock();
2615         g_journal_sync_requested++;
2616         wakeup(&g_journal_switcher_state);
2617         while (g_journal_sync_requested > 0)
2618                 tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
2619         g_topology_lock();
2620 }
2621
2622 static void
2623 g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
2624 {
2625         uint32_t *version;
2626
2627         g_topology_assert();
2628
2629         version = gctl_get_paraml(req, "version", sizeof(*version));
2630         if (version == NULL) {
2631                 gctl_error(req, "No '%s' argument.", "version");
2632                 return;
2633         }
2634         if (*version != G_JOURNAL_VERSION) {
2635                 gctl_error(req, "Userland and kernel parts are out of sync.");
2636                 return;
2637         }
2638
2639         if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
2640                 g_journal_ctl_destroy(req, mp);
2641                 return;
2642         } else if (strcmp(verb, "sync") == 0) {
2643                 g_journal_ctl_sync(req, mp);
2644                 return;
2645         }
2646
2647         gctl_error(req, "Unknown verb.");
2648 }
2649
2650 static void
2651 g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2652     struct g_consumer *cp, struct g_provider *pp)
2653 {
2654         struct g_journal_softc *sc;
2655
2656         g_topology_assert();
2657
2658         sc = gp->softc;
2659         if (sc == NULL)
2660                 return;
2661         if (pp != NULL) {
2662                 /* Nothing here. */
2663         } else if (cp != NULL) {
2664                 int first = 1;
2665
2666                 sbuf_printf(sb, "%s<Role>", indent);
2667                 if (cp == sc->sc_dconsumer) {
2668                         sbuf_printf(sb, "Data");
2669                         first = 0;
2670                 }
2671                 if (cp == sc->sc_jconsumer) {
2672                         if (!first)
2673                                 sbuf_printf(sb, ",");
2674                         sbuf_printf(sb, "Journal");
2675                 }
2676                 sbuf_printf(sb, "</Role>\n");
2677                 if (cp == sc->sc_jconsumer) {
2678                         sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
2679                             (intmax_t)sc->sc_jstart);
2680                         sbuf_printf(sb, "<Jend>%jd</Jend>\n",
2681                             (intmax_t)sc->sc_jend);
2682                 }
2683         } else {
2684                 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
2685         }
2686 }
2687
2688 static eventhandler_tag g_journal_event_shutdown = NULL;
2689 static eventhandler_tag g_journal_event_lowmem = NULL;
2690
2691 static void
2692 g_journal_shutdown(void *arg, int howto __unused)
2693 {
2694         struct g_class *mp;
2695         struct g_geom *gp, *gp2;
2696
2697         if (panicstr != NULL)
2698                 return;
2699         mp = arg;
2700         DROP_GIANT();
2701         g_topology_lock();
2702         LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2703                 if (gp->softc == NULL)
2704                         continue;
2705                 GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
2706                 g_journal_destroy(gp->softc);
2707         }
2708         g_topology_unlock();
2709         PICKUP_GIANT();
2710 }
2711
2712 /*
2713  * Free cached requests from inactive queue in case of low memory.
2714  * We free GJ_FREE_AT_ONCE elements at once.
2715  */
2716 #define GJ_FREE_AT_ONCE 4
2717 static void
2718 g_journal_lowmem(void *arg, int howto __unused)
2719 {
2720         struct g_journal_softc *sc;
2721         struct g_class *mp;
2722         struct g_geom *gp;
2723         struct bio *bp;
2724         u_int nfree = GJ_FREE_AT_ONCE;
2725
2726         g_journal_stats_low_mem++;
2727         mp = arg;
2728         DROP_GIANT();
2729         g_topology_lock();
2730         LIST_FOREACH(gp, &mp->geom, geom) {
2731                 sc = gp->softc;
2732                 if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
2733                         continue;
2734                 mtx_lock(&sc->sc_mtx);
2735                 for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
2736                     nfree--, bp = bp->bio_next) {
2737                         /*
2738                          * This is safe to free the bio_data, because:
2739                          * 1. If bio_data is NULL it will be read from the
2740                          *    inactive journal.
2741                          * 2. If bp is sent down, it is first removed from the
2742                          *    inactive queue, so it's impossible to free the
2743                          *    data from under in-flight bio.
2744                          * On the other hand, freeing elements from the active
2745                          * queue, is not safe.
2746                          */
2747                         if (bp->bio_data != NULL) {
2748                                 GJ_DEBUG(2, "Freeing data from %s.",
2749                                     sc->sc_name);
2750                                 gj_free(bp->bio_data, bp->bio_length);
2751                                 bp->bio_data = NULL;
2752                         }
2753                 }
2754                 mtx_unlock(&sc->sc_mtx);
2755                 if (nfree == 0)
2756                         break;
2757         }
2758         g_topology_unlock();
2759         PICKUP_GIANT();
2760 }
2761
2762 static void g_journal_switcher(void *arg);
2763
2764 static void
2765 g_journal_init(struct g_class *mp)
2766 {
2767         int error;
2768
2769         /* Pick a conservative value if provided value sucks. */
2770         if (g_journal_cache_divisor <= 0 ||
2771             (vm_kmem_size / g_journal_cache_divisor == 0)) {
2772                 g_journal_cache_divisor = 5;
2773         }
2774         if (g_journal_cache_limit > 0) {
2775                 g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
2776                 g_journal_cache_low =
2777                     (g_journal_cache_limit / 100) * g_journal_cache_switch;
2778         }
2779         g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
2780             g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
2781         if (g_journal_event_shutdown == NULL)
2782                 GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
2783         g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
2784             g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
2785         if (g_journal_event_lowmem == NULL)
2786                 GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
2787         error = kproc_create(g_journal_switcher, mp, NULL, 0, 0,
2788             "g_journal switcher");
2789         KASSERT(error == 0, ("Cannot create switcher thread."));
2790 }
2791
2792 static void
2793 g_journal_fini(struct g_class *mp)
2794 {
2795
2796         if (g_journal_event_shutdown != NULL) {
2797                 EVENTHANDLER_DEREGISTER(shutdown_post_sync,
2798                     g_journal_event_shutdown);
2799         }
2800         if (g_journal_event_lowmem != NULL)
2801                 EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
2802         g_journal_switcher_state = GJ_SWITCHER_DIE;
2803         wakeup(&g_journal_switcher_state);
2804         while (g_journal_switcher_state != GJ_SWITCHER_DIED)
2805                 tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
2806         GJ_DEBUG(1, "Switcher died.");
2807 }
2808
2809 DECLARE_GEOM_CLASS(g_journal_class, g_journal);
2810
2811 static const struct g_journal_desc *
2812 g_journal_find_desc(const char *fstype)
2813 {
2814         const struct g_journal_desc *desc;
2815         int i;
2816
2817         for (desc = g_journal_filesystems[i = 0]; desc != NULL;
2818              desc = g_journal_filesystems[++i]) {
2819                 if (strcmp(desc->jd_fstype, fstype) == 0)
2820                         break;
2821         }
2822         return (desc);
2823 }
2824
2825 static void
2826 g_journal_switch_wait(struct g_journal_softc *sc)
2827 {
2828         struct bintime bt;
2829
2830         mtx_assert(&sc->sc_mtx, MA_OWNED);
2831         if (g_journal_debug >= 2) {
2832                 if (sc->sc_flush_in_progress > 0) {
2833                         GJ_DEBUG(2, "%d requests flushing.",
2834                             sc->sc_flush_in_progress);
2835                 }
2836                 if (sc->sc_copy_in_progress > 0) {
2837                         GJ_DEBUG(2, "%d requests copying.",
2838                             sc->sc_copy_in_progress);
2839                 }
2840                 if (sc->sc_flush_count > 0) {
2841                         GJ_DEBUG(2, "%d requests to flush.",
2842                             sc->sc_flush_count);
2843                 }
2844                 if (sc->sc_delayed_count > 0) {
2845                         GJ_DEBUG(2, "%d requests delayed.",
2846                             sc->sc_delayed_count);
2847                 }
2848         }
2849         g_journal_stats_switches++;
2850         if (sc->sc_copy_in_progress > 0)
2851                 g_journal_stats_wait_for_copy++;
2852         GJ_TIMER_START(1, &bt);
2853         sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
2854         sc->sc_flags |= GJF_DEVICE_SWITCH;
2855         wakeup(sc);
2856         while (sc->sc_flags & GJF_DEVICE_SWITCH) {
2857                 msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
2858                     "gj:switch", 0);
2859         }
2860         GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
2861 }
2862
2863 static void
2864 g_journal_do_switch(struct g_class *classp)
2865 {
2866         struct g_journal_softc *sc;
2867         const struct g_journal_desc *desc;
2868         struct g_geom *gp;
2869         struct mount *mp;
2870         struct bintime bt;
2871         char *mountpoint;
2872         int error, vfslocked;
2873
2874         DROP_GIANT();
2875         g_topology_lock();
2876         LIST_FOREACH(gp, &classp->geom, geom) {
2877                 sc = gp->softc;
2878                 if (sc == NULL)
2879                         continue;
2880                 if (sc->sc_flags & GJF_DEVICE_DESTROY)
2881                         continue;
2882                 if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
2883                         continue;
2884                 mtx_lock(&sc->sc_mtx);
2885                 sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
2886                 mtx_unlock(&sc->sc_mtx);
2887         }
2888         g_topology_unlock();
2889         PICKUP_GIANT();
2890
2891         mtx_lock(&mountlist_mtx);
2892         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2893                 if (mp->mnt_gjprovider == NULL)
2894                         continue;
2895                 if (mp->mnt_flag & MNT_RDONLY)
2896                         continue;
2897                 desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
2898                 if (desc == NULL)
2899                         continue;
2900                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
2901                         continue;
2902                 /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
2903
2904                 DROP_GIANT();
2905                 g_topology_lock();
2906                 sc = g_journal_find_device(classp, mp->mnt_gjprovider);
2907                 g_topology_unlock();
2908                 PICKUP_GIANT();
2909
2910                 if (sc == NULL) {
2911                         GJ_DEBUG(0, "Cannot find journal geom for %s.",
2912                             mp->mnt_gjprovider);
2913                         goto next;
2914                 } else if (JEMPTY(sc)) {
2915                         mtx_lock(&sc->sc_mtx);
2916                         sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
2917                         mtx_unlock(&sc->sc_mtx);
2918                         GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
2919                         goto next;
2920                 }
2921
2922                 mountpoint = mp->mnt_stat.f_mntonname;
2923
2924                 vfslocked = VFS_LOCK_GIANT(mp);
2925
2926                 error = vn_start_write(NULL, &mp, V_WAIT);
2927                 if (error != 0) {
2928                         VFS_UNLOCK_GIANT(vfslocked);
2929                         GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
2930                             mountpoint, error);
2931                         goto next;
2932                 }
2933
2934                 MNT_ILOCK(mp);
2935                 mp->mnt_noasync++;
2936                 mp->mnt_kern_flag &= ~MNTK_ASYNC;
2937                 MNT_IUNLOCK(mp);
2938
2939                 GJ_TIMER_START(1, &bt);
2940                 vfs_msync(mp, MNT_NOWAIT);
2941                 GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
2942
2943                 GJ_TIMER_START(1, &bt);
2944                 error = VFS_SYNC(mp, MNT_NOWAIT);
2945                 if (error == 0)
2946                         GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
2947                 else {
2948                         GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
2949                             mountpoint, error);
2950                 }
2951
2952                 MNT_ILOCK(mp);
2953                 mp->mnt_noasync--;
2954                 if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
2955                         mp->mnt_kern_flag |= MNTK_ASYNC;
2956                 MNT_IUNLOCK(mp);
2957
2958                 vn_finished_write(mp);
2959
2960                 if (error != 0) {
2961                         VFS_UNLOCK_GIANT(vfslocked);
2962                         goto next;
2963                 }
2964
2965                 /*
2966                  * Send BIO_FLUSH before freezing the file system, so it can be
2967                  * faster after the freeze.
2968                  */
2969                 GJ_TIMER_START(1, &bt);
2970                 g_journal_flush_cache(sc);
2971                 GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
2972
2973                 GJ_TIMER_START(1, &bt);
2974                 error = vfs_write_suspend(mp);
2975                 VFS_UNLOCK_GIANT(vfslocked);
2976                 GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
2977                 if (error != 0) {
2978                         GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
2979                             mountpoint, error);
2980                         goto next;
2981                 }
2982
2983                 error = desc->jd_clean(mp);
2984                 if (error != 0)
2985                         goto next;
2986
2987                 mtx_lock(&sc->sc_mtx);
2988                 g_journal_switch_wait(sc);
2989                 mtx_unlock(&sc->sc_mtx);
2990
2991                 vfs_write_resume(mp);
2992 next:
2993                 mtx_lock(&mountlist_mtx);
2994                 vfs_unbusy(mp);
2995         }
2996         mtx_unlock(&mountlist_mtx);
2997
2998         sc = NULL;
2999         for (;;) {
3000                 DROP_GIANT();
3001                 g_topology_lock();
3002                 LIST_FOREACH(gp, &g_journal_class.geom, geom) {
3003                         sc = gp->softc;
3004                         if (sc == NULL)
3005                                 continue;
3006                         mtx_lock(&sc->sc_mtx);
3007                         if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
3008                             !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
3009                             (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
3010                                 break;
3011                         }
3012                         mtx_unlock(&sc->sc_mtx);
3013                         sc = NULL;
3014                 }
3015                 g_topology_unlock();
3016                 PICKUP_GIANT();
3017                 if (sc == NULL)
3018                         break;
3019                 mtx_assert(&sc->sc_mtx, MA_OWNED);
3020                 g_journal_switch_wait(sc);
3021                 mtx_unlock(&sc->sc_mtx);
3022         }
3023 }
3024
3025 /*
3026  * TODO: Switcher thread should be started on first geom creation and killed on
3027  * last geom destruction.
3028  */
3029 static void
3030 g_journal_switcher(void *arg)
3031 {
3032         struct g_class *mp;
3033         struct bintime bt;
3034         int error;
3035
3036         mp = arg;
3037         curthread->td_pflags |= TDP_NORUNNINGBUF;
3038         for (;;) {
3039                 g_journal_switcher_wokenup = 0;
3040                 error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
3041                     g_journal_switch_time * hz);
3042                 if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
3043                         g_journal_switcher_state = GJ_SWITCHER_DIED;
3044                         GJ_DEBUG(1, "Switcher exiting.");
3045                         wakeup(&g_journal_switcher_state);
3046                         kproc_exit(0);
3047                 }
3048                 if (error == 0 && g_journal_sync_requested == 0) {
3049                         GJ_DEBUG(1, "Out of cache, force switch (used=%u "
3050                             "limit=%u).", g_journal_cache_used,
3051                             g_journal_cache_limit);
3052                 }
3053                 GJ_TIMER_START(1, &bt);
3054                 g_journal_do_switch(mp);
3055                 GJ_TIMER_STOP(1, &bt, "Entire switch time");
3056                 if (g_journal_sync_requested > 0) {
3057                         g_journal_sync_requested = 0;
3058                         wakeup(&g_journal_sync_requested);
3059                 }
3060         }
3061 }