2 * Copyright (c) 2009-2010 The FreeBSD Foundation
5 * This software was developed by Pawel Jakub Dawidek under sponsorship from
6 * the FreeBSD Foundation.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h> /* powerof2() */
34 #include <sys/queue.h>
36 #include <bitstring.h>
45 #include "activemap.h"
49 #define PJDLOG_ASSERT(...) assert(__VA_ARGS__)
52 #define ACTIVEMAP_MAGIC 0xac71e4
54 int am_magic; /* Magic value. */
55 off_t am_mediasize; /* Media size in bytes. */
56 uint32_t am_extentsize; /* Extent size in bytes,
57 must be power of 2. */
58 uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */
59 int am_nextents; /* Number of extents. */
60 size_t am_mapsize; /* Bitmap size in bytes. */
61 uint16_t *am_memtab; /* An array that holds number of pending
63 bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */
64 bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */
65 size_t am_diskmapsize; /* Map size rounded up to sector size. */
66 uint64_t am_ndirty; /* Number of dirty regions. */
67 bitstr_t *am_syncmap; /* Bitmap of extents to sync. */
68 off_t am_syncoff; /* Next synchronization offset. */
69 TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
70 we keep dirty to reduce bitmap
72 int am_nkeepdirty; /* Number of am_keepdirty elements. */
73 int am_nkeepdirty_limit; /* Maximum number of am_keepdirty
79 TAILQ_ENTRY(keepdirty) kd_next;
83 * Helper function taken from sys/systm.h to calculate extentshift.
86 bitcount32(uint32_t x)
89 x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
90 x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
91 x = (x + (x >> 4)) & 0x0f0f0f0f;
93 x = (x + (x >> 16)) & 0x000000ff;
98 off2ext(const struct activemap *amp, off_t offset)
102 PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize);
103 extent = (offset >> amp->am_extentshift);
104 PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
108 static __inline off_t
109 ext2off(const struct activemap *amp, int extent)
113 PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
114 offset = ((off_t)extent << amp->am_extentshift);
115 PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize);
120 * Function calculates number of requests needed to synchronize the given
124 ext2reqs(const struct activemap *amp, int ext)
128 if (ext < amp->am_nextents - 1)
129 return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
131 PJDLOG_ASSERT(ext == amp->am_nextents - 1);
132 left = amp->am_mediasize % amp->am_extentsize;
134 left = amp->am_extentsize;
135 return (((left - 1) / MAXPHYS) + 1);
139 * Initialize activemap structure and allocate memory for internal needs.
140 * Function returns 0 on success and -1 if any of the allocations failed.
143 activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
144 uint32_t sectorsize, uint32_t keepdirty)
146 struct activemap *amp;
148 PJDLOG_ASSERT(ampp != NULL);
149 PJDLOG_ASSERT(mediasize > 0);
150 PJDLOG_ASSERT(extentsize > 0);
151 PJDLOG_ASSERT(powerof2(extentsize));
152 PJDLOG_ASSERT(sectorsize > 0);
153 PJDLOG_ASSERT(powerof2(sectorsize));
154 PJDLOG_ASSERT(keepdirty > 0);
156 amp = malloc(sizeof(*amp));
160 amp->am_mediasize = mediasize;
161 amp->am_nkeepdirty_limit = keepdirty;
162 amp->am_extentsize = extentsize;
163 amp->am_extentshift = bitcount32(extentsize - 1);
164 amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
165 amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents);
166 amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
168 amp->am_syncoff = -2;
169 TAILQ_INIT(&->am_keepdirty);
170 amp->am_nkeepdirty = 0;
172 amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
173 amp->am_diskmap = calloc(1, amp->am_diskmapsize);
174 amp->am_memmap = bit_alloc(amp->am_nextents);
175 amp->am_syncmap = bit_alloc(amp->am_nextents);
178 * Check to see if any of the allocations above failed.
180 if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
181 amp->am_memmap == NULL || amp->am_syncmap == NULL) {
182 if (amp->am_memtab != NULL)
183 free(amp->am_memtab);
184 if (amp->am_diskmap != NULL)
185 free(amp->am_diskmap);
186 if (amp->am_memmap != NULL)
187 free(amp->am_memmap);
188 if (amp->am_syncmap != NULL)
189 free(amp->am_syncmap);
196 amp->am_magic = ACTIVEMAP_MAGIC;
202 static struct keepdirty *
203 keepdirty_find(struct activemap *amp, int extent)
205 struct keepdirty *kd;
207 TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) {
208 if (kd->kd_extent == extent)
215 keepdirty_add(struct activemap *amp, int extent)
217 struct keepdirty *kd;
219 kd = keepdirty_find(amp, extent);
222 * Only move element at the beginning.
224 TAILQ_REMOVE(&->am_keepdirty, kd, kd_next);
225 TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next);
229 * Add new element, but first remove the most unused one if
232 if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
233 kd = TAILQ_LAST(&->am_keepdirty, skeepdirty);
234 PJDLOG_ASSERT(kd != NULL);
235 TAILQ_REMOVE(&->am_keepdirty, kd, kd_next);
236 amp->am_nkeepdirty--;
237 PJDLOG_ASSERT(amp->am_nkeepdirty > 0);
240 kd = malloc(sizeof(*kd));
241 /* We can ignore allocation failure. */
243 kd->kd_extent = extent;
244 amp->am_nkeepdirty++;
245 TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next);
252 keepdirty_fill(struct activemap *amp)
254 struct keepdirty *kd;
256 TAILQ_FOREACH(kd, &->am_keepdirty, kd_next)
257 bit_set(amp->am_diskmap, kd->kd_extent);
261 keepdirty_free(struct activemap *amp)
263 struct keepdirty *kd;
265 while ((kd = TAILQ_FIRST(&->am_keepdirty)) != NULL) {
266 TAILQ_REMOVE(&->am_keepdirty, kd, kd_next);
267 amp->am_nkeepdirty--;
270 PJDLOG_ASSERT(amp->am_nkeepdirty == 0);
274 * Function frees resources allocated by activemap_init() function.
277 activemap_free(struct activemap *amp)
280 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
285 free(amp->am_memtab);
286 free(amp->am_diskmap);
287 free(amp->am_memmap);
288 free(amp->am_syncmap);
292 * Function should be called before we handle write requests. It updates
293 * internal structures and returns true if on-disk metadata should be updated.
296 activemap_write_start(struct activemap *amp, off_t offset, off_t length)
302 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
303 PJDLOG_ASSERT(length > 0);
306 end = offset + length - 1;
308 for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
310 * If the number of pending writes is increased from 0,
311 * we have to mark the extent as dirty also in on-disk bitmap.
312 * By returning true we inform the caller that on-disk bitmap
313 * was modified and has to be flushed to disk.
315 if (amp->am_memtab[ext]++ == 0) {
316 PJDLOG_ASSERT(!bit_test(amp->am_memmap, ext));
317 bit_set(amp->am_memmap, ext);
320 if (keepdirty_add(amp, ext))
328 * Function should be called after receiving write confirmation. It updates
329 * internal structures and returns true if on-disk metadata should be updated.
332 activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
338 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
339 PJDLOG_ASSERT(length > 0);
342 end = offset + length - 1;
344 for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
346 * If the number of pending writes goes down to 0, we have to
347 * mark the extent as clean also in on-disk bitmap.
348 * By returning true we inform the caller that on-disk bitmap
349 * was modified and has to be flushed to disk.
351 PJDLOG_ASSERT(amp->am_memtab[ext] > 0);
352 PJDLOG_ASSERT(bit_test(amp->am_memmap, ext));
353 if (--amp->am_memtab[ext] == 0) {
354 bit_clear(amp->am_memmap, ext);
356 if (keepdirty_find(amp, ext) == NULL)
365 * Function should be called after finishing synchronization of one extent.
366 * It returns true if on-disk metadata should be updated.
369 activemap_extent_complete(struct activemap *amp, int extent)
374 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
375 PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
379 reqs = ext2reqs(amp, extent);
380 PJDLOG_ASSERT(amp->am_memtab[extent] >= reqs);
381 amp->am_memtab[extent] -= reqs;
382 PJDLOG_ASSERT(bit_test(amp->am_memmap, extent));
383 if (amp->am_memtab[extent] == 0) {
384 bit_clear(amp->am_memmap, extent);
393 * Function returns number of dirty regions.
396 activemap_ndirty(const struct activemap *amp)
399 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
401 return (amp->am_ndirty);
405 * Function compare on-disk bitmap and in-memory bitmap and returns true if
406 * they differ and should be flushed to the disk.
409 activemap_differ(const struct activemap *amp)
412 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
414 return (memcmp(amp->am_diskmap, amp->am_memmap,
415 amp->am_mapsize) != 0);
419 * Function returns number of bytes used by bitmap.
422 activemap_size(const struct activemap *amp)
425 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
427 return (amp->am_mapsize);
431 * Function returns number of bytes needed for storing on-disk bitmap.
432 * This is the same as activemap_size(), but rounded up to sector size.
435 activemap_ondisk_size(const struct activemap *amp)
438 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
440 return (amp->am_diskmapsize);
444 * Function copies the given buffer read from disk to the internal bitmap.
447 activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
451 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
452 PJDLOG_ASSERT(size >= amp->am_mapsize);
454 memcpy(amp->am_diskmap, buf, amp->am_mapsize);
455 memcpy(amp->am_memmap, buf, amp->am_mapsize);
456 memcpy(amp->am_syncmap, buf, amp->am_mapsize);
458 bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
460 /* There are no dirty extents, so we can leave now. */
464 * Set synchronization offset to the first dirty extent.
466 activemap_sync_rewind(amp);
468 * We have dirty extents and we want them to stay that way until
469 * we synchronize, so we set number of pending writes to number
470 * of requests needed to synchronize one extent.
473 for (; ext < amp->am_nextents; ext++) {
474 if (bit_test(amp->am_memmap, ext)) {
475 amp->am_memtab[ext] = ext2reqs(amp, ext);
482 * Function merges the given bitmap with existing one.
485 activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
487 bitstr_t *remmap = __DECONST(bitstr_t *, buf);
490 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
491 PJDLOG_ASSERT(size >= amp->am_mapsize);
493 bit_ffs(remmap, amp->am_nextents, &ext);
495 /* There are no dirty extents, so we can leave now. */
499 * We have dirty extents and we want them to stay that way until
500 * we synchronize, so we set number of pending writes to number
501 * of requests needed to synchronize one extent.
503 for (; ext < amp->am_nextents; ext++) {
504 /* Local extent already dirty. */
505 if (bit_test(amp->am_syncmap, ext))
507 /* Remote extent isn't dirty. */
508 if (!bit_test(remmap, ext))
510 bit_set(amp->am_syncmap, ext);
511 bit_set(amp->am_memmap, ext);
512 bit_set(amp->am_diskmap, ext);
513 if (amp->am_memtab[ext] == 0)
515 amp->am_memtab[ext] = ext2reqs(amp, ext);
518 * Set synchronization offset to the first dirty extent.
520 activemap_sync_rewind(amp);
524 * Function returns pointer to internal bitmap that should be written to disk.
526 const unsigned char *
527 activemap_bitmap(struct activemap *amp, size_t *sizep)
530 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
533 *sizep = amp->am_diskmapsize;
534 memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
536 return ((const unsigned char *)amp->am_diskmap);
540 * Function calculates size needed to store bitmap on disk.
543 activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
546 uint64_t nextents, mapsize;
548 PJDLOG_ASSERT(mediasize > 0);
549 PJDLOG_ASSERT(extentsize > 0);
550 PJDLOG_ASSERT(powerof2(extentsize));
551 PJDLOG_ASSERT(sectorsize > 0);
552 PJDLOG_ASSERT(powerof2(sectorsize));
554 nextents = ((mediasize - 1) / extentsize) + 1;
555 mapsize = sizeof(bitstr_t) * bitstr_size(nextents);
556 return (roundup2(mapsize, sectorsize));
560 * Set synchronization offset to the first dirty extent.
563 activemap_sync_rewind(struct activemap *amp)
567 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
569 bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
571 /* There are no extents to synchronize. */
572 amp->am_syncoff = -2;
576 * Mark that we want to start synchronization from the beginning.
578 amp->am_syncoff = -1;
582 * Return next offset of where we should synchronize.
585 activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
590 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
591 PJDLOG_ASSERT(lengthp != NULL);
592 PJDLOG_ASSERT(syncextp != NULL);
596 if (amp->am_syncoff == -2)
599 if (amp->am_syncoff >= 0 &&
600 (amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
601 off2ext(amp, amp->am_syncoff) !=
602 off2ext(amp, amp->am_syncoff + MAXPHYS))) {
604 * We are about to change extent, so mark previous one as clean.
606 ext = off2ext(amp, amp->am_syncoff);
607 bit_clear(amp->am_syncmap, ext);
609 amp->am_syncoff = -1;
612 if (amp->am_syncoff == -1) {
614 * Let's find first extent to synchronize.
616 bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
618 amp->am_syncoff = -2;
621 amp->am_syncoff = ext2off(amp, ext);
624 * We don't change extent, so just increase offset.
626 amp->am_syncoff += MAXPHYS;
627 if (amp->am_syncoff >= amp->am_mediasize) {
628 amp->am_syncoff = -2;
633 syncoff = amp->am_syncoff;
634 left = ext2off(amp, off2ext(amp, syncoff)) +
635 amp->am_extentsize - syncoff;
636 if (syncoff + left > amp->am_mediasize)
637 left = amp->am_mediasize - syncoff;
641 PJDLOG_ASSERT(left >= 0 && left <= MAXPHYS);
642 PJDLOG_ASSERT(syncoff >= 0 && syncoff < amp->am_mediasize);
643 PJDLOG_ASSERT(syncoff + left >= 0 &&
644 syncoff + left <= amp->am_mediasize);
651 * Mark extent(s) containing the given region for synchronization.
652 * Most likely one of the components is unavailable.
655 activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
661 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
664 end = offset + length - 1;
666 for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
667 if (bit_test(amp->am_syncmap, ext)) {
668 /* Already marked for synchronization. */
669 PJDLOG_ASSERT(bit_test(amp->am_memmap, ext));
672 bit_set(amp->am_syncmap, ext);
673 if (!bit_test(amp->am_memmap, ext)) {
674 bit_set(amp->am_memmap, ext);
677 amp->am_memtab[ext] += ext2reqs(amp, ext);
685 activemap_dump(const struct activemap *amp)
690 for (bit = 0; bit < amp->am_nextents; bit++)
691 printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
694 for (bit = 0; bit < amp->am_nextents; bit++)
695 printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
698 for (bit = 0; bit < amp->am_nextents; bit++)
699 printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);