2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2009-2010 The FreeBSD Foundation
7 * This software was developed by Pawel Jakub Dawidek under sponsorship from
8 * the FreeBSD Foundation.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
35 #include <sys/param.h> /* powerof2() */
36 #include <sys/queue.h>
38 #include <bitstring.h>
47 #include "activemap.h"
51 #define PJDLOG_ASSERT(...) assert(__VA_ARGS__)
54 #define ACTIVEMAP_MAGIC 0xac71e4
56 int am_magic; /* Magic value. */
57 off_t am_mediasize; /* Media size in bytes. */
58 uint32_t am_extentsize; /* Extent size in bytes,
59 must be power of 2. */
60 uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */
61 int am_nextents; /* Number of extents. */
62 size_t am_mapsize; /* Bitmap size in bytes. */
63 uint16_t *am_memtab; /* An array that holds number of pending
65 bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */
66 bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */
67 size_t am_diskmapsize; /* Map size rounded up to sector size. */
68 uint64_t am_ndirty; /* Number of dirty regions. */
69 bitstr_t *am_syncmap; /* Bitmap of extents to sync. */
70 off_t am_syncoff; /* Next synchronization offset. */
71 TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
72 we keep dirty to reduce bitmap
74 int am_nkeepdirty; /* Number of am_keepdirty elements. */
75 int am_nkeepdirty_limit; /* Maximum number of am_keepdirty
81 TAILQ_ENTRY(keepdirty) kd_next;
85 * Helper function taken from sys/systm.h to calculate extentshift.
88 bitcount32(uint32_t x)
91 x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
92 x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
93 x = (x + (x >> 4)) & 0x0f0f0f0f;
95 x = (x + (x >> 16)) & 0x000000ff;
100 off2ext(const struct activemap *amp, off_t offset)
104 PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize);
105 extent = (offset >> amp->am_extentshift);
106 PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
110 static __inline off_t
111 ext2off(const struct activemap *amp, int extent)
115 PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
116 offset = ((off_t)extent << amp->am_extentshift);
117 PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize);
122 * Function calculates number of requests needed to synchronize the given
126 ext2reqs(const struct activemap *amp, int ext)
130 if (ext < amp->am_nextents - 1)
131 return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
133 PJDLOG_ASSERT(ext == amp->am_nextents - 1);
134 left = amp->am_mediasize % amp->am_extentsize;
136 left = amp->am_extentsize;
137 return (((left - 1) / MAXPHYS) + 1);
141 * Initialize activemap structure and allocate memory for internal needs.
142 * Function returns 0 on success and -1 if any of the allocations failed.
145 activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
146 uint32_t sectorsize, uint32_t keepdirty)
148 struct activemap *amp;
150 PJDLOG_ASSERT(ampp != NULL);
151 PJDLOG_ASSERT(mediasize > 0);
152 PJDLOG_ASSERT(extentsize > 0);
153 PJDLOG_ASSERT(powerof2(extentsize));
154 PJDLOG_ASSERT(sectorsize > 0);
155 PJDLOG_ASSERT(powerof2(sectorsize));
156 PJDLOG_ASSERT(keepdirty > 0);
158 amp = malloc(sizeof(*amp));
162 amp->am_mediasize = mediasize;
163 amp->am_nkeepdirty_limit = keepdirty;
164 amp->am_extentsize = extentsize;
165 amp->am_extentshift = bitcount32(extentsize - 1);
166 amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
167 amp->am_mapsize = bitstr_size(amp->am_nextents);
168 amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
170 amp->am_syncoff = -2;
171 TAILQ_INIT(&->am_keepdirty);
172 amp->am_nkeepdirty = 0;
174 amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
175 amp->am_diskmap = calloc(1, amp->am_diskmapsize);
176 amp->am_memmap = bit_alloc(amp->am_nextents);
177 amp->am_syncmap = bit_alloc(amp->am_nextents);
180 * Check to see if any of the allocations above failed.
182 if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
183 amp->am_memmap == NULL || amp->am_syncmap == NULL) {
184 if (amp->am_memtab != NULL)
185 free(amp->am_memtab);
186 if (amp->am_diskmap != NULL)
187 free(amp->am_diskmap);
188 if (amp->am_memmap != NULL)
189 free(amp->am_memmap);
190 if (amp->am_syncmap != NULL)
191 free(amp->am_syncmap);
198 amp->am_magic = ACTIVEMAP_MAGIC;
204 static struct keepdirty *
205 keepdirty_find(struct activemap *amp, int extent)
207 struct keepdirty *kd;
209 TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) {
210 if (kd->kd_extent == extent)
217 keepdirty_add(struct activemap *amp, int extent)
219 struct keepdirty *kd;
221 kd = keepdirty_find(amp, extent);
224 * Only move element at the beginning.
226 TAILQ_REMOVE(&->am_keepdirty, kd, kd_next);
227 TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next);
231 * Add new element, but first remove the most unused one if
234 if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
235 kd = TAILQ_LAST(&->am_keepdirty, skeepdirty);
236 PJDLOG_ASSERT(kd != NULL);
237 TAILQ_REMOVE(&->am_keepdirty, kd, kd_next);
238 amp->am_nkeepdirty--;
239 PJDLOG_ASSERT(amp->am_nkeepdirty > 0);
242 kd = malloc(sizeof(*kd));
243 /* We can ignore allocation failure. */
245 kd->kd_extent = extent;
246 amp->am_nkeepdirty++;
247 TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next);
254 keepdirty_fill(struct activemap *amp)
256 struct keepdirty *kd;
258 TAILQ_FOREACH(kd, &->am_keepdirty, kd_next)
259 bit_set(amp->am_diskmap, kd->kd_extent);
263 keepdirty_free(struct activemap *amp)
265 struct keepdirty *kd;
267 while ((kd = TAILQ_FIRST(&->am_keepdirty)) != NULL) {
268 TAILQ_REMOVE(&->am_keepdirty, kd, kd_next);
269 amp->am_nkeepdirty--;
272 PJDLOG_ASSERT(amp->am_nkeepdirty == 0);
276 * Function frees resources allocated by activemap_init() function.
279 activemap_free(struct activemap *amp)
282 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
287 free(amp->am_memtab);
288 free(amp->am_diskmap);
289 free(amp->am_memmap);
290 free(amp->am_syncmap);
294 * Function should be called before we handle write requests. It updates
295 * internal structures and returns true if on-disk metadata should be updated.
298 activemap_write_start(struct activemap *amp, off_t offset, off_t length)
304 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
305 PJDLOG_ASSERT(length > 0);
308 end = offset + length - 1;
310 for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
312 * If the number of pending writes is increased from 0,
313 * we have to mark the extent as dirty also in on-disk bitmap.
314 * By returning true we inform the caller that on-disk bitmap
315 * was modified and has to be flushed to disk.
317 if (amp->am_memtab[ext]++ == 0) {
318 PJDLOG_ASSERT(!bit_test(amp->am_memmap, ext));
319 bit_set(amp->am_memmap, ext);
322 if (keepdirty_add(amp, ext))
330 * Function should be called after receiving write confirmation. It updates
331 * internal structures and returns true if on-disk metadata should be updated.
334 activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
340 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
341 PJDLOG_ASSERT(length > 0);
344 end = offset + length - 1;
346 for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
348 * If the number of pending writes goes down to 0, we have to
349 * mark the extent as clean also in on-disk bitmap.
350 * By returning true we inform the caller that on-disk bitmap
351 * was modified and has to be flushed to disk.
353 PJDLOG_ASSERT(amp->am_memtab[ext] > 0);
354 PJDLOG_ASSERT(bit_test(amp->am_memmap, ext));
355 if (--amp->am_memtab[ext] == 0) {
356 bit_clear(amp->am_memmap, ext);
358 if (keepdirty_find(amp, ext) == NULL)
367 * Function should be called after finishing synchronization of one extent.
368 * It returns true if on-disk metadata should be updated.
371 activemap_extent_complete(struct activemap *amp, int extent)
376 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
377 PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
381 reqs = ext2reqs(amp, extent);
382 PJDLOG_ASSERT(amp->am_memtab[extent] >= reqs);
383 amp->am_memtab[extent] -= reqs;
384 PJDLOG_ASSERT(bit_test(amp->am_memmap, extent));
385 if (amp->am_memtab[extent] == 0) {
386 bit_clear(amp->am_memmap, extent);
395 * Function returns number of dirty regions.
398 activemap_ndirty(const struct activemap *amp)
401 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
403 return (amp->am_ndirty);
407 * Function compare on-disk bitmap and in-memory bitmap and returns true if
408 * they differ and should be flushed to the disk.
411 activemap_differ(const struct activemap *amp)
414 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
416 return (memcmp(amp->am_diskmap, amp->am_memmap,
417 amp->am_mapsize) != 0);
421 * Function returns number of bytes used by bitmap.
424 activemap_size(const struct activemap *amp)
427 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
429 return (amp->am_mapsize);
433 * Function returns number of bytes needed for storing on-disk bitmap.
434 * This is the same as activemap_size(), but rounded up to sector size.
437 activemap_ondisk_size(const struct activemap *amp)
440 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
442 return (amp->am_diskmapsize);
446 * Function copies the given buffer read from disk to the internal bitmap.
449 activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
453 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
454 PJDLOG_ASSERT(size >= amp->am_mapsize);
456 memcpy(amp->am_diskmap, buf, amp->am_mapsize);
457 memcpy(amp->am_memmap, buf, amp->am_mapsize);
458 memcpy(amp->am_syncmap, buf, amp->am_mapsize);
460 bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
462 /* There are no dirty extents, so we can leave now. */
466 * Set synchronization offset to the first dirty extent.
468 activemap_sync_rewind(amp);
470 * We have dirty extents and we want them to stay that way until
471 * we synchronize, so we set number of pending writes to number
472 * of requests needed to synchronize one extent.
475 for (; ext < amp->am_nextents; ext++) {
476 if (bit_test(amp->am_memmap, ext)) {
477 amp->am_memtab[ext] = ext2reqs(amp, ext);
484 * Function merges the given bitmap with existing one.
487 activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
489 bitstr_t *remmap = __DECONST(bitstr_t *, buf);
492 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
493 PJDLOG_ASSERT(size >= amp->am_mapsize);
495 bit_ffs(remmap, amp->am_nextents, &ext);
497 /* There are no dirty extents, so we can leave now. */
501 * We have dirty extents and we want them to stay that way until
502 * we synchronize, so we set number of pending writes to number
503 * of requests needed to synchronize one extent.
505 for (; ext < amp->am_nextents; ext++) {
506 /* Local extent already dirty. */
507 if (bit_test(amp->am_syncmap, ext))
509 /* Remote extent isn't dirty. */
510 if (!bit_test(remmap, ext))
512 bit_set(amp->am_syncmap, ext);
513 bit_set(amp->am_memmap, ext);
514 bit_set(amp->am_diskmap, ext);
515 if (amp->am_memtab[ext] == 0)
517 amp->am_memtab[ext] = ext2reqs(amp, ext);
520 * Set synchronization offset to the first dirty extent.
522 activemap_sync_rewind(amp);
526 * Function returns pointer to internal bitmap that should be written to disk.
528 const unsigned char *
529 activemap_bitmap(struct activemap *amp, size_t *sizep)
532 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
535 *sizep = amp->am_diskmapsize;
536 memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
538 return ((const unsigned char *)amp->am_diskmap);
542 * Function calculates size needed to store bitmap on disk.
545 activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
548 uint64_t nextents, mapsize;
550 PJDLOG_ASSERT(mediasize > 0);
551 PJDLOG_ASSERT(extentsize > 0);
552 PJDLOG_ASSERT(powerof2(extentsize));
553 PJDLOG_ASSERT(sectorsize > 0);
554 PJDLOG_ASSERT(powerof2(sectorsize));
556 nextents = ((mediasize - 1) / extentsize) + 1;
557 mapsize = bitstr_size(nextents);
558 return (roundup2(mapsize, sectorsize));
562 * Set synchronization offset to the first dirty extent.
565 activemap_sync_rewind(struct activemap *amp)
569 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
571 bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
573 /* There are no extents to synchronize. */
574 amp->am_syncoff = -2;
578 * Mark that we want to start synchronization from the beginning.
580 amp->am_syncoff = -1;
584 * Return next offset of where we should synchronize.
587 activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
592 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
593 PJDLOG_ASSERT(lengthp != NULL);
594 PJDLOG_ASSERT(syncextp != NULL);
598 if (amp->am_syncoff == -2)
601 if (amp->am_syncoff >= 0 &&
602 (amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
603 off2ext(amp, amp->am_syncoff) !=
604 off2ext(amp, amp->am_syncoff + MAXPHYS))) {
606 * We are about to change extent, so mark previous one as clean.
608 ext = off2ext(amp, amp->am_syncoff);
609 bit_clear(amp->am_syncmap, ext);
611 amp->am_syncoff = -1;
614 if (amp->am_syncoff == -1) {
616 * Let's find first extent to synchronize.
618 bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
620 amp->am_syncoff = -2;
623 amp->am_syncoff = ext2off(amp, ext);
626 * We don't change extent, so just increase offset.
628 amp->am_syncoff += MAXPHYS;
629 if (amp->am_syncoff >= amp->am_mediasize) {
630 amp->am_syncoff = -2;
635 syncoff = amp->am_syncoff;
636 left = ext2off(amp, off2ext(amp, syncoff)) +
637 amp->am_extentsize - syncoff;
638 if (syncoff + left > amp->am_mediasize)
639 left = amp->am_mediasize - syncoff;
643 PJDLOG_ASSERT(left >= 0 && left <= MAXPHYS);
644 PJDLOG_ASSERT(syncoff >= 0 && syncoff < amp->am_mediasize);
645 PJDLOG_ASSERT(syncoff + left >= 0 &&
646 syncoff + left <= amp->am_mediasize);
653 * Mark extent(s) containing the given region for synchronization.
654 * Most likely one of the components is unavailable.
657 activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
663 PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
666 end = offset + length - 1;
668 for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
669 if (bit_test(amp->am_syncmap, ext)) {
670 /* Already marked for synchronization. */
671 PJDLOG_ASSERT(bit_test(amp->am_memmap, ext));
674 bit_set(amp->am_syncmap, ext);
675 if (!bit_test(amp->am_memmap, ext)) {
676 bit_set(amp->am_memmap, ext);
679 amp->am_memtab[ext] += ext2reqs(amp, ext);
687 activemap_dump(const struct activemap *amp)
692 for (bit = 0; bit < amp->am_nextents; bit++)
693 printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
696 for (bit = 0; bit < amp->am_nextents; bit++)
697 printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
700 for (bit = 0; bit < amp->am_nextents; bit++)
701 printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);