]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
MFV r353615: 9485 Optimize possible split block search space
[FreeBSD/FreeBSD.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / vdev_indirect_mapping.c
1 /*
2  * CDDL HEADER START
3  *
4  * This file and its contents are supplied under the terms of the
5  * Common Development and Distribution License ("CDDL"), version 1.0.
6  * You may only use this file in accordance with the terms of version
7  * 1.0 of the CDDL.
8  *
9  * A full copy of the text of the CDDL should have accompanied this
10  * source.  A copy of the CDDL is also available via the Internet at
11  * http://www.illumos.org/license/CDDL.
12  *
13  * CDDL HEADER END
14  */
15
16 /*
17  * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
18  */
19
20 #include <sys/dmu_tx.h>
21 #include <sys/dsl_pool.h>
22 #include <sys/spa.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/vdev_indirect_mapping.h>
25 #include <sys/zfeature.h>
26 #include <sys/dmu_objset.h>
27
28 static boolean_t
29 vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
30 {
31         ASSERT(vim != NULL);
32
33         ASSERT(vim->vim_object != 0);
34         ASSERT(vim->vim_objset != NULL);
35         ASSERT(vim->vim_phys != NULL);
36         ASSERT(vim->vim_dbuf != NULL);
37
38         EQUIV(vim->vim_phys->vimp_num_entries > 0,
39             vim->vim_entries != NULL);
40         if (vim->vim_phys->vimp_num_entries > 0) {
41                 vdev_indirect_mapping_entry_phys_t *last_entry =
42                     &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
43                 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
44                 uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
45
46                 ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
47         }
48         if (vim->vim_havecounts) {
49                 ASSERT(vim->vim_phys->vimp_counts_object != 0);
50         }
51
52         return (B_TRUE);
53 }
54
55 uint64_t
56 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
57 {
58         ASSERT(vdev_indirect_mapping_verify(vim));
59
60         return (vim->vim_phys->vimp_num_entries);
61 }
62
63 uint64_t
64 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
65 {
66         ASSERT(vdev_indirect_mapping_verify(vim));
67
68         return (vim->vim_phys->vimp_max_offset);
69 }
70
71 uint64_t
72 vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
73 {
74         ASSERT(vdev_indirect_mapping_verify(vim));
75
76         return (vim->vim_object);
77 }
78
79 uint64_t
80 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
81 {
82         ASSERT(vdev_indirect_mapping_verify(vim));
83
84         return (vim->vim_phys->vimp_bytes_mapped);
85 }
86
87 /*
88  * The length (in bytes) of the mapping object array in memory and
89  * (logically) on disk.
90  *
91  * Note that unlike most of our accessor functions,
92  * we don't assert that the struct is consistent; therefore it can be
93  * called while there may be concurrent changes, if we don't care about
94  * the value being immediately stale (e.g. from spa_removal_get_stats()).
95  */
96 uint64_t
97 vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
98 {
99         return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
100 }
101
102 /*
103  * Compare an offset with an indirect mapping entry; there are three
104  * possible scenarios:
105  *
106  *     1. The offset is "less than" the mapping entry; meaning the
107  *        offset is less than the source offset of the mapping entry. In
108  *        this case, there is no overlap between the offset and the
109  *        mapping entry and -1 will be returned.
110  *
111  *     2. The offset is "greater than" the mapping entry; meaning the
112  *        offset is greater than the mapping entry's source offset plus
113  *        the entry's size. In this case, there is no overlap between
114  *        the offset and the mapping entry and 1 will be returned.
115  *
116  *        NOTE: If the offset is actually equal to the entry's offset
117  *        plus size, this is considered to be "greater" than the entry,
118  *        and this case applies (i.e. 1 will be returned). Thus, the
119  *        entry's "range" can be considered to be inclusive at its
120  *        start, but exclusive at its end: e.g. [src, src + size).
121  *
122  *     3. The last case to consider is if the offset actually falls
123  *        within the mapping entry's range. If this is the case, the
124  *        offset is considered to be "equal to" the mapping entry and
125  *        0 will be returned.
126  *
127  *        NOTE: If the offset is equal to the entry's source offset,
128  *        this case applies and 0 will be returned. If the offset is
129  *        equal to the entry's source plus its size, this case does
130  *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
131  *        returned.
132  */
133 static int
134 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
135 {
136         const uint64_t *key = v_key;
137         const vdev_indirect_mapping_entry_phys_t *array_elem =
138             v_array_elem;
139         uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
140
141         if (*key < src_offset) {
142                 return (-1);
143         } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
144                 return (0);
145         } else {
146                 return (1);
147         }
148 }
149
150 /*
151  * Returns the mapping entry for the given offset.
152  *
153  * It's possible that the given offset will not be in the mapping table
154  * (i.e. no mapping entries contain this offset), in which case, the
155  * return value value depends on the "next_if_missing" parameter.
156  *
157  * If the offset is not found in the table and "next_if_missing" is
158  * B_FALSE, then NULL will always be returned. The behavior is intended
159  * to allow consumers to get the entry corresponding to the offset
160  * parameter, iff the offset overlaps with an entry in the table.
161  *
162  * If the offset is not found in the table and "next_if_missing" is
163  * B_TRUE, then the entry nearest to the given offset will be returned,
164  * such that the entry's source offset is greater than the offset
165  * passed in (i.e. the "next" mapping entry in the table is returned, if
166  * the offset is missing from the table). If there are no entries whose
167  * source offset is greater than the passed in offset, NULL is returned.
168  */
169 static vdev_indirect_mapping_entry_phys_t *
170 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
171     uint64_t offset, boolean_t next_if_missing)
172 {
173         ASSERT(vdev_indirect_mapping_verify(vim));
174         ASSERT(vim->vim_phys->vimp_num_entries > 0);
175
176         vdev_indirect_mapping_entry_phys_t *entry = NULL;
177
178         uint64_t last = vim->vim_phys->vimp_num_entries - 1;
179         uint64_t base = 0;
180
181         /*
182          * We don't define these inside of the while loop because we use
183          * their value in the case that offset isn't in the mapping.
184          */
185         uint64_t mid;
186         int result;
187
188         while (last >= base) {
189                 mid = base + ((last - base) >> 1);
190
191                 result = dva_mapping_overlap_compare(&offset,
192                     &vim->vim_entries[mid]);
193
194                 if (result == 0) {
195                         entry = &vim->vim_entries[mid];
196                         break;
197                 } else if (result < 0) {
198                         last = mid - 1;
199                 } else {
200                         base = mid + 1;
201                 }
202         }
203
204         if (entry == NULL && next_if_missing) {
205                 ASSERT3U(base, ==, last + 1);
206                 ASSERT(mid == base || mid == last);
207                 ASSERT3S(result, !=, 0);
208
209                 /*
210                  * The offset we're looking for isn't actually contained
211                  * in the mapping table, thus we need to return the
212                  * closest mapping entry that is greater than the
213                  * offset. We reuse the result of the last comparison,
214                  * comparing the mapping entry at index "mid" and the
215                  * offset. The offset is guaranteed to lie between
216                  * indices one less than "mid", and one greater than
217                  * "mid"; we just need to determine if offset is greater
218                  * than, or less than the mapping entry contained at
219                  * index "mid".
220                  */
221
222                 uint64_t index;
223                 if (result < 0)
224                         index = mid;
225                 else
226                         index = mid + 1;
227
228                 ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
229
230                 if (index == vim->vim_phys->vimp_num_entries) {
231                         /*
232                          * If "index" is past the end of the entries
233                          * array, then not only is the offset not in the
234                          * mapping table, but it's actually greater than
235                          * all entries in the table. In this case, we
236                          * can't return a mapping entry greater than the
237                          * offset (since none exist), so we return NULL.
238                          */
239
240                         ASSERT3S(dva_mapping_overlap_compare(&offset,
241                             &vim->vim_entries[index - 1]), >, 0);
242
243                         return (NULL);
244                 } else {
245                         /*
246                          * Just to be safe, we verify the offset falls
247                          * in between the mapping entries at index and
248                          * one less than index. Since we know the offset
249                          * doesn't overlap an entry, and we're supposed
250                          * to return the entry just greater than the
251                          * offset, both of the following tests must be
252                          * true.
253                          */
254                         ASSERT3S(dva_mapping_overlap_compare(&offset,
255                             &vim->vim_entries[index]), <, 0);
256                         IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
257                             &vim->vim_entries[index - 1]) > 0);
258
259                         return (&vim->vim_entries[index]);
260                 }
261         } else {
262                 return (entry);
263         }
264 }
265
266 vdev_indirect_mapping_entry_phys_t *
267 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
268     uint64_t offset)
269 {
270         return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
271             B_FALSE));
272 }
273
274 vdev_indirect_mapping_entry_phys_t *
275 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
276     uint64_t offset)
277 {
278         return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
279             B_TRUE));
280 }
281
282 void
283 vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
284 {
285         ASSERT(vdev_indirect_mapping_verify(vim));
286
287         if (vim->vim_phys->vimp_num_entries > 0) {
288                 uint64_t map_size = vdev_indirect_mapping_size(vim);
289                 kmem_free(vim->vim_entries, map_size);
290                 vim->vim_entries = NULL;
291         }
292
293         dmu_buf_rele(vim->vim_dbuf, vim);
294
295         vim->vim_objset = NULL;
296         vim->vim_object = 0;
297         vim->vim_dbuf = NULL;
298         vim->vim_phys = NULL;
299
300         kmem_free(vim, sizeof (*vim));
301 }
302
303 uint64_t
304 vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
305 {
306         uint64_t object;
307         ASSERT(dmu_tx_is_syncing(tx));
308         uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
309
310         if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
311                 bonus_size = sizeof (vdev_indirect_mapping_phys_t);
312         }
313
314         object = dmu_object_alloc(os,
315             DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
316             DMU_OTN_UINT64_METADATA, bonus_size,
317             tx);
318
319         if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
320                 dmu_buf_t *dbuf;
321                 vdev_indirect_mapping_phys_t *vimp;
322
323                 VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
324                 dmu_buf_will_dirty(dbuf, tx);
325                 vimp = dbuf->db_data;
326                 vimp->vimp_counts_object = dmu_object_alloc(os,
327                     DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
328                     DMU_OT_NONE, 0, tx);
329                 spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
330                 dmu_buf_rele(dbuf, FTAG);
331         }
332
333         return (object);
334 }
335
336
337 vdev_indirect_mapping_t *
338 vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
339 {
340         vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
341         dmu_object_info_t doi;
342         VERIFY0(dmu_object_info(os, mapping_object, &doi));
343
344         vim->vim_objset = os;
345         vim->vim_object = mapping_object;
346
347         VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
348             &vim->vim_dbuf));
349         vim->vim_phys = vim->vim_dbuf->db_data;
350
351         vim->vim_havecounts =
352             (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
353
354         if (vim->vim_phys->vimp_num_entries > 0) {
355                 uint64_t map_size = vdev_indirect_mapping_size(vim);
356                 vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
357                 VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
358                     vim->vim_entries, DMU_READ_PREFETCH));
359         }
360
361         ASSERT(vdev_indirect_mapping_verify(vim));
362
363         return (vim);
364 }
365
366 void
367 vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
368 {
369         vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
370         if (vim->vim_havecounts) {
371                 VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
372                     tx));
373                 spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
374         }
375         vdev_indirect_mapping_close(vim);
376
377         VERIFY0(dmu_object_free(os, object, tx));
378 }
379
380 /*
381  * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
382  * mapping object.  Also remove the entries from the list and free them.
383  * This also implicitly extends the max_offset of the mapping (to the end
384  * of the last entry).
385  */
386 void
387 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
388     list_t *list, dmu_tx_t *tx)
389 {
390         vdev_indirect_mapping_entry_phys_t *mapbuf;
391         uint64_t old_size;
392         uint32_t *countbuf = NULL;
393         vdev_indirect_mapping_entry_phys_t *old_entries;
394         uint64_t old_count;
395         uint64_t entries_written = 0;
396
397         ASSERT(vdev_indirect_mapping_verify(vim));
398         ASSERT(dmu_tx_is_syncing(tx));
399         ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
400         ASSERT(!list_is_empty(list));
401
402         old_size = vdev_indirect_mapping_size(vim);
403         old_entries = vim->vim_entries;
404         old_count = vim->vim_phys->vimp_num_entries;
405
406         dmu_buf_will_dirty(vim->vim_dbuf, tx);
407
408         mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
409         if (vim->vim_havecounts) {
410                 countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
411                 ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
412                     SPA_FEATURE_OBSOLETE_COUNTS));
413         }
414         while (!list_is_empty(list)) {
415                 uint64_t i;
416                 /*
417                  * Write entries from the list to the
418                  * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
419                  */
420                 for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
421                         vdev_indirect_mapping_entry_t *entry =
422                             list_remove_head(list);
423                         if (entry == NULL)
424                                 break;
425
426                         uint64_t size =
427                             DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
428                         uint64_t src_offset =
429                             DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
430
431                         /*
432                          * We shouldn't be adding an entry which is fully
433                          * obsolete.
434                          */
435                         ASSERT3U(entry->vime_obsolete_count, <, size);
436                         IMPLY(entry->vime_obsolete_count != 0,
437                             vim->vim_havecounts);
438
439                         mapbuf[i] = entry->vime_mapping;
440                         if (vim->vim_havecounts)
441                                 countbuf[i] = entry->vime_obsolete_count;
442
443                         vim->vim_phys->vimp_bytes_mapped += size;
444                         ASSERT3U(src_offset, >=,
445                             vim->vim_phys->vimp_max_offset);
446                         vim->vim_phys->vimp_max_offset = src_offset + size;
447
448                         entries_written++;
449
450                         kmem_free(entry, sizeof (*entry));
451                 }
452                 dmu_write(vim->vim_objset, vim->vim_object,
453                     vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
454                     i * sizeof (*mapbuf),
455                     mapbuf, tx);
456                 if (vim->vim_havecounts) {
457                         dmu_write(vim->vim_objset,
458                             vim->vim_phys->vimp_counts_object,
459                             vim->vim_phys->vimp_num_entries *
460                             sizeof (*countbuf),
461                             i * sizeof (*countbuf), countbuf, tx);
462                 }
463                 vim->vim_phys->vimp_num_entries += i;
464         }
465         zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
466         if (vim->vim_havecounts)
467                 zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
468
469         /*
470          * Update the entry array to reflect the new entries. First, copy
471          * over any old entries then read back the new entries we just wrote.
472          */
473         uint64_t new_size = vdev_indirect_mapping_size(vim);
474         ASSERT3U(new_size, >, old_size);
475         ASSERT3U(new_size - old_size, ==,
476             entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
477         vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
478         if (old_size > 0) {
479                 bcopy(old_entries, vim->vim_entries, old_size);
480                 kmem_free(old_entries, old_size);
481         }
482         VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
483             new_size - old_size, &vim->vim_entries[old_count],
484             DMU_READ_PREFETCH));
485
486         zfs_dbgmsg("txg %llu: wrote %llu entries to "
487             "indirect mapping obj %llu; max offset=0x%llx",
488             (u_longlong_t)dmu_tx_get_txg(tx),
489             (u_longlong_t)entries_written,
490             (u_longlong_t)vim->vim_object,
491             (u_longlong_t)vim->vim_phys->vimp_max_offset);
492 }
493
494 /*
495  * Increment the relevant counts for the specified offset and length.
496  * The counts array must be obtained from
497  * vdev_indirect_mapping_load_obsolete_counts().
498  */
499 void
500 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
501     uint64_t offset, uint64_t length, uint32_t *counts)
502 {
503         vdev_indirect_mapping_entry_phys_t *mapping;
504         uint64_t index;
505
506         mapping = vdev_indirect_mapping_entry_for_offset(vim,  offset);
507
508         ASSERT(length > 0);
509         ASSERT3P(mapping, !=, NULL);
510
511         index = mapping - vim->vim_entries;
512
513         while (length > 0) {
514                 ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
515
516                 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
517                 uint64_t inner_offset = offset -
518                     DVA_MAPPING_GET_SRC_OFFSET(mapping);
519                 VERIFY3U(inner_offset, <, size);
520                 uint64_t inner_size = MIN(length, size - inner_offset);
521
522                 VERIFY3U(counts[index] + inner_size, <=, size);
523                 counts[index] += inner_size;
524
525                 offset += inner_size;
526                 length -= inner_size;
527                 mapping++;
528                 index++;
529         }
530 }
531
532 typedef struct load_obsolete_space_map_arg {
533         vdev_indirect_mapping_t *losma_vim;
534         uint32_t                *losma_counts;
535 } load_obsolete_space_map_arg_t;
536
537 static int
538 load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
539 {
540         load_obsolete_space_map_arg_t *losma = arg;
541         ASSERT3S(sme->sme_type, ==, SM_ALLOC);
542
543         vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
544             sme->sme_offset, sme->sme_run, losma->losma_counts);
545
546         return (0);
547 }
548
549 /*
550  * Modify the counts (increment them) based on the spacemap.
551  */
552 void
553 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
554     uint32_t *counts, space_map_t *obsolete_space_sm)
555 {
556         load_obsolete_space_map_arg_t losma;
557         losma.losma_counts = counts;
558         losma.losma_vim = vim;
559         VERIFY0(space_map_iterate(obsolete_space_sm,
560             load_obsolete_sm_callback, &losma));
561 }
562
563 /*
564  * Read the obsolete counts from disk, returning them in an array.
565  */
566 uint32_t *
567 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
568 {
569         ASSERT(vdev_indirect_mapping_verify(vim));
570
571         uint64_t counts_size =
572             vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
573         uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
574         if (vim->vim_havecounts) {
575                 VERIFY0(dmu_read(vim->vim_objset,
576                     vim->vim_phys->vimp_counts_object,
577                     0, counts_size,
578                     counts, DMU_READ_PREFETCH));
579         } else {
580                 bzero(counts, counts_size);
581         }
582         return (counts);
583 }
584
585 extern void
586 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
587     uint32_t *counts)
588 {
589         ASSERT(vdev_indirect_mapping_verify(vim));
590
591         kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
592 }