]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dmu_zfetch.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 /*
27  * Copyright (c) 2013 by Delphix. All rights reserved.
28  */
29
30 #include <sys/zfs_context.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dmu_zfetch.h>
34 #include <sys/dmu.h>
35 #include <sys/dbuf.h>
36 #include <sys/kstat.h>
37
38 /*
39  * I'm against tune-ables, but these should probably exist as tweakable globals
40  * until we can get this working the way we want it to.
41  */
42
43 int zfs_prefetch_disable = 0;
44
45 /* max # of streams per zfetch */
46 uint32_t        zfetch_max_streams = 8;
47 /* min time before stream reclaim */
48 uint32_t        zfetch_min_sec_reap = 2;
49 /* max number of blocks to fetch at a time */
50 uint32_t        zfetch_block_cap = 256;
51 /* number of bytes in a array_read at which we stop prefetching (1Mb) */
52 uint64_t        zfetch_array_rd_sz = 1024 * 1024;
53
54 SYSCTL_DECL(_vfs_zfs);
55 SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
56     &zfs_prefetch_disable, 0, "Disable prefetch");
57 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
58 TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams);
59 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW,
60     &zfetch_max_streams, 0, "Max # of streams per zfetch");
61 TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
62 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
63     &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
64 TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap);
65 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN,
66     &zfetch_block_cap, 0, "Max number of blocks to fetch at a time");
67 TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz);
68 SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
69     &zfetch_array_rd_sz, 0,
70     "Number of bytes in a array_read at which we stop prefetching");
71
72 /* forward decls for static routines */
73 static boolean_t        dmu_zfetch_colinear(zfetch_t *, zstream_t *);
74 static void             dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
75 static uint64_t         dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
76 static uint64_t         dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
77 static boolean_t        dmu_zfetch_find(zfetch_t *, zstream_t *, int);
78 static int              dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
79 static zstream_t        *dmu_zfetch_stream_reclaim(zfetch_t *);
80 static void             dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
81 static int              dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
82
83 typedef struct zfetch_stats {
84         kstat_named_t zfetchstat_hits;
85         kstat_named_t zfetchstat_misses;
86         kstat_named_t zfetchstat_colinear_hits;
87         kstat_named_t zfetchstat_colinear_misses;
88         kstat_named_t zfetchstat_stride_hits;
89         kstat_named_t zfetchstat_stride_misses;
90         kstat_named_t zfetchstat_reclaim_successes;
91         kstat_named_t zfetchstat_reclaim_failures;
92         kstat_named_t zfetchstat_stream_resets;
93         kstat_named_t zfetchstat_stream_noresets;
94         kstat_named_t zfetchstat_bogus_streams;
95 } zfetch_stats_t;
96
97 static zfetch_stats_t zfetch_stats = {
98         { "hits",                       KSTAT_DATA_UINT64 },
99         { "misses",                     KSTAT_DATA_UINT64 },
100         { "colinear_hits",              KSTAT_DATA_UINT64 },
101         { "colinear_misses",            KSTAT_DATA_UINT64 },
102         { "stride_hits",                KSTAT_DATA_UINT64 },
103         { "stride_misses",              KSTAT_DATA_UINT64 },
104         { "reclaim_successes",          KSTAT_DATA_UINT64 },
105         { "reclaim_failures",           KSTAT_DATA_UINT64 },
106         { "streams_resets",             KSTAT_DATA_UINT64 },
107         { "streams_noresets",           KSTAT_DATA_UINT64 },
108         { "bogus_streams",              KSTAT_DATA_UINT64 },
109 };
110
111 #define ZFETCHSTAT_INCR(stat, val) \
112         atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
113
114 #define ZFETCHSTAT_BUMP(stat)           ZFETCHSTAT_INCR(stat, 1);
115
116 kstat_t         *zfetch_ksp;
117
118 /*
119  * Given a zfetch structure and a zstream structure, determine whether the
120  * blocks to be read are part of a co-linear pair of existing prefetch
121  * streams.  If a set is found, coalesce the streams, removing one, and
122  * configure the prefetch so it looks for a strided access pattern.
123  *
124  * In other words: if we find two sequential access streams that are
125  * the same length and distance N appart, and this read is N from the
126  * last stream, then we are probably in a strided access pattern.  So
127  * combine the two sequential streams into a single strided stream.
128  *
129  * Returns whether co-linear streams were found.
130  */
131 static boolean_t
132 dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
133 {
134         zstream_t       *z_walk;
135         zstream_t       *z_comp;
136
137         if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
138                 return (0);
139
140         if (zh == NULL) {
141                 rw_exit(&zf->zf_rwlock);
142                 return (0);
143         }
144
145         for (z_walk = list_head(&zf->zf_stream); z_walk;
146             z_walk = list_next(&zf->zf_stream, z_walk)) {
147                 for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
148                     z_comp = list_next(&zf->zf_stream, z_comp)) {
149                         int64_t         diff;
150
151                         if (z_walk->zst_len != z_walk->zst_stride ||
152                             z_comp->zst_len != z_comp->zst_stride) {
153                                 continue;
154                         }
155
156                         diff = z_comp->zst_offset - z_walk->zst_offset;
157                         if (z_comp->zst_offset + diff == zh->zst_offset) {
158                                 z_walk->zst_offset = zh->zst_offset;
159                                 z_walk->zst_direction = diff < 0 ? -1 : 1;
160                                 z_walk->zst_stride =
161                                     diff * z_walk->zst_direction;
162                                 z_walk->zst_ph_offset =
163                                     zh->zst_offset + z_walk->zst_stride;
164                                 dmu_zfetch_stream_remove(zf, z_comp);
165                                 mutex_destroy(&z_comp->zst_lock);
166                                 kmem_free(z_comp, sizeof (zstream_t));
167
168                                 dmu_zfetch_dofetch(zf, z_walk);
169
170                                 rw_exit(&zf->zf_rwlock);
171                                 return (1);
172                         }
173
174                         diff = z_walk->zst_offset - z_comp->zst_offset;
175                         if (z_walk->zst_offset + diff == zh->zst_offset) {
176                                 z_walk->zst_offset = zh->zst_offset;
177                                 z_walk->zst_direction = diff < 0 ? -1 : 1;
178                                 z_walk->zst_stride =
179                                     diff * z_walk->zst_direction;
180                                 z_walk->zst_ph_offset =
181                                     zh->zst_offset + z_walk->zst_stride;
182                                 dmu_zfetch_stream_remove(zf, z_comp);
183                                 mutex_destroy(&z_comp->zst_lock);
184                                 kmem_free(z_comp, sizeof (zstream_t));
185
186                                 dmu_zfetch_dofetch(zf, z_walk);
187
188                                 rw_exit(&zf->zf_rwlock);
189                                 return (1);
190                         }
191                 }
192         }
193
194         rw_exit(&zf->zf_rwlock);
195         return (0);
196 }
197
198 /*
199  * Given a zstream_t, determine the bounds of the prefetch.  Then call the
200  * routine that actually prefetches the individual blocks.
201  */
202 static void
203 dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
204 {
205         uint64_t        prefetch_tail;
206         uint64_t        prefetch_limit;
207         uint64_t        prefetch_ofst;
208         uint64_t        prefetch_len;
209         uint64_t        blocks_fetched;
210
211         zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
212         zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
213
214         prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
215             (int64_t)(zs->zst_offset + zs->zst_stride));
216         /*
217          * XXX: use a faster division method?
218          */
219         prefetch_limit = zs->zst_offset + zs->zst_len +
220             (zs->zst_cap * zs->zst_stride) / zs->zst_len;
221
222         while (prefetch_tail < prefetch_limit) {
223                 prefetch_ofst = zs->zst_offset + zs->zst_direction *
224                     (prefetch_tail - zs->zst_offset);
225
226                 prefetch_len = zs->zst_len;
227
228                 /*
229                  * Don't prefetch beyond the end of the file, if working
230                  * backwards.
231                  */
232                 if ((zs->zst_direction == ZFETCH_BACKWARD) &&
233                     (prefetch_ofst > prefetch_tail)) {
234                         prefetch_len += prefetch_ofst;
235                         prefetch_ofst = 0;
236                 }
237
238                 /* don't prefetch more than we're supposed to */
239                 if (prefetch_len > zs->zst_len)
240                         break;
241
242                 blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
243                     prefetch_ofst, zs->zst_len);
244
245                 prefetch_tail += zs->zst_stride;
246                 /* stop if we've run out of stuff to prefetch */
247                 if (blocks_fetched < zs->zst_len)
248                         break;
249         }
250         zs->zst_ph_offset = prefetch_tail;
251         zs->zst_last = ddi_get_lbolt();
252 }
253
254 void
255 zfetch_init(void)
256 {
257
258         zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
259             KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
260             KSTAT_FLAG_VIRTUAL);
261
262         if (zfetch_ksp != NULL) {
263                 zfetch_ksp->ks_data = &zfetch_stats;
264                 kstat_install(zfetch_ksp);
265         }
266 }
267
268 void
269 zfetch_fini(void)
270 {
271         if (zfetch_ksp != NULL) {
272                 kstat_delete(zfetch_ksp);
273                 zfetch_ksp = NULL;
274         }
275 }
276
277 /*
278  * This takes a pointer to a zfetch structure and a dnode.  It performs the
279  * necessary setup for the zfetch structure, grokking data from the
280  * associated dnode.
281  */
282 void
283 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
284 {
285         if (zf == NULL) {
286                 return;
287         }
288
289         zf->zf_dnode = dno;
290         zf->zf_stream_cnt = 0;
291         zf->zf_alloc_fail = 0;
292
293         list_create(&zf->zf_stream, sizeof (zstream_t),
294             offsetof(zstream_t, zst_node));
295
296         rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
297 }
298
299 /*
300  * This function computes the actual size, in blocks, that can be prefetched,
301  * and fetches it.
302  */
303 static uint64_t
304 dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
305 {
306         uint64_t        fetchsz;
307         uint64_t        i;
308
309         fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
310
311         for (i = 0; i < fetchsz; i++) {
312                 dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
313         }
314
315         return (fetchsz);
316 }
317
318 /*
319  * this function returns the number of blocks that would be prefetched, based
320  * upon the supplied dnode, blockid, and nblks.  This is used so that we can
321  * update streams in place, and then prefetch with their old value after the
322  * fact.  This way, we can delay the prefetch, but subsequent accesses to the
323  * stream won't result in the same data being prefetched multiple times.
324  */
325 static uint64_t
326 dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
327 {
328         uint64_t        fetchsz;
329
330         if (blkid > dn->dn_maxblkid) {
331                 return (0);
332         }
333
334         /* compute fetch size */
335         if (blkid + nblks + 1 > dn->dn_maxblkid) {
336                 fetchsz = (dn->dn_maxblkid - blkid) + 1;
337                 ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
338         } else {
339                 fetchsz = nblks;
340         }
341
342
343         return (fetchsz);
344 }
345
346 /*
347  * given a zfetch and a zstream structure, see if there is an associated zstream
348  * for this block read.  If so, it starts a prefetch for the stream it
349  * located and returns true, otherwise it returns false
350  */
351 static boolean_t
352 dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
353 {
354         zstream_t       *zs;
355         int64_t         diff;
356         int             reset = !prefetched;
357         int             rc = 0;
358
359         if (zh == NULL)
360                 return (0);
361
362         /*
363          * XXX: This locking strategy is a bit coarse; however, it's impact has
364          * yet to be tested.  If this turns out to be an issue, it can be
365          * modified in a number of different ways.
366          */
367
368         rw_enter(&zf->zf_rwlock, RW_READER);
369 top:
370
371         for (zs = list_head(&zf->zf_stream); zs;
372             zs = list_next(&zf->zf_stream, zs)) {
373
374                 /*
375                  * XXX - should this be an assert?
376                  */
377                 if (zs->zst_len == 0) {
378                         /* bogus stream */
379                         ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
380                         continue;
381                 }
382
383                 /*
384                  * We hit this case when we are in a strided prefetch stream:
385                  * we will read "len" blocks before "striding".
386                  */
387                 if (zh->zst_offset >= zs->zst_offset &&
388                     zh->zst_offset < zs->zst_offset + zs->zst_len) {
389                         if (prefetched) {
390                                 /* already fetched */
391                                 ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
392                                 rc = 1;
393                                 goto out;
394                         } else {
395                                 ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
396                         }
397                 }
398
399                 /*
400                  * This is the forward sequential read case: we increment
401                  * len by one each time we hit here, so we will enter this
402                  * case on every read.
403                  */
404                 if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
405
406                         reset = !prefetched && zs->zst_len > 1;
407
408                         if (mutex_tryenter(&zs->zst_lock) == 0) {
409                                 rc = 1;
410                                 goto out;
411                         }
412
413                         if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
414                                 mutex_exit(&zs->zst_lock);
415                                 goto top;
416                         }
417                         zs->zst_len += zh->zst_len;
418                         diff = zs->zst_len - zfetch_block_cap;
419                         if (diff > 0) {
420                                 zs->zst_offset += diff;
421                                 zs->zst_len = zs->zst_len > diff ?
422                                     zs->zst_len - diff : 0;
423                         }
424                         zs->zst_direction = ZFETCH_FORWARD;
425
426                         break;
427
428                 /*
429                  * Same as above, but reading backwards through the file.
430                  */
431                 } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
432                         /* backwards sequential access */
433
434                         reset = !prefetched && zs->zst_len > 1;
435
436                         if (mutex_tryenter(&zs->zst_lock) == 0) {
437                                 rc = 1;
438                                 goto out;
439                         }
440
441                         if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
442                                 mutex_exit(&zs->zst_lock);
443                                 goto top;
444                         }
445
446                         zs->zst_offset = zs->zst_offset > zh->zst_len ?
447                             zs->zst_offset - zh->zst_len : 0;
448                         zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
449                             zs->zst_ph_offset - zh->zst_len : 0;
450                         zs->zst_len += zh->zst_len;
451
452                         diff = zs->zst_len - zfetch_block_cap;
453                         if (diff > 0) {
454                                 zs->zst_ph_offset = zs->zst_ph_offset > diff ?
455                                     zs->zst_ph_offset - diff : 0;
456                                 zs->zst_len = zs->zst_len > diff ?
457                                     zs->zst_len - diff : zs->zst_len;
458                         }
459                         zs->zst_direction = ZFETCH_BACKWARD;
460
461                         break;
462
463                 } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
464                     zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
465                         /* strided forward access */
466
467                         if (mutex_tryenter(&zs->zst_lock) == 0) {
468                                 rc = 1;
469                                 goto out;
470                         }
471
472                         if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
473                             zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
474                                 mutex_exit(&zs->zst_lock);
475                                 goto top;
476                         }
477
478                         zs->zst_offset += zs->zst_stride;
479                         zs->zst_direction = ZFETCH_FORWARD;
480
481                         break;
482
483                 } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
484                     zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
485                         /* strided reverse access */
486
487                         if (mutex_tryenter(&zs->zst_lock) == 0) {
488                                 rc = 1;
489                                 goto out;
490                         }
491
492                         if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
493                             zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
494                                 mutex_exit(&zs->zst_lock);
495                                 goto top;
496                         }
497
498                         zs->zst_offset = zs->zst_offset > zs->zst_stride ?
499                             zs->zst_offset - zs->zst_stride : 0;
500                         zs->zst_ph_offset = (zs->zst_ph_offset >
501                             (2 * zs->zst_stride)) ?
502                             (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
503                         zs->zst_direction = ZFETCH_BACKWARD;
504
505                         break;
506                 }
507         }
508
509         if (zs) {
510                 if (reset) {
511                         zstream_t *remove = zs;
512
513                         ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
514                         rc = 0;
515                         mutex_exit(&zs->zst_lock);
516                         rw_exit(&zf->zf_rwlock);
517                         rw_enter(&zf->zf_rwlock, RW_WRITER);
518                         /*
519                          * Relocate the stream, in case someone removes
520                          * it while we were acquiring the WRITER lock.
521                          */
522                         for (zs = list_head(&zf->zf_stream); zs;
523                             zs = list_next(&zf->zf_stream, zs)) {
524                                 if (zs == remove) {
525                                         dmu_zfetch_stream_remove(zf, zs);
526                                         mutex_destroy(&zs->zst_lock);
527                                         kmem_free(zs, sizeof (zstream_t));
528                                         break;
529                                 }
530                         }
531                 } else {
532                         ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
533                         rc = 1;
534                         dmu_zfetch_dofetch(zf, zs);
535                         mutex_exit(&zs->zst_lock);
536                 }
537         }
538 out:
539         rw_exit(&zf->zf_rwlock);
540         return (rc);
541 }
542
543 /*
544  * Clean-up state associated with a zfetch structure.  This frees allocated
545  * structure members, empties the zf_stream tree, and generally makes things
546  * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
547  */
548 void
549 dmu_zfetch_rele(zfetch_t *zf)
550 {
551         zstream_t       *zs;
552         zstream_t       *zs_next;
553
554         ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
555
556         for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
557                 zs_next = list_next(&zf->zf_stream, zs);
558
559                 list_remove(&zf->zf_stream, zs);
560                 mutex_destroy(&zs->zst_lock);
561                 kmem_free(zs, sizeof (zstream_t));
562         }
563         list_destroy(&zf->zf_stream);
564         rw_destroy(&zf->zf_rwlock);
565
566         zf->zf_dnode = NULL;
567 }
568
569 /*
570  * Given a zfetch and zstream structure, insert the zstream structure into the
571  * AVL tree contained within the zfetch structure.  Peform the appropriate
572  * book-keeping.  It is possible that another thread has inserted a stream which
573  * matches one that we are about to insert, so we must be sure to check for this
574  * case.  If one is found, return failure, and let the caller cleanup the
575  * duplicates.
576  */
577 static int
578 dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
579 {
580         zstream_t       *zs_walk;
581         zstream_t       *zs_next;
582
583         ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
584
585         for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
586                 zs_next = list_next(&zf->zf_stream, zs_walk);
587
588                 if (dmu_zfetch_streams_equal(zs_walk, zs)) {
589                         return (0);
590                 }
591         }
592
593         list_insert_head(&zf->zf_stream, zs);
594         zf->zf_stream_cnt++;
595         return (1);
596 }
597
598
599 /*
600  * Walk the list of zstreams in the given zfetch, find an old one (by time), and
601  * reclaim it for use by the caller.
602  */
603 static zstream_t *
604 dmu_zfetch_stream_reclaim(zfetch_t *zf)
605 {
606         zstream_t       *zs;
607         clock_t         ticks;
608
609         ticks = zfetch_min_sec_reap * hz;
610         if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
611                 return (0);
612
613         for (zs = list_head(&zf->zf_stream); zs;
614             zs = list_next(&zf->zf_stream, zs)) {
615
616                 if (ddi_get_lbolt() - zs->zst_last > ticks)
617                         break;
618         }
619
620         if (zs) {
621                 dmu_zfetch_stream_remove(zf, zs);
622                 mutex_destroy(&zs->zst_lock);
623                 bzero(zs, sizeof (zstream_t));
624         } else {
625                 zf->zf_alloc_fail++;
626         }
627         rw_exit(&zf->zf_rwlock);
628
629         return (zs);
630 }
631
632 /*
633  * Given a zfetch and zstream structure, remove the zstream structure from its
634  * container in the zfetch structure.  Perform the appropriate book-keeping.
635  */
636 static void
637 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
638 {
639         ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
640
641         list_remove(&zf->zf_stream, zs);
642         zf->zf_stream_cnt--;
643 }
644
645 static int
646 dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
647 {
648         if (zs1->zst_offset != zs2->zst_offset)
649                 return (0);
650
651         if (zs1->zst_len != zs2->zst_len)
652                 return (0);
653
654         if (zs1->zst_stride != zs2->zst_stride)
655                 return (0);
656
657         if (zs1->zst_ph_offset != zs2->zst_ph_offset)
658                 return (0);
659
660         if (zs1->zst_cap != zs2->zst_cap)
661                 return (0);
662
663         if (zs1->zst_direction != zs2->zst_direction)
664                 return (0);
665
666         return (1);
667 }
668
669 /*
670  * This is the prefetch entry point.  It calls all of the other dmu_zfetch
671  * routines to create, delete, find, or operate upon prefetch streams.
672  */
673 void
674 dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
675 {
676         zstream_t       zst;
677         zstream_t       *newstream;
678         boolean_t       fetched;
679         int             inserted;
680         unsigned int    blkshft;
681         uint64_t        blksz;
682
683         if (zfs_prefetch_disable)
684                 return;
685
686         /* files that aren't ln2 blocksz are only one block -- nothing to do */
687         if (!zf->zf_dnode->dn_datablkshift)
688                 return;
689
690         /* convert offset and size, into blockid and nblocks */
691         blkshft = zf->zf_dnode->dn_datablkshift;
692         blksz = (1 << blkshft);
693
694         bzero(&zst, sizeof (zstream_t));
695         zst.zst_offset = offset >> blkshft;
696         zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
697             P2ALIGN(offset, blksz)) >> blkshft;
698
699         fetched = dmu_zfetch_find(zf, &zst, prefetched);
700         if (fetched) {
701                 ZFETCHSTAT_BUMP(zfetchstat_hits);
702         } else {
703                 ZFETCHSTAT_BUMP(zfetchstat_misses);
704                 fetched = dmu_zfetch_colinear(zf, &zst);
705                 if (fetched) {
706                         ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
707                 } else {
708                         ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
709                 }
710         }
711
712         if (!fetched) {
713                 newstream = dmu_zfetch_stream_reclaim(zf);
714
715                 /*
716                  * we still couldn't find a stream, drop the lock, and allocate
717                  * one if possible.  Otherwise, give up and go home.
718                  */
719                 if (newstream) {
720                         ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
721                 } else {
722                         uint64_t        maxblocks;
723                         uint32_t        max_streams;
724                         uint32_t        cur_streams;
725
726                         ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
727                         cur_streams = zf->zf_stream_cnt;
728                         maxblocks = zf->zf_dnode->dn_maxblkid;
729
730                         max_streams = MIN(zfetch_max_streams,
731                             (maxblocks / zfetch_block_cap));
732                         if (max_streams == 0) {
733                                 max_streams++;
734                         }
735
736                         if (cur_streams >= max_streams) {
737                                 return;
738                         }
739                         newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
740                 }
741
742                 newstream->zst_offset = zst.zst_offset;
743                 newstream->zst_len = zst.zst_len;
744                 newstream->zst_stride = zst.zst_len;
745                 newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
746                 newstream->zst_cap = zst.zst_len;
747                 newstream->zst_direction = ZFETCH_FORWARD;
748                 newstream->zst_last = ddi_get_lbolt();
749
750                 mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
751
752                 rw_enter(&zf->zf_rwlock, RW_WRITER);
753                 inserted = dmu_zfetch_stream_insert(zf, newstream);
754                 rw_exit(&zf->zf_rwlock);
755
756                 if (!inserted) {
757                         mutex_destroy(&newstream->zst_lock);
758                         kmem_free(newstream, sizeof (zstream_t));
759                 }
760         }
761 }