1 /* stats.c -- implements the svn_fs_fs__get_stats private API.
3 * ====================================================================
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
20 * ====================================================================
23 #include "svn_dirent_uri.h"
25 #include "svn_pools.h"
26 #include "svn_sorts.h"
28 #include "private/svn_cache.h"
29 #include "private/svn_sorts_private.h"
30 #include "private/svn_string_private.h"
37 #include "cached_data.h"
38 #include "low_level.h"
41 #include "../libsvn_fs/fs-loader.h"
43 #include "svn_private_config.h"
45 /* We group representations into 2x2 different kinds plus one default:
46 * [dir / file] x [text / prop]. The assignment is done by the first node
47 * that references the respective representation.
49 typedef enum rep_kind_t
51 /* The representation is not used _directly_, i.e. not referenced by any
52 * noderev. However, some other representation may use it as delta base.
53 * Null value. Should not occur in real-word repositories. */
56 /* a properties on directory rep */
59 /* a properties on file rep */
69 /* A representation fragment.
71 typedef struct rep_stats_t
73 /* offset in the revision file (phys. addressing) /
74 * item index within REVISION (log. addressing) */
75 apr_uint64_t item_index;
77 /* item length in bytes */
80 /* item length after de-deltification */
81 apr_uint64_t expanded_size;
83 /* revision that contains this representation
84 * (may be referenced by other revisions, though) */
85 svn_revnum_t revision;
87 /* number of nodes that reference this representation */
88 apr_uint32_t ref_count;
90 /* length of the PLAIN / DELTA line in the source file in bytes */
91 apr_uint16_t header_size;
93 /* classification of the representation. values of rep_kind_t */
96 /* length of the delta chain, including this representation,
97 * saturated to 255 - if need be */
98 apr_byte_t chain_length;
101 /* Represents a link in the rep delta chain. REVISION + ITEM_INDEX points
102 * to BASE_REVISION + BASE_ITEM_INDEX. We collect this info while scanning
103 * a f7 repo in a single pass and resolve it afterwards. */
104 typedef struct rep_ref_t
106 /* Revision that contains this representation. */
107 svn_revnum_t revision;
109 /* Item index of this rep within REVISION. */
110 apr_uint64_t item_index;
112 /* Revision of the representation we deltified against.
113 * -1 if this representation is either PLAIN or a self-delta. */
114 svn_revnum_t base_revision;
116 /* Item index of that rep within BASE_REVISION. */
117 apr_uint64_t base_item_index;
119 /* Length of the PLAIN / DELTA line in the source file in bytes.
120 * We use this to update the info in the rep stats after scanning the
122 apr_uint16_t header_size;
126 /* Represents a single revision.
127 * There will be only one instance per revision. */
128 typedef struct revision_info_t
130 /* number of this revision */
131 svn_revnum_t revision;
133 /* pack file offset (manifest value), 0 for non-packed files */
136 /* length of the changes list on bytes */
137 apr_uint64_t changes_len;
139 /* offset of the changes list relative to OFFSET */
140 apr_uint64_t change_count;
142 /* first offset behind the revision data in the pack file (file length
143 * for non-packed revs) */
146 /* number of directory noderevs in this revision */
147 apr_uint64_t dir_noderev_count;
149 /* number of file noderevs in this revision */
150 apr_uint64_t file_noderev_count;
152 /* total size of directory noderevs (i.e. the structs - not the rep) */
153 apr_uint64_t dir_noderev_size;
155 /* total size of file noderevs (i.e. the structs - not the rep) */
156 apr_uint64_t file_noderev_size;
158 /* all rep_stats_t of this revision (in no particular order),
159 * i.e. those that point back to this struct */
160 apr_array_header_t *representations;
162 /* Temporary rev / pack file access object, used in phys. addressing
163 * mode only. NULL when done reading this revision. */
164 svn_fs_fs__revision_file_t *rev_file;
167 /* Root data structure containing all information about a given repository.
168 * We use it as a wrapper around svn_fs_t and pass it around where we would
169 * otherwise just use a svn_fs_t.
171 typedef struct query_t
176 /* The HEAD revision. */
179 /* Number of revs per shard; 0 for non-sharded repos. */
182 /* First non-packed revision. */
183 svn_revnum_t min_unpacked_rev;
186 apr_array_header_t *revisions;
188 /* empty representation.
189 * Used as a dummy base for DELTA reps without base. */
190 rep_stats_t *null_base;
192 /* collected statistics */
193 svn_fs_fs__stats_t *stats;
195 /* Progress notification callback to call after each shard. May be NULL. */
196 svn_fs_progress_notify_func_t progress_func;
198 /* Baton for PROGRESS_FUNC. */
199 void *progress_baton;
201 /* Cancellation support callback to call once in a while. May be NULL. */
202 svn_cancel_func_t cancel_func;
204 /* Baton for CANCEL_FUNC. */
208 /* Initialize the LARGEST_CHANGES member in STATS with a capacity of COUNT
209 * entries. Allocate the result in RESULT_POOL.
212 initialize_largest_changes(svn_fs_fs__stats_t *stats,
214 apr_pool_t *result_pool)
218 stats->largest_changes = apr_pcalloc(result_pool,
219 sizeof(*stats->largest_changes));
220 stats->largest_changes->count = count;
221 stats->largest_changes->min_size = 1;
222 stats->largest_changes->changes
223 = apr_palloc(result_pool, count * sizeof(*stats->largest_changes->changes));
225 /* allocate *all* entries before the path stringbufs. This increases
226 * cache locality and enhances performance significantly. */
227 for (i = 0; i < count; ++i)
228 stats->largest_changes->changes[i]
229 = apr_palloc(result_pool, sizeof(**stats->largest_changes->changes));
231 /* now initialize them and allocate the stringbufs */
232 for (i = 0; i < count; ++i)
234 stats->largest_changes->changes[i]->size = 0;
235 stats->largest_changes->changes[i]->revision = SVN_INVALID_REVNUM;
236 stats->largest_changes->changes[i]->path
237 = svn_stringbuf_create_ensure(1024, result_pool);
241 /* Add entry for SIZE to HISTOGRAM.
244 add_to_histogram(svn_fs_fs__histogram_t *histogram,
247 apr_int64_t shift = 0;
249 while (((apr_int64_t)(1) << shift) <= size)
252 histogram->total.count++;
253 histogram->total.sum += size;
254 histogram->lines[(apr_size_t)shift].count++;
255 histogram->lines[(apr_size_t)shift].sum += size;
258 /* Update data aggregators in STATS with this representation of type KIND,
259 * on-disk REP_SIZE and expanded node size EXPANDED_SIZE for PATH in REVSION.
260 * PLAIN_ADDED indicates whether the node has a deltification predecessor.
263 add_change(svn_fs_fs__stats_t *stats,
264 apr_uint64_t rep_size,
265 apr_uint64_t expanded_size,
266 svn_revnum_t revision,
269 svn_boolean_t plain_added)
271 /* identify largest reps */
272 if (rep_size >= stats->largest_changes->min_size)
275 svn_fs_fs__largest_changes_t *largest_changes = stats->largest_changes;
276 svn_fs_fs__large_change_info_t *info
277 = largest_changes->changes[largest_changes->count - 1];
278 info->size = rep_size;
279 info->revision = revision;
280 svn_stringbuf_set(info->path, path);
282 /* linear insertion but not too bad since count is low and insertions
283 * near the end are more likely than close to front */
284 for (i = largest_changes->count - 1; i > 0; --i)
285 if (largest_changes->changes[i-1]->size >= rep_size)
288 largest_changes->changes[i] = largest_changes->changes[i-1];
290 largest_changes->changes[i] = info;
291 largest_changes->min_size
292 = largest_changes->changes[largest_changes->count-1]->size;
295 /* global histograms */
296 add_to_histogram(&stats->rep_size_histogram, rep_size);
297 add_to_histogram(&stats->node_size_histogram, expanded_size);
301 add_to_histogram(&stats->added_rep_size_histogram, rep_size);
302 add_to_histogram(&stats->added_node_size_histogram, expanded_size);
305 /* specific histograms by type */
309 add_to_histogram(&stats->unused_rep_histogram, rep_size);
311 case dir_property_rep:
312 add_to_histogram(&stats->dir_prop_rep_histogram, rep_size);
313 add_to_histogram(&stats->dir_prop_histogram, expanded_size);
315 case file_property_rep:
316 add_to_histogram(&stats->file_prop_rep_histogram, rep_size);
317 add_to_histogram(&stats->file_prop_histogram, expanded_size);
320 add_to_histogram(&stats->dir_rep_histogram, rep_size);
321 add_to_histogram(&stats->dir_histogram, expanded_size);
324 add_to_histogram(&stats->file_rep_histogram, rep_size);
325 add_to_histogram(&stats->file_histogram, expanded_size);
330 if (kind == file_rep)
332 /* determine extension */
333 svn_fs_fs__extension_info_t *info;
334 const char * file_name = strrchr(path, '/');
335 const char * extension = file_name ? strrchr(file_name, '.') : NULL;
337 if (extension == NULL || extension == file_name + 1)
338 extension = "(none)";
340 /* get / auto-insert entry for this extension */
341 info = apr_hash_get(stats->by_extension, extension, APR_HASH_KEY_STRING);
344 apr_pool_t *pool = apr_hash_pool_get(stats->by_extension);
345 info = apr_pcalloc(pool, sizeof(*info));
346 info->extension = apr_pstrdup(pool, extension);
348 apr_hash_set(stats->by_extension, info->extension,
349 APR_HASH_KEY_STRING, info);
352 /* update per-extension histogram */
353 add_to_histogram(&info->node_histogram, expanded_size);
354 add_to_histogram(&info->rep_histogram, rep_size);
358 /* Comparator used for binary search comparing the absolute file offset
359 * of a representation to some other offset. DATA is a *rep_stats_t,
360 * KEY is a pointer to an apr_uint64_t.
363 compare_representation_item_index(const void *data, const void *key)
365 apr_uint64_t lhs = (*(const rep_stats_t *const *)data)->item_index;
366 apr_uint64_t rhs = *(const apr_uint64_t *)key;
370 return (lhs > rhs ? 1 : 0);
373 /* Find the revision_info_t object to the given REVISION in QUERY and
374 * return it in *REVISION_INFO. For performance reasons, we skip the
375 * lookup if the info is already provided.
377 * In that revision, look for the rep_stats_t object for item ITEM_INDEX.
378 * If it already exists, set *IDX to its index in *REVISION_INFO's
379 * representations list and return the representation object. Otherwise,
380 * set the index to where it must be inserted and return NULL.
383 find_representation(int *idx,
385 revision_info_t **revision_info,
386 svn_revnum_t revision,
387 apr_uint64_t item_index)
389 revision_info_t *info;
392 /* first let's find the revision */
393 info = revision_info ? *revision_info : NULL;
394 if (info == NULL || info->revision != revision)
396 info = APR_ARRAY_IDX(query->revisions, revision, revision_info_t*);
398 *revision_info = info;
401 /* not found -> no result */
405 /* look for the representation */
406 *idx = svn_sort__bsearch_lower_bound(info->representations,
408 compare_representation_item_index);
409 if (*idx < info->representations->nelts)
411 /* return the representation, if this is the one we were looking for */
413 = APR_ARRAY_IDX(info->representations, *idx, rep_stats_t *);
414 if (result->item_index == item_index)
418 /* not parsed, yet */
422 /* Find / auto-construct the representation stats for REP in QUERY and
423 * return it in *REPRESENTATION.
425 * If necessary, allocate the result in RESULT_POOL; use SCRATCH_POOL for
426 * temporary allocations.
429 parse_representation(rep_stats_t **representation,
431 representation_t *rep,
432 revision_info_t *revision_info,
433 apr_pool_t *result_pool,
434 apr_pool_t *scratch_pool)
439 /* read location (revision, offset) and size */
442 result = find_representation(&idx, query, &revision_info, rep->revision,
446 /* not parsed, yet (probably a rep in the same revision).
447 * Create a new rep object and determine its base rep as well.
449 result = apr_pcalloc(result_pool, sizeof(*result));
450 result->revision = rep->revision;
451 result->expanded_size = rep->expanded_size;
452 result->item_index = rep->item_index;
453 result->size = rep->size;
455 /* In phys. addressing mode, follow link to the actual representation.
456 * In log. addressing mode, we will find it already as part of our
457 * linear walk through the whole file. */
458 if (!svn_fs_fs__use_log_addressing(query->fs))
460 svn_fs_fs__rep_header_t *header;
461 apr_off_t offset = revision_info->offset
462 + (apr_off_t)rep->item_index;
464 SVN_ERR_ASSERT(revision_info->rev_file);
465 SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET,
466 &offset, scratch_pool));
467 SVN_ERR(svn_fs_fs__read_rep_header(&header,
468 revision_info->rev_file->stream,
469 scratch_pool, scratch_pool));
471 result->header_size = header->header_size;
473 /* Determine length of the delta chain. */
474 if (header->type == svn_fs_fs__rep_delta)
477 rep_stats_t *base_rep
478 = find_representation(&base_idx, query, NULL,
479 header->base_revision,
480 header->base_item_index);
482 result->chain_length = 1 + MIN(base_rep->chain_length,
487 result->chain_length = 1;
491 SVN_ERR(svn_sort__array_insert2(revision_info->representations, &result, idx));
494 *representation = result;
500 /* forward declaration */
502 read_noderev(query_t *query,
503 svn_stringbuf_t *noderev_str,
504 revision_info_t *revision_info,
505 apr_pool_t *result_pool,
506 apr_pool_t *scratch_pool);
508 /* Read the noderev item at OFFSET in REVISION_INFO from the filesystem
509 * provided by QUERY. Return it in *NODEREV, allocated in RESULT_POOL.
510 * Use SCRATCH_POOL for temporary allocations.
512 * The textual representation of the noderev will be used to determine
513 * the on-disk size of the noderev. Only called in phys. addressing mode.
516 read_phsy_noderev(svn_stringbuf_t **noderev,
519 revision_info_t *revision_info,
520 apr_pool_t *result_pool,
521 apr_pool_t *scratch_pool)
523 svn_stringbuf_t *noderev_str = svn_stringbuf_create_empty(result_pool);
524 svn_stringbuf_t *line;
527 apr_pool_t *iterpool = svn_pool_create(scratch_pool);
529 /* Navigate the file stream to the start of noderev. */
530 SVN_ERR_ASSERT(revision_info->rev_file);
532 offset += revision_info->offset;
533 SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET,
534 &offset, scratch_pool));
536 /* Read it (terminated by an empty line) */
539 svn_pool_clear(iterpool);
541 SVN_ERR(svn_stream_readline(revision_info->rev_file->stream, &line,
542 "\n", &eof, iterpool));
543 svn_stringbuf_appendstr(noderev_str, line);
544 svn_stringbuf_appendbyte(noderev_str, '\n');
546 while (line->len > 0 && !eof);
548 /* Return the result. */
549 *noderev = noderev_str;
551 svn_pool_destroy(iterpool);
556 /* Starting at the directory in NODEREV's text, read all DAG nodes,
557 * directories and representations linked in that tree structure.
558 * Store them in QUERY and REVISION_INFO. Also, read them only once.
560 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
564 parse_dir(query_t *query,
565 node_revision_t *noderev,
566 revision_info_t *revision_info,
567 apr_pool_t *result_pool,
568 apr_pool_t *scratch_pool)
570 apr_pool_t *iterpool = svn_pool_create(scratch_pool);
573 apr_array_header_t *entries;
574 SVN_ERR(svn_fs_fs__rep_contents_dir(&entries, query->fs, noderev,
575 scratch_pool, scratch_pool));
577 for (i = 0; i < entries->nelts; ++i)
579 svn_fs_dirent_t *dirent = APR_ARRAY_IDX(entries, i, svn_fs_dirent_t *);
581 if (svn_fs_fs__id_rev(dirent->id) == revision_info->revision)
583 svn_stringbuf_t *noderev_str;
584 svn_pool_clear(iterpool);
586 SVN_ERR(read_phsy_noderev(&noderev_str, query,
587 svn_fs_fs__id_item(dirent->id),
588 revision_info, iterpool, iterpool));
589 SVN_ERR(read_noderev(query, noderev_str, revision_info,
590 result_pool, iterpool));
594 svn_pool_destroy(iterpool);
599 /* Parse the noderev given as NODEREV_STR and store the info in QUERY and
600 * REVISION_INFO. In phys. addressing mode, continue reading all DAG nodes,
601 * directories and representations linked in that tree structure.
603 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
607 read_noderev(query_t *query,
608 svn_stringbuf_t *noderev_str,
609 revision_info_t *revision_info,
610 apr_pool_t *result_pool,
611 apr_pool_t *scratch_pool)
613 rep_stats_t *text = NULL;
614 rep_stats_t *props = NULL;
615 node_revision_t *noderev;
617 svn_stream_t *stream = svn_stream_from_stringbuf(noderev_str, scratch_pool);
618 SVN_ERR(svn_fs_fs__read_noderev(&noderev, stream, scratch_pool,
620 SVN_ERR(svn_fs_fs__fixup_expanded_size(query->fs, noderev->data_rep,
622 SVN_ERR(svn_fs_fs__fixup_expanded_size(query->fs, noderev->prop_rep,
625 if (noderev->data_rep)
627 SVN_ERR(parse_representation(&text, query,
628 noderev->data_rep, revision_info,
629 result_pool, scratch_pool));
631 /* if we are the first to use this rep, mark it as "text rep" */
632 if (++text->ref_count == 1)
633 text->kind = noderev->kind == svn_node_dir ? dir_rep : file_rep;
636 if (noderev->prop_rep)
638 SVN_ERR(parse_representation(&props, query,
639 noderev->prop_rep, revision_info,
640 result_pool, scratch_pool));
642 /* if we are the first to use this rep, mark it as "prop rep" */
643 if (++props->ref_count == 1)
644 props->kind = noderev->kind == svn_node_dir ? dir_property_rep
648 /* record largest changes */
649 if (text && text->ref_count == 1)
650 add_change(query->stats, text->size, text->expanded_size, text->revision,
651 noderev->created_path, text->kind, !noderev->predecessor_id);
652 if (props && props->ref_count == 1)
653 add_change(query->stats, props->size, props->expanded_size,
654 props->revision, noderev->created_path, props->kind,
655 !noderev->predecessor_id);
657 /* if this is a directory and has not been processed, yet, read and
658 * process it recursively */
659 if ( noderev->kind == svn_node_dir && text && text->ref_count == 1
660 && !svn_fs_fs__use_log_addressing(query->fs))
661 SVN_ERR(parse_dir(query, noderev, revision_info, result_pool,
665 if (noderev->kind == svn_node_dir)
667 revision_info->dir_noderev_size += noderev_str->len;
668 revision_info->dir_noderev_count++;
672 revision_info->file_noderev_size += noderev_str->len;
673 revision_info->file_noderev_count++;
679 /* For the revision given as REVISION_INFO within QUERY, determine the number
680 * of entries in its changed paths list and store that info in REVISION_INFO.
681 * Use SCRATCH_POOL for temporary allocations.
684 get_phys_change_count(query_t *query,
685 revision_info_t *revision_info,
686 apr_pool_t *scratch_pool)
688 apr_pool_t *iterpool = svn_pool_create(scratch_pool);
689 svn_fs_fs__changes_context_t *context;
691 /* Fetch the first block of data. */
692 SVN_ERR(svn_fs_fs__create_changes_context(&context, query->fs,
693 revision_info->revision,
696 revision_info->change_count = 0;
697 while (!context->eol)
699 apr_array_header_t *changes;
701 svn_pool_clear(iterpool);
702 SVN_ERR(svn_fs_fs__get_changes(&changes, context, iterpool, iterpool));
703 revision_info->change_count = changes->nelts;
706 svn_pool_destroy(iterpool);
711 /* Read header information for the revision stored in FILE_CONTENT (one
712 * whole revision). Return the offsets within FILE_CONTENT for the
713 * *ROOT_NODEREV, the list of *CHANGES and its len in *CHANGES_LEN.
714 * Use POOL for temporary allocations. */
716 read_phys_revision(query_t *query,
717 revision_info_t *info,
718 apr_pool_t *result_pool,
719 apr_pool_t *scratch_pool)
722 apr_off_t root_node_offset;
723 apr_off_t changes_offset;
724 svn_stringbuf_t *trailer;
725 svn_stringbuf_t *noderev_str;
727 /* Read the last 64 bytes of the revision (if long enough). */
728 apr_off_t start = MAX(info->offset, info->end - sizeof(buf));
729 apr_size_t len = (apr_size_t)(info->end - start);
730 SVN_ERR(svn_io_file_seek(info->rev_file->file, APR_SET, &start,
732 SVN_ERR(svn_io_file_read_full2(info->rev_file->file, buf, len, NULL, NULL,
734 trailer = svn_stringbuf_ncreate(buf, len, scratch_pool);
736 /* Parse that trailer. */
737 SVN_ERR(svn_fs_fs__parse_revision_trailer(&root_node_offset,
738 &changes_offset, trailer,
740 SVN_ERR(get_phys_change_count(query, info, scratch_pool));
742 /* Calculate the length of the changes list. */
743 trailer = svn_fs_fs__unparse_revision_trailer(root_node_offset,
746 info->changes_len = info->end - info->offset - changes_offset
749 /* Recursively read nodes added in this rev. */
750 SVN_ERR(read_phsy_noderev(&noderev_str, query, root_node_offset, info,
751 scratch_pool, scratch_pool));
752 SVN_ERR(read_noderev(query, noderev_str, info, result_pool, scratch_pool));
757 /* Read the content of the pack file staring at revision BASE physical
758 * addressing mode and store it in QUERY.
760 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
764 read_phys_pack_file(query_t *query,
766 apr_pool_t *result_pool,
767 apr_pool_t *scratch_pool)
769 apr_pool_t *iterpool = svn_pool_create(scratch_pool);
771 svn_filesize_t file_size = 0;
772 svn_fs_fs__revision_file_t *rev_file;
774 SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base,
775 scratch_pool, scratch_pool));
776 SVN_ERR(svn_io_file_size_get(&file_size, rev_file->file, scratch_pool));
778 /* process each revision in the pack file */
779 for (i = 0; i < query->shard_size; ++i)
781 revision_info_t *info;
783 /* cancellation support */
784 if (query->cancel_func)
785 SVN_ERR(query->cancel_func(query->cancel_baton));
787 /* create the revision info for the current rev */
788 info = apr_pcalloc(result_pool, sizeof(*info));
789 info->representations = apr_array_make(result_pool, 4,
790 sizeof(rep_stats_t*));
791 info->rev_file = rev_file;
793 info->revision = base + i;
794 SVN_ERR(svn_fs_fs__get_packed_offset(&info->offset, query->fs, base + i,
796 if (i + 1 == query->shard_size)
797 info->end = file_size;
799 SVN_ERR(svn_fs_fs__get_packed_offset(&info->end, query->fs,
800 base + i + 1, iterpool));
802 SVN_ERR(read_phys_revision(query, info, result_pool, iterpool));
804 info->representations = apr_array_copy(result_pool,
805 info->representations);
807 /* Done with this revision. */
808 info->rev_file = NULL;
810 /* put it into our container */
811 APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
814 svn_pool_clear(iterpool);
817 /* Done with this pack file. */
818 SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
820 /* one more pack file processed */
821 if (query->progress_func)
822 query->progress_func(base, query->progress_baton, scratch_pool);
827 /* Read the content of the file for REVISION in physical addressing mode
828 * and store its contents in QUERY.
830 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
834 read_phys_revision_file(query_t *query,
835 svn_revnum_t revision,
836 apr_pool_t *result_pool,
837 apr_pool_t *scratch_pool)
839 revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info));
840 svn_filesize_t file_size = 0;
841 svn_fs_fs__revision_file_t *rev_file;
843 /* cancellation support */
844 if (query->cancel_func)
845 SVN_ERR(query->cancel_func(query->cancel_baton));
847 /* read the whole pack file into memory */
848 SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, revision,
849 scratch_pool, scratch_pool));
850 SVN_ERR(svn_io_file_size_get(&file_size, rev_file->file, scratch_pool));
852 /* create the revision info for the current rev */
853 info->representations = apr_array_make(result_pool, 4, sizeof(rep_stats_t*));
855 info->rev_file = rev_file;
856 info->revision = revision;
858 info->end = file_size;
860 SVN_ERR(read_phys_revision(query, info, result_pool, scratch_pool));
862 /* Done with this revision. */
863 SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
864 info->rev_file = NULL;
866 /* put it into our container */
867 APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
869 /* show progress every 1000 revs or so */
870 if (query->progress_func)
872 if (query->shard_size && (revision % query->shard_size == 0))
873 query->progress_func(revision, query->progress_baton, scratch_pool);
874 if (!query->shard_size && (revision % 1000 == 0))
875 query->progress_func(revision, query->progress_baton, scratch_pool);
881 /* Given the unparsed changes list in CHANGES with LEN chars, return the
882 * number of changed paths encoded in it. Only used in log. addressing
886 get_log_change_count(const char *changes,
889 apr_size_t lines = 0;
890 const char *end = changes + len;
893 for (; changes < end; ++changes)
894 if (*changes == '\n')
897 /* two lines per change */
901 /* Read the item described by ENTRY from the REV_FILE and return the
902 * respective byte sequence in *CONTENTS, allocated in RESULT_POOL.
903 * Use SCRATCH_POOL for temporary allocations
906 read_item(svn_stringbuf_t **contents,
907 svn_fs_fs__revision_file_t *rev_file,
908 svn_fs_fs__p2l_entry_t *entry,
909 apr_pool_t *result_pool,
910 apr_pool_t *scratch_pool)
912 svn_stringbuf_t *item = svn_stringbuf_create_ensure(entry->size,
914 item->len = entry->size;
915 item->data[item->len] = 0;
917 SVN_ERR(svn_io_file_aligned_seek(rev_file->file, rev_file->block_size,
918 NULL, entry->offset, scratch_pool));
919 SVN_ERR(svn_io_file_read_full2(rev_file->file, item->data, item->len,
920 NULL, NULL, scratch_pool));
927 /* Predicate comparing the two rep_ref_t** LHS and RHS by the respective
928 * representation's revision.
931 compare_representation_refs(const void *lhs, const void *rhs)
933 svn_revnum_t lhs_rev = (*(const rep_ref_t *const *)lhs)->revision;
934 svn_revnum_t rhs_rev = (*(const rep_ref_t *const *)rhs)->revision;
936 if (lhs_rev < rhs_rev)
938 return (lhs_rev > rhs_rev ? 1 : 0);
941 /* Given all the presentations found in a single rev / pack file as
942 * rep_ref_t * in REP_REFS, update the delta chain lengths in QUERY.
943 * REP_REFS and its contents can then be discarded.
946 resolve_representation_refs(query_t *query,
947 apr_array_header_t *rep_refs)
951 /* Because delta chains can only point to previous revs, after sorting
952 * REP_REFS, all base refs have already been updated. */
953 svn_sort__array(rep_refs, compare_representation_refs);
955 /* Build up the CHAIN_LENGTH values. */
956 for (i = 0; i < rep_refs->nelts; ++i)
959 rep_ref_t *ref = APR_ARRAY_IDX(rep_refs, i, rep_ref_t *);
960 rep_stats_t *rep = find_representation(&idx, query, NULL,
961 ref->revision, ref->item_index);
963 /* No dangling pointers and all base reps have been processed. */
965 SVN_ERR_ASSERT(!rep->chain_length);
967 /* Set the HEADER_SIZE as we found it during the scan. */
968 rep->header_size = ref->header_size;
970 /* The delta chain got 1 element longer. */
971 if (ref->base_revision == SVN_INVALID_REVNUM)
973 rep->chain_length = 1;
979 base = find_representation(&idx, query, NULL, ref->base_revision,
980 ref->base_item_index);
981 SVN_ERR_ASSERT(base);
982 SVN_ERR_ASSERT(base->chain_length);
984 rep->chain_length = 1 + MIN(base->chain_length, (apr_byte_t)0xfe);
991 /* Process the logically addressed revision contents of revisions BASE to
992 * BASE + COUNT - 1 in QUERY.
994 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
998 read_log_rev_or_packfile(query_t *query,
1001 apr_pool_t *result_pool,
1002 apr_pool_t *scratch_pool)
1004 fs_fs_data_t *ffd = query->fs->fsap_data;
1005 apr_pool_t *iterpool = svn_pool_create(scratch_pool);
1006 apr_off_t max_offset;
1007 apr_off_t offset = 0;
1009 svn_fs_fs__revision_file_t *rev_file;
1011 /* We collect the delta chain links as we scan the file. Afterwards,
1012 * we determine the lengths of those delta chains and throw this
1013 * temporary container away. */
1014 apr_array_header_t *rep_refs = apr_array_make(scratch_pool, 64,
1015 sizeof(rep_ref_t *));
1017 /* we will process every revision in the rev / pack file */
1018 for (i = 0; i < count; ++i)
1020 /* create the revision info for the current rev */
1021 revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info));
1022 info->representations = apr_array_make(result_pool, 4,
1023 sizeof(rep_stats_t*));
1024 info->revision = base + i;
1026 APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
1029 /* open the pack / rev file that is covered by the p2l index */
1030 SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base,
1031 scratch_pool, iterpool));
1032 SVN_ERR(svn_fs_fs__p2l_get_max_offset(&max_offset, query->fs, rev_file,
1033 base, scratch_pool));
1035 /* record the whole pack size in the first rev so the total sum will
1037 APR_ARRAY_IDX(query->revisions, base, revision_info_t*)->end = max_offset;
1039 /* for all offsets in the file, get the P2L index entries and process
1040 the interesting items (change lists, noderevs) */
1041 for (offset = 0; offset < max_offset; )
1043 apr_array_header_t *entries;
1045 svn_pool_clear(iterpool);
1047 /* cancellation support */
1048 if (query->cancel_func)
1049 SVN_ERR(query->cancel_func(query->cancel_baton));
1051 /* get all entries for the current block */
1052 SVN_ERR(svn_fs_fs__p2l_index_lookup(&entries, query->fs, rev_file, base,
1053 offset, ffd->p2l_page_size,
1054 iterpool, iterpool));
1056 /* process all entries (and later continue with the next block) */
1057 for (i = 0; i < entries->nelts; ++i)
1059 svn_stringbuf_t *item;
1060 revision_info_t *info;
1061 svn_fs_fs__p2l_entry_t *entry
1062 = &APR_ARRAY_IDX(entries, i, svn_fs_fs__p2l_entry_t);
1064 /* skip bits we previously processed */
1065 if (i == 0 && entry->offset < offset)
1068 /* skip zero-sized entries */
1069 if (entry->size == 0)
1072 /* read and process interesting items */
1073 info = APR_ARRAY_IDX(query->revisions, entry->item.revision,
1076 if (entry->type == SVN_FS_FS__ITEM_TYPE_NODEREV)
1078 SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool));
1079 SVN_ERR(read_noderev(query, item, info, result_pool, iterpool));
1081 else if (entry->type == SVN_FS_FS__ITEM_TYPE_CHANGES)
1083 SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool));
1085 = get_log_change_count(item->data + 0, item->len);
1086 info->changes_len += entry->size;
1088 else if ( (entry->type == SVN_FS_FS__ITEM_TYPE_FILE_REP)
1089 || (entry->type == SVN_FS_FS__ITEM_TYPE_DIR_REP)
1090 || (entry->type == SVN_FS_FS__ITEM_TYPE_FILE_PROPS)
1091 || (entry->type == SVN_FS_FS__ITEM_TYPE_DIR_PROPS))
1093 /* Collect the delta chain link. */
1094 svn_fs_fs__rep_header_t *header;
1095 rep_ref_t *ref = apr_pcalloc(scratch_pool, sizeof(*ref));
1097 SVN_ERR(svn_io_file_aligned_seek(rev_file->file,
1098 rev_file->block_size,
1099 NULL, entry->offset,
1101 SVN_ERR(svn_fs_fs__read_rep_header(&header,
1103 iterpool, iterpool));
1105 ref->header_size = header->header_size;
1106 ref->revision = entry->item.revision;
1107 ref->item_index = entry->item.number;
1109 if (header->type == svn_fs_fs__rep_delta)
1111 ref->base_item_index = header->base_item_index;
1112 ref->base_revision = header->base_revision;
1116 ref->base_item_index = SVN_FS_FS__ITEM_INDEX_UNUSED;
1117 ref->base_revision = SVN_INVALID_REVNUM;
1120 APR_ARRAY_PUSH(rep_refs, rep_ref_t *) = ref;
1123 /* advance offset */
1124 offset += entry->size;
1128 /* Resolve the delta chain links. */
1129 SVN_ERR(resolve_representation_refs(query, rep_refs));
1131 /* clean up and close file handles */
1132 svn_pool_destroy(iterpool);
1134 return SVN_NO_ERROR;
1137 /* Read the content of the pack file staring at revision BASE logical
1138 * addressing mode and store it in QUERY.
1140 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
1143 static svn_error_t *
1144 read_log_pack_file(query_t *query,
1146 apr_pool_t *result_pool,
1147 apr_pool_t *scratch_pool)
1149 SVN_ERR(read_log_rev_or_packfile(query, base, query->shard_size,
1150 result_pool, scratch_pool));
1152 /* one more pack file processed */
1153 if (query->progress_func)
1154 query->progress_func(base, query->progress_baton, scratch_pool);
1156 return SVN_NO_ERROR;
1159 /* Read the content of the file for REVISION in logical addressing mode
1160 * and store its contents in QUERY.
1162 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
1165 static svn_error_t *
1166 read_log_revision_file(query_t *query,
1167 svn_revnum_t revision,
1168 apr_pool_t *result_pool,
1169 apr_pool_t *scratch_pool)
1171 SVN_ERR(read_log_rev_or_packfile(query, revision, 1,
1172 result_pool, scratch_pool));
1174 /* show progress every 1000 revs or so */
1175 if (query->progress_func)
1177 if (query->shard_size && (revision % query->shard_size == 0))
1178 query->progress_func(revision, query->progress_baton, scratch_pool);
1179 if (!query->shard_size && (revision % 1000 == 0))
1180 query->progress_func(revision, query->progress_baton, scratch_pool);
1183 return SVN_NO_ERROR;
1186 /* Read the repository and collect the stats info in QUERY.
1188 * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
1191 static svn_error_t *
1192 read_revisions(query_t *query,
1193 apr_pool_t *result_pool,
1194 apr_pool_t *scratch_pool)
1196 apr_pool_t *iterpool = svn_pool_create(scratch_pool);
1197 svn_revnum_t revision;
1199 /* read all packed revs */
1201 ; revision < query->min_unpacked_rev
1202 ; revision += query->shard_size)
1204 svn_pool_clear(iterpool);
1206 if (svn_fs_fs__use_log_addressing(query->fs))
1207 SVN_ERR(read_log_pack_file(query, revision, result_pool, iterpool));
1209 SVN_ERR(read_phys_pack_file(query, revision, result_pool, iterpool));
1212 /* read non-packed revs */
1213 for ( ; revision <= query->head; ++revision)
1215 svn_pool_clear(iterpool);
1217 if (svn_fs_fs__use_log_addressing(query->fs))
1218 SVN_ERR(read_log_revision_file(query, revision, result_pool,
1221 SVN_ERR(read_phys_revision_file(query, revision, result_pool,
1225 svn_pool_destroy(iterpool);
1227 return SVN_NO_ERROR;
1230 /* Accumulate stats of REP in STATS.
1233 add_rep_pack_stats(svn_fs_fs__rep_pack_stats_t *stats,
1238 stats->packed_size += rep->size;
1239 stats->expanded_size += rep->expanded_size;
1240 stats->overhead_size += rep->header_size + 7 /* ENDREP\n */;
1243 /* Accumulate stats of REP in STATS.
1246 add_rep_stats(svn_fs_fs__representation_stats_t *stats,
1249 add_rep_pack_stats(&stats->total, rep);
1250 if (rep->ref_count == 1)
1251 add_rep_pack_stats(&stats->uniques, rep);
1253 add_rep_pack_stats(&stats->shared, rep);
1255 stats->references += rep->ref_count;
1256 stats->expanded_size += rep->ref_count * rep->expanded_size;
1257 stats->chain_len += rep->chain_length;
1260 /* Aggregate the info the in revision_info_t * array REVISIONS into the
1261 * respectve fields of STATS.
1264 aggregate_stats(const apr_array_header_t *revisions,
1265 svn_fs_fs__stats_t *stats)
1269 /* aggregate info from all revisions */
1270 stats->revision_count = revisions->nelts;
1271 for (i = 0; i < revisions->nelts; ++i)
1273 revision_info_t *revision = APR_ARRAY_IDX(revisions, i,
1276 /* data gathered on a revision level */
1277 stats->change_count += revision->change_count;
1278 stats->change_len += revision->changes_len;
1279 stats->total_size += revision->end - revision->offset;
1281 stats->dir_node_stats.count += revision->dir_noderev_count;
1282 stats->dir_node_stats.size += revision->dir_noderev_size;
1283 stats->file_node_stats.count += revision->file_noderev_count;
1284 stats->file_node_stats.size += revision->file_noderev_size;
1285 stats->total_node_stats.count += revision->dir_noderev_count
1286 + revision->file_noderev_count;
1287 stats->total_node_stats.size += revision->dir_noderev_size
1288 + revision->file_noderev_size;
1290 /* process representations */
1291 for (k = 0; k < revision->representations->nelts; ++k)
1293 rep_stats_t *rep = APR_ARRAY_IDX(revision->representations, k,
1296 /* accumulate in the right bucket */
1300 add_rep_stats(&stats->file_rep_stats, rep);
1303 add_rep_stats(&stats->dir_rep_stats, rep);
1305 case file_property_rep:
1306 add_rep_stats(&stats->file_prop_rep_stats, rep);
1308 case dir_property_rep:
1309 add_rep_stats(&stats->dir_prop_rep_stats, rep);
1315 add_rep_stats(&stats->total_rep_stats, rep);
1320 /* Return a new svn_fs_fs__stats_t instance, allocated in RESULT_POOL.
1322 static svn_fs_fs__stats_t *
1323 create_stats(apr_pool_t *result_pool)
1325 svn_fs_fs__stats_t *stats = apr_pcalloc(result_pool, sizeof(*stats));
1327 initialize_largest_changes(stats, 64, result_pool);
1328 stats->by_extension = apr_hash_make(result_pool);
1333 /* Create a *QUERY, allocated in RESULT_POOL, reading filesystem FS and
1334 * collecting results in STATS. Store the optional PROCESS_FUNC and
1335 * PROGRESS_BATON as well as CANCEL_FUNC and CANCEL_BATON in *QUERY, too.
1336 * Use SCRATCH_POOL for temporary allocations.
1338 static svn_error_t *
1339 create_query(query_t **query,
1341 svn_fs_fs__stats_t *stats,
1342 svn_fs_progress_notify_func_t progress_func,
1343 void *progress_baton,
1344 svn_cancel_func_t cancel_func,
1346 apr_pool_t *result_pool,
1347 apr_pool_t *scratch_pool)
1349 *query = apr_pcalloc(result_pool, sizeof(**query));
1351 /* Read repository dimensions. */
1352 (*query)->shard_size = svn_fs_fs__shard_size(fs);
1353 SVN_ERR(svn_fs_fs__youngest_rev(&(*query)->head, fs, scratch_pool));
1354 SVN_ERR(svn_fs_fs__min_unpacked_rev(&(*query)->min_unpacked_rev, fs,
1357 /* create data containers and caches
1358 * Note: this assumes that int is at least 32-bits and that we only support
1359 * 32-bit wide revision numbers (actually 31-bits due to the signedness
1360 * of both the nelts field of the array and our revision numbers). This
1361 * means this code will fail on platforms where int is less than 32-bits
1362 * and the repository has more revisions than int can hold. */
1363 (*query)->revisions = apr_array_make(result_pool, (int) (*query)->head + 1,
1364 sizeof(revision_info_t *));
1365 (*query)->null_base = apr_pcalloc(result_pool,
1366 sizeof(*(*query)->null_base));
1368 /* Store other parameters */
1370 (*query)->stats = stats;
1371 (*query)->progress_func = progress_func;
1372 (*query)->progress_baton = progress_baton;
1373 (*query)->cancel_func = cancel_func;
1374 (*query)->cancel_baton = cancel_baton;
1376 return SVN_NO_ERROR;
1380 svn_fs_fs__get_stats(svn_fs_fs__stats_t **stats,
1382 svn_fs_progress_notify_func_t progress_func,
1383 void *progress_baton,
1384 svn_cancel_func_t cancel_func,
1386 apr_pool_t *result_pool,
1387 apr_pool_t *scratch_pool)
1391 *stats = create_stats(result_pool);
1392 SVN_ERR(create_query(&query, fs, *stats, progress_func, progress_baton,
1393 cancel_func, cancel_baton, scratch_pool,
1395 SVN_ERR(read_revisions(query, scratch_pool, scratch_pool));
1396 aggregate_stats(query->revisions, *stats);
1398 return SVN_NO_ERROR;
1401 /* Baton for rev_size_index_entry_cb. */
1402 struct rev_size_baton_t {
1403 svn_revnum_t revision;
1407 /* Implements svn_fs_fs__dump_index_func_t, summing object sizes for
1408 * revision BATON->revision into BATON->rev_size.
1410 static svn_error_t *
1411 rev_size_index_entry_cb(const svn_fs_fs__p2l_entry_t *entry,
1413 apr_pool_t *scratch_pool)
1415 struct rev_size_baton_t *b = baton;
1417 if (entry->item.revision == b->revision)
1418 b->rev_size += entry->size;
1419 return SVN_NO_ERROR;
1423 svn_fs_fs__revision_size(apr_off_t *rev_size,
1425 svn_revnum_t revision,
1426 apr_pool_t *scratch_pool)
1428 /* Get the size of the revision (excluding rev-props) */
1429 if (svn_fs_fs__use_log_addressing(fs))
1431 /* This works for a packed or a non-packed revision.
1432 We could provide an optimized case for a non-packed revision
1433 using svn_fs_fs__p2l_get_max_offset(). */
1434 struct rev_size_baton_t b = { 0, 0 };
1436 b.revision = revision;
1437 SVN_ERR(svn_fs_fs__dump_index(fs, revision,
1438 rev_size_index_entry_cb, &b,
1439 NULL, NULL, scratch_pool));
1440 *rev_size = b.rev_size;
1444 svn_fs_fs__revision_file_t *rev_file;
1445 svn_revnum_t min_unpacked_rev;
1447 SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, fs, revision,
1448 scratch_pool, scratch_pool));
1449 SVN_ERR(svn_fs_fs__min_unpacked_rev(&min_unpacked_rev, fs,
1451 if (revision < min_unpacked_rev)
1453 int shard_size = svn_fs_fs__shard_size(fs);
1454 apr_off_t start_offset, end_offset;
1456 SVN_ERR(svn_fs_fs__get_packed_offset(&start_offset, fs, revision,
1458 if (((revision + 1) % shard_size) == 0)
1460 svn_filesize_t file_size;
1462 SVN_ERR(svn_io_file_size_get(&file_size, rev_file->file, scratch_pool));
1463 end_offset = (apr_off_t)file_size;
1467 SVN_ERR(svn_fs_fs__get_packed_offset(&end_offset, fs,
1468 revision + 1, scratch_pool));
1470 *rev_size = (end_offset - start_offset);
1474 svn_filesize_t file_size;
1476 SVN_ERR(svn_io_file_size_get(&file_size, rev_file->file, scratch_pool));
1477 *rev_size = (apr_off_t)file_size;
1480 SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
1483 /* Add the size of the rev-props */
1487 SVN_ERR(svn_fs_fs__get_revision_props_size(&size, fs, revision, scratch_pool));
1491 return SVN_NO_ERROR;