summaryrefslogtreecommitdiff
path: root/subversion/libsvn_fs_fs/stats.c
diff options
context:
space:
mode:
Diffstat (limited to 'subversion/libsvn_fs_fs/stats.c')
-rw-r--r--subversion/libsvn_fs_fs/stats.c1255
1 files changed, 1255 insertions, 0 deletions
diff --git a/subversion/libsvn_fs_fs/stats.c b/subversion/libsvn_fs_fs/stats.c
new file mode 100644
index 0000000..97a2ed7
--- /dev/null
+++ b/subversion/libsvn_fs_fs/stats.c
@@ -0,0 +1,1255 @@
+/* stats.c -- implements the svn_fs_fs__get_stats private API.
+ *
+ * ====================================================================
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ * ====================================================================
+ */
+
+#include "svn_dirent_uri.h"
+#include "svn_fs.h"
+#include "svn_pools.h"
+#include "svn_sorts.h"
+
+#include "private/svn_cache.h"
+#include "private/svn_sorts_private.h"
+#include "private/svn_string_private.h"
+#include "private/svn_fs_fs_private.h"
+
+#include "index.h"
+#include "pack.h"
+#include "rev_file.h"
+#include "util.h"
+#include "fs_fs.h"
+#include "cached_data.h"
+#include "low_level.h"
+
+#include "../libsvn_fs/fs-loader.h"
+
+#include "svn_private_config.h"
+
+/* We group representations into 2x2 different kinds plus one default:
+ * [dir / file] x [text / prop]. The assignment is done by the first node
+ * that references the respective representation.
+ */
+typedef enum rep_kind_t
+{
+ /* The representation is not used _directly_, i.e. not referenced by any
+ * noderev. However, some other representation may use it as delta base.
+ * Null value. Should not occur in real-word repositories. */
+ unused_rep,
+
+ /* a properties on directory rep */
+ dir_property_rep,
+
+ /* a properties on file rep */
+ file_property_rep,
+
+ /* a directory rep */
+ dir_rep,
+
+ /* a file rep */
+ file_rep
+} rep_kind_t;
+
+/* A representation fragment.
+ */
+typedef struct rep_stats_t
+{
+ /* absolute offset in the file */
+ apr_off_t offset;
+
+ /* item length in bytes */
+ apr_uint64_t size;
+
+ /* item length after de-deltification */
+ apr_uint64_t expanded_size;
+
+ /* revision that contains this representation
+ * (may be referenced by other revisions, though) */
+ svn_revnum_t revision;
+
+ /* number of nodes that reference this representation */
+ apr_uint32_t ref_count;
+
+ /* length of the PLAIN / DELTA line in the source file in bytes */
+ apr_uint16_t header_size;
+
+ /* classification of the representation. values of rep_kind_t */
+ char kind;
+
+} rep_stats_t;
+
+/* Represents a single revision.
+ * There will be only one instance per revision. */
+typedef struct revision_info_t
+{
+ /* number of this revision */
+ svn_revnum_t revision;
+
+ /* pack file offset (manifest value), 0 for non-packed files */
+ apr_off_t offset;
+
+ /* length of the changes list on bytes */
+ apr_uint64_t changes_len;
+
+ /* offset of the changes list relative to OFFSET */
+ apr_uint64_t change_count;
+
+ /* first offset behind the revision data in the pack file (file length
+ * for non-packed revs) */
+ apr_off_t end;
+
+ /* number of directory noderevs in this revision */
+ apr_uint64_t dir_noderev_count;
+
+ /* number of file noderevs in this revision */
+ apr_uint64_t file_noderev_count;
+
+ /* total size of directory noderevs (i.e. the structs - not the rep) */
+ apr_uint64_t dir_noderev_size;
+
+ /* total size of file noderevs (i.e. the structs - not the rep) */
+ apr_uint64_t file_noderev_size;
+
+ /* all rep_stats_t of this revision (in no particular order),
+ * i.e. those that point back to this struct */
+ apr_array_header_t *representations;
+
+ /* Temporary rev / pack file access object, used in phys. addressing
+ * mode only. NULL when done reading this revision. */
+ svn_fs_fs__revision_file_t *rev_file;
+} revision_info_t;
+
+/* Root data structure containing all information about a given repository.
+ * We use it as a wrapper around svn_fs_t and pass it around where we would
+ * otherwise just use a svn_fs_t.
+ */
+typedef struct query_t
+{
+ /* FS API object*/
+ svn_fs_t *fs;
+
+ /* The HEAD revision. */
+ svn_revnum_t head;
+
+ /* Number of revs per shard; 0 for non-sharded repos. */
+ int shard_size;
+
+ /* First non-packed revision. */
+ svn_revnum_t min_unpacked_rev;
+
+ /* all revisions */
+ apr_array_header_t *revisions;
+
+ /* empty representation.
+ * Used as a dummy base for DELTA reps without base. */
+ rep_stats_t *null_base;
+
+ /* collected statistics */
+ svn_fs_fs__stats_t *stats;
+
+ /* Progress notification callback to call after each shard. May be NULL. */
+ svn_fs_progress_notify_func_t progress_func;
+
+ /* Baton for PROGRESS_FUNC. */
+ void *progress_baton;
+
+ /* Cancellation support callback to call once in a while. May be NULL. */
+ svn_cancel_func_t cancel_func;
+
+ /* Baton for CANCEL_FUNC. */
+ void *cancel_baton;
+} query_t;
+
+/* Return the length of REV_FILE in *FILE_SIZE.
+ * Use SCRATCH_POOL for temporary allocations.
+ */
+static svn_error_t *
+get_file_size(apr_off_t *file_size,
+ svn_fs_fs__revision_file_t *rev_file,
+ apr_pool_t *scratch_pool)
+{
+ apr_finfo_t finfo;
+
+ SVN_ERR(svn_io_file_info_get(&finfo, APR_FINFO_SIZE, rev_file->file,
+ scratch_pool));
+
+ *file_size = finfo.size;
+ return SVN_NO_ERROR;
+}
+
+/* Initialize the LARGEST_CHANGES member in STATS with a capacity of COUNT
+ * entries. Allocate the result in RESULT_POOL.
+ */
+static void
+initialize_largest_changes(svn_fs_fs__stats_t *stats,
+ apr_size_t count,
+ apr_pool_t *result_pool)
+{
+ apr_size_t i;
+
+ stats->largest_changes = apr_pcalloc(result_pool,
+ sizeof(*stats->largest_changes));
+ stats->largest_changes->count = count;
+ stats->largest_changes->min_size = 1;
+ stats->largest_changes->changes
+ = apr_palloc(result_pool, count * sizeof(*stats->largest_changes->changes));
+
+ /* allocate *all* entries before the path stringbufs. This increases
+ * cache locality and enhances performance significantly. */
+ for (i = 0; i < count; ++i)
+ stats->largest_changes->changes[i]
+ = apr_palloc(result_pool, sizeof(**stats->largest_changes->changes));
+
+ /* now initialize them and allocate the stringbufs */
+ for (i = 0; i < count; ++i)
+ {
+ stats->largest_changes->changes[i]->size = 0;
+ stats->largest_changes->changes[i]->revision = SVN_INVALID_REVNUM;
+ stats->largest_changes->changes[i]->path
+ = svn_stringbuf_create_ensure(1024, result_pool);
+ }
+}
+
+/* Add entry for SIZE to HISTOGRAM.
+ */
+static void
+add_to_histogram(svn_fs_fs__histogram_t *histogram,
+ apr_int64_t size)
+{
+ apr_int64_t shift = 0;
+
+ while (((apr_int64_t)(1) << shift) <= size)
+ shift++;
+
+ histogram->total.count++;
+ histogram->total.sum += size;
+ histogram->lines[(apr_size_t)shift].count++;
+ histogram->lines[(apr_size_t)shift].sum += size;
+}
+
+/* Update data aggregators in STATS with this representation of type KIND,
+ * on-disk REP_SIZE and expanded node size EXPANDED_SIZE for PATH in REVSION.
+ * PLAIN_ADDED indicates whether the node has a deltification predecessor.
+ */
+static void
+add_change(svn_fs_fs__stats_t *stats,
+ apr_uint64_t rep_size,
+ apr_uint64_t expanded_size,
+ svn_revnum_t revision,
+ const char *path,
+ rep_kind_t kind,
+ svn_boolean_t plain_added)
+{
+ /* identify largest reps */
+ if (rep_size >= stats->largest_changes->min_size)
+ {
+ apr_size_t i;
+ svn_fs_fs__largest_changes_t *largest_changes = stats->largest_changes;
+ svn_fs_fs__large_change_info_t *info
+ = largest_changes->changes[largest_changes->count - 1];
+ info->size = rep_size;
+ info->revision = revision;
+ svn_stringbuf_set(info->path, path);
+
+ /* linear insertion but not too bad since count is low and insertions
+ * near the end are more likely than close to front */
+ for (i = largest_changes->count - 1; i > 0; --i)
+ if (largest_changes->changes[i-1]->size >= rep_size)
+ break;
+ else
+ largest_changes->changes[i] = largest_changes->changes[i-1];
+
+ largest_changes->changes[i] = info;
+ largest_changes->min_size
+ = largest_changes->changes[largest_changes->count-1]->size;
+ }
+
+ /* global histograms */
+ add_to_histogram(&stats->rep_size_histogram, rep_size);
+ add_to_histogram(&stats->node_size_histogram, expanded_size);
+
+ if (plain_added)
+ {
+ add_to_histogram(&stats->added_rep_size_histogram, rep_size);
+ add_to_histogram(&stats->added_node_size_histogram, expanded_size);
+ }
+
+ /* specific histograms by type */
+ switch (kind)
+ {
+ case unused_rep:
+ add_to_histogram(&stats->unused_rep_histogram, rep_size);
+ break;
+ case dir_property_rep:
+ add_to_histogram(&stats->dir_prop_rep_histogram, rep_size);
+ add_to_histogram(&stats->dir_prop_histogram, expanded_size);
+ break;
+ case file_property_rep:
+ add_to_histogram(&stats->file_prop_rep_histogram, rep_size);
+ add_to_histogram(&stats->file_prop_histogram, expanded_size);
+ break;
+ case dir_rep:
+ add_to_histogram(&stats->dir_rep_histogram, rep_size);
+ add_to_histogram(&stats->dir_histogram, expanded_size);
+ break;
+ case file_rep:
+ add_to_histogram(&stats->file_rep_histogram, rep_size);
+ add_to_histogram(&stats->file_histogram, expanded_size);
+ break;
+ }
+
+ /* by extension */
+ if (kind == file_rep)
+ {
+ /* determine extension */
+ svn_fs_fs__extension_info_t *info;
+ const char * file_name = strrchr(path, '/');
+ const char * extension = file_name ? strrchr(file_name, '.') : NULL;
+
+ if (extension == NULL || extension == file_name + 1)
+ extension = "(none)";
+
+ /* get / auto-insert entry for this extension */
+ info = apr_hash_get(stats->by_extension, extension, APR_HASH_KEY_STRING);
+ if (info == NULL)
+ {
+ apr_pool_t *pool = apr_hash_pool_get(stats->by_extension);
+ info = apr_pcalloc(pool, sizeof(*info));
+ info->extension = apr_pstrdup(pool, extension);
+
+ apr_hash_set(stats->by_extension, info->extension,
+ APR_HASH_KEY_STRING, info);
+ }
+
+ /* update per-extension histogram */
+ add_to_histogram(&info->node_histogram, expanded_size);
+ add_to_histogram(&info->rep_histogram, rep_size);
+ }
+}
+
+/* Comparator used for binary search comparing the absolute file offset
+ * of a representation to some other offset. DATA is a *rep_stats_t,
+ * KEY is a pointer to an apr_off_t.
+ */
+static int
+compare_representation_offsets(const void *data, const void *key)
+{
+ apr_off_t lhs = (*(const rep_stats_t *const *)data)->offset;
+ apr_off_t rhs = *(const apr_off_t *)key;
+
+ if (lhs < rhs)
+ return -1;
+ return (lhs > rhs ? 1 : 0);
+}
+
+/* Find the revision_info_t object to the given REVISION in QUERY and
+ * return it in *REVISION_INFO. For performance reasons, we skip the
+ * lookup if the info is already provided.
+ *
+ * In that revision, look for the rep_stats_t object for offset OFFSET.
+ * If it already exists, set *IDX to its index in *REVISION_INFO's
+ * representations list and return the representation object. Otherwise,
+ * set the index to where it must be inserted and return NULL.
+ */
+static rep_stats_t *
+find_representation(int *idx,
+ query_t *query,
+ revision_info_t **revision_info,
+ svn_revnum_t revision,
+ apr_off_t offset)
+{
+ revision_info_t *info;
+ *idx = -1;
+
+ /* first let's find the revision */
+ info = revision_info ? *revision_info : NULL;
+ if (info == NULL || info->revision != revision)
+ {
+ info = APR_ARRAY_IDX(query->revisions, revision, revision_info_t*);
+ if (revision_info)
+ *revision_info = info;
+ }
+
+ /* not found -> no result */
+ if (info == NULL)
+ return NULL;
+
+ /* look for the representation */
+ *idx = svn_sort__bsearch_lower_bound(info->representations,
+ &offset,
+ compare_representation_offsets);
+ if (*idx < info->representations->nelts)
+ {
+ /* return the representation, if this is the one we were looking for */
+ rep_stats_t *result
+ = APR_ARRAY_IDX(info->representations, *idx, rep_stats_t *);
+ if (result->offset == offset)
+ return result;
+ }
+
+ /* not parsed, yet */
+ return NULL;
+}
+
+/* Find / auto-construct the representation stats for REP in QUERY and
+ * return it in *REPRESENTATION.
+ *
+ * If necessary, allocate the result in RESULT_POOL; use SCRATCH_POOL for
+ * temporary allocations.
+ */
+static svn_error_t *
+parse_representation(rep_stats_t **representation,
+ query_t *query,
+ representation_t *rep,
+ revision_info_t *revision_info,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ rep_stats_t *result;
+ int idx;
+
+ /* read location (revision, offset) and size */
+
+ /* look it up */
+ result = find_representation(&idx, query, &revision_info, rep->revision,
+ (apr_off_t)rep->item_index);
+ if (!result)
+ {
+ /* not parsed, yet (probably a rep in the same revision).
+ * Create a new rep object and determine its base rep as well.
+ */
+ result = apr_pcalloc(result_pool, sizeof(*result));
+ result->revision = rep->revision;
+ result->expanded_size = (rep->expanded_size ? rep->expanded_size
+ : rep->size);
+ result->offset = (apr_off_t)rep->item_index;
+ result->size = rep->size;
+
+ /* In phys. addressing mode, follow link to the actual representation.
+ * In log. addressing mode, we will find it already as part of our
+ * linear walk through the whole file. */
+ if (!svn_fs_fs__use_log_addressing(query->fs))
+ {
+ svn_fs_fs__rep_header_t *header;
+ apr_off_t offset = revision_info->offset + result->offset;
+
+ SVN_ERR_ASSERT(revision_info->rev_file);
+ SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET,
+ &offset, scratch_pool));
+ SVN_ERR(svn_fs_fs__read_rep_header(&header,
+ revision_info->rev_file->stream,
+ scratch_pool, scratch_pool));
+
+ result->header_size = header->header_size;
+ }
+
+ svn_sort__array_insert(revision_info->representations, &result, idx);
+ }
+
+ *representation = result;
+
+ return SVN_NO_ERROR;
+}
+
+
+/* forward declaration */
+static svn_error_t *
+read_noderev(query_t *query,
+ svn_stringbuf_t *noderev_str,
+ revision_info_t *revision_info,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool);
+
+/* Read the noderev item at OFFSET in REVISION_INFO from the filesystem
+ * provided by QUERY. Return it in *NODEREV, allocated in RESULT_POOL.
+ * Use SCRATCH_POOL for temporary allocations.
+ *
+ * The textual representation of the noderev will be used to determine
+ * the on-disk size of the noderev. Only called in phys. addressing mode.
+ */
+static svn_error_t *
+read_phsy_noderev(svn_stringbuf_t **noderev,
+ query_t *query,
+ apr_off_t offset,
+ revision_info_t *revision_info,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ svn_stringbuf_t *noderev_str = svn_stringbuf_create_empty(result_pool);
+ svn_stringbuf_t *line;
+ svn_boolean_t eof;
+
+ apr_pool_t *iterpool = svn_pool_create(scratch_pool);
+
+ /* Navigate the file stream to the start of noderev. */
+ SVN_ERR_ASSERT(revision_info->rev_file);
+
+ offset += revision_info->offset;
+ SVN_ERR(svn_io_file_seek(revision_info->rev_file->file, APR_SET,
+ &offset, scratch_pool));
+
+ /* Read it (terminated by an empty line) */
+ do
+ {
+ svn_pool_clear(iterpool);
+
+ SVN_ERR(svn_stream_readline(revision_info->rev_file->stream, &line,
+ "\n", &eof, iterpool));
+ svn_stringbuf_appendstr(noderev_str, line);
+ svn_stringbuf_appendbyte(noderev_str, '\n');
+ }
+ while (line->len > 0 && !eof);
+
+ /* Return the result. */
+ *noderev = noderev_str;
+
+ svn_pool_destroy(iterpool);
+
+ return SVN_NO_ERROR;
+}
+
+/* Starting at the directory in NODEREV's text, read all DAG nodes,
+ * directories and representations linked in that tree structure.
+ * Store them in QUERY and REVISION_INFO. Also, read them only once.
+ *
+ * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
+ * temporaries.
+ */
+static svn_error_t *
+parse_dir(query_t *query,
+ node_revision_t *noderev,
+ revision_info_t *revision_info,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ apr_pool_t *iterpool = svn_pool_create(scratch_pool);
+
+ int i;
+ apr_array_header_t *entries;
+ SVN_ERR(svn_fs_fs__rep_contents_dir(&entries, query->fs, noderev,
+ scratch_pool, scratch_pool));
+
+ for (i = 0; i < entries->nelts; ++i)
+ {
+ svn_fs_dirent_t *dirent = APR_ARRAY_IDX(entries, i, svn_fs_dirent_t *);
+
+ if (svn_fs_fs__id_rev(dirent->id) == revision_info->revision)
+ {
+ svn_stringbuf_t *noderev_str;
+ svn_pool_clear(iterpool);
+
+ SVN_ERR(read_phsy_noderev(&noderev_str, query,
+ svn_fs_fs__id_item(dirent->id),
+ revision_info, iterpool, iterpool));
+ SVN_ERR(read_noderev(query, noderev_str, revision_info,
+ result_pool, iterpool));
+ }
+ }
+
+ svn_pool_destroy(iterpool);
+
+ return SVN_NO_ERROR;
+}
+
+/* Parse the noderev given as NODEREV_STR and store the info in QUERY and
+ * REVISION_INFO. In phys. addressing mode, continue reading all DAG nodes,
+ * directories and representations linked in that tree structure.
+ *
+ * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
+ * temporaries.
+ */
+static svn_error_t *
+read_noderev(query_t *query,
+ svn_stringbuf_t *noderev_str,
+ revision_info_t *revision_info,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ rep_stats_t *text = NULL;
+ rep_stats_t *props = NULL;
+ node_revision_t *noderev;
+
+ svn_stream_t *stream = svn_stream_from_stringbuf(noderev_str, scratch_pool);
+ SVN_ERR(svn_fs_fs__read_noderev(&noderev, stream, scratch_pool,
+ scratch_pool));
+
+ if (noderev->data_rep)
+ {
+ SVN_ERR(parse_representation(&text, query,
+ noderev->data_rep, revision_info,
+ result_pool, scratch_pool));
+
+ /* if we are the first to use this rep, mark it as "text rep" */
+ if (++text->ref_count == 1)
+ text->kind = noderev->kind == svn_node_dir ? dir_rep : file_rep;
+ }
+
+ if (noderev->prop_rep)
+ {
+ SVN_ERR(parse_representation(&props, query,
+ noderev->prop_rep, revision_info,
+ result_pool, scratch_pool));
+
+ /* if we are the first to use this rep, mark it as "prop rep" */
+ if (++props->ref_count == 1)
+ props->kind = noderev->kind == svn_node_dir ? dir_property_rep
+ : file_property_rep;
+ }
+
+ /* record largest changes */
+ if (text && text->ref_count == 1)
+ add_change(query->stats, text->size, text->expanded_size, text->revision,
+ noderev->created_path, text->kind, !noderev->predecessor_id);
+ if (props && props->ref_count == 1)
+ add_change(query->stats, props->size, props->expanded_size,
+ props->revision, noderev->created_path, props->kind,
+ !noderev->predecessor_id);
+
+ /* if this is a directory and has not been processed, yet, read and
+ * process it recursively */
+ if ( noderev->kind == svn_node_dir && text && text->ref_count == 1
+ && !svn_fs_fs__use_log_addressing(query->fs))
+ SVN_ERR(parse_dir(query, noderev, revision_info, result_pool,
+ scratch_pool));
+
+ /* update stats */
+ if (noderev->kind == svn_node_dir)
+ {
+ revision_info->dir_noderev_size += noderev_str->len;
+ revision_info->dir_noderev_count++;
+ }
+ else
+ {
+ revision_info->file_noderev_size += noderev_str->len;
+ revision_info->file_noderev_count++;
+ }
+
+ return SVN_NO_ERROR;
+}
+
+/* For the revision given as REVISION_INFO within QUERY, determine the number
+ * of entries in its changed paths list and store that info in REVISION_INFO.
+ * Use SCRATCH_POOL for temporary allocations.
+ */
+static svn_error_t *
+get_phys_change_count(query_t *query,
+ revision_info_t *revision_info,
+ apr_pool_t *scratch_pool)
+{
+ /* We are going to use our own sub-pool here because the changes object
+ * may well be >100MB and SCRATCH_POOL may not get cleared until all other
+ * info has been read by read_phys_revision(). Therefore, tidy up early.
+ */
+ apr_pool_t *subpool = svn_pool_create(scratch_pool);
+ apr_array_header_t *changes;
+
+ SVN_ERR(svn_fs_fs__get_changes(&changes, query->fs,
+ revision_info->revision, subpool));
+ revision_info->change_count = changes->nelts;
+
+ /* Release potentially tons of memory. */
+ svn_pool_destroy(subpool);
+
+ return SVN_NO_ERROR;
+}
+
+/* Read header information for the revision stored in FILE_CONTENT (one
+ * whole revision). Return the offsets within FILE_CONTENT for the
+ * *ROOT_NODEREV, the list of *CHANGES and its len in *CHANGES_LEN.
+ * Use POOL for temporary allocations. */
+static svn_error_t *
+read_phys_revision(query_t *query,
+ revision_info_t *info,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ char buf[64];
+ apr_off_t root_node_offset;
+ apr_off_t changes_offset;
+ svn_stringbuf_t *trailer;
+ svn_stringbuf_t *noderev_str;
+
+ /* Read the last 64 bytes of the revision (if long enough). */
+ apr_off_t start = MAX(info->offset, info->end - sizeof(buf));
+ apr_size_t len = (apr_size_t)(info->end - start);
+ SVN_ERR(svn_io_file_seek(info->rev_file->file, APR_SET, &start,
+ scratch_pool));
+ SVN_ERR(svn_io_file_read_full2(info->rev_file->file, buf, len, NULL, NULL,
+ scratch_pool));
+ trailer = svn_stringbuf_ncreate(buf, len, scratch_pool);
+
+ /* Parse that trailer. */
+ SVN_ERR(svn_fs_fs__parse_revision_trailer(&root_node_offset,
+ &changes_offset, trailer,
+ info->revision));
+ SVN_ERR(get_phys_change_count(query, info, scratch_pool));
+
+ /* Calculate the length of the changes list. */
+ trailer = svn_fs_fs__unparse_revision_trailer(root_node_offset,
+ changes_offset,
+ scratch_pool);
+ info->changes_len = info->end - info->offset - changes_offset
+ - trailer->len;
+
+ /* Recursively read nodes added in this rev. */
+ SVN_ERR(read_phsy_noderev(&noderev_str, query, root_node_offset, info,
+ scratch_pool, scratch_pool));
+ SVN_ERR(read_noderev(query, noderev_str, info, result_pool, scratch_pool));
+
+ return SVN_NO_ERROR;
+}
+
+/* Read the content of the pack file staring at revision BASE physical
+ * addressing mode and store it in QUERY.
+ *
+ * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
+ * temporaries.
+ */
+static svn_error_t *
+read_phys_pack_file(query_t *query,
+ svn_revnum_t base,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ apr_pool_t *iterpool = svn_pool_create(scratch_pool);
+ int i;
+ apr_off_t file_size = 0;
+ svn_fs_fs__revision_file_t *rev_file;
+
+ SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base,
+ scratch_pool, scratch_pool));
+ SVN_ERR(get_file_size(&file_size, rev_file, scratch_pool));
+
+ /* process each revision in the pack file */
+ for (i = 0; i < query->shard_size; ++i)
+ {
+ revision_info_t *info;
+
+ /* cancellation support */
+ if (query->cancel_func)
+ SVN_ERR(query->cancel_func(query->cancel_baton));
+
+ /* create the revision info for the current rev */
+ info = apr_pcalloc(result_pool, sizeof(*info));
+ info->representations = apr_array_make(result_pool, 4,
+ sizeof(rep_stats_t*));
+ info->rev_file = rev_file;
+
+ info->revision = base + i;
+ SVN_ERR(svn_fs_fs__get_packed_offset(&info->offset, query->fs, base + i,
+ iterpool));
+ if (i + 1 == query->shard_size)
+ info->end = file_size;
+ else
+ SVN_ERR(svn_fs_fs__get_packed_offset(&info->end, query->fs,
+ base + i + 1, iterpool));
+
+ SVN_ERR(read_phys_revision(query, info, result_pool, iterpool));
+
+ info->representations = apr_array_copy(result_pool,
+ info->representations);
+
+ /* Done with this revision. */
+ info->rev_file = NULL;
+
+ /* put it into our container */
+ APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
+
+ /* destroy temps */
+ svn_pool_clear(iterpool);
+ }
+
+ /* Done with this pack file. */
+ SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
+
+ /* one more pack file processed */
+ if (query->progress_func)
+ query->progress_func(base, query->progress_baton, scratch_pool);
+
+ return SVN_NO_ERROR;
+}
+
+/* Read the content of the file for REVISION in physical addressing mode
+ * and store its contents in QUERY.
+ *
+ * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
+ * temporaries.
+ */
+static svn_error_t *
+read_phys_revision_file(query_t *query,
+ svn_revnum_t revision,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info));
+ apr_off_t file_size = 0;
+ svn_fs_fs__revision_file_t *rev_file;
+
+ /* cancellation support */
+ if (query->cancel_func)
+ SVN_ERR(query->cancel_func(query->cancel_baton));
+
+ /* read the whole pack file into memory */
+ SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, revision,
+ scratch_pool, scratch_pool));
+ SVN_ERR(get_file_size(&file_size, rev_file, scratch_pool));
+
+ /* create the revision info for the current rev */
+ info->representations = apr_array_make(result_pool, 4, sizeof(rep_stats_t*));
+
+ info->rev_file = rev_file;
+ info->revision = revision;
+ info->offset = 0;
+ info->end = file_size;
+
+ SVN_ERR(read_phys_revision(query, info, result_pool, scratch_pool));
+
+ /* Done with this revision. */
+ SVN_ERR(svn_fs_fs__close_revision_file(rev_file));
+ info->rev_file = NULL;
+
+ /* put it into our container */
+ APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
+
+ /* show progress every 1000 revs or so */
+ if (query->progress_func)
+ {
+ if (query->shard_size && (revision % query->shard_size == 0))
+ query->progress_func(revision, query->progress_baton, scratch_pool);
+ if (!query->shard_size && (revision % 1000 == 0))
+ query->progress_func(revision, query->progress_baton, scratch_pool);
+ }
+
+ return SVN_NO_ERROR;
+}
+
+/* Given the unparsed changes list in CHANGES with LEN chars, return the
+ * number of changed paths encoded in it. Only used in log. addressing
+ * mode.
+ */
+static apr_uint64_t
+get_log_change_count(const char *changes,
+ apr_size_t len)
+{
+ apr_size_t lines = 0;
+ const char *end = changes + len;
+
+ /* line count */
+ for (; changes < end; ++changes)
+ if (*changes == '\n')
+ ++lines;
+
+ /* two lines per change */
+ return lines / 2;
+}
+
+/* Read the item described by ENTRY from the REV_FILE and return the
+ * respective byte sequence in *CONTENTS, allocated in RESULT_POOL.
+ * Use SCRATCH_POOL for temporary allocations
+ */
+static svn_error_t *
+read_item(svn_stringbuf_t **contents,
+ svn_fs_fs__revision_file_t *rev_file,
+ svn_fs_fs__p2l_entry_t *entry,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ svn_stringbuf_t *item = svn_stringbuf_create_ensure(entry->size,
+ result_pool);
+ item->len = entry->size;
+ item->data[item->len] = 0;
+
+ SVN_ERR(svn_io_file_aligned_seek(rev_file->file, rev_file->block_size,
+ NULL, entry->offset, scratch_pool));
+ SVN_ERR(svn_io_file_read_full2(rev_file->file, item->data, item->len,
+ NULL, NULL, scratch_pool));
+
+ *contents = item;
+
+ return SVN_NO_ERROR;
+}
+
+/* Process the logically addressed revision contents of revisions BASE to
+ * BASE + COUNT - 1 in QUERY.
+ *
+ * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
+ * temporaries.
+ */
+static svn_error_t *
+read_log_rev_or_packfile(query_t *query,
+ svn_revnum_t base,
+ int count,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ fs_fs_data_t *ffd = query->fs->fsap_data;
+ apr_pool_t *iterpool = svn_pool_create(scratch_pool);
+ apr_off_t max_offset;
+ apr_off_t offset = 0;
+ int i;
+ svn_fs_fs__revision_file_t *rev_file;
+
+ /* we will process every revision in the rev / pack file */
+ for (i = 0; i < count; ++i)
+ {
+ /* create the revision info for the current rev */
+ revision_info_t *info = apr_pcalloc(result_pool, sizeof(*info));
+ info->representations = apr_array_make(result_pool, 4,
+ sizeof(rep_stats_t*));
+ info->revision = base + i;
+
+ APR_ARRAY_PUSH(query->revisions, revision_info_t*) = info;
+ }
+
+ /* open the pack / rev file that is covered by the p2l index */
+ SVN_ERR(svn_fs_fs__open_pack_or_rev_file(&rev_file, query->fs, base,
+ scratch_pool, iterpool));
+ SVN_ERR(svn_fs_fs__p2l_get_max_offset(&max_offset, query->fs, rev_file,
+ base, scratch_pool));
+
+ /* record the whole pack size in the first rev so the total sum will
+ still be correct */
+ APR_ARRAY_IDX(query->revisions, base, revision_info_t*)->end = max_offset;
+
+ /* for all offsets in the file, get the P2L index entries and process
+ the interesting items (change lists, noderevs) */
+ for (offset = 0; offset < max_offset; )
+ {
+ apr_array_header_t *entries;
+
+ svn_pool_clear(iterpool);
+
+ /* cancellation support */
+ if (query->cancel_func)
+ SVN_ERR(query->cancel_func(query->cancel_baton));
+
+ /* get all entries for the current block */
+ SVN_ERR(svn_fs_fs__p2l_index_lookup(&entries, query->fs, rev_file, base,
+ offset, ffd->p2l_page_size,
+ iterpool, iterpool));
+
+ /* process all entries (and later continue with the next block) */
+ for (i = 0; i < entries->nelts; ++i)
+ {
+ svn_fs_fs__p2l_entry_t *entry
+ = &APR_ARRAY_IDX(entries, i, svn_fs_fs__p2l_entry_t);
+
+ /* skip bits we previously processed */
+ if (i == 0 && entry->offset < offset)
+ continue;
+
+ /* skip zero-sized entries */
+ if (entry->size == 0)
+ continue;
+
+ /* read and process interesting items */
+ if (entry->type == SVN_FS_FS__ITEM_TYPE_NODEREV)
+ {
+ svn_stringbuf_t *item;
+ revision_info_t *info = APR_ARRAY_IDX(query->revisions,
+ entry->item.revision,
+ revision_info_t*);
+ SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool));
+ SVN_ERR(read_noderev(query, item, info, result_pool, iterpool));
+ }
+ else if (entry->type == SVN_FS_FS__ITEM_TYPE_CHANGES)
+ {
+ svn_stringbuf_t *item;
+ revision_info_t *info = APR_ARRAY_IDX(query->revisions,
+ entry->item.revision,
+ revision_info_t*);
+ SVN_ERR(read_item(&item, rev_file, entry, iterpool, iterpool));
+ info->change_count
+ = get_log_change_count(item->data + 0, item->len);
+ info->changes_len += entry->size;
+ }
+
+ /* advance offset */
+ offset += entry->size;
+ }
+ }
+
+ /* clean up and close file handles */
+ svn_pool_destroy(iterpool);
+
+ return SVN_NO_ERROR;
+}
+
+/* Read the content of the pack file staring at revision BASE logical
+ * addressing mode and store it in QUERY.
+ *
+ * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
+ * temporaries.
+ */
+static svn_error_t *
+read_log_pack_file(query_t *query,
+ svn_revnum_t base,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ SVN_ERR(read_log_rev_or_packfile(query, base, query->shard_size,
+ result_pool, scratch_pool));
+
+ /* one more pack file processed */
+ if (query->progress_func)
+ query->progress_func(base, query->progress_baton, scratch_pool);
+
+ return SVN_NO_ERROR;
+}
+
+/* Read the content of the file for REVISION in logical addressing mode
+ * and store its contents in QUERY.
+ *
+ * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
+ * temporaries.
+ */
+static svn_error_t *
+read_log_revision_file(query_t *query,
+ svn_revnum_t revision,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ SVN_ERR(read_log_rev_or_packfile(query, revision, 1,
+ result_pool, scratch_pool));
+
+ /* show progress every 1000 revs or so */
+ if (query->progress_func)
+ {
+ if (query->shard_size && (revision % query->shard_size == 0))
+ query->progress_func(revision, query->progress_baton, scratch_pool);
+ if (!query->shard_size && (revision % 1000 == 0))
+ query->progress_func(revision, query->progress_baton, scratch_pool);
+ }
+
+ return SVN_NO_ERROR;
+}
+
+/* Read the repository and collect the stats info in QUERY.
+ *
+ * Use RESULT_POOL for persistent allocations and SCRATCH_POOL for
+ * temporaries.
+ */
+static svn_error_t *
+read_revisions(query_t *query,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ apr_pool_t *iterpool = svn_pool_create(scratch_pool);
+ svn_revnum_t revision;
+
+ /* read all packed revs */
+ for ( revision = 0
+ ; revision < query->min_unpacked_rev
+ ; revision += query->shard_size)
+ {
+ svn_pool_clear(iterpool);
+
+ if (svn_fs_fs__use_log_addressing(query->fs))
+ SVN_ERR(read_log_pack_file(query, revision, result_pool, iterpool));
+ else
+ SVN_ERR(read_phys_pack_file(query, revision, result_pool, iterpool));
+ }
+
+ /* read non-packed revs */
+ for ( ; revision <= query->head; ++revision)
+ {
+ svn_pool_clear(iterpool);
+
+ if (svn_fs_fs__use_log_addressing(query->fs))
+ SVN_ERR(read_log_revision_file(query, revision, result_pool,
+ iterpool));
+ else
+ SVN_ERR(read_phys_revision_file(query, revision, result_pool,
+ iterpool));
+ }
+
+ svn_pool_destroy(iterpool);
+
+ return SVN_NO_ERROR;
+}
+
+/* Accumulate stats of REP in STATS.
+ */
+static void
+add_rep_pack_stats(svn_fs_fs__rep_pack_stats_t *stats,
+ rep_stats_t *rep)
+{
+ stats->count++;
+
+ stats->packed_size += rep->size;
+ stats->expanded_size += rep->expanded_size;
+ stats->overhead_size += rep->header_size + 7 /* ENDREP\n */;
+}
+
+/* Accumulate stats of REP in STATS.
+ */
+static void
+add_rep_stats(svn_fs_fs__representation_stats_t *stats,
+ rep_stats_t *rep)
+{
+ add_rep_pack_stats(&stats->total, rep);
+ if (rep->ref_count == 1)
+ add_rep_pack_stats(&stats->uniques, rep);
+ else
+ add_rep_pack_stats(&stats->shared, rep);
+
+ stats->references += rep->ref_count;
+ stats->expanded_size += rep->ref_count * rep->expanded_size;
+}
+
+/* Aggregate the info the in revision_info_t * array REVISIONS into the
+ * respectve fields of STATS.
+ */
+static void
+aggregate_stats(const apr_array_header_t *revisions,
+ svn_fs_fs__stats_t *stats)
+{
+ int i, k;
+
+ /* aggregate info from all revisions */
+ stats->revision_count = revisions->nelts;
+ for (i = 0; i < revisions->nelts; ++i)
+ {
+ revision_info_t *revision = APR_ARRAY_IDX(revisions, i,
+ revision_info_t *);
+
+ /* data gathered on a revision level */
+ stats->change_count += revision->change_count;
+ stats->change_len += revision->changes_len;
+ stats->total_size += revision->end - revision->offset;
+
+ stats->dir_node_stats.count += revision->dir_noderev_count;
+ stats->dir_node_stats.size += revision->dir_noderev_size;
+ stats->file_node_stats.count += revision->file_noderev_count;
+ stats->file_node_stats.size += revision->file_noderev_size;
+ stats->total_node_stats.count += revision->dir_noderev_count
+ + revision->file_noderev_count;
+ stats->total_node_stats.size += revision->dir_noderev_size
+ + revision->file_noderev_size;
+
+ /* process representations */
+ for (k = 0; k < revision->representations->nelts; ++k)
+ {
+ rep_stats_t *rep = APR_ARRAY_IDX(revision->representations, k,
+ rep_stats_t *);
+
+ /* accumulate in the right bucket */
+ switch(rep->kind)
+ {
+ case file_rep:
+ add_rep_stats(&stats->file_rep_stats, rep);
+ break;
+ case dir_rep:
+ add_rep_stats(&stats->dir_rep_stats, rep);
+ break;
+ case file_property_rep:
+ add_rep_stats(&stats->file_prop_rep_stats, rep);
+ break;
+ case dir_property_rep:
+ add_rep_stats(&stats->dir_prop_rep_stats, rep);
+ break;
+ default:
+ break;
+ }
+
+ add_rep_stats(&stats->total_rep_stats, rep);
+ }
+ }
+}
+
+/* Return a new svn_fs_fs__stats_t instance, allocated in RESULT_POOL.
+ */
+static svn_fs_fs__stats_t *
+create_stats(apr_pool_t *result_pool)
+{
+ svn_fs_fs__stats_t *stats = apr_pcalloc(result_pool, sizeof(*stats));
+
+ initialize_largest_changes(stats, 64, result_pool);
+ stats->by_extension = apr_hash_make(result_pool);
+
+ return stats;
+}
+
+/* Create a *QUERY, allocated in RESULT_POOL, reading filesystem FS and
+ * collecting results in STATS. Store the optional PROCESS_FUNC and
+ * PROGRESS_BATON as well as CANCEL_FUNC and CANCEL_BATON in *QUERY, too.
+ * Use SCRATCH_POOL for temporary allocations.
+ */
+static svn_error_t *
+create_query(query_t **query,
+ svn_fs_t *fs,
+ svn_fs_fs__stats_t *stats,
+ svn_fs_progress_notify_func_t progress_func,
+ void *progress_baton,
+ svn_cancel_func_t cancel_func,
+ void *cancel_baton,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ *query = apr_pcalloc(result_pool, sizeof(**query));
+
+ /* Read repository dimensions. */
+ (*query)->shard_size = svn_fs_fs__shard_size(fs);
+ SVN_ERR(svn_fs_fs__youngest_rev(&(*query)->head, fs, scratch_pool));
+ SVN_ERR(svn_fs_fs__min_unpacked_rev(&(*query)->min_unpacked_rev, fs,
+ scratch_pool));
+
+ /* create data containers and caches
+ * Note: this assumes that int is at least 32-bits and that we only support
+ * 32-bit wide revision numbers (actually 31-bits due to the signedness
+ * of both the nelts field of the array and our revision numbers). This
+ * means this code will fail on platforms where int is less than 32-bits
+ * and the repository has more revisions than int can hold. */
+ (*query)->revisions = apr_array_make(result_pool, (int) (*query)->head + 1,
+ sizeof(revision_info_t *));
+ (*query)->null_base = apr_pcalloc(result_pool,
+ sizeof(*(*query)->null_base));
+
+ /* Store other parameters */
+ (*query)->fs = fs;
+ (*query)->stats = stats;
+ (*query)->progress_func = progress_func;
+ (*query)->progress_baton = progress_baton;
+ (*query)->cancel_func = cancel_func;
+ (*query)->cancel_baton = cancel_baton;
+
+ return SVN_NO_ERROR;
+}
+
+svn_error_t *
+svn_fs_fs__get_stats(svn_fs_fs__stats_t **stats,
+ svn_fs_t *fs,
+ svn_fs_progress_notify_func_t progress_func,
+ void *progress_baton,
+ svn_cancel_func_t cancel_func,
+ void *cancel_baton,
+ apr_pool_t *result_pool,
+ apr_pool_t *scratch_pool)
+{
+ query_t *query;
+
+ *stats = create_stats(result_pool);
+ SVN_ERR(create_query(&query, fs, *stats, progress_func, progress_baton,
+ cancel_func, cancel_baton, scratch_pool,
+ scratch_pool));
+ SVN_ERR(read_revisions(query, scratch_pool, scratch_pool));
+ aggregate_stats(query->revisions, *stats);
+
+ return SVN_NO_ERROR;
+}