diff options
Diffstat (limited to 'tools/dev/fsfs-reorg.c')
-rw-r--r-- | tools/dev/fsfs-reorg.c | 3147 |
1 files changed, 3147 insertions, 0 deletions
diff --git a/tools/dev/fsfs-reorg.c b/tools/dev/fsfs-reorg.c new file mode 100644 index 0000000..052ad39 --- /dev/null +++ b/tools/dev/fsfs-reorg.c @@ -0,0 +1,3147 @@ +/* fsfs-reorg.c -- prototypic tool to reorganize packed FSFS repositories + * to reduce seeks + * + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * ==================================================================== + */ + + +#include <assert.h> + +#include <apr.h> +#include <apr_general.h> +#include <apr_file_io.h> +#include <apr_poll.h> + +#include "svn_pools.h" +#include "svn_diff.h" +#include "svn_io.h" +#include "svn_utf.h" +#include "svn_dirent_uri.h" +#include "svn_sorts.h" +#include "svn_delta.h" +#include "svn_hash.h" + +#include "private/svn_string_private.h" +#include "private/svn_subr_private.h" +#include "private/svn_dep_compat.h" + +#ifndef _ +#define _(x) x +#endif + +#define ERROR_TAG "fsfs-reporg: " + +/* forward declarations */ +typedef struct noderev_t noderev_t; +typedef struct revision_info_t revision_info_t; + +/* A FSFS rev file is sequence of fragments and unused space (the latter + * only being inserted by this tool and not during ordinary SVN operation). + * + * This type defines the type of any fragment. + * + * Please note that the classification as "property", "dir" or "file" + * fragments is only to be used while determining the future placement + * of a representation. If the rep is shared, the same rep may be used + * as *any* of the 3 kinds. + */ +enum fragment_kind_t +{ + /* the 2 number line containing changes and root node offsets */ + header_fragment, + + /* list of all changes in a revision */ + changes_fragment, + + /* (the textual representation of) a noderev */ + noderev_fragment, + + /* a property rep (including PLAIN / DELTA header) */ + property_fragment, + + /* a directory rep (including PLAIN / DELTA header) */ + dir_fragment, + + /* a file rep (including PLAIN / DELTA header) */ + file_fragment +}; + +/* A fragment. This is used to represent the final ordering, i.e. there + * will be an array containing elements of this type that basically put + * a fragment at some location in the target file. + */ +typedef struct fragment_t +{ + /* position in the target file */ + apr_size_t position; + + /* kind of fragment */ + enum fragment_kind_t kind; + + /* pointer to the fragment struct; type depends on KIND */ + void *data; +} fragment_t; + +/* Location info for a single revision. + */ +typedef struct revision_location_t +{ + /* pack file offset (manifest value), 0 for non-packed files */ + apr_size_t offset; + + /* offset of the changes list relative to OFFSET */ + apr_size_t changes; + + /* length of the changes list on bytes */ + apr_size_t changes_len; + + /* first offset behind the revision data in the pack file (file length + * for non-packed revs) */ + apr_size_t end; +} revision_location_t; + +/* Absolute position and size of some item. + */ +typedef struct location_t +{ + /* absolute offset in the file */ + apr_size_t offset; + + /* item length in bytes */ + apr_size_t size; +} location_t; + +/* A parsed directory entry. Note that instances of this struct may be + * shared between different DIRECTORY_T containers. + */ +typedef struct direntry_t +{ + /* (local) entry / path name */ + const char *name; + + /* strlen (name) */ + apr_size_t name_len; + + /* node rev providing ID and representation(s) */ + noderev_t *node; +} direntry_t; + +/* Representation of a parsed directory content. + */ +typedef struct directory_t +{ + /* array of pointers to DIRENTRY_T */ + apr_array_header_t *entries; + + /* MD5 of the textual representation. Will be set lazily as a side-effect + * of determining the length of this dir's textual representation. */ + unsigned char target_md5[16]; + + /* (expanded) length of the textual representation. + * Determined lazily during the write process. */ + apr_size_t size; +} directory_t; + +/* A representation fragment. + */ +typedef struct representation_t +{ + /* location in the source file */ + location_t original; + + /* location in the reordered target file */ + location_t target; + + /* length of the PLAIN / DELTA line in the source file in bytes */ + apr_size_t header_size; + + /* deltification base, or NULL if there is none */ + struct representation_t *delta_base; + + /* revision that contains this representation + * (may be referenced by other revisions, though) */ + revision_info_t *revision; + + /* representation content parsed as a directory. This will be NULL, if + * *no* directory noderev uses this representation. */ + directory_t *dir; + + /* the source content has a PLAIN header, so we may simply copy the + * source content into the target */ + svn_boolean_t is_plain; + + /* coloring flag used in the reordering algorithm to keep track of + * representations that still need to be placed. */ + svn_boolean_t covered; +} representation_t; + +/* A node rev. + */ +struct noderev_t +{ + /* location within the source file */ + location_t original; + + /* location within the reorganized target file. */ + location_t target; + + /* predecessor node, or NULL if there is none */ + noderev_t *predecessor; + + /* content representation; may be NULL if there is none */ + representation_t *text; + + /* properties representation; may be NULL if there is none */ + representation_t *props; + + /* revision that this noderev belongs to */ + revision_info_t *revision; + + /* coloring flag used in the reordering algorithm to keep track of + * representations that still need to be placed. */ + svn_boolean_t covered; +}; + +/* Represents a single revision. + * There will be only one instance per revision. */ +struct revision_info_t +{ + /* number of this revision */ + svn_revnum_t revision; + + /* position in the source file */ + revision_location_t original; + + /* position in the reorganized target file */ + revision_location_t target; + + /* noderev of the root directory */ + noderev_t *root_noderev; + + /* all noderevs_t of this revision (ordered by source file offset), + * i.e. those that point back to this struct */ + apr_array_header_t *node_revs; + + /* all representation_t of this revision (ordered by source file offset), + * i.e. those that point back to this struct */ + apr_array_header_t *representations; +}; + +/* Represents a packed revision file. + */ +typedef struct revision_pack_t +{ + /* first revision in the pack file */ + svn_revnum_t base; + + /* revision_info_t* of all revisions in the pack file; in revision order. */ + apr_array_header_t *info; + + /* list of fragments to place in the target pack file; in target order. */ + apr_array_header_t *fragments; + + /* source pack file length */ + apr_size_t filesize; + + /* temporary value. Equal to the number of bytes in the target pack file + * already allocated to fragments. */ + apr_size_t target_offset; +} revision_pack_t; + +/* Cache for revision source content. All content is stored in DATA and + * the HASH maps revision number to an svn_string_t instance whose data + * member points into DATA. + * + * Once TOTAL_SIZE exceeds LIMIT, all content will be discarded. Similarly, + * the hash gets cleared every 10000 insertions to keep the HASH_POOL + * memory usage in check. + */ +typedef struct content_cache_t +{ + /* pool used for HASH */ + apr_pool_t *hash_pool; + + /* svn_revnum_t -> svn_string_t. + * The strings become (potentially) invalid when adding new cache entries. */ + apr_hash_t *hash; + + /* data buffer. the first TOTAL_SIZE bytes are actually being used. */ + char *data; + + /* DATA capacity */ + apr_size_t limit; + + /* number of bytes used in DATA */ + apr_size_t total_size; + + /* number of insertions since the last hash cleanup */ + apr_size_t insert_count; +} content_cache_t; + +/* A cached directory. In contrast to directory_t, this stored the data as + * the plain hash that the normal FSFS will use to serialize & diff dirs. + */ +typedef struct dir_cache_entry_t +{ + /* revision containing the representation */ + svn_revnum_t revision; + + /* offset of the representation within that revision */ + apr_size_t offset; + + /* key-value representation of the directory entries */ + apr_hash_t *hash; +} dir_cache_entry_t; + +/* Directory cache. (revision, offset) will be mapped directly into the + * ENTRIES array of ENTRY_COUNT buckets (many entries will be NULL). + * Two alternating pools will be used to allocate dir content. + * + * If the INSERT_COUNT exceeds a given limit, the pools get exchanged and + * the older of the two will be cleared. This is to keep dir objects valid + * for at least one insertion. + */ +typedef struct dir_cache_t +{ + /* fixed-size array of ENTRY_COUNT elements */ + dir_cache_entry_t *entries; + + /* currently used for entry allocations */ + apr_pool_t *pool1; + + /* previously used for entry allocations */ + apr_pool_t *pool2; + + /* size of ENTRIES in elements */ + apr_size_t entry_count; + + /* number of directory elements added. I.e. usually >> #cached dirs */ + apr_size_t insert_count; +} dir_cache_t; + +/* A cached, undeltified txdelta window. + */ +typedef struct window_cache_entry_t +{ + /* revision containing the window */ + svn_revnum_t revision; + + /* offset of the deltified window within that revision */ + apr_size_t offset; + + /* window content */ + svn_stringbuf_t *window; +} window_cache_entry_t; + +/* Cache for undeltified txdelta windows. (revision, offset) will be mapped + * directly into the ENTRIES array of INSERT_COUNT buckets (most entries + * will be NULL). + * + * The cache will be cleared when USED exceeds CAPACITY. + */ +typedef struct window_cache_t +{ + /* fixed-size array of ENTRY_COUNT elements */ + window_cache_entry_t *entries; + + /* used to allocate windows */ + apr_pool_t *pool; + + /* size of ENTRIES in elements */ + apr_size_t entry_count; + + /* maximum combined size of all cached windows */ + apr_size_t capacity; + + /* current combined size of all cached windows */ + apr_size_t used; +} window_cache_t; + +/* Root data structure containing all information about a given repository. + */ +typedef struct fs_fs_t +{ + /* repository to reorg */ + const char *path; + + /* revision to start at (must be 0, ATM) */ + svn_revnum_t start_revision; + + /* FSFS format number */ + int format; + + /* highest revision number in the repo */ + svn_revnum_t max_revision; + + /* first non-packed revision */ + svn_revnum_t min_unpacked_rev; + + /* sharing size*/ + int max_files_per_dir; + + /* all revisions */ + apr_array_header_t *revisions; + + /* all packed files */ + apr_array_header_t *packs; + + /* empty representation. + * Used as a dummy base for DELTA reps without base. */ + representation_t *null_base; + + /* revision content cache */ + content_cache_t *cache; + + /* directory hash cache */ + dir_cache_t *dir_cache; + + /* undeltified txdelta window cache */ + window_cache_t *window_cache; +} fs_fs_t; + +/* Return the rev pack folder for revision REV in FS. + */ +static const char * +get_pack_folder(fs_fs_t *fs, + svn_revnum_t rev, + apr_pool_t *pool) +{ + return apr_psprintf(pool, "%s/db/revs/%ld.pack", + fs->path, rev / fs->max_files_per_dir); +} + +/* Return the path of the file containing revision REV in FS. + */ +static const char * +rev_or_pack_file_name(fs_fs_t *fs, + svn_revnum_t rev, + apr_pool_t *pool) +{ + return fs->min_unpacked_rev > rev + ? svn_dirent_join(get_pack_folder(fs, rev, pool), "pack", pool) + : apr_psprintf(pool, "%s/db/revs/%ld/%ld", fs->path, + rev / fs->max_files_per_dir, rev); +} + +/* Open the file containing revision REV in FS and return it in *FILE. + */ +static svn_error_t * +open_rev_or_pack_file(apr_file_t **file, + fs_fs_t *fs, + svn_revnum_t rev, + apr_pool_t *pool) +{ + return svn_io_file_open(file, + rev_or_pack_file_name(fs, rev, pool), + APR_READ | APR_BUFFERED, + APR_OS_DEFAULT, + pool); +} + +/* Read the whole content of the file containing REV in FS and return that + * in *CONTENT. + */ +static svn_error_t * +read_rev_or_pack_file(svn_stringbuf_t **content, + fs_fs_t *fs, + svn_revnum_t rev, + apr_pool_t *pool) +{ + return svn_stringbuf_from_file2(content, + rev_or_pack_file_name(fs, rev, pool), + pool); +} + +/* Return a new content cache with the given size LIMIT. Use POOL for + * all cache-related allocations. + */ +static content_cache_t * +create_content_cache(apr_pool_t *pool, + apr_size_t limit) +{ + content_cache_t *result = apr_pcalloc(pool, sizeof(*result)); + + result->hash_pool = svn_pool_create(pool); + result->hash = svn_hash__make(result->hash_pool); + result->limit = limit; + result->total_size = 0; + result->insert_count = 0; + result->data = apr_palloc(pool, limit); + + return result; +} + +/* Return the content of revision REVISION from CACHE. Return NULL upon a + * cache miss. This is a cache-internal function. + */ +static svn_string_t * +get_cached_content(content_cache_t *cache, + svn_revnum_t revision) +{ + return apr_hash_get(cache->hash, &revision, sizeof(revision)); +} + +/* Take the content in DATA and store it under REVISION in CACHE. + * This is a cache-internal function. + */ +static void +set_cached_content(content_cache_t *cache, + svn_revnum_t revision, + svn_string_t *data) +{ + svn_string_t *content; + svn_revnum_t *key; + + /* double insertion? -> broken cache logic */ + assert(get_cached_content(cache, revision) == NULL); + + /* purge the cache upon overflow */ + if (cache->total_size + data->len > cache->limit) + { + /* the hash pool grows slowly over time; clear it once in a while */ + if (cache->insert_count > 10000) + { + svn_pool_clear(cache->hash_pool); + cache->hash = svn_hash__make(cache->hash_pool); + cache->insert_count = 0; + } + else + cache->hash = svn_hash__make(cache->hash_pool); + + cache->total_size = 0; + + /* buffer overflow / revision too large */ + if (data->len > cache->limit) + SVN_ERR_MALFUNCTION_NO_RETURN(); + } + + /* copy data to cache and update he index (hash) */ + content = apr_palloc(cache->hash_pool, sizeof(*content)); + content->data = cache->data + cache->total_size; + content->len = data->len; + + memcpy(cache->data + cache->total_size, data->data, data->len); + cache->total_size += data->len; + + key = apr_palloc(cache->hash_pool, sizeof(*key)); + *key = revision; + + apr_hash_set(cache->hash, key, sizeof(*key), content); + ++cache->insert_count; +} + +/* Get the file content of revision REVISION in FS and return it in *DATA. + * Use SCRATCH_POOL for temporary allocations. + */ +static svn_error_t * +get_content(svn_string_t **data, + fs_fs_t *fs, + svn_revnum_t revision, + apr_pool_t *scratch_pool) +{ + apr_file_t *file; + revision_info_t *revision_info; + svn_stringbuf_t *temp; + apr_off_t temp_offset; + + /* try getting the data from our cache */ + svn_string_t *result = get_cached_content(fs->cache, revision); + if (result) + { + *data = result; + return SVN_NO_ERROR; + } + + /* not in cache. Is the revision valid at all? */ + if (revision - fs->start_revision > fs->revisions->nelts) + return svn_error_createf(SVN_ERR_BAD_VERSION_FILE_FORMAT, NULL, + _("Unknown revision %ld"), revision); + revision_info = APR_ARRAY_IDX(fs->revisions, + revision - fs->start_revision, + revision_info_t*); + + /* read the revision content. Assume that the file has *not* been + * reorg'ed, yet, i.e. all data is in one place. */ + temp = svn_stringbuf_create_ensure( revision_info->original.end + - revision_info->original.offset, + scratch_pool); + temp->len = revision_info->original.end - revision_info->original.offset; + SVN_ERR(open_rev_or_pack_file(&file, fs, revision, scratch_pool)); + + temp_offset = revision_info->original.offset; + SVN_ERR(svn_io_file_seek(file, APR_SET, &temp_offset, + scratch_pool)); + SVN_ERR_ASSERT(temp_offset < APR_SIZE_MAX); + revision_info->original.offset = (apr_size_t)temp_offset; + SVN_ERR(svn_io_file_read(file, temp->data, &temp->len, scratch_pool)); + + /* cache the result and return it */ + set_cached_content(fs->cache, revision, + svn_stringbuf__morph_into_string(temp)); + *data = get_cached_content(fs->cache, revision); + + return SVN_NO_ERROR; +} + +/* Return a new directory cache with ENTRY_COUNT buckets in its index. + * Use POOL for all cache-related allocations. + */ +static dir_cache_t * +create_dir_cache(apr_pool_t *pool, + apr_size_t entry_count) +{ + dir_cache_t *result = apr_pcalloc(pool, sizeof(*result)); + + result->pool1 = svn_pool_create(pool); + result->pool2 = svn_pool_create(pool); + result->entry_count = entry_count; + result->insert_count = 0; + result->entries = apr_pcalloc(pool, sizeof(*result->entries) * entry_count); + + return result; +} + +/* Return the position within FS' dir cache ENTRIES index for the given + * (REVISION, OFFSET) pair. This is a cache-internal function. + */ +static apr_size_t +get_dir_cache_index(fs_fs_t *fs, + svn_revnum_t revision, + apr_size_t offset) +{ + return (revision + offset * 0xd1f3da69) % fs->dir_cache->entry_count; +} + +/* Return the currently active pool of FS' dir cache. Note that it may be + * cleared after *2* insertions. + */ +static apr_pool_t * +get_cached_dir_pool(fs_fs_t *fs) +{ + return fs->dir_cache->pool1; +} + +/* Return the cached directory content stored in REPRESENTATION within FS. + * If that has not been found in cache, return NULL. + */ +static apr_hash_t * +get_cached_dir(fs_fs_t *fs, + representation_t *representation) +{ + svn_revnum_t revision = representation->revision->revision; + apr_size_t offset = representation->original.offset; + + apr_size_t i = get_dir_cache_index(fs, revision, offset); + dir_cache_entry_t *entry = &fs->dir_cache->entries[i]; + + return entry->offset == offset && entry->revision == revision + ? entry->hash + : NULL; +} + +/* Cache the directory HASH for REPRESENTATION within FS. + */ +static void +set_cached_dir(fs_fs_t *fs, + representation_t *representation, + apr_hash_t *hash) +{ + /* select the entry to use */ + svn_revnum_t revision = representation->revision->revision; + apr_size_t offset = representation->original.offset; + + apr_size_t i = get_dir_cache_index(fs, revision, offset); + dir_cache_entry_t *entry = &fs->dir_cache->entries[i]; + + /* clean the cache and rotate pools at regular intervals */ + fs->dir_cache->insert_count += apr_hash_count(hash); + if (fs->dir_cache->insert_count >= fs->dir_cache->entry_count * 100) + { + apr_pool_t *pool; + + svn_pool_clear(fs->dir_cache->pool2); + memset(fs->dir_cache->entries, + 0, + sizeof(*fs->dir_cache->entries) * fs->dir_cache->entry_count); + fs->dir_cache->insert_count = 0; + + pool = fs->dir_cache->pool2; + fs->dir_cache->pool2 = fs->dir_cache->pool1; + fs->dir_cache->pool1 = pool; + } + + /* write data to cache */ + entry->hash = hash; + entry->offset = offset; + entry->revision = revision; +} + +/* Return a new txdelta window cache with ENTRY_COUNT buckets in its index + * and a the total CAPACITY given in bytes. + * Use POOL for all cache-related allocations. + */ +static window_cache_t * +create_window_cache(apr_pool_t *pool, + apr_size_t entry_count, + apr_size_t capacity) +{ + window_cache_t *result = apr_pcalloc(pool, sizeof(*result)); + + result->pool = svn_pool_create(pool); + result->entry_count = entry_count; + result->capacity = capacity; + result->used = 0; + result->entries = apr_pcalloc(pool, sizeof(*result->entries) * entry_count); + + return result; +} + +/* Return the position within FS' window cache ENTRIES index for the given + * (REVISION, OFFSET) pair. This is a cache-internal function. + */ +static apr_size_t +get_window_cache_index(fs_fs_t *fs, + svn_revnum_t revision, + apr_size_t offset) +{ + return (revision + offset * 0xd1f3da69) % fs->window_cache->entry_count; +} + +/* Return the cached txdelta window stored in REPRESENTATION within FS. + * If that has not been found in cache, return NULL. + */ +static svn_stringbuf_t * +get_cached_window(fs_fs_t *fs, + representation_t *representation, + apr_pool_t *pool) +{ + svn_revnum_t revision = representation->revision->revision; + apr_size_t offset = representation->original.offset; + + apr_size_t i = get_window_cache_index(fs, revision, offset); + window_cache_entry_t *entry = &fs->window_cache->entries[i]; + + return entry->offset == offset && entry->revision == revision + ? svn_stringbuf_dup(entry->window, pool) + : NULL; +} + +/* Cache the undeltified txdelta WINDOW for REPRESENTATION within FS. + */ +static void +set_cached_window(fs_fs_t *fs, + representation_t *representation, + svn_stringbuf_t *window) +{ + /* select entry */ + svn_revnum_t revision = representation->revision->revision; + apr_size_t offset = representation->original.offset; + + apr_size_t i = get_window_cache_index(fs, revision, offset); + window_cache_entry_t *entry = &fs->window_cache->entries[i]; + + /* if the capacity is exceeded, clear the cache */ + fs->window_cache->used += window->len; + if (fs->window_cache->used >= fs->window_cache->capacity) + { + svn_pool_clear(fs->window_cache->pool); + memset(fs->window_cache->entries, + 0, + sizeof(*fs->window_cache->entries) * fs->window_cache->entry_count); + fs->window_cache->used = window->len; + } + + /* set the entry to a copy of the window data */ + entry->window = svn_stringbuf_dup(window, fs->window_cache->pool); + entry->offset = offset; + entry->revision = revision; +} + +/* Given rev pack PATH in FS, read the manifest file and return the offsets + * in *MANIFEST. Use POOL for allocations. + */ +static svn_error_t * +read_manifest(apr_array_header_t **manifest, + fs_fs_t *fs, + const char *path, + apr_pool_t *pool) +{ + svn_stream_t *manifest_stream; + apr_pool_t *iterpool; + + /* Open the manifest file. */ + SVN_ERR(svn_stream_open_readonly(&manifest_stream, + svn_dirent_join(path, "manifest", pool), + pool, pool)); + + /* While we're here, let's just read the entire manifest file into an array, + so we can cache the entire thing. */ + iterpool = svn_pool_create(pool); + *manifest = apr_array_make(pool, fs->max_files_per_dir, sizeof(apr_size_t)); + while (1) + { + svn_stringbuf_t *sb; + svn_boolean_t eof; + apr_uint64_t val; + svn_error_t *err; + + svn_pool_clear(iterpool); + SVN_ERR(svn_stream_readline(manifest_stream, &sb, "\n", &eof, iterpool)); + if (eof) + break; + + err = svn_cstring_strtoui64(&val, sb->data, 0, APR_SIZE_MAX, 10); + if (err) + return svn_error_createf(SVN_ERR_FS_CORRUPT, err, + _("Manifest offset '%s' too large"), + sb->data); + APR_ARRAY_PUSH(*manifest, apr_size_t) = (apr_size_t)val; + } + svn_pool_destroy(iterpool); + + return svn_stream_close(manifest_stream); +} + +/* Read header information for the revision stored in FILE_CONTENT at + * offsets START or END. Return the offsets within FILE_CONTENT for the + * *ROOT_NODEREV, the list of *CHANGES and its len in *CHANGES_LEN. + * Use POOL for temporary allocations. */ +static svn_error_t * +read_revision_header(apr_size_t *changes, + apr_size_t *changes_len, + apr_size_t *root_noderev, + svn_stringbuf_t *file_content, + apr_size_t start, + apr_size_t end, + apr_pool_t *pool) +{ + char buf[64]; + const char *line; + char *space; + apr_uint64_t val; + apr_size_t len; + + /* Read in this last block, from which we will identify the last line. */ + len = sizeof(buf); + if (start + len > end) + len = end - start; + + memcpy(buf, file_content->data + end - len, len); + + /* The last byte should be a newline. */ + if (buf[(apr_ssize_t)len - 1] != '\n') + return svn_error_create(SVN_ERR_FS_CORRUPT, NULL, + _("Revision lacks trailing newline")); + + /* Look for the next previous newline. */ + buf[len - 1] = 0; + line = strrchr(buf, '\n'); + if (line == NULL) + return svn_error_create(SVN_ERR_FS_CORRUPT, NULL, + _("Final line in revision file longer " + "than 64 characters")); + + space = strchr(line, ' '); + if (space == NULL) + return svn_error_create(SVN_ERR_FS_CORRUPT, NULL, + _("Final line in revision file missing space")); + + /* terminate the header line */ + *space = 0; + + /* extract information */ + SVN_ERR(svn_cstring_strtoui64(&val, line+1, 0, APR_SIZE_MAX, 10)); + *root_noderev = (apr_size_t)val; + SVN_ERR(svn_cstring_strtoui64(&val, space+1, 0, APR_SIZE_MAX, 10)); + *changes = (apr_size_t)val; + *changes_len = end - *changes - start - (buf + len - line) + 1; + + return SVN_NO_ERROR; +} + +/* Read the FSFS format number and sharding size from the format file at + * PATH and return it in *PFORMAT and *MAX_FILES_PER_DIR respectively. + * Use POOL for temporary allocations. + */ +static svn_error_t * +read_format(int *pformat, int *max_files_per_dir, + const char *path, apr_pool_t *pool) +{ + svn_error_t *err; + apr_file_t *file; + char buf[80]; + apr_size_t len; + + /* open format file and read the first line */ + err = svn_io_file_open(&file, path, APR_READ | APR_BUFFERED, + APR_OS_DEFAULT, pool); + if (err && APR_STATUS_IS_ENOENT(err->apr_err)) + { + /* Treat an absent format file as format 1. Do not try to + create the format file on the fly, because the repository + might be read-only for us, or this might be a read-only + operation, and the spirit of FSFS is to make no changes + whatseover in read-only operations. See thread starting at + http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=97600 + for more. */ + svn_error_clear(err); + *pformat = 1; + *max_files_per_dir = 0; + + return SVN_NO_ERROR; + } + SVN_ERR(err); + + len = sizeof(buf); + err = svn_io_read_length_line(file, buf, &len, pool); + if (err && APR_STATUS_IS_EOF(err->apr_err)) + { + /* Return a more useful error message. */ + svn_error_clear(err); + return svn_error_createf(SVN_ERR_BAD_VERSION_FILE_FORMAT, NULL, + _("Can't read first line of format file '%s'"), + svn_dirent_local_style(path, pool)); + } + SVN_ERR(err); + + /* Check that the first line contains only digits. */ + SVN_ERR(svn_cstring_atoi(pformat, buf)); + + /* Set the default values for anything that can be set via an option. */ + *max_files_per_dir = 0; + + /* Read any options. */ + while (1) + { + len = sizeof(buf); + err = svn_io_read_length_line(file, buf, &len, pool); + if (err && APR_STATUS_IS_EOF(err->apr_err)) + { + /* No more options; that's okay. */ + svn_error_clear(err); + break; + } + SVN_ERR(err); + + if (strncmp(buf, "layout ", 7) == 0) + { + if (strcmp(buf+7, "linear") == 0) + { + *max_files_per_dir = 0; + continue; + } + + if (strncmp(buf+7, "sharded ", 8) == 0) + { + /* Check that the argument is numeric. */ + SVN_ERR(svn_cstring_atoi(max_files_per_dir, buf + 15)); + continue; + } + } + + return svn_error_createf(SVN_ERR_BAD_VERSION_FILE_FORMAT, NULL, + _("'%s' contains invalid filesystem format option '%s'"), + svn_dirent_local_style(path, pool), buf); + } + + return svn_io_file_close(file, pool); +} + +/* Read the content of the file at PATH and return it in *RESULT. + * Use POOL for temporary allocations. + */ +static svn_error_t * +read_number(svn_revnum_t *result, const char *path, apr_pool_t *pool) +{ + svn_stringbuf_t *content; + apr_uint64_t number; + + SVN_ERR(svn_stringbuf_from_file2(&content, path, pool)); + + content->data[content->len-1] = 0; + SVN_ERR(svn_cstring_strtoui64(&number, content->data, 0, LONG_MAX, 10)); + *result = (svn_revnum_t)number; + + return SVN_NO_ERROR; +} + +/* Create *FS for the repository at PATH and read the format and size info. + * Use POOL for temporary allocations. + */ +static svn_error_t * +fs_open(fs_fs_t **fs, const char *path, apr_pool_t *pool) +{ + *fs = apr_pcalloc(pool, sizeof(**fs)); + (*fs)->path = apr_pstrdup(pool, path); + (*fs)->max_files_per_dir = 1000; + + /* Read the FS format number. */ + SVN_ERR(read_format(&(*fs)->format, + &(*fs)->max_files_per_dir, + svn_dirent_join(path, "db/format", pool), + pool)); + if (((*fs)->format != 4) && ((*fs)->format != 6)) + return svn_error_create(SVN_ERR_FS_UNSUPPORTED_FORMAT, NULL, NULL); + + /* read size (HEAD) info */ + SVN_ERR(read_number(&(*fs)->min_unpacked_rev, + svn_dirent_join(path, "db/min-unpacked-rev", pool), + pool)); + return read_number(&(*fs)->max_revision, + svn_dirent_join(path, "db/current", pool), + pool); +} + +/* Utility function that returns true if STRING->DATA matches KEY. + */ +static svn_boolean_t +key_matches(svn_string_t *string, const char *key) +{ + return strcmp(string->data, key) == 0; +} + +/* Comparator used for binary search comparing the absolute file offset + * of a noderev to some other offset. DATA is a *noderev_t, KEY is pointer + * to an apr_size_t. + */ +static int +compare_noderev_offsets(const void *data, const void *key) +{ + apr_ssize_t diff = (*(const noderev_t *const *)data)->original.offset + - *(const apr_size_t *)key; + + /* sizeof(int) may be < sizeof(ssize_t) */ + if (diff < 0) + return -1; + return diff > 0 ? 1 : 0; +} + +/* Get the revision and offset info from the node ID with FS. Return the + * data as *REVISION_INFO and *OFFSET, respectively. + * + * Note that we assume that the revision_info_t object ID's revision has + * already been created. That can be guaranteed for standard FSFS pack + * files as IDs never point to future revisions. + */ +static svn_error_t * +parse_revnode_pos(revision_info_t **revision_info, + apr_size_t *offset, + fs_fs_t *fs, + svn_string_t *id) +{ + int revision; + apr_uint64_t temp; + + /* split the ID and verify the format */ + const char *revision_pos = strrchr(id->data, 'r'); + char *offset_pos = (char *)strchr(id->data, '/'); + + if (revision_pos == NULL || offset_pos == NULL) + return svn_error_createf(SVN_ERR_BAD_VERSION_FILE_FORMAT, NULL, + _("Invalid node id '%s'"), id->data); + + /* extract the numbers (temp. modifying the ID)*/ + *offset_pos = 0; + SVN_ERR(svn_cstring_atoi(&revision, revision_pos + 1)); + SVN_ERR(svn_cstring_strtoui64(&temp, offset_pos + 1, 0, APR_SIZE_MAX, 10)); + *offset = (apr_size_t)temp; + *offset_pos = '/'; + + /* validate the revision number and return the revision info */ + if (revision - fs->start_revision > fs->revisions->nelts) + return svn_error_createf(SVN_ERR_BAD_VERSION_FILE_FORMAT, NULL, + _("Unknown revision %d"), revision); + + *revision_info = APR_ARRAY_IDX(fs->revisions, + revision - fs->start_revision, + revision_info_t*); + + return SVN_NO_ERROR; +} + +/* Returns in *RESULT the noderev at OFFSET relative the revision given in + * REVISION_INFO. If no such noderev has been parsed, yet, error out. + * + * Since we require the noderev to already have been parsed, we can use + * this functions only to access "older", i.e. predecessor noderevs. + */ +static svn_error_t * +find_noderev(noderev_t **result, + revision_info_t *revision_info, + apr_size_t offset) +{ + int idx = svn_sort__bsearch_lower_bound(&offset, + revision_info->node_revs, + compare_noderev_offsets); + if ((idx < 0) || (idx >= revision_info->node_revs->nelts)) + return svn_error_createf(SVN_ERR_BAD_VERSION_FILE_FORMAT, NULL, + _("No noderev found at offset %ld"), + (long)offset); + + *result = APR_ARRAY_IDX(revision_info->node_revs, idx, noderev_t *); + if ((*result)->original.offset != offset) + return svn_error_createf(SVN_ERR_BAD_VERSION_FILE_FORMAT, NULL, + _("No noderev found at offset %ld"), + (long)offset); + + return SVN_NO_ERROR; +} + +/* In *RESULT, return the noderev given by ID in FS. The noderev must + * already have been parsed and put into the FS data structures. + */ +static svn_error_t * +parse_pred(noderev_t **result, + fs_fs_t *fs, + svn_string_t *id) +{ + apr_size_t offset; + revision_info_t *revision_info; + + SVN_ERR(parse_revnode_pos(&revision_info, &offset, fs, id)); + SVN_ERR(find_noderev(result, revision_info, offset)); + + return SVN_NO_ERROR; +} + +/* Comparator used for binary search comparing the absolute file offset + * of a representation to some other offset. DATA is a *representation_t, + * KEY is a pointer to an apr_size_t. + */ +static int +compare_representation_offsets(const void *data, const void *key) +{ + apr_ssize_t diff = (*(const representation_t *const *)data)->original.offset + - *(const apr_size_t *)key; + + /* sizeof(int) may be < sizeof(ssize_t) */ + if (diff < 0) + return -1; + return diff > 0 ? 1 : 0; +} + +/* Find the revision_info_t object to the given REVISION in FS and return + * it in *REVISION_INFO. For performance reasons, we skip the lookup if + * the info is already provided. + * + * In that revision, look for the representation_t object for offset OFFSET. + * If it already exists, set *idx to its index in *REVISION_INFO's + * representations list and return the representation object. Otherwise, + * set the index to where it must be inserted and return NULL. + */ +static representation_t * +find_representation(int *idx, + fs_fs_t *fs, + revision_info_t **revision_info, + int revision, + apr_size_t offset) +{ + revision_info_t *info; + *idx = -1; + + /* first let's find the revision '*/ + info = revision_info ? *revision_info : NULL; + if (info == NULL || info->revision != revision) + { + info = APR_ARRAY_IDX(fs->revisions, + revision - fs->start_revision, + revision_info_t*); + if (revision_info) + *revision_info = info; + } + + /* not found -> no result */ + if (info == NULL) + return NULL; + + assert(revision == info->revision); + + /* look for the representation */ + *idx = svn_sort__bsearch_lower_bound(&offset, + info->representations, + compare_representation_offsets); + if (*idx < info->representations->nelts) + { + /* return the representation, if this is the one we were looking for */ + representation_t *result + = APR_ARRAY_IDX(info->representations, *idx, representation_t *); + if (result->original.offset == offset) + return result; + } + + /* not parsed, yet */ + return NULL; +} + +/* Read the representation header in FILE_CONTENT at OFFSET. Return its + * size in *HEADER_SIZE, set *IS_PLAIN if no deltification was used and + * return the deltification base representation in *REPRESENTATION. If + * there is none, set it to NULL. Use FS to it look up. + * + * Use SCRATCH_POOL for temporary allocations. + */ +static svn_error_t * +read_rep_base(representation_t **representation, + apr_size_t *header_size, + svn_boolean_t *is_plain, + fs_fs_t *fs, + svn_stringbuf_t *file_content, + apr_size_t offset, + apr_pool_t *scratch_pool) +{ + char *str, *last_str; + int idx, revision; + apr_uint64_t temp; + + /* identify representation header (1 line) */ + const char *buffer = file_content->data + offset; + const char *line_end = strchr(buffer, '\n'); + *header_size = line_end - buffer + 1; + + /* check for PLAIN rep */ + if (strncmp(buffer, "PLAIN\n", *header_size) == 0) + { + *is_plain = TRUE; + *representation = NULL; + return SVN_NO_ERROR; + } + + /* check for DELTA against empty rep */ + *is_plain = FALSE; + if (strncmp(buffer, "DELTA\n", *header_size) == 0) + { + /* This is a delta against the empty stream. */ + *representation = fs->null_base; + return SVN_NO_ERROR; + } + + /* it's delta against some other rep. Duplicate the header info such + * that we may modify it during parsing. */ + str = apr_pstrndup(scratch_pool, buffer, line_end - buffer); + last_str = str; + + /* parse it. */ + str = svn_cstring_tokenize(" ", &last_str); + str = svn_cstring_tokenize(" ", &last_str); + SVN_ERR(svn_cstring_atoi(&revision, str)); + + str = svn_cstring_tokenize(" ", &last_str); + SVN_ERR(svn_cstring_strtoui64(&temp, str, 0, APR_SIZE_MAX, 10)); + + /* it should refer to a rep in an earlier revision. Look it up */ + *representation = find_representation(&idx, fs, NULL, revision, (apr_size_t)temp); + return SVN_NO_ERROR; +} + +/* Parse the representation reference (text: or props:) in VALUE, look + * it up in FS and return it in *REPRESENTATION. To be able to parse the + * base rep, we pass the FILE_CONTENT as well. + * + * If necessary, allocate the result in POOL; use SCRATCH_POOL for temp. + * allocations. + */ +static svn_error_t * +parse_representation(representation_t **representation, + fs_fs_t *fs, + svn_stringbuf_t *file_content, + svn_string_t *value, + revision_info_t *revision_info, + apr_pool_t *pool, + apr_pool_t *scratch_pool) +{ + representation_t *result; + int revision; + + apr_uint64_t offset; + apr_uint64_t size; + int idx; + + /* read location (revision, offset) and size */ + char *c = (char *)value->data; + SVN_ERR(svn_cstring_atoi(&revision, svn_cstring_tokenize(" ", &c))); + SVN_ERR(svn_cstring_strtoui64(&offset, svn_cstring_tokenize(" ", &c), 0, APR_SIZE_MAX, 10)); + SVN_ERR(svn_cstring_strtoui64(&size, svn_cstring_tokenize(" ", &c), 0, APR_SIZE_MAX, 10)); + + /* look it up */ + result = find_representation(&idx, fs, &revision_info, revision, (apr_size_t)offset); + if (!result) + { + /* not parsed, yet (probably a rep in the same revision). + * Create a new rep object and determine its base rep as well. + */ + result = apr_pcalloc(pool, sizeof(*result)); + result->revision = revision_info; + result->original.offset = (apr_size_t)offset; + result->original.size = (apr_size_t)size; + SVN_ERR(read_rep_base(&result->delta_base, &result->header_size, + &result->is_plain, fs, file_content, + (apr_size_t)offset + revision_info->original.offset, + scratch_pool)); + + svn_sort__array_insert(&result, revision_info->representations, idx); + } + + *representation = result; + + return SVN_NO_ERROR; +} + +/* Read the delta window contents of all windows in REPRESENTATION in FS. + * Return the data as svn_txdelta_window_t* instances in *WINDOWS. + * Use POOL for allocations. + */ +static svn_error_t * +read_windows(apr_array_header_t **windows, + fs_fs_t *fs, + representation_t *representation, + apr_pool_t *pool) +{ + svn_string_t *content; + svn_string_t data; + svn_stream_t *stream; + apr_size_t offset = representation->original.offset + + representation->header_size; + char version; + apr_size_t len = sizeof(version); + + *windows = apr_array_make(pool, 0, sizeof(svn_txdelta_window_t *)); + + /* get the whole revision content */ + SVN_ERR(get_content(&content, fs, representation->revision->revision, pool)); + + /* create a read stream and position it directly after the rep header */ + data.data = content->data + offset + 3; + data.len = representation->original.size - 3; + stream = svn_stream_from_string(&data, pool); + SVN_ERR(svn_stream_read(stream, &version, &len)); + + /* read the windows from that stream */ + while (TRUE) + { + svn_txdelta_window_t *window; + svn_stream_mark_t *mark; + char dummy; + + len = sizeof(dummy); + SVN_ERR(svn_stream_mark(stream, &mark, pool)); + SVN_ERR(svn_stream_read(stream, &dummy, &len)); + if (len == 0) + break; + + SVN_ERR(svn_stream_seek(stream, mark)); + SVN_ERR(svn_txdelta_read_svndiff_window(&window, stream, version, pool)); + APR_ARRAY_PUSH(*windows, svn_txdelta_window_t *) = window; + } + + return SVN_NO_ERROR; +} + +/* Read the content of the PLAIN REPRESENTATION in FS and return it in + * *CONTENT. Use POOL for allocations. + */ +static svn_error_t * +read_plain(svn_stringbuf_t **content, + fs_fs_t *fs, + representation_t *representation, + apr_pool_t *pool) +{ + svn_string_t *data; + apr_size_t offset = representation->original.offset + + representation->header_size; + + SVN_ERR(get_content(&data, fs, representation->revision->revision, pool)); + + /* content is stored as fulltext already */ + *content = svn_stringbuf_ncreate(data->data + offset, + representation->original.size, + pool); + + return SVN_NO_ERROR; +} + +/* Get the undeltified representation that is a result of combining all + * deltas from the current desired REPRESENTATION in FS with its base + * representation. Store the result in *CONTENT. + * Use POOL for allocations. */ +static svn_error_t * +get_combined_window(svn_stringbuf_t **content, + fs_fs_t *fs, + representation_t *representation, + apr_pool_t *pool) +{ + int i; + apr_array_header_t *windows; + svn_stringbuf_t *base_content, *result; + const char *source; + apr_pool_t *sub_pool; + apr_pool_t *iter_pool; + + /* special case: no un-deltification necessary */ + if (representation->is_plain) + return read_plain(content, fs, representation, pool); + + /* special case: data already in cache */ + *content = get_cached_window(fs, representation, pool); + if (*content) + return SVN_NO_ERROR; + + /* read the delta windows for this representation */ + sub_pool = svn_pool_create(pool); + iter_pool = svn_pool_create(pool); + SVN_ERR(read_windows(&windows, fs, representation, sub_pool)); + + /* fetch the / create a base content */ + if (representation->delta_base && representation->delta_base->revision) + SVN_ERR(get_combined_window(&base_content, fs, + representation->delta_base, sub_pool)); + else + base_content = svn_stringbuf_create_empty(sub_pool); + + /* apply deltas */ + result = svn_stringbuf_create_empty(pool); + source = base_content->data; + + for (i = 0; i < windows->nelts; ++i) + { + svn_txdelta_window_t *window + = APR_ARRAY_IDX(windows, i, svn_txdelta_window_t *); + svn_stringbuf_t *buf + = svn_stringbuf_create_ensure(window->tview_len, iter_pool); + + buf->len = window->tview_len; + svn_txdelta_apply_instructions(window, window->src_ops ? source : NULL, + buf->data, &buf->len); + + svn_stringbuf_appendbytes(result, buf->data, buf->len); + source += window->sview_len; + + svn_pool_clear(iter_pool); + } + + svn_pool_destroy(iter_pool); + svn_pool_destroy(sub_pool); + + /* cache result and return it */ + set_cached_window(fs, representation, result); + *content = result; + + return SVN_NO_ERROR; +} + +/* forward declaration */ +static svn_error_t * +read_noderev(noderev_t **noderev, + fs_fs_t *fs, + svn_stringbuf_t *file_content, + apr_size_t offset, + revision_info_t *revision_info, + apr_pool_t *pool, + apr_pool_t *scratch_pool); + +/* Get the noderev at OFFSET in FILE_CONTENT in FS. The file content must + * pertain to the revision given in REVISION_INFO. If the data has not + * been read yet, parse it and store it in REVISION_INFO. Return the result + * in *NODEREV. + * + * Use POOL for allocations and SCRATCH_POOL for temporaries. + */ +static svn_error_t * +get_noderev(noderev_t **noderev, + fs_fs_t *fs, + svn_stringbuf_t *file_content, + apr_size_t offset, + revision_info_t *revision_info, + apr_pool_t *pool, + apr_pool_t *scratch_pool) +{ + int idx = svn_sort__bsearch_lower_bound(&offset, + revision_info->node_revs, + compare_noderev_offsets); + if ((idx < 0) || (idx >= revision_info->node_revs->nelts)) + SVN_ERR(read_noderev(noderev, fs, file_content, offset, revision_info, + pool, scratch_pool)); + else + { + *noderev = APR_ARRAY_IDX(revision_info->node_revs, idx, noderev_t *); + if ((*noderev)->original.offset != offset) + SVN_ERR(read_noderev(noderev, fs, file_content, offset, revision_info, + pool, scratch_pool)); + } + + return SVN_NO_ERROR; +} + +/* Read the directory stored in REPRESENTATION in FS into *HASH. The result + * will be allocated in FS' directory cache and it will be plain key-value + * hash. Use SCRATCH_POOL for temporary allocations. + */ +static svn_error_t * +read_dir(apr_hash_t **hash, + fs_fs_t *fs, + representation_t *representation, + apr_pool_t *scratch_pool) +{ + svn_stringbuf_t *text; + apr_pool_t *text_pool; + svn_stream_t *stream; + apr_pool_t *pool; + + /* chances are, we find the info in cache already */ + *hash = get_cached_dir(fs, representation); + if (*hash) + return SVN_NO_ERROR; + + /* create the result container */ + pool = get_cached_dir_pool(fs); + *hash = svn_hash__make(pool); + + /* if this is a non-empty rep, read it and de-serialize the hash */ + if (representation != NULL) + { + text_pool = svn_pool_create(scratch_pool); + SVN_ERR(get_combined_window(&text, fs, representation, text_pool)); + stream = svn_stream_from_stringbuf(text, text_pool); + SVN_ERR(svn_hash_read2(*hash, stream, SVN_HASH_TERMINATOR, pool)); + svn_pool_destroy(text_pool); + } + + /* cache the result */ + set_cached_dir(fs, representation, *hash); + + return SVN_NO_ERROR; +} + +/* Starting at the directory in REPRESENTATION in FILE_CONTENT, read all + * DAG nodes, directories and representations linked in that tree structure. + * Store them in FS and read them only once. + * + * Use POOL for persistent allocations and SCRATCH_POOL for temporaries. + */ +static svn_error_t * +parse_dir(fs_fs_t *fs, + svn_stringbuf_t *file_content, + representation_t *representation, + apr_pool_t *pool, + apr_pool_t *scratch_pool) +{ + apr_hash_t *hash; + apr_hash_index_t *hi; + apr_pool_t *iter_pool = svn_pool_create(scratch_pool); + apr_hash_t *base_dir = svn_hash__make(scratch_pool); + + /* special case: empty dir rep */ + if (representation == NULL) + return SVN_NO_ERROR; + + /* if we have a previous representation of that dir, hash it by name */ + if (representation->delta_base && representation->delta_base->dir) + { + apr_array_header_t *dir = representation->delta_base->dir->entries; + int i; + + for (i = 0; i < dir->nelts; ++i) + { + direntry_t *entry = APR_ARRAY_IDX(dir, i, direntry_t *); + apr_hash_set(base_dir, entry->name, entry->name_len, entry); + } + } + + /* read this directory */ + SVN_ERR(read_dir(&hash, fs, representation, scratch_pool)); + + /* add it as an array to the representation (entries yet to be filled) */ + representation->dir = apr_pcalloc(pool, sizeof(*representation->dir)); + representation->dir->entries + = apr_array_make(pool, apr_hash_count(hash), sizeof(direntry_t *)); + + /* Translate the string dir entries into real entries. Reuse existing + * objects as much as possible to keep memory consumption low. + */ + for (hi = apr_hash_first(pool, hash); hi; hi = apr_hash_next(hi)) + { + const char *name = svn__apr_hash_index_key(hi); + svn_string_t *str_val = svn__apr_hash_index_val(hi); + apr_size_t offset; + revision_info_t *revision_info; + + /* look for corresponding entry in previous version */ + apr_size_t name_len = strlen(name); + direntry_t *entry = base_dir + ? apr_hash_get(base_dir, name, name_len) + : NULL; + + /* parse the new target revnode ID (revision, offset) */ + SVN_ERR(parse_revnode_pos(&revision_info, &offset, fs, str_val)); + + /* if this is a new entry or if the content changed, create a new + * instance for it. */ + if ( !entry + || !entry->node->text + || entry->node->text->revision != revision_info + || entry->node->original.offset != offset) + { + /* create & init the new entry. Reuse the name string if possible */ + direntry_t *new_entry = apr_pcalloc(pool, sizeof(*entry)); + new_entry->name_len = name_len; + if (entry) + new_entry->name = entry->name; + else + new_entry->name = apr_pstrdup(pool, name); + + /* Link it to the content noderev. Recurse. */ + entry = new_entry; + SVN_ERR(get_noderev(&entry->node, fs, file_content, offset, + revision_info, pool, iter_pool)); + } + + /* set the directory entry */ + APR_ARRAY_PUSH(representation->dir->entries, direntry_t *) = entry; + svn_pool_clear(iter_pool); + } + + svn_pool_destroy(iter_pool); + return SVN_NO_ERROR; +} + +/* Starting at the noderev at OFFSET in FILE_CONTENT, read all DAG nodes, + * directories and representations linked in that tree structure. Store + * them in FS and read them only once. Return the result in *NODEREV. + * + * Use POOL for persistent allocations and SCRATCH_POOL for temporaries. + */ +static svn_error_t * +read_noderev(noderev_t **noderev, + fs_fs_t *fs, + svn_stringbuf_t *file_content, + apr_size_t offset, + revision_info_t *revision_info, + apr_pool_t *pool, + apr_pool_t *scratch_pool) +{ + noderev_t *result = apr_pcalloc(pool, sizeof(*result)); + svn_string_t *line; + svn_boolean_t is_dir = FALSE; + + scratch_pool = svn_pool_create(scratch_pool); + + /* parse the noderev line-by-line until we find an empty line */ + result->original.offset = offset; + while (1) + { + /* for this line, extract key and value. Ignore invalid values */ + svn_string_t key; + svn_string_t value; + char *sep; + const char *start = file_content->data + offset + + revision_info->original.offset; + const char *end = strchr(start, '\n'); + + line = svn_string_ncreate(start, end - start, scratch_pool); + offset += end - start + 1; + + /* empty line -> end of noderev data */ + if (line->len == 0) + break; + + sep = strchr(line->data, ':'); + if (sep == NULL) + continue; + + key.data = line->data; + key.len = sep - key.data; + *sep = 0; + + if (key.len + 2 > line->len) + continue; + + value.data = sep + 2; + value.len = line->len - (key.len + 2); + + /* translate (key, value) into noderev elements */ + if (key_matches(&key, "type")) + is_dir = strcmp(value.data, "dir") == 0; + else if (key_matches(&key, "pred")) + SVN_ERR(parse_pred(&result->predecessor, fs, &value)); + else if (key_matches(&key, "text")) + SVN_ERR(parse_representation(&result->text, fs, file_content, + &value, revision_info, + pool, scratch_pool)); + else if (key_matches(&key, "props")) + SVN_ERR(parse_representation(&result->props, fs, file_content, + &value, revision_info, + pool, scratch_pool)); + } + + /* link noderev to revision info */ + result->revision = revision_info; + result->original.size = offset - result->original.offset; + + svn_sort__array_insert(&result, + revision_info->node_revs, + svn_sort__bsearch_lower_bound(&offset, + revision_info->node_revs, + compare_noderev_offsets)); + + /* if this is a directory, read and process that recursively */ + if (is_dir) + SVN_ERR(parse_dir(fs, file_content, result->text, + pool, scratch_pool)); + + /* done */ + svn_pool_destroy(scratch_pool); + *noderev = result; + + return SVN_NO_ERROR; +} + +/* Simple utility to print a REVISION number and make it appear immediately. + */ +static void +print_progress(svn_revnum_t revision) +{ + printf("%8ld", revision); + fflush(stdout); +} + +/* Read the content of the pack file staring at revision BASE and store it + * in FS. Use POOL for allocations. + */ +static svn_error_t * +read_pack_file(fs_fs_t *fs, + svn_revnum_t base, + apr_pool_t *pool) +{ + apr_array_header_t *manifest = NULL; + apr_pool_t *local_pool = svn_pool_create(pool); + apr_pool_t *iter_pool = svn_pool_create(local_pool); + int i; + svn_stringbuf_t *file_content; + revision_pack_t *revisions; + const char *pack_folder = get_pack_folder(fs, base, local_pool); + + /* read the whole pack file into memory */ + SVN_ERR(read_rev_or_pack_file(&file_content, fs, base, local_pool)); + + /* create the revision container */ + revisions = apr_pcalloc(pool, sizeof(*revisions)); + revisions->base = base; + revisions->fragments = NULL; + revisions->info = apr_array_make(pool, + fs->max_files_per_dir, + sizeof(revision_info_t*)); + revisions->filesize = file_content->len; + APR_ARRAY_PUSH(fs->packs, revision_pack_t*) = revisions; + + /* parse the manifest file */ + SVN_ERR(read_manifest(&manifest, fs, pack_folder, local_pool)); + if (manifest->nelts != fs->max_files_per_dir) + return svn_error_create(SVN_ERR_FS_CORRUPT, NULL, NULL); + + /* process each revision in the pack file */ + for (i = 0; i < manifest->nelts; ++i) + { + apr_size_t root_node_offset; + svn_string_t rev_content; + + /* create the revision info for the current rev */ + revision_info_t *info = apr_pcalloc(pool, sizeof(*info)); + info->node_revs = apr_array_make(iter_pool, 4, sizeof(noderev_t*)); + info->representations = apr_array_make(iter_pool, 4, sizeof(representation_t*)); + + info->revision = base + i; + info->original.offset = APR_ARRAY_IDX(manifest, i, apr_size_t); + info->original.end = i+1 < manifest->nelts + ? APR_ARRAY_IDX(manifest, i+1 , apr_size_t) + : file_content->len; + SVN_ERR(read_revision_header(&info->original.changes, + &info->original.changes_len, + &root_node_offset, + file_content, + APR_ARRAY_IDX(manifest, i , apr_size_t), + info->original.end, + iter_pool)); + + /* put it into our containers */ + APR_ARRAY_PUSH(revisions->info, revision_info_t*) = info; + APR_ARRAY_PUSH(fs->revisions, revision_info_t*) = info; + + /* cache the revision content */ + rev_content.data = file_content->data + info->original.offset; + rev_content.len = info->original.end - info->original.offset; + set_cached_content(fs->cache, info->revision, &rev_content); + + /* parse the revision content recursively. */ + SVN_ERR(read_noderev(&info->root_noderev, fs, file_content, + root_node_offset, info, pool, iter_pool)); + + /* copy dynamically grown containers from temp into result pool */ + info->node_revs = apr_array_copy(pool, info->node_revs); + info->representations = apr_array_copy(pool, info->representations); + + /* destroy temps */ + svn_pool_clear(iter_pool); + } + + /* one more pack file processed */ + print_progress(base); + svn_pool_destroy(local_pool); + + return SVN_NO_ERROR; +} + +/* Read the content of REVSION file and store it in FS. + * Use POOL for allocations. + */ +static svn_error_t * +read_revision_file(fs_fs_t *fs, + svn_revnum_t revision, + apr_pool_t *pool) +{ + apr_size_t root_node_offset; + apr_pool_t *local_pool = svn_pool_create(pool); + svn_stringbuf_t *file_content; + svn_string_t rev_content; + revision_pack_t *revisions = apr_pcalloc(pool, sizeof(*revisions)); + revision_info_t *info = apr_pcalloc(pool, sizeof(*info)); + + /* read the whole pack file into memory */ + SVN_ERR(read_rev_or_pack_file(&file_content, fs, revision, local_pool)); + + /* create the revision info for the current rev */ + info->node_revs = apr_array_make(pool, 4, sizeof(noderev_t*)); + info->representations = apr_array_make(pool, 4, sizeof(representation_t*)); + + info->revision = revision; + info->original.offset = 0; + info->original.end = file_content->len; + SVN_ERR(read_revision_header(&info->original.changes, + &info->original.changes_len, + &root_node_offset, + file_content, + 0, + info->original.end, + local_pool)); + + /* put it into our containers */ + APR_ARRAY_PUSH(fs->revisions, revision_info_t*) = info; + + /* create a pseudo-pack file container for just this rev to keep our + * data structures as uniform as possible. + */ + revisions->base = revision; + revisions->fragments = NULL; + revisions->info = apr_array_make(pool, 1, sizeof(revision_info_t*)); + revisions->filesize = file_content->len; + APR_ARRAY_PUSH(revisions->info, revision_info_t*) = info; + APR_ARRAY_PUSH(fs->packs, revision_pack_t*) = revisions; + + /* cache the revision content */ + rev_content.data = file_content->data + info->original.offset; + rev_content.len = info->original.end - info->original.offset; + set_cached_content(fs->cache, info->revision, &rev_content); + + /* parse the revision content recursively. */ + SVN_ERR(read_noderev(&info->root_noderev, fs, file_content, + root_node_offset, info, + pool, local_pool)); + APR_ARRAY_PUSH(info->node_revs, noderev_t*) = info->root_noderev; + + /* show progress every 1000 revs or so */ + if (revision % fs->max_files_per_dir == 0) + print_progress(revision); + + svn_pool_destroy(local_pool); + + return SVN_NO_ERROR; +} + +/* Read the repository at PATH beginning with revision START_REVISION and + * return the result in *FS. Allocate caches with MEMSIZE bytes total + * capacity. Use POOL for non-cache allocations. + */ +static svn_error_t * +read_revisions(fs_fs_t **fs, + const char *path, + svn_revnum_t start_revision, + apr_size_t memsize, + apr_pool_t *pool) +{ + svn_revnum_t revision; + apr_size_t content_cache_size; + apr_size_t window_cache_size; + apr_size_t dir_cache_size; + + /* determine cache sizes */ + if (memsize < 100) + memsize = 100; + + content_cache_size = memsize * 7 / 10 > 4000 ? 4000 : memsize * 7 / 10; + window_cache_size = memsize * 2 / 10 * 1024 * 1024; + dir_cache_size = (memsize / 10) * 16000; + + /* read repo format and such */ + SVN_ERR(fs_open(fs, path, pool)); + + /* create data containers and caches */ + (*fs)->start_revision = start_revision + - (start_revision % (*fs)->max_files_per_dir); + (*fs)->revisions = apr_array_make(pool, + (*fs)->max_revision + 1 - (*fs)->start_revision, + sizeof(revision_info_t *)); + (*fs)->packs = apr_array_make(pool, + ((*fs)->min_unpacked_rev - (*fs)->start_revision) + / (*fs)->max_files_per_dir, + sizeof(revision_pack_t *)); + (*fs)->null_base = apr_pcalloc(pool, sizeof(*(*fs)->null_base)); + (*fs)->cache = create_content_cache + (apr_allocator_owner_get + (svn_pool_create_allocator(FALSE)), + content_cache_size * 1024 * 1024); + (*fs)->dir_cache = create_dir_cache + (apr_allocator_owner_get + (svn_pool_create_allocator(FALSE)), + dir_cache_size); + (*fs)->window_cache = create_window_cache + (apr_allocator_owner_get + (svn_pool_create_allocator(FALSE)), + 10000, window_cache_size); + + /* read all packed revs */ + for ( revision = start_revision + ; revision < (*fs)->min_unpacked_rev + ; revision += (*fs)->max_files_per_dir) + SVN_ERR(read_pack_file(*fs, revision, pool)); + + /* read non-packed revs */ + for ( ; revision <= (*fs)->max_revision; ++revision) + SVN_ERR(read_revision_file(*fs, revision, pool)); + + return SVN_NO_ERROR; +} + +/* Return the maximum number of decimal digits required to represent offsets + * in the given PACK file. + */ +static apr_size_t +get_max_offset_len(const revision_pack_t *pack) +{ + /* the pack files may grow a few percent. + * Fudge it up to be on safe side. + */ + apr_size_t max_future_size = pack->filesize * 2 + 10000; + apr_size_t result = 0; + + while (max_future_size > 0) + { + ++result; + max_future_size /= 10; + } + + return result; +} + +/* Create the fragments container in PACK and add revision header fragments + * to it. Use POOL for allocations. + */ +static svn_error_t * +add_revisions_pack_heads(revision_pack_t *pack, + apr_pool_t *pool) +{ + int i; + revision_info_t *info; + apr_size_t offset_len = get_max_offset_len(pack); + fragment_t fragment; + + /* allocate fragment arrays */ + + int fragment_count = 1; + for (i = 0; i < pack->info->nelts; ++i) + { + info = APR_ARRAY_IDX(pack->info, i, revision_info_t*); + fragment_count += info->node_revs->nelts + + info->representations->nelts + + 2; + } + + pack->target_offset = pack->info->nelts > 1 ? 64 : 0; + pack->fragments = apr_array_make(pool, + fragment_count, + sizeof(fragment_t)); + + /* put revision headers first */ + + for (i = 0; i < pack->info->nelts - 1; ++i) + { + info = APR_ARRAY_IDX(pack->info, i, revision_info_t*); + info->target.offset = pack->target_offset; + + fragment.data = info; + fragment.kind = header_fragment; + fragment.position = pack->target_offset; + APR_ARRAY_PUSH(pack->fragments, fragment_t) = fragment; + + pack->target_offset += 2 * offset_len + 3; + } + + info = APR_ARRAY_IDX(pack->info, pack->info->nelts - 1, revision_info_t*); + info->target.offset = pack->target_offset; + + /* followed by the changes list */ + + for (i = 0; i < pack->info->nelts; ++i) + { + info = APR_ARRAY_IDX(pack->info, i, revision_info_t*); + + info->target.changes = pack->target_offset - info->target.offset; + info->target.changes_len = info->original.changes_len; + + fragment.data = info; + fragment.kind = changes_fragment; + fragment.position = pack->target_offset; + APR_ARRAY_PUSH(pack->fragments, fragment_t) = fragment; + + pack->target_offset += info->original.changes_len; + } + + return SVN_NO_ERROR; +} + +/* For the revision given by INFO in FS, return the fragment container in + * *FRAGMENTS and the current placement offset in *CURRENT_POS. + */ +static svn_error_t * +get_target_offset(apr_size_t **current_pos, + apr_array_header_t **fragments, + fs_fs_t *fs, + revision_info_t *info) +{ + int i; + revision_pack_t *pack; + svn_revnum_t revision = info->revision; + + /* identify the pack object */ + if (fs->min_unpacked_rev > revision) + { + i = (revision - fs->start_revision) / fs->max_files_per_dir; + } + else + { + i = (fs->min_unpacked_rev - fs->start_revision) / fs->max_files_per_dir; + i += revision - fs->min_unpacked_rev; + } + + /* extract the desired info from it */ + pack = APR_ARRAY_IDX(fs->packs, i, revision_pack_t*); + *current_pos = &pack->target_offset; + *fragments = pack->fragments; + + return SVN_NO_ERROR; +} + +/* forward declaration */ +static svn_error_t * +add_noderev_recursively(fs_fs_t *fs, + noderev_t *node, + apr_pool_t *pool); + +/* Place fragments for the given REPRESENTATION of the given KIND, iff it + * has not been covered, yet. Place the base reps along the deltification + * chain as far as those reps have not been covered, yet. If REPRESENTATION + * is a directory, recursively place its elements. + * + * Use POOL for allocations. + */ +static svn_error_t * +add_representation_recursively(fs_fs_t *fs, + representation_t *representation, + enum fragment_kind_t kind, + apr_pool_t *pool) +{ + apr_size_t *current_pos; + apr_array_header_t *fragments; + fragment_t fragment; + + /* place REPRESENTATION only once and only if it exists and will not + * be covered later as a directory. */ + if ( representation == NULL + || representation->covered + || (representation->dir && kind != dir_fragment) + || representation == fs->null_base) + return SVN_NO_ERROR; + + /* add and place a fragment for REPRESENTATION */ + SVN_ERR(get_target_offset(¤t_pos, &fragments, + fs, representation->revision)); + representation->target.offset = *current_pos; + representation->covered = TRUE; + + fragment.data = representation; + fragment.kind = kind; + fragment.position = *current_pos; + APR_ARRAY_PUSH(fragments, fragment_t) = fragment; + + /* determine the size of data to be added to the target file */ + if ( kind != dir_fragment + && representation->delta_base && representation->delta_base->dir) + { + /* base rep is a dir -> would change -> need to store it as fulltext + * in our target file */ + apr_pool_t *text_pool = svn_pool_create(pool); + svn_stringbuf_t *content; + + SVN_ERR(get_combined_window(&content, fs, representation, text_pool)); + representation->target.size = content->len; + *current_pos += representation->target.size + 13; + + svn_pool_destroy(text_pool); + } + else + if ( kind == dir_fragment + || (representation->delta_base && representation->delta_base->dir)) + { + /* deltified directories may grow considerably */ + if (representation->original.size < 50) + *current_pos += 300; + else + *current_pos += representation->original.size * 3 + 150; + } + else + { + /* plain / deltified content will not change but the header may + * grow slightly due to larger offsets. */ + representation->target.size = representation->original.size; + + if (representation->delta_base && + (representation->delta_base != fs->null_base)) + *current_pos += representation->original.size + 50; + else + *current_pos += representation->original.size + 13; + } + + /* follow the delta chain and place base revs immediately after this */ + if (representation->delta_base) + SVN_ERR(add_representation_recursively(fs, + representation->delta_base, + kind, + pool)); + + /* finally, recurse into directories */ + if (representation->dir) + { + int i; + apr_array_header_t *entries = representation->dir->entries; + + for (i = 0; i < entries->nelts; ++i) + { + direntry_t *entry = APR_ARRAY_IDX(entries, i, direntry_t *); + if (entry->node) + SVN_ERR(add_noderev_recursively(fs, entry->node, pool)); + } + } + + return SVN_NO_ERROR; +} + +/* Place fragments for the given NODE in FS, iff it has not been covered, + * yet. Place the reps (text, props) immediately after the node. + * + * Use POOL for allocations. + */ +static svn_error_t * +add_noderev_recursively(fs_fs_t *fs, + noderev_t *node, + apr_pool_t *pool) +{ + apr_size_t *current_pos; + apr_array_header_t *fragments; + fragment_t fragment; + + /* don't add it twice */ + if (node->covered) + return SVN_NO_ERROR; + + /* add and place a fragment for NODE */ + SVN_ERR(get_target_offset(¤t_pos, &fragments, fs, node->revision)); + node->covered = TRUE; + node->target.offset = *current_pos; + + fragment.data = node; + fragment.kind = noderev_fragment; + fragment.position = *current_pos; + APR_ARRAY_PUSH(fragments, fragment_t) = fragment; + + /* size may slightly increase */ + *current_pos += node->original.size + 40; + + /* recurse into representations */ + if (node->text && node->text->dir) + SVN_ERR(add_representation_recursively(fs, node->text, dir_fragment, pool)); + else + SVN_ERR(add_representation_recursively(fs, node->text, file_fragment, pool)); + + SVN_ERR(add_representation_recursively(fs, node->props, property_fragment, pool)); + + return SVN_NO_ERROR; +} + +/* Place a fragment for the last revision in PACK. Use POOL for allocations. + */ +static svn_error_t * +add_revisions_pack_tail(revision_pack_t *pack, + apr_pool_t *pool) +{ + int i; + revision_info_t *info; + apr_size_t offset_len = get_max_offset_len(pack); + fragment_t fragment; + + /* put final revision header last and fix up revision lengths */ + + info = APR_ARRAY_IDX(pack->info, pack->info->nelts-1, revision_info_t*); + + fragment.data = info; + fragment.kind = header_fragment; + fragment.position = pack->target_offset; + APR_ARRAY_PUSH(pack->fragments, fragment_t) = fragment; + + pack->target_offset += 2 * offset_len + 3; + + /* end of target file reached. Store that info in all revs. */ + for (i = 0; i < pack->info->nelts; ++i) + { + info = APR_ARRAY_IDX(pack->info, i, revision_info_t*); + info->target.end = pack->target_offset; + } + + return SVN_NO_ERROR; +} + +/* Place all fragments for all revisions / packs in FS. + * Use POOL for allocations. + */ +static svn_error_t * +reorder_revisions(fs_fs_t *fs, + apr_pool_t *pool) +{ + int i, k; + + /* headers and changes */ + + for (i = 0; i < fs->packs->nelts; ++i) + { + revision_pack_t *pack = APR_ARRAY_IDX(fs->packs, i, revision_pack_t*); + SVN_ERR(add_revisions_pack_heads(pack, pool)); + } + + /* representations & nodes */ + + for (i = fs->revisions->nelts-1; i >= 0; --i) + { + revision_info_t *info = APR_ARRAY_IDX(fs->revisions, i, revision_info_t*); + for (k = info->node_revs->nelts - 1; k >= 0; --k) + { + noderev_t *node = APR_ARRAY_IDX(info->node_revs, k, noderev_t*); + SVN_ERR(add_noderev_recursively(fs, node, pool)); + } + + if (info->revision % fs->max_files_per_dir == 0) + print_progress(info->revision); + } + + /* pack file tails */ + + for (i = 0; i < fs->packs->nelts; ++i) + { + revision_pack_t *pack = APR_ARRAY_IDX(fs->packs, i, revision_pack_t*); + SVN_ERR(add_revisions_pack_tail(pack, pool)); + } + + return SVN_NO_ERROR; +} + +/* forward declaration */ +static svn_error_t * +get_fragment_content(svn_string_t **content, + fs_fs_t *fs, + fragment_t *fragment, + apr_pool_t *pool); + +/* Directory content may change and with it, the deltified representations + * may significantly. This function causes all directory target reps in + * PACK of FS to be built and their new MD5 as well as rep sizes be updated. + * We must do that before attempting to write noderevs. + * + * Use POOL for allocations. + */ +static svn_error_t * +update_noderevs(fs_fs_t *fs, + revision_pack_t *pack, + apr_pool_t *pool) +{ + int i; + apr_pool_t *itempool = svn_pool_create(pool); + + for (i = 0; i < pack->fragments->nelts; ++i) + { + fragment_t *fragment = &APR_ARRAY_IDX(pack->fragments, i, fragment_t); + if (fragment->kind == dir_fragment) + { + svn_string_t *content; + + /* request updated rep content but ignore the result. + * We are only interested in the MD5, content and rep size updates. */ + SVN_ERR(get_fragment_content(&content, fs, fragment, itempool)); + svn_pool_clear(itempool); + } + } + + svn_pool_destroy(itempool); + + return SVN_NO_ERROR; +} + +/* Determine the target size of the FRAGMENT in FS and return the value + * in *LENGTH. If ADD_PADDING has been set, slightly fudge the numbers + * to account for changes in offset lengths etc. Use POOL for temporary + * allocations. + */ +static svn_error_t * +get_content_length(apr_size_t *length, + fs_fs_t *fs, + fragment_t *fragment, + svn_boolean_t add_padding, + apr_pool_t *pool) +{ + svn_string_t *content; + + SVN_ERR(get_fragment_content(&content, fs, fragment, pool)); + if (add_padding) + switch (fragment->kind) + { + case dir_fragment: + *length = content->len + 16; + break; + case noderev_fragment: + *length = content->len + 3; + break; + default: + *length = content->len; + break; + } + else + *length = content->len; + + return SVN_NO_ERROR; +} + +/* Move the FRAGMENT to global file offset NEW_POSITION. Update the target + * location info of the underlying object as well. + */ +static void +move_fragment(fragment_t *fragment, + apr_size_t new_position) +{ + revision_info_t *info; + representation_t *representation; + noderev_t *node; + + /* move the fragment */ + fragment->position = new_position; + + /* move the underlying object */ + switch (fragment->kind) + { + case header_fragment: + info = fragment->data; + info->target.offset = new_position; + break; + + case changes_fragment: + info = fragment->data; + info->target.changes = new_position - info->target.offset; + break; + + case property_fragment: + case file_fragment: + case dir_fragment: + representation = fragment->data; + representation->target.offset = new_position; + break; + + case noderev_fragment: + node = fragment->data; + node->target.offset = new_position; + break; + } +} + +/* Move the fragments in PACK's target fragment list to their final offsets. + * This may require several iterations if the fudge factors turned out to + * be insufficient. Use POOL for allocations. + */ +static svn_error_t * +pack_revisions(fs_fs_t *fs, + revision_pack_t *pack, + apr_pool_t *pool) +{ + int i; + fragment_t *fragment, *next; + svn_boolean_t needed_to_expand; + revision_info_t *info; + apr_size_t current_pos, len, old_len; + + apr_pool_t *itempool = svn_pool_create(pool); + + /* update all directory reps. Chances are that most of the target rep + * sizes are now close to accurate. */ + SVN_ERR(update_noderevs(fs, pack, pool)); + + /* compression phase: pack all fragments tightly with only a very small + * fudge factor. This should cause offsets to shrink, thus all the + * actual fragment rate should tend to be even smaller afterwards. */ + current_pos = pack->info->nelts > 1 ? 64 : 0; + for (i = 0; i + 1 < pack->fragments->nelts; ++i) + { + fragment = &APR_ARRAY_IDX(pack->fragments, i, fragment_t); + SVN_ERR(get_content_length(&len, fs, fragment, TRUE, itempool)); + move_fragment(fragment, current_pos); + current_pos += len; + + svn_pool_clear(itempool); + } + + /* don't forget the final fragment (last revision's revision header) */ + fragment = &APR_ARRAY_IDX(pack->fragments, pack->fragments->nelts-1, fragment_t); + fragment->position = current_pos; + + /* expansion phase: check whether all fragments fit into their allotted + * slots. Grow them geometrically if they don't fit. Retry until they + * all do fit. + * Note: there is an upper limit to which fragments can grow. So, this + * loop will terminate. Often, no expansion will be necessary at all. */ + do + { + needed_to_expand = FALSE; + current_pos = pack->info->nelts > 1 ? 64 : 0; + + for (i = 0; i + 1 < pack->fragments->nelts; ++i) + { + fragment = &APR_ARRAY_IDX(pack->fragments, i, fragment_t); + next = &APR_ARRAY_IDX(pack->fragments, i + 1, fragment_t); + old_len = next->position - fragment->position; + + SVN_ERR(get_content_length(&len, fs, fragment, FALSE, itempool)); + + if (len > old_len) + { + len = (apr_size_t)(len * 1.1) + 10; + needed_to_expand = TRUE; + } + else + len = old_len; + + if (i == pack->info->nelts - 1) + { + info = APR_ARRAY_IDX(pack->info, pack->info->nelts - 1, revision_info_t*); + info->target.offset = current_pos; + } + + move_fragment(fragment, current_pos); + current_pos += len; + + svn_pool_clear(itempool); + } + + fragment = &APR_ARRAY_IDX(pack->fragments, pack->fragments->nelts-1, fragment_t); + fragment->position = current_pos; + + /* update the revision + * sizes (they all end at the end of the pack file now) */ + SVN_ERR(get_content_length(&len, fs, fragment, FALSE, itempool)); + current_pos += len; + + for (i = 0; i < pack->info->nelts; ++i) + { + info = APR_ARRAY_IDX(pack->info, i, revision_info_t*); + info->target.end = current_pos; + } + } + while (needed_to_expand); + + svn_pool_destroy(itempool); + + return SVN_NO_ERROR; +} + +/* Write reorg'ed target content for PACK in FS. Use POOL for allocations. + */ +static svn_error_t * +write_revisions(fs_fs_t *fs, + revision_pack_t *pack, + apr_pool_t *pool) +{ + int i; + fragment_t *fragment = NULL; + svn_string_t *content; + + apr_pool_t *itempool = svn_pool_create(pool); + apr_pool_t *iterpool = svn_pool_create(pool); + + apr_file_t *file; + apr_size_t current_pos = 0; + svn_stringbuf_t *null_buffer = svn_stringbuf_create_empty(iterpool); + + /* create the target file */ + const char *dir = apr_psprintf(iterpool, "%s/new/%ld%s", + fs->path, pack->base / fs->max_files_per_dir, + pack->info->nelts > 1 ? ".pack" : ""); + SVN_ERR(svn_io_make_dir_recursively(dir, pool)); + SVN_ERR(svn_io_file_open(&file, + pack->info->nelts > 1 + ? apr_psprintf(iterpool, "%s/pack", dir) + : apr_psprintf(iterpool, "%s/%ld", dir, pack->base), + APR_WRITE | APR_CREATE | APR_BUFFERED, + APR_OS_DEFAULT, + iterpool)); + + /* write all fragments */ + for (i = 0; i < pack->fragments->nelts; ++i) + { + apr_size_t padding; + + /* get fragment content to write */ + fragment = &APR_ARRAY_IDX(pack->fragments, i, fragment_t); + SVN_ERR(get_fragment_content(&content, fs, fragment, itempool)); + SVN_ERR_ASSERT(fragment->position >= current_pos); + + /* number of bytes between this and the previous fragment */ + if ( fragment->kind == header_fragment + && i+1 < pack->fragments->nelts) + /* special case: header fragments are aligned to the slot end */ + padding = APR_ARRAY_IDX(pack->fragments, i+1, fragment_t).position - + content->len - current_pos; + else + /* standard case: fragments are aligned to the slot start */ + padding = fragment->position - current_pos; + + /* write padding between fragments */ + if (padding) + { + while (null_buffer->len < padding) + svn_stringbuf_appendbyte(null_buffer, 0); + + SVN_ERR(svn_io_file_write_full(file, + null_buffer->data, + padding, + NULL, + itempool)); + current_pos += padding; + } + + /* write fragment content */ + SVN_ERR(svn_io_file_write_full(file, + content->data, + content->len, + NULL, + itempool)); + current_pos += content->len; + + svn_pool_clear(itempool); + } + + apr_file_close(file); + + /* write new manifest file */ + if (pack->info->nelts > 1) + { + svn_stream_t *stream; + SVN_ERR(svn_io_file_open(&file, + apr_psprintf(iterpool, "%s/manifest", dir), + APR_WRITE | APR_CREATE | APR_BUFFERED, + APR_OS_DEFAULT, + iterpool)); + stream = svn_stream_from_aprfile2(file, FALSE, iterpool); + + for (i = 0; i < pack->info->nelts; ++i) + { + revision_info_t *info = APR_ARRAY_IDX(pack->info, i, + revision_info_t *); + SVN_ERR(svn_stream_printf(stream, itempool, + "%" APR_SIZE_T_FMT "\n", + info->target.offset)); + svn_pool_clear(itempool); + } + } + + /* cleanup */ + svn_pool_destroy(itempool); + svn_pool_destroy(iterpool); + + return SVN_NO_ERROR; +} + +/* Write reorg'ed target content for all revisions in FS. To maximize + * data locality, pack and write in one go per pack file. + * Use POOL for allocations. + */ +static svn_error_t * +pack_and_write_revisions(fs_fs_t *fs, + apr_pool_t *pool) +{ + int i; + + SVN_ERR(svn_io_make_dir_recursively(apr_psprintf(pool, "%s/new", + fs->path), + pool)); + + for (i = 0; i < fs->packs->nelts; ++i) + { + revision_pack_t *pack = APR_ARRAY_IDX(fs->packs, i, revision_pack_t*); + if (pack->base % fs->max_files_per_dir == 0) + print_progress(pack->base); + + SVN_ERR(pack_revisions(fs, pack, pool)); + SVN_ERR(write_revisions(fs, pack, pool)); + } + + return SVN_NO_ERROR; +} + +/* For the directory REPRESENTATION in FS, construct the new (target) + * serialized plaintext representation and return it in *CONTENT. + * Allocate the result in POOL and temporaries in SCRATCH_POOL. + */ +static svn_error_t * +get_updated_dir(svn_string_t **content, + fs_fs_t *fs, + representation_t *representation, + apr_pool_t *pool, + apr_pool_t *scratch_pool) +{ + apr_hash_t *hash; + apr_pool_t *hash_pool = svn_pool_create(scratch_pool); + apr_array_header_t *dir = representation->dir->entries; + int i; + svn_stream_t *stream; + svn_stringbuf_t *result; + + /* get the original content */ + SVN_ERR(read_dir(&hash, fs, representation, scratch_pool)); + hash = apr_hash_copy(hash_pool, hash); + + /* update all entries */ + for (i = 0; i < dir->nelts; ++i) + { + char buffer[256]; + svn_string_t *new_val; + apr_size_t pos; + + /* find the original entry for for the current name */ + direntry_t *entry = APR_ARRAY_IDX(dir, i, direntry_t *); + svn_string_t *str_val = apr_hash_get(hash, entry->name, entry->name_len); + if (str_val == NULL) + return svn_error_createf(SVN_ERR_FS_CORRUPT, NULL, + _("Dir entry '%s' not found"), entry->name); + + SVN_ERR_ASSERT(str_val->len < sizeof(buffer)); + + /* create and updated node ID */ + memcpy(buffer, str_val->data, str_val->len+1); + pos = strchr(buffer, '/') - buffer + 1; + pos += svn__ui64toa(buffer + pos, entry->node->target.offset - entry->node->revision->target.offset); + new_val = svn_string_ncreate(buffer, pos, hash_pool); + + /* store it in the hash */ + apr_hash_set(hash, entry->name, entry->name_len, new_val); + } + + /* serialize the updated hash */ + result = svn_stringbuf_create_ensure(representation->target.size, pool); + stream = svn_stream_from_stringbuf(result, hash_pool); + SVN_ERR(svn_hash_write2(hash, stream, SVN_HASH_TERMINATOR, hash_pool)); + svn_pool_destroy(hash_pool); + + /* done */ + *content = svn_stringbuf__morph_into_string(result); + + return SVN_NO_ERROR; +} + +/* Calculate the delta representation for the given CONTENT and BASE. + * Return the rep in *DIFF. Use POOL for allocations. + */ +static svn_error_t * +diff_stringbufs(svn_stringbuf_t *diff, + svn_string_t *base, + svn_string_t *content, + apr_pool_t *pool) +{ + svn_txdelta_window_handler_t diff_wh; + void *diff_whb; + + svn_stream_t *stream; + svn_stream_t *source = svn_stream_from_string(base, pool); + svn_stream_t *target = svn_stream_from_stringbuf(diff, pool); + + /* Prepare to write the svndiff data. */ + svn_txdelta_to_svndiff3(&diff_wh, + &diff_whb, + target, + 1, + SVN_DELTA_COMPRESSION_LEVEL_DEFAULT, + pool); + + /* create delta stream */ + stream = svn_txdelta_target_push(diff_wh, diff_whb, source, pool); + + /* run delta */ + SVN_ERR(svn_stream_write(stream, content->data, &content->len)); + SVN_ERR(svn_stream_close(stream)); + + return SVN_NO_ERROR; +} + +/* Update the noderev id value for KEY in the textual noderev representation + * in NODE_REV. Take the new id from NODE. This is a no-op if the KEY + * cannot be found. + */ +static void +update_id(svn_stringbuf_t *node_rev, + const char *key, + noderev_t *node) +{ + char *newline_pos = 0; + char *pos; + + /* we need to update the offset only -> find its position */ + pos = strstr(node_rev->data, key); + if (pos) + pos = strchr(pos, '/'); + if (pos) + newline_pos = strchr(++pos, '\n'); + + if (pos && newline_pos) + { + /* offset data has been found -> replace it */ + char temp[SVN_INT64_BUFFER_SIZE]; + apr_size_t len = svn__i64toa(temp, node->target.offset - node->revision->target.offset); + svn_stringbuf_replace(node_rev, + pos - node_rev->data, newline_pos - pos, + temp, len); + } +} + +/* Update the representation id value for KEY in the textual noderev + * representation in NODE_REV. Take the offset, sizes and new MD5 from + * REPRESENTATION. Use SCRATCH_POOL for allocations. + * This is a no-op if the KEY cannot be found. + */ +static void +update_text(svn_stringbuf_t *node_rev, + const char *key, + representation_t *representation, + apr_pool_t *scratch_pool) +{ + apr_size_t key_len = strlen(key); + char *pos = strstr(node_rev->data, key); + char *val_pos; + + if (!pos) + return; + + val_pos = pos + key_len; + if (representation->dir) + { + /* for directories, we need to write all rep info anew */ + char *newline_pos = strchr(val_pos, '\n'); + svn_checksum_t checksum; + const char* temp = apr_psprintf(scratch_pool, "%ld %" APR_SIZE_T_FMT " %" + APR_SIZE_T_FMT" %" APR_SIZE_T_FMT " %s", + representation->revision->revision, + representation->target.offset - representation->revision->target.offset, + representation->target.size, + representation->dir->size, + svn_checksum_to_cstring(&checksum, + scratch_pool)); + + checksum.digest = representation->dir->target_md5; + checksum.kind = svn_checksum_md5; + svn_stringbuf_replace(node_rev, + val_pos - node_rev->data, newline_pos - val_pos, + temp, strlen(temp)); + } + else + { + /* ordinary representation: replace offset and rep size only. + * Content size and checksums are unchanged. */ + const char* temp; + char *end_pos = strchr(val_pos, ' '); + + val_pos = end_pos + 1; + end_pos = strchr(strchr(val_pos, ' ') + 1, ' '); + temp = apr_psprintf(scratch_pool, "%" APR_SIZE_T_FMT " %" APR_SIZE_T_FMT, + representation->target.offset - representation->revision->target.offset, + representation->target.size); + + svn_stringbuf_replace(node_rev, + val_pos - node_rev->data, end_pos - val_pos, + temp, strlen(temp)); + } +} + +/* Get the target content (data block as to be written to the file) for + * the given FRAGMENT in FS. Return the content in *CONTENT. Use POOL + * for allocations. + * + * Note that, as a side-effect, this will update the target rep. info for + * directories. + */ +static svn_error_t * +get_fragment_content(svn_string_t **content, + fs_fs_t *fs, + fragment_t *fragment, + apr_pool_t *pool) +{ + revision_info_t *info; + representation_t *representation; + noderev_t *node; + svn_string_t *revision_content, *base_content; + svn_stringbuf_t *header, *node_rev, *text; + apr_size_t header_size; + svn_checksum_t *checksum = NULL; + + switch (fragment->kind) + { + /* revision headers can be constructed from target position info */ + case header_fragment: + info = fragment->data; + *content = svn_string_createf(pool, + "\n%" APR_SIZE_T_FMT " %" APR_SIZE_T_FMT "\n", + info->root_noderev->target.offset - info->target.offset, + info->target.changes); + return SVN_NO_ERROR; + + /* The changes list remains untouched */ + case changes_fragment: + info = fragment->data; + SVN_ERR(get_content(&revision_content, fs, info->revision, pool)); + + *content = svn_string_create_empty(pool); + (*content)->data = revision_content->data + info->original.changes; + (*content)->len = info->target.changes_len; + return SVN_NO_ERROR; + + /* property and file reps get new headers any need to be rewritten, + * iff the base rep is a directory. The actual (deltified) content + * remains unchanged, though. MD5 etc. do not change. */ + case property_fragment: + case file_fragment: + representation = fragment->data; + SVN_ERR(get_content(&revision_content, fs, + representation->revision->revision, pool)); + + if (representation->delta_base) + if (representation->delta_base->dir) + { + /* if the base happens to be a directory, reconstruct the + * full text and represent it as PLAIN rep. */ + SVN_ERR(get_combined_window(&text, fs, representation, pool)); + representation->target.size = text->len; + + svn_stringbuf_insert(text, 0, "PLAIN\n", 6); + svn_stringbuf_appendcstr(text, "ENDREP\n"); + *content = svn_stringbuf__morph_into_string(text); + + return SVN_NO_ERROR; + } + else + /* construct a new rep header */ + if (representation->delta_base == fs->null_base) + header = svn_stringbuf_create("DELTA\n", pool); + else + header = svn_stringbuf_createf(pool, + "DELTA %ld %" APR_SIZE_T_FMT " %" APR_SIZE_T_FMT "\n", + representation->delta_base->revision->revision, + representation->delta_base->target.offset + - representation->delta_base->revision->target.offset, + representation->delta_base->target.size); + else + header = svn_stringbuf_create("PLAIN\n", pool); + + /* if it exists, the actual delta base is unchanged. Hence, this + * rep is unchanged even if it has been deltified. */ + header_size = strchr(revision_content->data + + representation->original.offset, '\n') - + revision_content->data - + representation->original.offset + 1; + svn_stringbuf_appendbytes(header, + revision_content->data + + representation->original.offset + + header_size, + representation->original.size); + svn_stringbuf_appendcstr(header, "ENDREP\n"); + *content = svn_stringbuf__morph_into_string(header); + return SVN_NO_ERROR; + + /* directory reps need to be rewritten (and deltified) completely. + * As a side-effect, update the MD5 and target content size. */ + case dir_fragment: + /* construct new content and update MD5 */ + representation = fragment->data; + SVN_ERR(get_updated_dir(&revision_content, fs, representation, + pool, pool)); + SVN_ERR(svn_checksum(&checksum, svn_checksum_md5, + revision_content->data, revision_content->len, + pool)); + memcpy(representation->dir->target_md5, + checksum->digest, + sizeof(representation->dir->target_md5)); + + /* deltify against the base rep if necessary */ + if (representation->delta_base) + { + if (representation->delta_base->dir == NULL) + { + /* dummy or non-dir base rep -> self-compress only */ + header = svn_stringbuf_create("DELTA\n", pool); + base_content = svn_string_create_empty(pool); + } + else + { + /* deltify against base rep (which is a directory, too)*/ + representation_t *base_rep = representation->delta_base; + header = svn_stringbuf_createf(pool, + "DELTA %ld %" APR_SIZE_T_FMT " %" APR_SIZE_T_FMT "\n", + base_rep->revision->revision, + base_rep->target.offset - base_rep->revision->target.offset, + base_rep->target.size); + SVN_ERR(get_updated_dir(&base_content, fs, base_rep, + pool, pool)); + } + + /* run deltification and update target content size */ + header_size = header->len; + SVN_ERR(diff_stringbufs(header, base_content, + revision_content, pool)); + representation->dir->size = revision_content->len; + representation->target.size = header->len - header_size; + svn_stringbuf_appendcstr(header, "ENDREP\n"); + *content = svn_stringbuf__morph_into_string(header); + } + else + { + /* no delta base (not even a dummy) -> PLAIN rep */ + representation->target.size = revision_content->len; + representation->dir->size = revision_content->len; + *content = svn_string_createf(pool, "PLAIN\n%sENDREP\n", + revision_content->data); + } + + return SVN_NO_ERROR; + + /* construct the new noderev content. No side-effects.*/ + case noderev_fragment: + /* get the original noderev as string */ + node = fragment->data; + SVN_ERR(get_content(&revision_content, fs, + node->revision->revision, pool)); + node_rev = svn_stringbuf_ncreate(revision_content->data + + node->original.offset, + node->original.size, + pool); + + /* update the values that may have hanged for target */ + update_id(node_rev, "id: ", node); + update_id(node_rev, "pred: ", node->predecessor); + update_text(node_rev, "text: ", node->text, pool); + update_text(node_rev, "props: ", node->props, pool); + + *content = svn_stringbuf__morph_into_string(node_rev); + return SVN_NO_ERROR; + } + + SVN_ERR_ASSERT(0); + + return SVN_NO_ERROR; +} + +/* In the repository at PATH, restore the original content in case we ran + * this reorg tool before. Use POOL for allocations. + */ +static svn_error_t * +prepare_repo(const char *path, apr_pool_t *pool) +{ + svn_node_kind_t kind; + + const char *old_path = svn_dirent_join(path, "db/old", pool); + const char *new_path = svn_dirent_join(path, "new", pool); + const char *revs_path = svn_dirent_join(path, "db/revs", pool); + const char *old_rep_cache_path = svn_dirent_join(path, "db/rep-cache.db.old", pool); + const char *rep_cache_path = svn_dirent_join(path, "db/rep-cache.db", pool); + + /* is there a backup? */ + SVN_ERR(svn_io_check_path(old_path, &kind, pool)); + if (kind == svn_node_dir) + { + /* yes, restore the org content from it */ + SVN_ERR(svn_io_remove_dir2(new_path, TRUE, NULL, NULL, pool)); + SVN_ERR(svn_io_file_move(revs_path, new_path, pool)); + SVN_ERR(svn_io_file_move(old_path, revs_path, pool)); + SVN_ERR(svn_io_remove_dir2(new_path, TRUE, NULL, NULL, pool)); + } + + /* same for the rep cache db */ + SVN_ERR(svn_io_check_path(old_rep_cache_path, &kind, pool)); + if (kind == svn_node_file) + SVN_ERR(svn_io_file_move(old_rep_cache_path, rep_cache_path, pool)); + + return SVN_NO_ERROR; +} + +/* In the repository at PATH, create a backup of the orig content and + * replace it with the reorg'ed. Use POOL for allocations. + */ +static svn_error_t * +activate_new_revs(const char *path, apr_pool_t *pool) +{ + svn_node_kind_t kind; + + const char *old_path = svn_dirent_join(path, "db/old", pool); + const char *new_path = svn_dirent_join(path, "new", pool); + const char *revs_path = svn_dirent_join(path, "db/revs", pool); + const char *old_rep_cache_path = svn_dirent_join(path, "db/rep-cache.db.old", pool); + const char *rep_cache_path = svn_dirent_join(path, "db/rep-cache.db", pool); + + /* if there is no backup, yet, move the current repo content to the backup + * and place it with the new (reorg'ed) data. */ + SVN_ERR(svn_io_check_path(old_path, &kind, pool)); + if (kind == svn_node_none) + { + SVN_ERR(svn_io_file_move(revs_path, old_path, pool)); + SVN_ERR(svn_io_file_move(new_path, revs_path, pool)); + } + + /* same for the rep cache db */ + SVN_ERR(svn_io_check_path(old_rep_cache_path, &kind, pool)); + if (kind == svn_node_none) + SVN_ERR(svn_io_file_move(rep_cache_path, old_rep_cache_path, pool)); + + return SVN_NO_ERROR; +} + +/* Write tool usage info text to OSTREAM using PROGNAME as a prefix and + * POOL for allocations. + */ +static void +print_usage(svn_stream_t *ostream, const char *progname, + apr_pool_t *pool) +{ + svn_error_clear(svn_stream_printf(ostream, pool, + "\n" + "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + "!!! This is an experimental tool. Don't use it on production data !!!\n" + "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + "\n" + "Usage: %s <repo> <cachesize>\n" + "\n" + "Optimize the repository at local path <repo> staring from revision 0.\n" + "Use up to <cachesize> MB of memory for caching. This does not include\n" + "temporary representation of the repository structure, i.e. the actual\n" + "memory will be higher and <cachesize> be the lower limit.\n", + progname)); +} + +/* linear control flow */ +int main(int argc, const char *argv[]) +{ + apr_pool_t *pool; + svn_stream_t *ostream; + svn_error_t *svn_err; + const char *repo_path = NULL; + svn_revnum_t start_revision = 0; + apr_size_t memsize = 0; + apr_uint64_t temp = 0; + fs_fs_t *fs; + + apr_initialize(); + atexit(apr_terminate); + + pool = apr_allocator_owner_get(svn_pool_create_allocator(FALSE)); + + svn_err = svn_stream_for_stdout(&ostream, pool); + if (svn_err) + { + svn_handle_error2(svn_err, stdout, FALSE, ERROR_TAG); + return 2; + } + + if (argc != 3) + { + print_usage(ostream, argv[0], pool); + return 2; + } + + svn_err = svn_cstring_strtoui64(&temp, argv[2], 0, APR_SIZE_MAX, 10); + if (svn_err) + { + print_usage(ostream, argv[0], pool); + svn_error_clear(svn_err); + return 2; + } + + memsize = (apr_size_t)temp; + repo_path = argv[1]; + start_revision = 0; + + printf("\nPreparing repository\n"); + svn_err = prepare_repo(repo_path, pool); + + if (!svn_err) + { + printf("Reading revisions\n"); + svn_err = read_revisions(&fs, repo_path, start_revision, memsize, pool); + } + + if (!svn_err) + { + printf("\nReordering revision content\n"); + svn_err = reorder_revisions(fs, pool); + } + + if (!svn_err) + { + printf("\nPacking and writing revisions\n"); + svn_err = pack_and_write_revisions(fs, pool); + } + + if (!svn_err) + { + printf("\nSwitch to new revs\n"); + svn_err = activate_new_revs(repo_path, pool); + } + + if (svn_err) + { + svn_handle_error2(svn_err, stdout, FALSE, ERROR_TAG); + return 2; + } + + return 0; +} |