From e6da4551fef241adaefa143df2b708f6df6aacf0 Mon Sep 17 00:00:00 2001 From: Victor Denisov Date: Sat, 20 Jul 2013 00:29:43 +0400 Subject: Hash table optimization for big files. --- search.c | 133 ++++++++++++++++++++++++++++++++++++++------------------------- sumset.h | 7 +++- 2 files changed, 87 insertions(+), 53 deletions(-) diff --git a/search.c b/search.c index 75d023d..58d3568 100644 --- a/search.c +++ b/search.c @@ -48,57 +48,73 @@ #include "search.h" #include "checksum.h" - -#define TABLESIZE (1<<16) +#define TABLE_SIZE (1<<16) #define NULL_TAG (-1) - #define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF) #define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16) - -static int -rs_compare_targets(rs_target_t const *t1, rs_target_t const *t2) -{ - return ((int) t1->t - (int) t2->t); -} - - rs_result rs_build_hash_table(rs_signature_t * sums) { - int i; + int rs_compare_targets(void const *a1, void const *a2) { + rs_target_t const *t1 = a1; + rs_target_t const *t2 = a2; + + int v = (int) t1->t - (int) t2->t; + if (v != 0) + return v; + + rs_weak_sum_t w1 = sums->block_sigs[t1->i].weak_sum; + rs_weak_sum_t w2 = sums->block_sigs[t2->i].weak_sum; + + v = (w1 > w2) - (w1 < w2); + if (v != 0) + return v; + + return memcmp(sums->block_sigs[t1->i].strong_sum, + sums->block_sigs[t2->i].strong_sum, + sums->strong_sum_len); + } + + int i; - sums->tag_table = calloc(TABLESIZE, sizeof sums->tag_table[0]); + sums->tag_table = calloc(TABLE_SIZE, sizeof(sums->tag_table[0])); if (!sums->tag_table) return RS_MEM_ERROR; if (sums->count > 0) { sums->targets = calloc(sums->count, sizeof(rs_target_t)); - if (!sums->targets) + if (!sums->targets) { + free(sums->tag_table); + sums->tag_table = NULL; return RS_MEM_ERROR; + } for (i = 0; i < sums->count; i++) { sums->targets[i].i = i; sums->targets[i].t = gettag(sums->block_sigs[i].weak_sum); } - /* FIXME: Perhaps if this operating system has comparison_fn_t - * like GNU, then use it in the cast. But really does anyone - * care? */ qsort(sums->targets, sums->count, sizeof(sums->targets[0]), - (int (*)(const void *, const void *)) rs_compare_targets); + rs_compare_targets); } - for (i = 0; i < TABLESIZE; i++) - sums->tag_table[i] = NULL_TAG; + for (i = 0; i < TABLE_SIZE; i++) { + sums->tag_table[i].l = NULL_TAG; + sums->tag_table[i].r = NULL_TAG; + } for (i = sums->count - 1; i >= 0; i--) { - sums->tag_table[sums->targets[i].t] = i; + sums->tag_table[sums->targets[i].t].l = i; + } + + for (i = 0; i < sums->count; i++) { + sums->tag_table[sums->targets[i].t].r = i; } - rs_trace("done"); + rs_trace("rs_build_hash_table done"); return RS_DONE; } @@ -119,44 +135,57 @@ rs_search_for_block(rs_weak_sum_t weak_sum, rs_signature_t const *sig, rs_stats_t * stats, rs_long_t * match_where) { - int hash_tag = gettag(weak_sum); - int j = sig->tag_table[hash_tag]; - rs_strong_sum_t strong_sum; - int got_strong = 0; - - if (j == NULL_TAG) { + rs_strong_sum_t strong_sum; + int got_strong = 0; + int hash_tag = gettag(weak_sum); + tag_table_entry_t *bucket = &(sig->tag_table[hash_tag]); + int l = bucket->l; + int r = bucket->r + 1; + int v = 1; + + if (l == NULL_TAG) return 0; - } - for (; j < sig->count && sig->targets[j].t == hash_tag; j++) { - int i = sig->targets[j].i; - int token; - - if (weak_sum != sig->block_sigs[i].weak_sum) - continue; - - token = sig->block_sigs[i].i; + while (l < r) { + int m = (l + r) >> 1; + int i = sig->targets[m].i; + rs_block_sig_t *b = &(sig->block_sigs[i]); + v = (weak_sum > b->weak_sum) - (weak_sum < b->weak_sum); // v < 0 - weak_sum < b->weak_sum + // v == 0 - weak_sum == b->weak_sum + // v > 0 - weak_sum > b->weak_sum + if (v == 0) { + if (!got_strong) { + rs_calc_strong_sum(inbuf, block_len, &strong_sum); + got_strong = 1; + } + v = memcmp(strong_sum, b->strong_sum, sig->strong_sum_len); + + if (v == 0) { + l = m; + r = m; + break; + } + } - rs_trace("found weak match for %08x in token %d", weak_sum, token); + if (v > 0) + l = m + 1; + else + r = m; + } + if (l == r) { + int i = sig->targets[l].i; + rs_block_sig_t *b = &(sig->block_sigs[i]); + if (weak_sum != b->weak_sum) + return 0; if (!got_strong) { rs_calc_strong_sum(inbuf, block_len, &strong_sum); got_strong = 1; } - - /* FIXME: Use correct dynamic sum length! */ - if (memcmp(strong_sum, sig->block_sigs[i].strong_sum, - sig->strong_sum_len) == 0) { - /* XXX: This is a remnant of rsync: token number 1 is the - * block at offset 0. It would be good to clear this - * up. */ - *match_where = (rs_long_t)(token - 1) * sig->block_len; - return 1; - } else { - rs_trace("this was a false positive, the strong sig doesn't match"); - stats->false_matches++; - } + v = memcmp(strong_sum, b->strong_sum, sig->strong_sum_len); + int token = b->i; + *match_where = (rs_long_t)(token - 1) * sig->block_len; } - return 0; + return !v; } diff --git a/sumset.h b/sumset.h index 4be461b..b29c7f0 100644 --- a/sumset.h +++ b/sumset.h @@ -39,6 +39,11 @@ typedef struct rs_target { typedef struct rs_block_sig rs_block_sig_t; +typedef struct tag_table_entry { + int l; + int r; +} tag_table_entry_t ; + /* * This structure describes all the sums generated for an instance of * a file. It incorporates some redundancy to make it easier to @@ -51,7 +56,7 @@ struct rs_signature { int block_len; /* block_length */ int strong_sum_len; rs_block_sig_t *block_sigs; /* points to info for each chunk */ - int *tag_table; + tag_table_entry_t *tag_table; rs_target_t *targets; }; -- cgit v1.2.1