summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Pool <mbp@sourcefrog.net>2015-06-09 21:42:55 -0700
committerMartin Pool <mbp@sourcefrog.net>2015-06-09 21:42:55 -0700
commit4699cc9f3c73f180df07978a7e02254373c162c2 (patch)
tree3b35da869841bdc03731704aa634373f479b1c0d
parent784108c2da9ff16829a1df171f842196c53f8582 (diff)
parent2d68645abf99161c57b9faea635b6802a0ff056e (diff)
downloadlibrsync-4699cc9f3c73f180df07978a7e02254373c162c2.tar.gz
Merge pull request #14 from VictorDenisov/master
performance issue resolution for large files
-rw-r--r--search.c170
-rw-r--r--sumset.h8
2 files changed, 127 insertions, 51 deletions
diff --git a/search.c b/search.c
index b549725..ec5c492 100644
--- a/search.c
+++ b/search.c
@@ -47,57 +47,107 @@
#include "search.h"
#include "checksum.h"
-
-#define TABLESIZE (1<<16)
+#define TABLE_SIZE (1<<16)
#define NULL_TAG (-1)
-
#define gettag2(s1,s2) (((s1) + (s2)) & 0xFFFF)
#define gettag(sum) gettag2((sum)&0xFFFF,(sum)>>16)
+static void swap(rs_target_t *t1, rs_target_t *t2) {
+ unsigned short ts = t1->t;
+ t1->t = t2->t;
+ t2->t = ts;
-static int
-rs_compare_targets(rs_target_t const *t1, rs_target_t const *t2)
-{
- return ((int) t1->t - (int) t2->t);
+ int ti = t1->i;
+ t1->i = t2->i;
+ t2->i = ti;
+}
+
+int rs_compare_targets(rs_target_t const *t1, rs_target_t const *t2, rs_signature_t * sums) {
+ int v = (int) t1->t - (int) t2->t;
+ if (v != 0)
+ return v;
+
+ rs_weak_sum_t w1 = sums->block_sigs[t1->i].weak_sum;
+ rs_weak_sum_t w2 = sums->block_sigs[t2->i].weak_sum;
+
+ v = (w1 > w2) - (w1 < w2);
+ if (v != 0)
+ return v;
+
+ return memcmp(sums->block_sigs[t1->i].strong_sum,
+ sums->block_sigs[t2->i].strong_sum,
+ sums->strong_sum_len);
}
+static int heap_sort(rs_signature_t * sums) {
+ unsigned int i, j, c, n, k, p;
+ for (i = 1; i < sums->count; ++i) {
+ for (j = i; j > 0;) {
+ p = (j - 1) >> 1;
+ if (rs_compare_targets(&sums->targets[j], &sums->targets[p], sums) > 0)
+ swap(&sums->targets[j], &sums->targets[p]);
+ else
+ break;
+ j = p;
+ }
+ }
+
+ for (n = sums->count - 1; n > 0;) {
+ swap(&sums->targets[0], &sums->targets[n]);
+ --n;
+ for (i = 0; ((i << 1) + 1) <= n;) {
+ k = (i << 1) + 1;
+ if ((k + 1 <= n) && (rs_compare_targets(&sums->targets[k], &sums->targets[k + 1], sums) < 0))
+ k = k + 1;
+ if (rs_compare_targets(&sums->targets[k], &sums->targets[i], sums) > 0)
+ swap(&sums->targets[k], &sums->targets[i]);
+ else
+ break;
+ i = k;
+ }
+ }
+}
rs_result
rs_build_hash_table(rs_signature_t * sums)
{
- int i;
+ int i;
- sums->tag_table = calloc(TABLESIZE, sizeof sums->tag_table[0]);
+ sums->tag_table = calloc(TABLE_SIZE, sizeof(sums->tag_table[0]));
if (!sums->tag_table)
return RS_MEM_ERROR;
-
+
if (sums->count > 0) {
sums->targets = calloc(sums->count, sizeof(rs_target_t));
- if (!sums->targets)
+ if (!sums->targets) {
+ free(sums->tag_table);
+ sums->tag_table = NULL;
return RS_MEM_ERROR;
+ }
for (i = 0; i < sums->count; i++) {
sums->targets[i].i = i;
sums->targets[i].t = gettag(sums->block_sigs[i].weak_sum);
}
- /* FIXME: Perhaps if this operating system has comparison_fn_t
- * like GNU, then use it in the cast. But really does anyone
- * care? */
- qsort(sums->targets, sums->count,
- sizeof(sums->targets[0]),
- (int (*)(const void *, const void *)) rs_compare_targets);
+ heap_sort(sums);
}
- for (i = 0; i < TABLESIZE; i++)
- sums->tag_table[i] = NULL_TAG;
+ for (i = 0; i < TABLE_SIZE; i++) {
+ sums->tag_table[i].l = NULL_TAG;
+ sums->tag_table[i].r = NULL_TAG;
+ }
for (i = sums->count - 1; i >= 0; i--) {
- sums->tag_table[sums->targets[i].t] = i;
+ sums->tag_table[sums->targets[i].t].l = i;
}
- rs_trace("done");
+ for (i = 0; i < sums->count; i++) {
+ sums->tag_table[sums->targets[i].t].r = i;
+ }
+
+ rs_trace("rs_build_hash_table done");
return RS_DONE;
}
@@ -119,26 +169,56 @@ rs_search_for_block(rs_weak_sum_t weak_sum,
rs_signature_t const *sig, rs_stats_t * stats,
rs_long_t * match_where)
{
- int hash_tag = gettag(weak_sum);
- int j = sig->tag_table[hash_tag];
- rs_strong_sum_t strong_sum;
- int got_strong = 0;
-
- if (j == NULL_TAG) {
+ rs_strong_sum_t strong_sum;
+ int got_strong = 0;
+ int hash_tag = gettag(weak_sum);
+ rs_tag_table_entry_t *bucket = &(sig->tag_table[hash_tag]);
+ int l = bucket->l;
+ int r = bucket->r + 1;
+ int v = 1;
+
+ if (l == NULL_TAG)
return 0;
- }
- for (; j < sig->count && sig->targets[j].t == hash_tag; j++) {
- int i = sig->targets[j].i;
- int token;
-
- if (weak_sum != sig->block_sigs[i].weak_sum)
- continue;
+ while (l < r) {
+ int m = (l + r) >> 1;
+ int i = sig->targets[m].i;
+ rs_block_sig_t *b = &(sig->block_sigs[i]);
+ v = (weak_sum > b->weak_sum) - (weak_sum < b->weak_sum); // v < 0 - weak_sum < b->weak_sum
+ // v == 0 - weak_sum == b->weak_sum
+ // v > 0 - weak_sum > b->weak_sum
+ if (v == 0) {
+ if (!got_strong) {
+ if(sig->magic == RS_BLAKE2_SIG_MAGIC) {
+ rs_calc_blake2_sum(inbuf, block_len, &strong_sum);
+ } else if (sig->magic == RS_MD4_SIG_MAGIC) {
+ rs_calc_md4_sum(inbuf, block_len, &strong_sum);
+ } else {
+ rs_error("Unknown signature algorithm - this is a BUG");
+ return 0; /* FIXME: Is this the best way to handle this? */
+ }
+ got_strong = 1;
+ }
+ v = memcmp(strong_sum, b->strong_sum, sig->strong_sum_len);
- token = sig->block_sigs[i].i;
+ if (v == 0) {
+ l = m;
+ r = m;
+ break;
+ }
+ }
- rs_trace("found weak match for %08x in token %d", weak_sum, token);
+ if (v > 0)
+ l = m + 1;
+ else
+ r = m;
+ }
+ if (l == r) {
+ int i = sig->targets[l].i;
+ rs_block_sig_t *b = &(sig->block_sigs[i]);
+ if (weak_sum != b->weak_sum)
+ return 0;
if (!got_strong) {
if(sig->magic == RS_BLAKE2_SIG_MAGIC) {
rs_calc_blake2_sum(inbuf, block_len, &strong_sum);
@@ -150,20 +230,10 @@ rs_search_for_block(rs_weak_sum_t weak_sum,
}
got_strong = 1;
}
-
- /* FIXME: Use correct dynamic sum length! */
- if (memcmp(strong_sum, sig->block_sigs[i].strong_sum,
- sig->strong_sum_len) == 0) {
- /* XXX: This is a remnant of rsync: token number 1 is the
- * block at offset 0. It would be good to clear this
- * up. */
- *match_where = (rs_long_t)(token - 1) * sig->block_len;
- return 1;
- } else {
- rs_trace("this was a false positive, the strong sig doesn't match");
- stats->false_matches++;
- }
+ v = memcmp(strong_sum, b->strong_sum, sig->strong_sum_len);
+ int token = b->i;
+ *match_where = (rs_long_t)(token - 1) * sig->block_len;
}
- return 0;
+ return !v;
}
diff --git a/sumset.h b/sumset.h
index 505c0a2..173fd66 100644
--- a/sumset.h
+++ b/sumset.h
@@ -39,6 +39,12 @@ typedef struct rs_target {
typedef struct rs_block_sig rs_block_sig_t;
+typedef struct rs_tag_table_entry {
+ int l; // left bound of the hash tag in sorted array of targets
+ int r; // right bound of the hash tag in sorted array of targets
+ // all tags between l and r inclusively are the same
+} rs_tag_table_entry_t ;
+
/*
* This structure describes all the sums generated for an instance of
* a file. It incorporates some redundancy to make it easier to
@@ -51,7 +57,7 @@ struct rs_signature {
int block_len; /* block_length */
int strong_sum_len;
rs_block_sig_t *block_sigs; /* points to info for each chunk */
- int *tag_table;
+ rs_tag_table_entry_t *tag_table;
rs_target_t *targets;
int magic;
};