summaryrefslogtreecommitdiff
path: root/src/diff_tform.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/diff_tform.c')
-rw-r--r--src/diff_tform.c219
1 files changed, 128 insertions, 91 deletions
diff --git a/src/diff_tform.c b/src/diff_tform.c
index d4b8cf30a..92c4036fb 100644
--- a/src/diff_tform.c
+++ b/src/diff_tform.c
@@ -436,7 +436,7 @@ static int similarity_init(
info->file, &info->odb_obj, info->repo);
}
-static int similarity_calc(
+static int similarity_sig(
similarity_info *info,
const git_diff_find_options *opts,
void **cache)
@@ -566,16 +566,19 @@ static int similarity_measure(
/* check if file sizes are nowhere near each other */
if (a_file->size > 127 &&
b_file->size > 127 &&
- (a_file->size > (b_file->size << 4) ||
- b_file->size > (a_file->size << 4)))
+ (a_file->size > (b_file->size << 3) ||
+ b_file->size > (a_file->size << 3)))
goto cleanup;
/* update signature cache if needed */
- if (!cache[a_idx] && (error = similarity_calc(&a_info, opts, cache)) < 0)
- goto cleanup;
-
- if (!cache[b_idx] && (error = similarity_calc(&b_info, opts, cache)) < 0)
- goto cleanup;
+ if (!cache[a_idx]) {
+ if ((error = similarity_sig(&a_info, opts, cache)) < 0)
+ goto cleanup;
+ }
+ if (!cache[b_idx]) {
+ if ((error = similarity_sig(&b_info, opts, cache)) < 0)
+ goto cleanup;
+ }
/* calculate similarity provided that the metric choose to process
* both the a and b files (some may not if file is too big, etc).
@@ -759,25 +762,30 @@ int git_diff_find_similar(
git_diff_list *diff,
git_diff_find_options *given_opts)
{
- size_t i, j, sigcache_size;
+ size_t s, t;
int error = 0, similarity;
- git_diff_delta *from, *to;
+ git_diff_delta *src, *tgt;
git_diff_find_options opts;
- size_t num_srcs = 0, num_tgts = 0, tried_srcs = 0, tried_tgts = 0;
+ size_t num_deltas, num_srcs = 0, num_tgts = 0;
+ size_t tried_srcs = 0, tried_tgts = 0;
size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
void **sigcache; /* cache of similarity metric file signatures */
- diff_find_match *match_srcs = NULL, *match_tgts = NULL, *best_match;
+ diff_find_match *tgt2src = NULL;
+ diff_find_match *src2tgt = NULL;
+ diff_find_match *tgt2src_copy = NULL;
+ diff_find_match *best_match;
git_diff_file swap;
if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
return error;
+ num_deltas = diff->deltas.length;
+
/* TODO: maybe abort if deltas.length > rename_limit ??? */
- if (!git__is_uint32(diff->deltas.length))
+ if (!git__is_uint32(num_deltas))
return 0;
- sigcache_size = diff->deltas.length * 2; /* keep size b/c diff may change */
- sigcache = git__calloc(sigcache_size, sizeof(void *));
+ sigcache = git__calloc(num_deltas * 2, sizeof(void *));
GITERR_CHECK_ALLOC(sigcache);
/* Label rename sources and targets
@@ -785,11 +793,11 @@ int git_diff_find_similar(
* This will also set self-similarity scores for MODIFIED files and
* mark them for splitting if break-rewrites is enabled
*/
- git_vector_foreach(&diff->deltas, i, to) {
- if (is_rename_source(diff, &opts, i, sigcache))
+ git_vector_foreach(&diff->deltas, t, tgt) {
+ if (is_rename_source(diff, &opts, t, sigcache))
++num_srcs;
- if (is_rename_target(diff, &opts, i, sigcache))
+ if (is_rename_target(diff, &opts, t, sigcache))
++num_tgts;
}
@@ -797,10 +805,15 @@ int git_diff_find_similar(
if (!num_srcs || !num_tgts)
goto cleanup;
- match_tgts = git__calloc(diff->deltas.length, sizeof(diff_find_match));
- GITERR_CHECK_ALLOC(match_tgts);
- match_srcs = git__calloc(diff->deltas.length, sizeof(diff_find_match));
- GITERR_CHECK_ALLOC(match_srcs);
+ src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
+ GITERR_CHECK_ALLOC(src2tgt);
+ tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
+ GITERR_CHECK_ALLOC(tgt2src);
+
+ if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
+ tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
+ GITERR_CHECK_ALLOC(tgt2src_copy);
+ }
/*
* Find best-fit matches for rename / copy candidates
@@ -809,47 +822,61 @@ int git_diff_find_similar(
find_best_matches:
tried_tgts = num_bumped = 0;
- git_vector_foreach(&diff->deltas, i, to) {
+ git_vector_foreach(&diff->deltas, t, tgt) {
/* skip things that are not rename targets */
- if ((to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
+ if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
continue;
tried_srcs = 0;
- git_vector_foreach(&diff->deltas, j, from) {
+ git_vector_foreach(&diff->deltas, s, src) {
/* skip things that are not rename sources */
- if ((from->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
+ if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
continue;
/* calculate similarity for this pair and find best match */
- if (i == j)
+ if (s == t)
similarity = -1; /* don't measure self-similarity here */
else if ((error = similarity_measure(
- &similarity, diff, &opts, sigcache, 2 * j, 2 * i + 1)) < 0)
+ &similarity, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
goto cleanup;
- /* if this pairing is better for the src and the tgt, keep it */
- if (similarity > 0 &&
- match_tgts[i].similarity < (uint32_t)similarity &&
- match_srcs[j].similarity < (uint32_t)similarity)
+ if (similarity < 0)
+ continue;
+
+ /* is this a better rename? */
+ if (tgt2src[t].similarity < (uint32_t)similarity &&
+ src2tgt[s].similarity < (uint32_t)similarity)
{
- if (match_tgts[i].similarity > 0) {
- match_tgts[match_srcs[j].idx].similarity = 0;
- match_srcs[match_tgts[i].idx].similarity = 0;
- ++num_bumped;
+ /* eject old mapping */
+ if (src2tgt[s].similarity > 0) {
+ tgt2src[src2tgt[s].idx].similarity = 0;
+ num_bumped++;
+ }
+ if (tgt2src[t].similarity > 0) {
+ src2tgt[tgt2src[t].idx].similarity = 0;
+ num_bumped++;
}
- match_tgts[i].similarity = (uint32_t)similarity;
- match_tgts[i].idx = (uint32_t)j;
+ /* write new mapping */
+ tgt2src[t].idx = s;
+ tgt2src[t].similarity = (uint32_t)similarity;
+ src2tgt[s].idx = t;
+ src2tgt[s].similarity = (uint32_t)similarity;
+ }
- match_srcs[j].similarity = (uint32_t)similarity;
- match_srcs[j].idx = (uint32_t)i;
+ /* keep best absolute match for copies */
+ if (tgt2src_copy != NULL &&
+ tgt2src_copy[t].similarity < (uint32_t)similarity)
+ {
+ tgt2src_copy[t].idx = s;
+ tgt2src_copy[t].similarity = (uint32_t)similarity;
}
if (++tried_srcs >= num_srcs)
break;
- /* cap on maximum targets we'll examine (per "to" file) */
+ /* cap on maximum targets we'll examine (per "tgt" file) */
if (tried_srcs > opts.rename_limit)
break;
}
@@ -867,18 +894,21 @@ find_best_matches:
tried_tgts = 0;
- git_vector_foreach(&diff->deltas, i, to) {
+ git_vector_foreach(&diff->deltas, t, tgt) {
/* skip things that are not rename targets */
- if ((to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
+ if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
continue;
/* check if this delta was the target of a similarity */
- best_match = &match_tgts[i];
- if (!best_match->similarity)
+ if (tgt2src[t].similarity)
+ best_match = &tgt2src[t];
+ else if (tgt2src_copy && tgt2src_copy[t].similarity)
+ best_match = &tgt2src_copy[t];
+ else
continue;
- j = best_match->idx;
- from = GIT_VECTOR_GET(&diff->deltas, j);
+ s = best_match->idx;
+ src = GIT_VECTOR_GET(&diff->deltas, s);
/* possible scenarios:
* 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
@@ -888,101 +918,107 @@ find_best_matches:
* 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
*/
- if (from->status == GIT_DELTA_DELETED) {
+ if (src->status == GIT_DELTA_DELETED) {
- if (delta_is_new_only(to)) {
+ if (delta_is_new_only(tgt)) {
if (best_match->similarity < opts.rename_threshold)
continue;
- delta_make_rename(to, from, best_match->similarity);
+ delta_make_rename(tgt, src, best_match->similarity);
- from->flags |= GIT_DIFF_FLAG__TO_DELETE;
+ src->flags |= GIT_DIFF_FLAG__TO_DELETE;
num_rewrites++;
} else {
- assert(delta_is_split(to));
+ assert(delta_is_split(tgt));
if (best_match->similarity < opts.rename_from_rewrite_threshold)
continue;
- memcpy(&swap, &to->old_file, sizeof(swap));
+ memcpy(&swap, &tgt->old_file, sizeof(swap));
- delta_make_rename(to, from, best_match->similarity);
+ delta_make_rename(tgt, src, best_match->similarity);
num_rewrites--;
- from->status = GIT_DELTA_DELETED;
- memcpy(&from->old_file, &swap, sizeof(from->old_file));
- memset(&from->new_file, 0, sizeof(from->new_file));
- from->new_file.path = from->old_file.path;
- from->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
+ src->status = GIT_DELTA_DELETED;
+ memcpy(&src->old_file, &swap, sizeof(src->old_file));
+ memset(&src->new_file, 0, sizeof(src->new_file));
+ src->new_file.path = src->old_file.path;
+ src->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
num_updates++;
}
}
- else if (delta_is_split(from)) {
+ else if (delta_is_split(src)) {
- if (delta_is_new_only(to)) {
+ if (delta_is_new_only(tgt)) {
if (best_match->similarity < opts.rename_threshold)
continue;
- delta_make_rename(to, from, best_match->similarity);
+ delta_make_rename(tgt, src, best_match->similarity);
- from->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
+ src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
- memset(&from->old_file, 0, sizeof(from->old_file));
- from->old_file.path = from->new_file.path;
- from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
+ memset(&src->old_file, 0, sizeof(src->old_file));
+ src->old_file.path = src->new_file.path;
+ src->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
- from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
+ src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
num_rewrites--;
num_updates++;
} else {
- assert(delta_is_split(from));
+ assert(delta_is_split(src));
if (best_match->similarity < opts.rename_from_rewrite_threshold)
continue;
- memcpy(&swap, &to->old_file, sizeof(swap));
+ memcpy(&swap, &tgt->old_file, sizeof(swap));
- delta_make_rename(to, from, best_match->similarity);
+ delta_make_rename(tgt, src, best_match->similarity);
num_rewrites--;
num_updates++;
- memcpy(&from->old_file, &swap, sizeof(from->old_file));
+ memcpy(&src->old_file, &swap, sizeof(src->old_file));
/* if we've just swapped the new element into the correct
* place, clear the SPLIT flag
*/
- if (match_tgts[j].idx == i &&
- match_tgts[j].similarity >
+ if (tgt2src[s].idx == t &&
+ tgt2src[s].similarity >
opts.rename_from_rewrite_threshold) {
-
- from->status = GIT_DELTA_RENAMED;
- from->similarity = match_tgts[j].similarity;
- match_tgts[j].similarity = 0;
- from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
+ src->status = GIT_DELTA_RENAMED;
+ src->similarity = tgt2src[s].similarity;
+ tgt2src[s].similarity = 0;
+ src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
num_rewrites--;
}
/* otherwise, if we just overwrote a source, update mapping */
- else if (j > i && match_srcs[i].similarity > 0) {
- match_tgts[match_srcs[i].idx].idx = (uint32_t)j;
+ else if (s > t && src2tgt[t].similarity > 0) {
+ /* what used to be at src t is now at src s */
+ tgt2src[src2tgt[t].idx].idx = (uint32_t)s;
}
num_updates++;
}
}
- else if (delta_is_new_only(to)) {
- if (!FLAG_SET(&opts, GIT_DIFF_FIND_COPIES) ||
- best_match->similarity < opts.copy_threshold)
+ else if (delta_is_new_only(tgt)) {
+ if (!FLAG_SET(&opts, GIT_DIFF_FIND_COPIES))
+ continue;
+
+ if (tgt2src_copy[t].similarity < opts.copy_threshold)
continue;
- to->status = GIT_DELTA_COPIED;
- to->similarity = best_match->similarity;
- memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
+ /* always use best possible source for copy */
+ best_match = &tgt2src_copy[t];
+ src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);
+
+ tgt->status = GIT_DELTA_COPIED;
+ tgt->similarity = best_match->similarity;
+ memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
num_updates++;
}
@@ -998,12 +1034,13 @@ find_best_matches:
FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES));
cleanup:
- git__free(match_srcs);
- git__free(match_tgts);
+ git__free(tgt2src);
+ git__free(src2tgt);
+ git__free(tgt2src_copy);
- for (i = 0; i < sigcache_size; ++i) {
- if (sigcache[i] != NULL)
- opts.metric->free_signature(sigcache[i], opts.metric->payload);
+ for (t = 0; t < num_deltas * 2; ++t) {
+ if (sigcache[t] != NULL)
+ opts.metric->free_signature(sigcache[t], opts.metric->payload);
}
git__free(sigcache);