/* * Copyright (C) the libgit2 contributors. All rights reserved. * * This file is part of libgit2, distributed under the GNU GPL v2 with * a Linking Exception. For full terms see the included COPYING file. */ #include "common.h" #include "git2/config.h" #include "git2/blob.h" #include "diff.h" #include "hashsig.h" #include "path.h" #include "fileops.h" static git_diff_delta *diff_delta__dup( const git_diff_delta *d, git_pool *pool) { git_diff_delta *delta = git__malloc(sizeof(git_diff_delta)); if (!delta) return NULL; memcpy(delta, d, sizeof(git_diff_delta)); GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags); if (d->old_file.path != NULL) { delta->old_file.path = git_pool_strdup(pool, d->old_file.path); if (delta->old_file.path == NULL) goto fail; } if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) { delta->new_file.path = git_pool_strdup(pool, d->new_file.path); if (delta->new_file.path == NULL) goto fail; } else { delta->new_file.path = delta->old_file.path; } return delta; fail: git__free(delta); return NULL; } static git_diff_delta *diff_delta__merge_like_cgit( const git_diff_delta *a, const git_diff_delta *b, git_pool *pool) { git_diff_delta *dup; /* Emulate C git for merging two diffs (a la 'git diff '). * * When C git does a diff between the work dir and a tree, it actually * diffs with the index but uses the workdir contents. This emulates * those choices so we can emulate the type of diff. * * We have three file descriptions here, let's call them: * f1 = a->old_file * f2 = a->new_file AND b->old_file * f3 = b->new_file */ /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */ if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED) return diff_delta__dup(a, pool); /* otherwise, base this diff on the 'b' diff */ if ((dup = diff_delta__dup(b, pool)) == NULL) return NULL; /* If 'a' status is uninteresting, then we're done */ if (a->status == GIT_DELTA_UNMODIFIED) return dup; assert(a->status != GIT_DELTA_UNMODIFIED); assert(b->status != GIT_DELTA_UNMODIFIED); /* A cgit exception is that the diff of a file that is only in the * index (i.e. not in HEAD nor workdir) is given as empty. */ if (dup->status == GIT_DELTA_DELETED) { if (a->status == GIT_DELTA_ADDED) dup->status = GIT_DELTA_UNMODIFIED; /* else don't overwrite DELETE status */ } else { dup->status = a->status; } git_oid_cpy(&dup->old_file.oid, &a->old_file.oid); dup->old_file.mode = a->old_file.mode; dup->old_file.size = a->old_file.size; dup->old_file.flags = a->old_file.flags; return dup; } int git_diff_merge( git_diff_list *onto, const git_diff_list *from) { int error = 0; git_pool onto_pool; git_vector onto_new; git_diff_delta *delta; bool ignore_case = false; unsigned int i, j; assert(onto && from); if (!from->deltas.length) return 0; if (git_vector_init( &onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 || git_pool_init(&onto_pool, 1, 0) < 0) return -1; if ((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 || (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0) { ignore_case = true; /* This function currently only supports merging diff lists that * are sorted identically. */ assert((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 && (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0); } for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) { git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i); const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j); int cmp = !f ? -1 : !o ? 1 : STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path); if (cmp < 0) { delta = diff_delta__dup(o, &onto_pool); i++; } else if (cmp > 0) { delta = diff_delta__dup(f, &onto_pool); j++; } else { delta = diff_delta__merge_like_cgit(o, f, &onto_pool); i++; j++; } /* the ignore rules for the target may not match the source * or the result of a merged delta could be skippable... */ if (git_diff_delta__should_skip(&onto->opts, delta)) { git__free(delta); continue; } if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0) break; } if (!error) { git_vector_swap(&onto->deltas, &onto_new); git_pool_swap(&onto->pool, &onto_pool); onto->new_src = from->new_src; /* prefix strings also come from old pool, so recreate those.*/ onto->opts.old_prefix = git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix); onto->opts.new_prefix = git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix); } git_vector_foreach(&onto_new, i, delta) git__free(delta); git_vector_free(&onto_new); git_pool_clear(&onto_pool); return error; } int git_diff_find_similar__hashsig_for_file( void **out, const git_diff_file *f, const char *path, void *p) { git_hashsig_option_t opt = (git_hashsig_option_t)p; int error = 0; GIT_UNUSED(f); error = git_hashsig_create_fromfile((git_hashsig **)out, path, opt); if (error == GIT_EBUFS) { error = 0; giterr_clear(); } return error; } int git_diff_find_similar__hashsig_for_buf( void **out, const git_diff_file *f, const char *buf, size_t len, void *p) { git_hashsig_option_t opt = (git_hashsig_option_t)p; int error = 0; GIT_UNUSED(f); error = git_hashsig_create((git_hashsig **)out, buf, len, opt); if (error == GIT_EBUFS) { error = 0; giterr_clear(); } return error; } void git_diff_find_similar__hashsig_free(void *sig, void *payload) { GIT_UNUSED(payload); git_hashsig_free(sig); } int git_diff_find_similar__calc_similarity( int *score, void *siga, void *sigb, void *payload) { GIT_UNUSED(payload); *score = git_hashsig_compare(siga, sigb); return 0; } #define DEFAULT_THRESHOLD 50 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60 #define DEFAULT_RENAME_LIMIT 200 static int normalize_find_opts( git_diff_list *diff, git_diff_find_options *opts, git_diff_find_options *given) { git_config *cfg = NULL; if (diff->repo != NULL && git_repository_config__weakptr(&cfg, diff->repo) < 0) return -1; if (given != NULL) memcpy(opts, given, sizeof(*opts)); else { const char *val = NULL; GIT_INIT_STRUCTURE(opts, GIT_DIFF_FIND_OPTIONS_VERSION); opts->flags = GIT_DIFF_FIND_RENAMES; if (git_config_get_string(&val, cfg, "diff.renames") < 0) giterr_clear(); else if (val && (!strcasecmp(val, "copies") || !strcasecmp(val, "copy"))) opts->flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES; } GITERR_CHECK_VERSION(opts, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options"); /* some flags imply others */ if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) { /* if we are only looking for exact matches, then don't turn * MODIFIED items into ADD/DELETE pairs because it's too picky */ opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES); /* similarly, don't look for self-rewrites to split */ opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES; } if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES) opts->flags |= GIT_DIFF_FIND_RENAMES; if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED) opts->flags |= GIT_DIFF_FIND_COPIES; if (opts->flags & GIT_DIFF_BREAK_REWRITES) opts->flags |= GIT_DIFF_FIND_REWRITES; #define USE_DEFAULT(X) ((X) == 0 || (X) > 100) if (USE_DEFAULT(opts->rename_threshold)) opts->rename_threshold = DEFAULT_THRESHOLD; if (USE_DEFAULT(opts->rename_from_rewrite_threshold)) opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD; if (USE_DEFAULT(opts->copy_threshold)) opts->copy_threshold = DEFAULT_THRESHOLD; if (USE_DEFAULT(opts->break_rewrite_threshold)) opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD; #undef USE_DEFAULT if (!opts->rename_limit) { int32_t limit = 0; opts->rename_limit = DEFAULT_RENAME_LIMIT; if (git_config_get_int32(&limit, cfg, "diff.renameLimit") < 0) giterr_clear(); else if (limit > 0) opts->rename_limit = limit; } /* assign the internal metric with whitespace flag as payload */ if (!opts->metric) { opts->metric = git__malloc(sizeof(git_diff_similarity_metric)); GITERR_CHECK_ALLOC(opts->metric); opts->metric->file_signature = git_diff_find_similar__hashsig_for_file; opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf; opts->metric->free_signature = git_diff_find_similar__hashsig_free; opts->metric->similarity = git_diff_find_similar__calc_similarity; if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE) opts->metric->payload = (void *)GIT_HASHSIG_IGNORE_WHITESPACE; else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE) opts->metric->payload = (void *)GIT_HASHSIG_NORMAL; else opts->metric->payload = (void *)GIT_HASHSIG_SMART_WHITESPACE; } return 0; } static int apply_splits_and_deletes( git_diff_list *diff, size_t expected_size, bool actually_split) { git_vector onto = GIT_VECTOR_INIT; size_t i; git_diff_delta *delta, *deleted; if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0) return -1; /* build new delta list without TO_DELETE and splitting TO_SPLIT */ git_vector_foreach(&diff->deltas, i, delta) { if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0) continue; if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) { delta->similarity = 0; /* make new record for DELETED side of split */ if (!(deleted = diff_delta__dup(delta, &diff->pool))) goto on_error; deleted->status = GIT_DELTA_DELETED; memset(&deleted->new_file, 0, sizeof(deleted->new_file)); deleted->new_file.path = deleted->old_file.path; deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; if (git_vector_insert(&onto, deleted) < 0) goto on_error; if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) delta->status = GIT_DELTA_UNTRACKED; else delta->status = GIT_DELTA_ADDED; memset(&delta->old_file, 0, sizeof(delta->old_file)); delta->old_file.path = delta->new_file.path; delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; } /* clean up delta before inserting into new list */ GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags); if (delta->status != GIT_DELTA_COPIED && delta->status != GIT_DELTA_RENAMED && (delta->status != GIT_DELTA_MODIFIED || actually_split)) delta->similarity = 0; /* insert into new list */ if (git_vector_insert(&onto, delta) < 0) goto on_error; } /* cannot return an error past this point */ /* free deltas from old list that didn't make it to the new one */ git_vector_foreach(&diff->deltas, i, delta) { if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0) git__free(delta); } /* swap new delta list into place */ git_vector_swap(&diff->deltas, &onto); git_vector_free(&onto); git_vector_sort(&diff->deltas); return 0; on_error: git_vector_foreach(&onto, i, delta) git__free(delta); git_vector_free(&onto); return -1; } GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx) { git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2); return (idx & 1) ? &delta->new_file : &delta->old_file; } typedef struct { size_t idx; git_iterator_type_t src; git_repository *repo; git_diff_file *file; git_buf data; git_odb_object *odb_obj; git_blob *blob; } similarity_info; static int similarity_init( similarity_info *info, git_diff_list *diff, size_t file_idx) { info->idx = file_idx; info->src = (file_idx & 1) ? diff->new_src : diff->old_src; info->repo = diff->repo; info->file = similarity_get_file(diff, file_idx); info->odb_obj = NULL; info->blob = NULL; git_buf_init(&info->data, 0); if (info->file->size > 0) return 0; return git_diff_file__resolve_zero_size( info->file, &info->odb_obj, info->repo); } static int similarity_sig( similarity_info *info, const git_diff_find_options *opts, void **cache) { int error = 0; git_diff_file *file = info->file; if (info->src == GIT_ITERATOR_TYPE_WORKDIR) { if ((error = git_buf_joinpath( &info->data, git_repository_workdir(info->repo), file->path)) < 0) return error; /* if path is not a regular file, just skip this item */ if (!git_path_isfile(info->data.ptr)) return 0; /* TODO: apply wd-to-odb filters to file data if necessary */ error = opts->metric->file_signature( &cache[info->idx], info->file, info->data.ptr, opts->metric->payload); } else { /* if we didn't initially know the size, we might have an odb_obj * around from earlier, so convert that, otherwise load the blob now */ if (info->odb_obj != NULL) error = git_object__from_odb_object( (git_object **)&info->blob, info->repo, info->odb_obj, GIT_OBJ_BLOB); else error = git_blob_lookup(&info->blob, info->repo, &file->oid); if (error < 0) { /* if lookup fails, just skip this item in similarity calc */ giterr_clear(); } else { size_t sz; /* index size may not be actual blob size if filtered */ if (file->size != git_blob_rawsize(info->blob)) file->size = git_blob_rawsize(info->blob); sz = (size_t)(git__is_sizet(file->size) ? file->size : -1); error = opts->metric->buffer_signature( &cache[info->idx], info->file, git_blob_rawcontent(info->blob), sz, opts->metric->payload); } } return error; } static void similarity_unload(similarity_info *info) { if (info->odb_obj) git_odb_object_free(info->odb_obj); if (info->blob) git_blob_free(info->blob); else git_buf_free(&info->data); } #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0) /* - score < 0 means files cannot be compared * - score >= 100 means files are exact match * - score == 0 means files are completely different */ static int similarity_measure( int *score, git_diff_list *diff, const git_diff_find_options *opts, void **cache, size_t a_idx, size_t b_idx) { git_diff_file *a_file = similarity_get_file(diff, a_idx); git_diff_file *b_file = similarity_get_file(diff, b_idx); bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY); int error = 0; similarity_info a_info, b_info; *score = -1; /* don't try to compare files of different types */ if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode)) return 0; /* if exact match is requested, force calculation of missing OIDs now */ if (exact_match) { if (git_oid_iszero(&a_file->oid) && diff->old_src == GIT_ITERATOR_TYPE_WORKDIR && !git_diff__oid_for_file(diff->repo, a_file->path, a_file->mode, a_file->size, &a_file->oid)) a_file->flags |= GIT_DIFF_FLAG_VALID_OID; if (git_oid_iszero(&b_file->oid) && diff->new_src == GIT_ITERATOR_TYPE_WORKDIR && !git_diff__oid_for_file(diff->repo, b_file->path, b_file->mode, b_file->size, &b_file->oid)) b_file->flags |= GIT_DIFF_FLAG_VALID_OID; } /* check OID match as a quick test */ if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) { *score = 100; return 0; } /* don't calculate signatures if we are doing exact match */ if (exact_match) { *score = 0; return 0; } memset(&a_info, 0, sizeof(a_info)); memset(&b_info, 0, sizeof(b_info)); /* set up similarity data (will try to update missing file sizes) */ if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0) return error; if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0) goto cleanup; /* check if file sizes are nowhere near each other */ if (a_file->size > 127 && b_file->size > 127 && (a_file->size > (b_file->size << 3) || b_file->size > (a_file->size << 3))) goto cleanup; /* update signature cache if needed */ if (!cache[a_idx]) { if ((error = similarity_sig(&a_info, opts, cache)) < 0) goto cleanup; } if (!cache[b_idx]) { if ((error = similarity_sig(&b_info, opts, cache)) < 0) goto cleanup; } /* calculate similarity provided that the metric choose to process * both the a and b files (some may not if file is too big, etc). */ if (cache[a_idx] && cache[b_idx]) error = opts->metric->similarity( score, cache[a_idx], cache[b_idx], opts->metric->payload); cleanup: similarity_unload(&a_info); similarity_unload(&b_info); return error; } static int calc_self_similarity( git_diff_list *diff, const git_diff_find_options *opts, size_t delta_idx, void **cache) { int error, similarity = -1; git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx); if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0) return 0; error = similarity_measure( &similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1); if (error < 0) return error; if (similarity >= 0) { delta->similarity = (uint32_t)similarity; delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY; } return 0; } static bool is_rename_target( git_diff_list *diff, const git_diff_find_options *opts, size_t delta_idx, void **cache) { git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx); /* skip things that aren't plain blobs */ if (!GIT_MODE_ISBLOB(delta->new_file.mode)) return false; /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as * targets; maybe include UNTRACKED and IGNORED if requested. */ switch (delta->status) { case GIT_DELTA_UNMODIFIED: case GIT_DELTA_DELETED: return false; case GIT_DELTA_MODIFIED: if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) && !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES)) return false; if (calc_self_similarity(diff, opts, delta_idx, cache) < 0) return false; if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) && delta->similarity < opts->break_rewrite_threshold) { delta->flags |= GIT_DIFF_FLAG__TO_SPLIT; break; } if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && delta->similarity < opts->rename_from_rewrite_threshold) break; return false; case GIT_DELTA_UNTRACKED: if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED)) return false; break; case GIT_DELTA_IGNORED: return false; default: /* all other status values should be checked */ break; } delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET; return true; } static bool is_rename_source( git_diff_list *diff, const git_diff_find_options *opts, size_t delta_idx, void **cache) { git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx); /* skip things that aren't blobs */ if (!GIT_MODE_ISBLOB(delta->old_file.mode)) return false; switch (delta->status) { case GIT_DELTA_ADDED: case GIT_DELTA_UNTRACKED: case GIT_DELTA_IGNORED: return false; case GIT_DELTA_DELETED: case GIT_DELTA_TYPECHANGE: break; case GIT_DELTA_UNMODIFIED: if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) return false; break; default: /* MODIFIED, RENAMED, COPIED */ /* if we're finding copies, this could be a source */ if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES)) break; /* otherwise, this is only a source if we can split it */ if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) && !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES)) return false; if (calc_self_similarity(diff, opts, delta_idx, cache) < 0) return false; if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) && delta->similarity < opts->break_rewrite_threshold) { delta->flags |= GIT_DIFF_FLAG__TO_SPLIT; break; } if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && delta->similarity < opts->rename_from_rewrite_threshold) break; return false; } delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE; return true; } GIT_INLINE(bool) delta_is_split(git_diff_delta *delta) { return (delta->status == GIT_DELTA_TYPECHANGE || (delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0); } GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta) { return (delta->status == GIT_DELTA_ADDED || delta->status == GIT_DELTA_UNTRACKED || delta->status == GIT_DELTA_IGNORED); } GIT_INLINE(void) delta_make_rename( git_diff_delta *to, const git_diff_delta *from, uint32_t similarity) { to->status = GIT_DELTA_RENAMED; to->similarity = similarity; memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; } typedef struct { uint32_t idx; uint32_t similarity; } diff_find_match; int git_diff_find_similar( git_diff_list *diff, git_diff_find_options *given_opts) { size_t s, t; int error = 0, similarity; git_diff_delta *src, *tgt; git_diff_find_options opts; size_t num_deltas, num_srcs = 0, num_tgts = 0; size_t tried_srcs = 0, tried_tgts = 0; size_t num_rewrites = 0, num_updates = 0, num_bumped = 0; void **sigcache; /* cache of similarity metric file signatures */ diff_find_match *tgt2src = NULL; diff_find_match *src2tgt = NULL; diff_find_match *tgt2src_copy = NULL; diff_find_match *best_match; git_diff_file swap; if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0) return error; num_deltas = diff->deltas.length; /* TODO: maybe abort if deltas.length > rename_limit ??? */ if (!git__is_uint32(num_deltas)) return 0; sigcache = git__calloc(num_deltas * 2, sizeof(void *)); GITERR_CHECK_ALLOC(sigcache); /* Label rename sources and targets * * This will also set self-similarity scores for MODIFIED files and * mark them for splitting if break-rewrites is enabled */ git_vector_foreach(&diff->deltas, t, tgt) { if (is_rename_source(diff, &opts, t, sigcache)) ++num_srcs; if (is_rename_target(diff, &opts, t, sigcache)) ++num_tgts; } /* if there are no candidate srcs or tgts, we're done */ if (!num_srcs || !num_tgts) goto cleanup; src2tgt = git__calloc(num_deltas, sizeof(diff_find_match)); GITERR_CHECK_ALLOC(src2tgt); tgt2src = git__calloc(num_deltas, sizeof(diff_find_match)); GITERR_CHECK_ALLOC(tgt2src); if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) { tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match)); GITERR_CHECK_ALLOC(tgt2src_copy); } /* * Find best-fit matches for rename / copy candidates */ find_best_matches: tried_tgts = num_bumped = 0; git_vector_foreach(&diff->deltas, t, tgt) { /* skip things that are not rename targets */ if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) continue; tried_srcs = 0; git_vector_foreach(&diff->deltas, s, src) { /* skip things that are not rename sources */ if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0) continue; /* calculate similarity for this pair and find best match */ if (s == t) similarity = -1; /* don't measure self-similarity here */ else if ((error = similarity_measure( &similarity, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0) goto cleanup; if (similarity < 0) continue; /* is this a better rename? */ if (tgt2src[t].similarity < (uint32_t)similarity && src2tgt[s].similarity < (uint32_t)similarity) { /* eject old mapping */ if (src2tgt[s].similarity > 0) { tgt2src[src2tgt[s].idx].similarity = 0; num_bumped++; } if (tgt2src[t].similarity > 0) { src2tgt[tgt2src[t].idx].similarity = 0; num_bumped++; } /* write new mapping */ tgt2src[t].idx = (uint32_t)s; tgt2src[t].similarity = (uint32_t)similarity; src2tgt[s].idx = (uint32_t)t; src2tgt[s].similarity = (uint32_t)similarity; } /* keep best absolute match for copies */ if (tgt2src_copy != NULL && tgt2src_copy[t].similarity < (uint32_t)similarity) { tgt2src_copy[t].idx = (uint32_t)s; tgt2src_copy[t].similarity = (uint32_t)similarity; } if (++tried_srcs >= num_srcs) break; /* cap on maximum targets we'll examine (per "tgt" file) */ if (tried_srcs > opts.rename_limit) break; } if (++tried_tgts >= num_tgts) break; } if (num_bumped > 0) /* try again if we bumped some items */ goto find_best_matches; /* * Rewrite the diffs with renames / copies */ tried_tgts = 0; git_vector_foreach(&diff->deltas, t, tgt) { /* skip things that are not rename targets */ if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) continue; /* check if this delta was the target of a similarity */ if (tgt2src[t].similarity) best_match = &tgt2src[t]; else if (tgt2src_copy && tgt2src_copy[t].similarity) best_match = &tgt2src_copy[t]; else continue; s = best_match->idx; src = GIT_VECTOR_GET(&diff->deltas, s); /* possible scenarios: * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY */ if (src->status == GIT_DELTA_DELETED) { if (delta_is_new_only(tgt)) { if (best_match->similarity < opts.rename_threshold) continue; delta_make_rename(tgt, src, best_match->similarity); src->flags |= GIT_DIFF_FLAG__TO_DELETE; num_rewrites++; } else { assert(delta_is_split(tgt)); if (best_match->similarity < opts.rename_from_rewrite_threshold) continue; memcpy(&swap, &tgt->old_file, sizeof(swap)); delta_make_rename(tgt, src, best_match->similarity); num_rewrites--; src->status = GIT_DELTA_DELETED; memcpy(&src->old_file, &swap, sizeof(src->old_file)); memset(&src->new_file, 0, sizeof(src->new_file)); src->new_file.path = src->old_file.path; src->new_file.flags |= GIT_DIFF_FLAG_VALID_OID; num_updates++; if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) { /* what used to be at src t is now at src s */ tgt2src[src2tgt[t].idx].idx = (uint32_t)s; } } } else if (delta_is_split(src)) { if (delta_is_new_only(tgt)) { if (best_match->similarity < opts.rename_threshold) continue; delta_make_rename(tgt, src, best_match->similarity); src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ? GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED; memset(&src->old_file, 0, sizeof(src->old_file)); src->old_file.path = src->new_file.path; src->old_file.flags |= GIT_DIFF_FLAG_VALID_OID; src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; num_rewrites--; num_updates++; } else { assert(delta_is_split(src)); if (best_match->similarity < opts.rename_from_rewrite_threshold) continue; memcpy(&swap, &tgt->old_file, sizeof(swap)); delta_make_rename(tgt, src, best_match->similarity); num_rewrites--; num_updates++; memcpy(&src->old_file, &swap, sizeof(src->old_file)); /* if we've just swapped the new element into the correct * place, clear the SPLIT flag */ if (tgt2src[s].idx == t && tgt2src[s].similarity > opts.rename_from_rewrite_threshold) { src->status = GIT_DELTA_RENAMED; src->similarity = tgt2src[s].similarity; tgt2src[s].similarity = 0; src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; num_rewrites--; } /* otherwise, if we just overwrote a source, update mapping */ else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) { /* what used to be at src t is now at src s */ tgt2src[src2tgt[t].idx].idx = (uint32_t)s; } num_updates++; } } else if (delta_is_new_only(tgt)) { if (!FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) continue; if (tgt2src_copy[t].similarity < opts.copy_threshold) continue; /* always use best possible source for copy */ best_match = &tgt2src_copy[t]; src = GIT_VECTOR_GET(&diff->deltas, best_match->idx); tgt->status = GIT_DELTA_COPIED; tgt->similarity = best_match->similarity; memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file)); num_updates++; } } /* * Actually split and delete entries as needed */ if (num_rewrites > 0 || num_updates > 0) error = apply_splits_and_deletes( diff, diff->deltas.length - num_rewrites, FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES)); cleanup: git__free(tgt2src); git__free(src2tgt); git__free(tgt2src_copy); for (t = 0; t < num_deltas * 2; ++t) { if (sigcache[t] != NULL) opts.metric->free_signature(sigcache[t], opts.metric->payload); } git__free(sigcache); if (!given_opts || !given_opts->metric) git__free(opts.metric); return error; } #undef FLAG_SET