summaryrefslogtreecommitdiff
path: root/levenshtein.c
diff options
context:
space:
mode:
authorJunio C Hamano <gitster@pobox.com>2011-07-06 15:38:18 -0700
committerJunio C Hamano <gitster@pobox.com>2011-07-06 15:38:18 -0700
commit71ee7fd15457a0252c089420b5b66de266dcbd2f (patch)
tree98d4d7d1f72c30fd7695f011be78d3e45158c41a /levenshtein.c
parent4d9e42f8f11c57b32b976a943c8ddaf6214e64b8 (diff)
parente923eaeb901ff056421b9007adcbbce271caa7b6 (diff)
downloadgit-71ee7fd15457a0252c089420b5b66de266dcbd2f.tar.gz
Merge commit 'v1.7.0' into jc/checkout-reflog-fix
* commit 'v1.7.0': (4188 commits) Git 1.7.0 Fix typo in 1.6.6.2 release notes Re-fix check-ref-format documentation mark-up archive documentation: attributes are taken from the tree by default Documentation: minor fixes to RelNotes-1.7.0 bash: support 'git am's new '--continue' option filter-branch: Fix error message for --prune-empty --commit-filter am: switch --resolved to --continue Update draft release notes to 1.7.0 one more time Git 1.6.6.2 t8003: check exit code of command and error message separately check-ref-format documentation: fix enumeration mark-up Documentation: quote braces in {upstream} notation t3902: Protect against OS X normalization blame: prevent a segv when -L given start > EOF git-push: document all the status flags used in the output Fix parsing of imap.preformattedHTML and imap.sslverify git-add documentation: Fix shell quoting example Revert "pack-objects: fix pack generation when using pack_size_limit" archive: simplify archive format guessing ...
Diffstat (limited to 'levenshtein.c')
-rw-r--r--levenshtein.c84
1 files changed, 84 insertions, 0 deletions
diff --git a/levenshtein.c b/levenshtein.c
new file mode 100644
index 0000000000..fc281597fd
--- /dev/null
+++ b/levenshtein.c
@@ -0,0 +1,84 @@
+#include "cache.h"
+#include "levenshtein.h"
+
+/*
+ * This function implements the Damerau-Levenshtein algorithm to
+ * calculate a distance between strings.
+ *
+ * Basically, it says how many letters need to be swapped, substituted,
+ * deleted from, or added to string1, at least, to get string2.
+ *
+ * The idea is to build a distance matrix for the substrings of both
+ * strings. To avoid a large space complexity, only the last three rows
+ * are kept in memory (if swaps had the same or higher cost as one deletion
+ * plus one insertion, only two rows would be needed).
+ *
+ * At any stage, "i + 1" denotes the length of the current substring of
+ * string1 that the distance is calculated for.
+ *
+ * row2 holds the current row, row1 the previous row (i.e. for the substring
+ * of string1 of length "i"), and row0 the row before that.
+ *
+ * In other words, at the start of the big loop, row2[j + 1] contains the
+ * Damerau-Levenshtein distance between the substring of string1 of length
+ * "i" and the substring of string2 of length "j + 1".
+ *
+ * All the big loop does is determine the partial minimum-cost paths.
+ *
+ * It does so by calculating the costs of the path ending in characters
+ * i (in string1) and j (in string2), respectively, given that the last
+ * operation is a substitution, a swap, a deletion, or an insertion.
+ *
+ * This implementation allows the costs to be weighted:
+ *
+ * - w (as in "sWap")
+ * - s (as in "Substitution")
+ * - a (for insertion, AKA "Add")
+ * - d (as in "Deletion")
+ *
+ * Note that this algorithm calculates a distance _iff_ d == a.
+ */
+int levenshtein(const char *string1, const char *string2,
+ int w, int s, int a, int d)
+{
+ int len1 = strlen(string1), len2 = strlen(string2);
+ int *row0 = xmalloc(sizeof(int) * (len2 + 1));
+ int *row1 = xmalloc(sizeof(int) * (len2 + 1));
+ int *row2 = xmalloc(sizeof(int) * (len2 + 1));
+ int i, j;
+
+ for (j = 0; j <= len2; j++)
+ row1[j] = j * a;
+ for (i = 0; i < len1; i++) {
+ int *dummy;
+
+ row2[0] = (i + 1) * d;
+ for (j = 0; j < len2; j++) {
+ /* substitution */
+ row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
+ /* swap */
+ if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
+ string1[i] == string2[j - 1] &&
+ row2[j + 1] > row0[j - 1] + w)
+ row2[j + 1] = row0[j - 1] + w;
+ /* deletion */
+ if (row2[j + 1] > row1[j + 1] + d)
+ row2[j + 1] = row1[j + 1] + d;
+ /* insertion */
+ if (row2[j + 1] > row2[j] + a)
+ row2[j + 1] = row2[j] + a;
+ }
+
+ dummy = row0;
+ row0 = row1;
+ row1 = row2;
+ row2 = dummy;
+ }
+
+ i = row1[len2];
+ free(row0);
+ free(row1);
+ free(row2);
+
+ return i;
+}