summaryrefslogtreecommitdiff
path: root/bzrlib/_patiencediff_c.c
diff options
context:
space:
mode:
authorLorry <lorry@roadtrain.codethink.co.uk>2012-08-22 15:47:16 +0100
committerLorry <lorry@roadtrain.codethink.co.uk>2012-08-22 15:47:16 +0100
commit25335618bf8755ce6b116ee14f47f5a1f2c821e9 (patch)
treed889d7ab3f9f985d0c54c534cb8052bd2e6d7163 /bzrlib/_patiencediff_c.c
downloadbzr-tarball-25335618bf8755ce6b116ee14f47f5a1f2c821e9.tar.gz
Tarball conversion
Diffstat (limited to 'bzrlib/_patiencediff_c.c')
-rw-r--r--bzrlib/_patiencediff_c.c1281
1 files changed, 1281 insertions, 0 deletions
diff --git a/bzrlib/_patiencediff_c.c b/bzrlib/_patiencediff_c.c
new file mode 100644
index 0000000..4de410d
--- /dev/null
+++ b/bzrlib/_patiencediff_c.c
@@ -0,0 +1,1281 @@
+/*
+ Copyright (C) 2007, 2010 Canonical Ltd
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+ Function equate_lines based on bdiff.c from Mercurial.
+ Copyright (C) 2005, 2006 Matt Mackall <mpm@selenic.com>
+
+ Functions unique_lcs/recurse_matches based on _patiencediff_py.py.
+ Copyright (C) 2005 Bram Cohen, Copyright (C) 2005, 2006 Canonical Ltd
+*/
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <Python.h>
+
+#include "python-compat.h"
+
+
+#if defined(__GNUC__)
+# define inline __inline__
+#elif defined(_MSC_VER)
+# define inline __inline
+#else
+# define inline
+#endif
+
+
+#define MIN(a, b) (((a) > (b)) ? (b) : (a))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+
+#define SENTINEL -1
+
+
+/* malloc returns NULL on some platforms if you try to allocate nothing,
+ * causing <https://bugs.launchpad.net/bzr/+bug/511267> and
+ * <https://bugs.launchpad.net/bzr/+bug/331095>. On glibc it passes, but
+ * let's make it fail to aid testing. */
+#define guarded_malloc(x) ( (x) ? malloc(x) : NULL )
+
+enum {
+ OP_EQUAL = 0,
+ OP_INSERT,
+ OP_DELETE,
+ OP_REPLACE
+};
+
+
+/* values from this array need to correspont to the order of the enum above */
+static char *opcode_names[] = {
+ "equal",
+ "insert",
+ "delete",
+ "replace",
+};
+
+
+struct line {
+ long hash; /* hash code of the string/object */
+ Py_ssize_t next; /* next line from the same equivalence class */
+ Py_ssize_t equiv; /* equivalence class */
+ PyObject *data;
+};
+
+
+struct bucket {
+ Py_ssize_t a_head; /* first item in `a` from this equivalence class */
+ Py_ssize_t a_count;
+ Py_ssize_t b_head; /* first item in `b` from this equivalence class */
+ Py_ssize_t b_count;
+ Py_ssize_t a_pos;
+ Py_ssize_t b_pos;
+};
+
+
+struct hashtable {
+ Py_ssize_t last_a_pos;
+ Py_ssize_t last_b_pos;
+ Py_ssize_t size;
+ struct bucket *table;
+};
+
+struct matching_line {
+ Py_ssize_t a; /* index of the line in `a` */
+ Py_ssize_t b; /* index of the line in `b` */
+};
+
+
+struct matching_block {
+ Py_ssize_t a; /* index of the first line in `a` */
+ Py_ssize_t b; /* index of the first line in `b` */
+ Py_ssize_t len; /* length of the block */
+};
+
+
+struct matching_blocks {
+ struct matching_block *matches;
+ Py_ssize_t count;
+};
+
+
+struct opcode {
+ int tag;
+ Py_ssize_t i1;
+ Py_ssize_t i2;
+ Py_ssize_t j1;
+ Py_ssize_t j2;
+};
+
+
+typedef struct {
+ PyObject_HEAD
+ Py_ssize_t asize;
+ Py_ssize_t bsize;
+ struct line *a;
+ struct line *b;
+ struct hashtable hashtable;
+ Py_ssize_t *backpointers;
+} PatienceSequenceMatcher;
+
+
+static inline Py_ssize_t
+bisect_left(Py_ssize_t *list, Py_ssize_t item, Py_ssize_t lo, Py_ssize_t hi)
+{
+ while (lo < hi) {
+ Py_ssize_t mid = lo / 2 + hi / 2 + (lo % 2 + hi % 2) / 2;
+ if (list[mid] < item)
+ lo = mid + 1;
+ else
+ hi = mid;
+ }
+ return lo;
+}
+
+
+static inline int
+compare_lines(struct line *a, struct line *b)
+{
+ return ((a->hash != b->hash)
+ || PyObject_Compare(a->data, b->data));
+}
+
+
+static inline int
+find_equivalence_class(struct bucket *hashtable, Py_ssize_t hsize,
+ struct line *lines, struct line *ref_lines,
+ Py_ssize_t i)
+{
+ Py_ssize_t j;
+ for (j = lines[i].hash & hsize; hashtable[j].b_head != SENTINEL; j = (j + 1) & hsize) {
+ if (!compare_lines(lines + i, ref_lines + hashtable[j].b_head)) {
+ break;
+ }
+ }
+ return j;
+}
+
+
+static int
+equate_lines(struct hashtable *result,
+ struct line *lines_a, struct line *lines_b,
+ Py_ssize_t asize, Py_ssize_t bsize)
+{
+ Py_ssize_t i, j, hsize;
+ struct bucket *hashtable;
+
+ /* check for overflow, we need the table to be at least bsize+1 */
+ if (bsize == PY_SSIZE_T_MAX) {
+ PyErr_SetNone(PyExc_OverflowError);
+ return 0;
+ }
+
+ /* build a hash table of the next highest power of 2 */
+ hsize = 1;
+ while (hsize < bsize + 1)
+ hsize *= 2;
+
+ /* can't be 0 */
+ hashtable = (struct bucket *) guarded_malloc(sizeof(struct bucket) * hsize);
+ if (hashtable == NULL) {
+ PyErr_NoMemory();
+ return 0;
+ }
+
+ /* initialise the hashtable */
+ for (i = 0; i < hsize; i++) {
+ hashtable[i].a_count = 0;
+ hashtable[i].b_count = 0;
+ hashtable[i].a_head = SENTINEL;
+ hashtable[i].b_head = SENTINEL;
+ }
+ hsize--;
+
+ /* add lines from lines_b to the hash table chains. iterating
+ backwards so the matching lines are sorted to the linked list
+ by the line number (because we are adding new lines to the
+ head of the list) */
+ for (i = bsize - 1; i >= 0; i--) {
+ /* find the first hashtable entry, which is either empty or contains
+ the same line as lines_b[i] */
+ j = find_equivalence_class(hashtable, hsize, lines_b, lines_b, i);
+
+ /* set the equivalence class */
+ lines_b[i].equiv = j;
+
+ /* add to the head of the equivalence class */
+ lines_b[i].next = hashtable[j].b_head;
+ hashtable[j].b_head = i;
+ hashtable[j].b_count++;
+ }
+
+ /* match items from lines_a to their equivalence class in lines_b.
+ again, iterating backwards for the right order of the linked lists */
+ for (i = asize - 1; i >= 0; i--) {
+ /* find the first hash entry, which is either empty or contains
+ the same line as lines_a[i] */
+ j = find_equivalence_class(hashtable, hsize, lines_a, lines_b, i);
+
+ /* set the equivalence class, even if we are not interested in this
+ line, because the values are not pre-filled */
+ lines_a[i].equiv = j;
+
+ /* we are not interested in lines which are not also in lines_b */
+ if (hashtable[j].b_head == SENTINEL)
+ continue;
+
+ /* add to the head of the equivalence class */
+ lines_a[i].next = hashtable[j].a_head;
+ hashtable[j].a_head = i;
+ hashtable[j].a_count++;
+ }
+
+ result->last_a_pos = -1;
+ result->last_b_pos = -1;
+ result->size = hsize + 1;
+ result->table = hashtable;
+
+ return 1;
+}
+
+
+
+/* Finds longest common subsequence of unique lines in a[alo:ahi] and
+ b[blo:bhi].
+ Parameter backpointers must have allocated memory for at least
+ 4 * (bhi - blo) ints. */
+Py_ssize_t
+unique_lcs(struct matching_line *answer,
+ struct hashtable *hashtable, Py_ssize_t *backpointers,
+ struct line *lines_a, struct line *lines_b,
+ Py_ssize_t alo, Py_ssize_t blo, Py_ssize_t ahi, Py_ssize_t bhi)
+{
+ Py_ssize_t i, k, equiv, apos, bpos, norm_apos, norm_bpos, bsize, stacksize;
+ Py_ssize_t *stacks, *lasts, *btoa;
+ struct bucket *h;
+
+ k = 0;
+ stacksize = 0;
+ bsize = bhi - blo;
+ h = hashtable->table;
+
+ /* "unpack" the allocated memory */
+ stacks = backpointers + bsize;
+ lasts = stacks + bsize;
+ btoa = lasts + bsize;
+
+ /* initialise the backpointers */
+ for (i = 0; i < bsize; i++)
+ backpointers[i] = SENTINEL;
+
+ if (hashtable->last_a_pos == -1 || hashtable->last_a_pos > alo)
+ for (i = 0; i < hashtable->size; i++)
+ h[i].a_pos = h[i].a_head;
+ hashtable->last_a_pos = alo;
+
+ if (hashtable->last_b_pos == -1 || hashtable->last_b_pos > blo)
+ for (i = 0; i < hashtable->size; i++)
+ h[i].b_pos = h[i].b_head;
+ hashtable->last_b_pos = blo;
+
+ for (bpos = blo; bpos < bhi; bpos++) {
+ equiv = lines_b[bpos].equiv;
+
+ /* no lines in a or b */
+ if (h[equiv].a_count == 0 || h[equiv].b_count == 0)
+ continue;
+
+ /* find an unique line in lines_a that matches lines_b[bpos]
+ if we find more than one line within the range alo:ahi,
+ jump to the next line from lines_b immediately */
+ apos = SENTINEL;
+ /* loop through all lines in the linked list */
+ for (i = h[equiv].a_pos; i != SENTINEL; i = lines_a[i].next) {
+ /* the index is lower than alo, continue to the next line */
+ if (i < alo) {
+ h[equiv].a_pos = i;
+ continue;
+ }
+ /* the index is higher than ahi, stop searching */
+ if (i >= ahi)
+ break;
+ /* if the line is within our range, check if it's a duplicate */
+ if (apos != SENTINEL)
+ goto nextb;
+ /* save index to the line */
+ apos = i;
+ }
+ /* this line has no equivalent in lines_a[alo:ahi] */
+ if (apos == SENTINEL)
+ goto nextb;
+
+ /* check for duplicates of this line in lines_b[blo:bhi] */
+ /* loop through all lines in the linked list */
+ for (i = h[equiv].b_pos; i != SENTINEL; i = lines_b[i].next) {
+ /* the index is lower than blo, continue to the next line */
+ if (i < blo) {
+ h[equiv].b_pos = i;
+ continue;
+ }
+ /* the index is higher than bhi, stop searching */
+ if (i >= bhi)
+ break;
+ /* if this isn't the line with started with and it's within
+ our range, it's a duplicate */
+ if (i != bpos)
+ goto nextb;
+ }
+
+ /* use normalised indexes ([0,ahi-alo) instead of [alo,ahi))
+ for the patience sorting algorithm */
+ norm_bpos = bpos - blo;
+ norm_apos = apos - alo;
+ btoa[norm_bpos] = norm_apos;
+
+ /*
+ Ok, how does this work...
+
+ We have a list of matching lines from two lists, a and b. These
+ matches are stored in variable `btoa`. As we are iterating over this
+ table by bpos, the lines from b already form an increasing sequence.
+ We need to "sort" also the lines from a using the patience sorting
+ algorithm, ignoring the lines which would need to be swapped.
+
+ http://en.wikipedia.org/wiki/Patience_sorting
+
+ For each pair of lines, we need to place the line from a on either
+ an existing pile that has higher value on the top or create a new
+ pile. Variable `stacks` represents the tops of these piles and in
+ variable `lasts` we store the lines from b, that correspond to the
+ lines from a in `stacks`.
+
+ Whenever we place a new line on top of a pile, we store a
+ backpointer to the line (b) from top of the previous pile. This means
+ that after the loop, variable `backpointers` will contain an index
+ to the previous matching lines that forms an increasing sequence
+ (over both indexes a and b) with the current matching lines. If
+ either index a or b of the previous matching lines would be higher
+ than indexes of the current one or if the indexes of the current
+ one are 0, it will contain SENTINEL.
+
+ To construct the LCS, we will just need to follow these backpointers
+ from the top of the last pile and stop when we reach SENTINEL.
+ */
+
+ /* as an optimization, check if the next line comes at the end,
+ because it usually does */
+ if (stacksize && stacks[stacksize - 1] < norm_apos)
+ k = stacksize;
+ /* as an optimization, check if the next line comes right after
+ the previous line, because usually it does */
+ else if (stacksize && (stacks[k] < norm_apos) &&
+ (k == stacksize - 1 || stacks[k + 1] > norm_apos))
+ k += 1;
+ else
+ k = bisect_left(stacks, norm_apos, 0, stacksize);
+
+ if (k > 0)
+ backpointers[norm_bpos] = lasts[k - 1];
+
+ if (k < stacksize) {
+ stacks[k] = norm_apos;
+ lasts[k] = norm_bpos;
+ }
+ else {
+ stacks[stacksize] = norm_apos;
+ lasts[stacksize] = norm_bpos;
+ stacksize += 1;
+ }
+
+
+nextb:
+ ;
+ }
+
+ if (stacksize == 0)
+ return 0;
+
+ /* backtrace the structures to find the LCS */
+ i = 0;
+ k = lasts[stacksize - 1];
+ while (k != SENTINEL) {
+ answer[i].a = btoa[k];
+ answer[i].b = k;
+ k = backpointers[k];
+ i++;
+ }
+
+ return i;
+}
+
+/* Adds a new line to the list of matching blocks, either extending the
+ current block or adding a new one. */
+static inline void
+add_matching_line(struct matching_blocks *answer, Py_ssize_t a, Py_ssize_t b)
+{
+ Py_ssize_t last_index = answer->count - 1;
+ if ((last_index >= 0) &&
+ (a == answer->matches[last_index].a +
+ answer->matches[last_index].len) &&
+ (b == answer->matches[last_index].b +
+ answer->matches[last_index].len)) {
+ /* enlarge the last block */
+ answer->matches[last_index].len++;
+ }
+ else {
+ /* create a new block */
+ last_index++;
+ answer->matches[last_index].a = a;
+ answer->matches[last_index].b = b;
+ answer->matches[last_index].len = 1;
+ answer->count++;
+ }
+}
+
+
+static int
+recurse_matches(struct matching_blocks *answer, struct hashtable *hashtable,
+ Py_ssize_t *backpointers, struct line *a, struct line *b,
+ Py_ssize_t alo, Py_ssize_t blo, Py_ssize_t ahi, Py_ssize_t bhi,
+ int maxrecursion)
+{
+ int res;
+ Py_ssize_t new, last_a_pos, last_b_pos, lcs_size, nahi, nbhi, i, apos, bpos;
+ struct matching_line *lcs;
+
+ if (maxrecursion < 0)
+ return 1;
+
+ if (alo == ahi || blo == bhi)
+ return 1;
+
+ new = 0;
+ last_a_pos = alo - 1;
+ last_b_pos = blo - 1;
+
+ lcs = (struct matching_line *)guarded_malloc(sizeof(struct matching_line) * (bhi - blo));
+ if (lcs == NULL)
+ return 0;
+
+ lcs_size = unique_lcs(lcs, hashtable, backpointers, a, b, alo, blo, ahi, bhi);
+
+ /* recurse between lines which are unique in each file and match */
+ for (i = lcs_size - 1; i >= 0; i--) {
+ apos = alo + lcs[i].a;
+ bpos = blo + lcs[i].b;
+ if (last_a_pos + 1 != apos || last_b_pos + 1 != bpos) {
+ res = recurse_matches(answer, hashtable,
+ backpointers, a, b,
+ last_a_pos + 1, last_b_pos + 1,
+ apos, bpos, maxrecursion - 1);
+ if (!res)
+ goto error;
+ }
+ last_a_pos = apos;
+ last_b_pos = bpos;
+ add_matching_line(answer, apos, bpos);
+ new = 1;
+ }
+
+ free(lcs);
+ lcs = NULL;
+
+ /* find matches between the last match and the end */
+ if (new > 0) {
+ res = recurse_matches(answer, hashtable,
+ backpointers, a, b,
+ last_a_pos + 1, last_b_pos + 1,
+ ahi, bhi, maxrecursion - 1);
+ if (!res)
+ goto error;
+ }
+
+
+ /* find matching lines at the very beginning */
+ else if (a[alo].equiv == b[blo].equiv) {
+ while (alo < ahi && blo < bhi && a[alo].equiv == b[blo].equiv)
+ add_matching_line(answer, alo++, blo++);
+ res = recurse_matches(answer, hashtable,
+ backpointers, a, b,
+ alo, blo, ahi, bhi, maxrecursion - 1);
+ if (!res)
+ goto error;
+ }
+
+ /* find matching lines at the very end */
+ else if (a[ahi - 1].equiv == b[bhi - 1].equiv) {
+ nahi = ahi - 1;
+ nbhi = bhi - 1;
+ while (nahi > alo && nbhi > blo && a[nahi - 1].equiv == b[nbhi - 1].equiv) {
+ nahi--;
+ nbhi--;
+ }
+ res = recurse_matches(answer, hashtable,
+ backpointers, a, b,
+ last_a_pos + 1, last_b_pos + 1,
+ nahi, nbhi, maxrecursion - 1);
+ if (!res)
+ goto error;
+ for (i = 0; i < ahi - nahi; i++)
+ add_matching_line(answer, nahi + i, nbhi + i);
+ }
+
+ return 1;
+
+error:
+ free(lcs);
+ return 0;
+}
+
+
+static void
+delete_lines(struct line *lines, Py_ssize_t size)
+{
+ struct line *line = lines;
+ while (size-- > 0) {
+ Py_XDECREF(line->data);
+ line++;
+ }
+ free(lines);
+}
+
+
+static Py_ssize_t
+load_lines(PyObject *orig, struct line **lines)
+{
+ Py_ssize_t size, i;
+ struct line *line;
+ PyObject *seq, *item;
+
+ seq = PySequence_Fast(orig, "sequence expected");
+ if (seq == NULL) {
+ return -1;
+ }
+
+ size = PySequence_Fast_GET_SIZE(seq);
+ if (size == 0) {
+ Py_DECREF(seq);
+ return 0;
+ }
+
+ /* Allocate a memory block for line data, initialized to 0 */
+ line = *lines = (struct line *)calloc(size, sizeof(struct line));
+ if (line == NULL) {
+ PyErr_NoMemory();
+ Py_DECREF(seq);
+ return -1;
+ }
+
+ for (i = 0; i < size; i++) {
+ item = PySequence_Fast_GET_ITEM(seq, i);
+ Py_INCREF(item);
+ line->data = item;
+ line->hash = PyObject_Hash(item);
+ if (line->hash == (-1)) {
+ /* Propogate the hash exception */
+ size = -1;
+ goto cleanup;
+ }
+ line->next = SENTINEL;
+ line++;
+ }
+
+ cleanup:
+ Py_DECREF(seq);
+ if (size == -1) {
+ /* Error -- cleanup unused object references */
+ delete_lines(*lines, i);
+ *lines = NULL;
+ }
+ return size;
+}
+
+
+static PyObject *
+py_unique_lcs(PyObject *self, PyObject *args)
+{
+ PyObject *aseq, *bseq, *res, *item;
+ Py_ssize_t asize, bsize, i, nmatches, *backpointers = NULL;
+ struct line *a = NULL, *b = NULL;
+ struct matching_line *matches = NULL;
+ struct hashtable hashtable;
+
+ if (!PyArg_ParseTuple(args, "OO", &aseq, &bseq))
+ return NULL;
+
+ hashtable.table = NULL;
+
+ asize = load_lines(aseq, &a);
+ bsize = load_lines(bseq, &b);
+ if (asize == -1 || bsize == -1)
+ goto error;
+
+ if (!equate_lines(&hashtable, a, b, asize, bsize))
+ goto error;
+
+ if (bsize > 0) {
+ matches = (struct matching_line *)guarded_malloc(sizeof(struct matching_line) * bsize);
+ if (matches == NULL)
+ goto error;
+
+ backpointers = (Py_ssize_t *)guarded_malloc(sizeof(Py_ssize_t) * bsize * 4);
+ if (backpointers == NULL)
+ goto error;
+ }
+
+ nmatches = unique_lcs(matches, &hashtable, backpointers, a, b, 0, 0, asize, bsize);
+
+ res = PyList_New(nmatches);
+ for (i = 0; i < nmatches; i++) {
+#if PY_VERSION_HEX < 0x02050000
+ item = Py_BuildValue("ii", matches[nmatches - i - 1].a, matches[nmatches - i - 1].b);
+#else
+ item = Py_BuildValue("nn", matches[nmatches - i - 1].a, matches[nmatches - i - 1].b);
+#endif
+ if (item == NULL)
+ goto error;
+ if (PyList_SetItem(res, i, item) != 0)
+ goto error;
+ }
+
+ free(backpointers);
+ free(matches);
+ free(hashtable.table);
+ delete_lines(b, bsize);
+ delete_lines(a, asize);
+ return res;
+
+error:
+ free(backpointers);
+ free(matches);
+ free(hashtable.table);
+ delete_lines(b, bsize);
+ delete_lines(a, asize);
+ return NULL;
+}
+
+
+static PyObject *
+py_recurse_matches(PyObject *self, PyObject *args)
+{
+ PyObject *aseq, *bseq, *item, *answer;
+ int maxrecursion, res;
+ Py_ssize_t i, j, asize, bsize, alo, blo, ahi, bhi;
+ Py_ssize_t *backpointers = NULL;
+ struct line *a = NULL, *b = NULL;
+ struct hashtable hashtable;
+ struct matching_blocks matches;
+
+#if PY_VERSION_HEX < 0x02050000
+ if (!PyArg_ParseTuple(args, "OOiiiiOi", &aseq, &bseq, &alo, &blo,
+ &ahi, &bhi, &answer, &maxrecursion))
+#else
+ if (!PyArg_ParseTuple(args, "OOnnnnOi", &aseq, &bseq, &alo, &blo,
+ &ahi, &bhi, &answer, &maxrecursion))
+#endif
+ return NULL;
+
+ hashtable.table = NULL;
+ matches.matches = NULL;
+
+ asize = load_lines(aseq, &a);
+ bsize = load_lines(bseq, &b);
+ if (asize == -1 || bsize == -1)
+ goto error;
+
+ if (!equate_lines(&hashtable, a, b, asize, bsize))
+ goto error;
+
+ matches.count = 0;
+
+ if (bsize > 0) {
+ matches.matches = (struct matching_block *)guarded_malloc(sizeof(struct matching_block) * bsize);
+ if (matches.matches == NULL)
+ goto error;
+
+ backpointers = (Py_ssize_t *)guarded_malloc(sizeof(Py_ssize_t) * bsize * 4);
+ if (backpointers == NULL)
+ goto error;
+ } else {
+ matches.matches = NULL;
+ backpointers = NULL;
+ }
+
+ res = recurse_matches(&matches, &hashtable, backpointers,
+ a, b, alo, blo, ahi, bhi, maxrecursion);
+ if (!res)
+ goto error;
+
+ for (i = 0; i < matches.count; i++) {
+ for (j = 0; j < matches.matches[i].len; j++) {
+#if PY_VERSION_HEX < 0x02050000
+ item = Py_BuildValue("ii", matches.matches[i].a + j,
+ matches.matches[i].b + j);
+#else
+ item = Py_BuildValue("nn", matches.matches[i].a + j,
+ matches.matches[i].b + j);
+#endif
+ if (item == NULL)
+ goto error;
+ if (PyList_Append(answer, item) != 0)
+ goto error;
+ }
+ }
+
+ free(backpointers);
+ free(matches.matches);
+ free(hashtable.table);
+ delete_lines(b, bsize);
+ delete_lines(a, asize);
+ Py_RETURN_NONE;
+
+error:
+ free(backpointers);
+ free(matches.matches);
+ free(hashtable.table);
+ delete_lines(b, bsize);
+ delete_lines(a, asize);
+ return NULL;
+}
+
+
+static PyObject *
+PatienceSequenceMatcher_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+ PyObject *junk, *a, *b;
+ PatienceSequenceMatcher *self;
+
+ self = (PatienceSequenceMatcher *)type->tp_alloc(type, 0);
+ if (self != NULL) {
+
+ if (!PyArg_ParseTuple(args, "OOO", &junk, &a, &b)) {
+ Py_DECREF(self);
+ return NULL;
+ }
+
+ self->asize = load_lines(a, &(self->a));
+ self->bsize = load_lines(b, &(self->b));
+
+ if (self->asize == -1 || self->bsize == -1) {
+ Py_DECREF(self);
+ return NULL;
+ }
+
+ if (!equate_lines(&self->hashtable, self->a, self->b, self->asize, self->bsize)) {
+ Py_DECREF(self);
+ return NULL;
+ }
+
+ if (self->bsize > 0) {
+ self->backpointers = (Py_ssize_t *)guarded_malloc(sizeof(Py_ssize_t) * self->bsize * 4);
+ if (self->backpointers == NULL) {
+ Py_DECREF(self);
+ PyErr_NoMemory();
+ return NULL;
+ }
+ } else {
+ self->backpointers = NULL;
+ }
+
+ }
+
+ return (PyObject *)self;
+}
+
+
+static void
+PatienceSequenceMatcher_dealloc(PatienceSequenceMatcher* self)
+{
+ free(self->backpointers);
+ free(self->hashtable.table);
+ delete_lines(self->b, self->bsize);
+ delete_lines(self->a, self->asize);
+ self->ob_type->tp_free((PyObject *)self);
+}
+
+
+static char PatienceSequenceMatcher_get_matching_blocks_doc[] =
+ "Return list of triples describing matching subsequences.\n"
+ "\n"
+ "Each triple is of the form (i, j, n), and means that\n"
+ "a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in\n"
+ "i and in j.\n"
+ "\n"
+ "The last triple is a dummy, (len(a), len(b), 0), and is the only\n"
+ "triple with n==0.\n"
+ "\n"
+ ">>> s = PatienceSequenceMatcher(None, \"abxcd\", \"abcd\")\n"
+ ">>> s.get_matching_blocks()\n"
+ "[(0, 0, 2), (3, 2, 2), (5, 4, 0)]\n";
+
+static PyObject *
+PatienceSequenceMatcher_get_matching_blocks(PatienceSequenceMatcher* self)
+{
+ PyObject *answer, *item;
+ int res;
+ Py_ssize_t i;
+ struct matching_blocks matches;
+
+ matches.count = 0;
+ if (self->bsize > 0) {
+ matches.matches = (struct matching_block *)
+ guarded_malloc(sizeof(struct matching_block) * self->bsize);
+ if (matches.matches == NULL)
+ return PyErr_NoMemory();
+ } else
+ matches.matches = NULL;
+
+ res = recurse_matches(&matches, &self->hashtable, self->backpointers,
+ self->a, self->b, 0, 0,
+ self->asize, self->bsize, 10);
+ if (!res) {
+ free(matches.matches);
+ return PyErr_NoMemory();
+ }
+
+ answer = PyList_New(matches.count + 1);
+ if (answer == NULL) {
+ free(matches.matches);
+ return NULL;
+ }
+
+ for (i = 0; i < matches.count; i++) {
+#if PY_VERSION_HEX < 0x02050000
+ item = Py_BuildValue("iii", matches.matches[i].a,
+ matches.matches[i].b, matches.matches[i].len);
+#else
+ item = Py_BuildValue("nnn", matches.matches[i].a,
+ matches.matches[i].b, matches.matches[i].len);
+#endif
+ if (item == NULL)
+ goto error;
+ if (PyList_SetItem(answer, i, item) != 0)
+ goto error;
+ }
+#if PY_VERSION_HEX < 0x02050000
+ item = Py_BuildValue("iii", self->asize, self->bsize, 0);
+#else
+ item = Py_BuildValue("nnn", self->asize, self->bsize, 0);
+#endif
+ if (item == NULL)
+ goto error;
+ if (PyList_SetItem(answer, i, item) != 0)
+ goto error;
+
+ free(matches.matches);
+ return answer;
+
+error:
+ free(matches.matches);
+ Py_DECREF(answer);
+ return NULL;
+}
+
+
+static char PatienceSequenceMatcher_get_opcodes_doc[] =
+ "Return list of 5-tuples describing how to turn a into b.\n"
+ "\n"
+ "Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple\n"
+ "has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the\n"
+ "tuple preceding it, and likewise for j1 == the previous j2.\n"
+ "\n"
+ "The tags are strings, with these meanings:\n"
+ "\n"
+ "'replace': a[i1:i2] should be replaced by b[j1:j2]\n"
+ "'delete': a[i1:i2] should be deleted.\n"
+ " Note that j1==j2 in this case.\n"
+ "'insert': b[j1:j2] should be inserted at a[i1:i1].\n"
+ " Note that i1==i2 in this case.\n"
+ "'equal': a[i1:i2] == b[j1:j2]\n"
+ "\n"
+ ">>> a = \"qabxcd\"\n"
+ ">>> b = \"abycdf\"\n"
+ ">>> s = PatienceSequenceMatcher(None, a, b)\n"
+ ">>> for tag, i1, i2, j1, j2 in s.get_opcodes():\n"
+ "... print (\"%7s a[%d:%d] (%s) b[%d:%d] (%s)\" %\n"
+ "... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))\n"
+ " delete a[0:1] (q) b[0:0] ()\n"
+ " equal a[1:3] (ab) b[0:2] (ab)\n"
+ "replace a[3:4] (x) b[2:3] (y)\n"
+ " equal a[4:6] (cd) b[3:5] (cd)\n"
+ " insert a[6:6] () b[5:6] (f)\n";
+
+static PyObject *
+PatienceSequenceMatcher_get_opcodes(PatienceSequenceMatcher* self)
+{
+ PyObject *answer, *item;
+ Py_ssize_t i, j, k, ai, bj;
+ int tag, res;
+ struct matching_blocks matches;
+
+ matches.count = 0;
+ matches.matches = (struct matching_block *)guarded_malloc(sizeof(struct matching_block) * (self->bsize + 1));
+ if (matches.matches == NULL)
+ return PyErr_NoMemory();
+
+ res = recurse_matches(&matches, &self->hashtable, self->backpointers,
+ self->a, self->b, 0, 0,
+ self->asize, self->bsize, 10);
+ if (!res) {
+ free(matches.matches);
+ return PyErr_NoMemory();
+ }
+
+ matches.matches[matches.count].a = self->asize;
+ matches.matches[matches.count].b = self->bsize;
+ matches.matches[matches.count].len = 0;
+ matches.count++;
+
+ answer = PyList_New(0);
+ if (answer == NULL) {
+ free(matches.matches);
+ return NULL;
+ }
+
+ i = j = 0;
+ for (k = 0; k < matches.count; k++) {
+ ai = matches.matches[k].a;
+ bj = matches.matches[k].b;
+
+ tag = -1;
+ if (i < ai && j < bj)
+ tag = OP_REPLACE;
+ else if (i < ai)
+ tag = OP_DELETE;
+ else if (j < bj)
+ tag = OP_INSERT;
+
+ if (tag != -1) {
+#if PY_VERSION_HEX < 0x02050000
+ item = Py_BuildValue("siiii", opcode_names[tag], i, ai, j, bj);
+#else
+ item = Py_BuildValue("snnnn", opcode_names[tag], i, ai, j, bj);
+#endif
+ if (item == NULL)
+ goto error;
+ if (PyList_Append(answer, item) != 0)
+ goto error;
+ }
+
+ i = ai + matches.matches[k].len;
+ j = bj + matches.matches[k].len;
+
+ if (matches.matches[k].len > 0) {
+#if PY_VERSION_HEX < 0x02050000
+ item = Py_BuildValue("siiii", opcode_names[OP_EQUAL], ai, i, bj, j);
+#else
+ item = Py_BuildValue("snnnn", opcode_names[OP_EQUAL], ai, i, bj, j);
+#endif
+ if (item == NULL)
+ goto error;
+ if (PyList_Append(answer, item) != 0)
+ goto error;
+ }
+ }
+
+ free(matches.matches);
+ return answer;
+
+error:
+ free(matches.matches);
+ Py_DECREF(answer);
+ return NULL;
+}
+
+
+static char PatienceSequenceMatcher_get_grouped_opcodes_doc[] =
+ "Isolate change clusters by eliminating ranges with no changes.\n"
+ "\n"
+ "Return a list of groups with upto n lines of context.\n"
+ "Each group is in the same format as returned by get_opcodes().\n"
+ "\n"
+ ">>> from pprint import pprint\n"
+ ">>> a = map(str, range(1,40))\n"
+ ">>> b = a[:]\n"
+ ">>> b[8:8] = ['i'] # Make an insertion\n"
+ ">>> b[20] += 'x' # Make a replacement\n"
+ ">>> b[23:28] = [] # Make a deletion\n"
+ ">>> b[30] += 'y' # Make another replacement\n"
+ ">>> pprint(PatienceSequenceMatcher(None,a,b).get_grouped_opcodes())\n"
+ "[[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],\n"
+ " [('equal', 16, 19, 17, 20),\n"
+ " ('replace', 19, 20, 20, 21),\n"
+ " ('equal', 20, 22, 21, 23),\n"
+ " ('delete', 22, 27, 23, 23),\n"
+ " ('equal', 27, 30, 23, 26)],\n"
+ " [('equal', 31, 34, 27, 30),\n"
+ " ('replace', 34, 35, 30, 31),\n"
+ " ('equal', 35, 38, 31, 34)]]\n";
+
+static PyObject *
+PatienceSequenceMatcher_get_grouped_opcodes(PatienceSequenceMatcher* self,
+ PyObject *args)
+{
+ PyObject *answer, *group, *item;
+ Py_ssize_t i, j, k, ai, bj, size, ncodes, tag;
+ Py_ssize_t i1, i2, j1, j2;
+ int n = 3, nn, res;
+ struct matching_blocks matches;
+ struct opcode *codes;
+
+ if (!PyArg_ParseTuple(args, "|i", &n))
+ return NULL;
+
+ matches.count = 0;
+ matches.matches = (struct matching_block *)guarded_malloc(sizeof(struct matching_block) * (self->bsize + 1));
+ if (matches.matches == NULL)
+ return PyErr_NoMemory();
+
+ res = recurse_matches(&matches, &self->hashtable, self->backpointers,
+ self->a, self->b, 0, 0,
+ self->asize, self->bsize, 10);
+ if (!res) {
+ free(matches.matches);
+ return PyErr_NoMemory();
+ }
+
+ matches.matches[matches.count].a = self->asize;
+ matches.matches[matches.count].b = self->bsize;
+ matches.matches[matches.count].len = 0;
+ matches.count++;
+
+ ncodes = 0;
+ codes = (struct opcode *)guarded_malloc(sizeof(struct opcode) * matches.count * 2);
+ if (codes == NULL) {
+ free(matches.matches);
+ return PyErr_NoMemory();
+ }
+
+ i = j = 0;
+ for (k = 0; k < matches.count; k++) {
+ ai = matches.matches[k].a;
+ bj = matches.matches[k].b;
+
+ tag = -1;
+ if (i < ai && j < bj)
+ tag = OP_REPLACE;
+ else if (i < ai)
+ tag = OP_DELETE;
+ else if (j < bj)
+ tag = OP_INSERT;
+
+ if (tag != -1) {
+ codes[ncodes].tag = tag;
+ codes[ncodes].i1 = i;
+ codes[ncodes].i2 = ai;
+ codes[ncodes].j1 = j;
+ codes[ncodes].j2 = bj;
+ ncodes++;
+ }
+
+ i = ai + matches.matches[k].len;
+ j = bj + matches.matches[k].len;
+
+ if (matches.matches[k].len > 0) {
+ codes[ncodes].tag = OP_EQUAL;
+ codes[ncodes].i1 = ai;
+ codes[ncodes].i2 = i;
+ codes[ncodes].j1 = bj;
+ codes[ncodes].j2 = j;
+ ncodes++;
+ }
+ }
+
+ if (ncodes == 0) {
+ codes[ncodes].tag = OP_EQUAL;
+ codes[ncodes].i1 = 0;
+ codes[ncodes].i2 = 1;
+ codes[ncodes].j1 = 0;
+ codes[ncodes].j2 = 1;
+ ncodes++;
+ }
+
+ /* fixup leading and trailing groups if they show no changes. */
+ if (codes[0].tag == OP_EQUAL) {
+ codes[0].i1 = MAX(codes[0].i1, codes[0].i2 - n);
+ codes[0].j1 = MAX(codes[0].j1, codes[0].j2 - n);
+ }
+ if (codes[ncodes - 1].tag == OP_EQUAL) {
+ codes[ncodes - 1].i2 = MIN(codes[ncodes - 1].i2,
+ codes[ncodes - 1].i1 + n);
+ codes[ncodes - 1].j2 = MIN(codes[ncodes - 1].j2,
+ codes[ncodes - 1].j1 + n);
+ }
+
+ group = NULL;
+
+ answer = PyList_New(0);
+ if (answer == NULL)
+ goto error;
+
+ group = PyList_New(0);
+ if (group == NULL)
+ goto error;
+
+ nn = n + n;
+ tag = -1;
+ for (i = 0; i < ncodes; i++) {
+ tag = codes[i].tag;
+ i1 = codes[i].i1;
+ i2 = codes[i].i2;
+ j1 = codes[i].j1;
+ j2 = codes[i].j2;
+ /* end the current group and start a new one whenever
+ there is a large range with no changes. */
+ if (tag == OP_EQUAL && i2 - i1 > nn) {
+#if PY_VERSION_HEX < 0x02050000
+ item = Py_BuildValue("siiii", opcode_names[tag],
+ i1, MIN(i2, i1 + n), j1, MIN(j2, j1 + n));
+#else
+ item = Py_BuildValue("snnnn", opcode_names[tag],
+ i1, MIN(i2, i1 + n), j1, MIN(j2, j1 + n));
+#endif
+ if (item == NULL)
+ goto error;
+ if (PyList_Append(group, item) != 0)
+ goto error;
+ if (PyList_Append(answer, group) != 0)
+ goto error;
+ group = PyList_New(0);
+ if (group == NULL)
+ goto error;
+ i1 = MAX(i1, i2 - n);
+ j1 = MAX(j1, j2 - n);
+ }
+#if PY_VERSION_HEX < 0x02050000
+ item = Py_BuildValue("siiii", opcode_names[tag], i1, i2, j1 ,j2);
+#else
+ item = Py_BuildValue("snnnn", opcode_names[tag], i1, i2, j1 ,j2);
+#endif
+ if (item == NULL)
+ goto error;
+ if (PyList_Append(group, item) != 0)
+ goto error;
+ }
+ size = PyList_Size(group);
+ if (size > 0 && !(size == 1 && tag == OP_EQUAL)) {
+ if (PyList_Append(answer, group) != 0)
+ goto error;
+ }
+ else
+ Py_DECREF(group);
+
+ free(codes);
+ free(matches.matches);
+ return answer;
+
+error:
+ free(codes);
+ free(matches.matches);
+ Py_DECREF(group);
+ Py_DECREF(answer);
+ return NULL;
+}
+
+
+static PyMethodDef PatienceSequenceMatcher_methods[] = {
+ {"get_matching_blocks",
+ (PyCFunction)PatienceSequenceMatcher_get_matching_blocks,
+ METH_NOARGS,
+ PatienceSequenceMatcher_get_matching_blocks_doc},
+ {"get_opcodes",
+ (PyCFunction)PatienceSequenceMatcher_get_opcodes,
+ METH_NOARGS,
+ PatienceSequenceMatcher_get_opcodes_doc},
+ {"get_grouped_opcodes",
+ (PyCFunction)PatienceSequenceMatcher_get_grouped_opcodes,
+ METH_VARARGS,
+ PatienceSequenceMatcher_get_grouped_opcodes_doc},
+ {NULL}
+};
+
+
+static char PatienceSequenceMatcher_doc[] =
+ "C implementation of PatienceSequenceMatcher";
+
+
+static PyTypeObject PatienceSequenceMatcherType = {
+ PyObject_HEAD_INIT(NULL)
+ 0, /* ob_size */
+ "PatienceSequenceMatcher", /* tp_name */
+ sizeof(PatienceSequenceMatcher), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)PatienceSequenceMatcher_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT, /* tp_flags*/
+ PatienceSequenceMatcher_doc, /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ PatienceSequenceMatcher_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ 0, /* tp_init */
+ 0, /* tp_alloc */
+ PatienceSequenceMatcher_new, /* tp_new */
+};
+
+
+static PyMethodDef cpatiencediff_methods[] = {
+ {"unique_lcs_c", py_unique_lcs, METH_VARARGS},
+ {"recurse_matches_c", py_recurse_matches, METH_VARARGS},
+ {NULL, NULL}
+};
+
+
+PyMODINIT_FUNC
+init_patiencediff_c(void)
+{
+ PyObject* m;
+
+ if (PyType_Ready(&PatienceSequenceMatcherType) < 0)
+ return;
+
+ m = Py_InitModule3("_patiencediff_c", cpatiencediff_methods,
+ "C implementation of PatienceSequenceMatcher");
+ if (m == NULL)
+ return;
+
+ Py_INCREF(&PatienceSequenceMatcherType);
+ PyModule_AddObject(m, "PatienceSequenceMatcher_c",
+ (PyObject *)&PatienceSequenceMatcherType);
+}
+
+
+/* vim: sw=4 et
+ */