diff options
author | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-08-22 14:49:51 +0100 |
---|---|---|
committer | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-08-22 14:49:51 +0100 |
commit | a498da43c7fdb9f24b73680c02a4a3588cc62d9a (patch) | |
tree | daf8119dae1749b5165b68033a1b23a7375ce9ce /mercurial/bdiff.c | |
download | mercurial-tarball-a498da43c7fdb9f24b73680c02a4a3588cc62d9a.tar.gz |
Tarball conversion
Diffstat (limited to 'mercurial/bdiff.c')
-rw-r--r-- | mercurial/bdiff.c | 483 |
1 files changed, 483 insertions, 0 deletions
diff --git a/mercurial/bdiff.c b/mercurial/bdiff.c new file mode 100644 index 0000000..be38b53 --- /dev/null +++ b/mercurial/bdiff.c @@ -0,0 +1,483 @@ +/* + bdiff.c - efficient binary diff extension for Mercurial + + Copyright 2005, 2006 Matt Mackall <mpm@selenic.com> + + This software may be used and distributed according to the terms of + the GNU General Public License, incorporated herein by reference. + + Based roughly on Python difflib +*/ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> + +#include "util.h" + +struct line { + int hash, n, e; + Py_ssize_t len; + const char *l; +}; + +struct pos { + int pos, len; +}; + +struct hunk; +struct hunk { + int a1, a2, b1, b2; + struct hunk *next; +}; + +static int splitlines(const char *a, Py_ssize_t len, struct line **lr) +{ + unsigned hash; + int i; + const char *p, *b = a; + const char * const plast = a + len - 1; + struct line *l; + + /* count the lines */ + i = 1; /* extra line for sentinel */ + for (p = a; p < a + len; p++) + if (*p == '\n' || p == plast) + i++; + + *lr = l = (struct line *)malloc(sizeof(struct line) * i); + if (!l) + return -1; + + /* build the line array and calculate hashes */ + hash = 0; + for (p = a; p < a + len; p++) { + /* Leonid Yuriev's hash */ + hash = (hash * 1664525) + (unsigned char)*p + 1013904223; + + if (*p == '\n' || p == plast) { + l->hash = hash; + hash = 0; + l->len = p - b + 1; + l->l = b; + l->n = INT_MAX; + l++; + b = p + 1; + } + } + + /* set up a sentinel */ + l->hash = 0; + l->len = 0; + l->l = a + len; + return i - 1; +} + +static inline int cmp(struct line *a, struct line *b) +{ + return a->hash != b->hash || a->len != b->len || memcmp(a->l, b->l, a->len); +} + +static int equatelines(struct line *a, int an, struct line *b, int bn) +{ + int i, j, buckets = 1, t, scale; + struct pos *h = NULL; + + /* build a hash table of the next highest power of 2 */ + while (buckets < bn + 1) + buckets *= 2; + + /* try to allocate a large hash table to avoid collisions */ + for (scale = 4; scale; scale /= 2) { + h = (struct pos *)malloc(scale * buckets * sizeof(struct pos)); + if (h) + break; + } + + if (!h) + return 0; + + buckets = buckets * scale - 1; + + /* clear the hash table */ + for (i = 0; i <= buckets; i++) { + h[i].pos = INT_MAX; + h[i].len = 0; + } + + /* add lines to the hash table chains */ + for (i = bn - 1; i >= 0; i--) { + /* find the equivalence class */ + for (j = b[i].hash & buckets; h[j].pos != INT_MAX; + j = (j + 1) & buckets) + if (!cmp(b + i, b + h[j].pos)) + break; + + /* add to the head of the equivalence class */ + b[i].n = h[j].pos; + b[i].e = j; + h[j].pos = i; + h[j].len++; /* keep track of popularity */ + } + + /* compute popularity threshold */ + t = (bn >= 31000) ? bn / 1000 : 1000000 / (bn + 1); + + /* match items in a to their equivalence class in b */ + for (i = 0; i < an; i++) { + /* find the equivalence class */ + for (j = a[i].hash & buckets; h[j].pos != INT_MAX; + j = (j + 1) & buckets) + if (!cmp(a + i, b + h[j].pos)) + break; + + a[i].e = j; /* use equivalence class for quick compare */ + if (h[j].len <= t) + a[i].n = h[j].pos; /* point to head of match list */ + else + a[i].n = INT_MAX; /* too popular */ + } + + /* discard hash tables */ + free(h); + return 1; +} + +static int longest_match(struct line *a, struct line *b, struct pos *pos, + int a1, int a2, int b1, int b2, int *omi, int *omj) +{ + int mi = a1, mj = b1, mk = 0, mb = 0, i, j, k; + + for (i = a1; i < a2; i++) { + /* skip things before the current block */ + for (j = a[i].n; j < b1; j = b[j].n) + ; + + /* loop through all lines match a[i] in b */ + for (; j < b2; j = b[j].n) { + /* does this extend an earlier match? */ + if (i > a1 && j > b1 && pos[j - 1].pos == i - 1) + k = pos[j - 1].len + 1; + else + k = 1; + pos[j].pos = i; + pos[j].len = k; + + /* best match so far? */ + if (k > mk) { + mi = i; + mj = j; + mk = k; + } + } + } + + if (mk) { + mi = mi - mk + 1; + mj = mj - mk + 1; + } + + /* expand match to include neighboring popular lines */ + while (mi - mb > a1 && mj - mb > b1 && + a[mi - mb - 1].e == b[mj - mb - 1].e) + mb++; + while (mi + mk < a2 && mj + mk < b2 && + a[mi + mk].e == b[mj + mk].e) + mk++; + + *omi = mi - mb; + *omj = mj - mb; + + return mk + mb; +} + +static struct hunk *recurse(struct line *a, struct line *b, struct pos *pos, + int a1, int a2, int b1, int b2, struct hunk *l) +{ + int i, j, k; + + while (1) { + /* find the longest match in this chunk */ + k = longest_match(a, b, pos, a1, a2, b1, b2, &i, &j); + if (!k) + return l; + + /* and recurse on the remaining chunks on either side */ + l = recurse(a, b, pos, a1, i, b1, j, l); + if (!l) + return NULL; + + l->next = (struct hunk *)malloc(sizeof(struct hunk)); + if (!l->next) + return NULL; + + l = l->next; + l->a1 = i; + l->a2 = i + k; + l->b1 = j; + l->b2 = j + k; + l->next = NULL; + + /* tail-recursion didn't happen, so do equivalent iteration */ + a1 = i + k; + b1 = j + k; + } +} + +static int diff(struct line *a, int an, struct line *b, int bn, + struct hunk *base) +{ + struct hunk *curr; + struct pos *pos; + int t, count = 0; + + /* allocate and fill arrays */ + t = equatelines(a, an, b, bn); + pos = (struct pos *)calloc(bn ? bn : 1, sizeof(struct pos)); + + if (pos && t) { + /* generate the matching block list */ + + curr = recurse(a, b, pos, 0, an, 0, bn, base); + if (!curr) + return -1; + + /* sentinel end hunk */ + curr->next = (struct hunk *)malloc(sizeof(struct hunk)); + if (!curr->next) + return -1; + curr = curr->next; + curr->a1 = curr->a2 = an; + curr->b1 = curr->b2 = bn; + curr->next = NULL; + } + + free(pos); + + /* normalize the hunk list, try to push each hunk towards the end */ + for (curr = base->next; curr; curr = curr->next) { + struct hunk *next = curr->next; + int shift = 0; + + if (!next) + break; + + if (curr->a2 == next->a1) + while (curr->a2 + shift < an && curr->b2 + shift < bn + && !cmp(a + curr->a2 + shift, + b + curr->b2 + shift)) + shift++; + else if (curr->b2 == next->b1) + while (curr->b2 + shift < bn && curr->a2 + shift < an + && !cmp(b + curr->b2 + shift, + a + curr->a2 + shift)) + shift++; + if (!shift) + continue; + curr->b2 += shift; + next->b1 += shift; + curr->a2 += shift; + next->a1 += shift; + } + + for (curr = base->next; curr; curr = curr->next) + count++; + return count; +} + +static void freehunks(struct hunk *l) +{ + struct hunk *n; + for (; l; l = n) { + n = l->next; + free(l); + } +} + +static PyObject *blocks(PyObject *self, PyObject *args) +{ + PyObject *sa, *sb, *rl = NULL, *m; + struct line *a, *b; + struct hunk l, *h; + int an, bn, count, pos = 0; + + if (!PyArg_ParseTuple(args, "SS:bdiff", &sa, &sb)) + return NULL; + + an = splitlines(PyBytes_AsString(sa), PyBytes_Size(sa), &a); + bn = splitlines(PyBytes_AsString(sb), PyBytes_Size(sb), &b); + + if (!a || !b) + goto nomem; + + l.next = NULL; + count = diff(a, an, b, bn, &l); + if (count < 0) + goto nomem; + + rl = PyList_New(count); + if (!rl) + goto nomem; + + for (h = l.next; h; h = h->next) { + m = Py_BuildValue("iiii", h->a1, h->a2, h->b1, h->b2); + PyList_SetItem(rl, pos, m); + pos++; + } + +nomem: + free(a); + free(b); + freehunks(l.next); + return rl ? rl : PyErr_NoMemory(); +} + +static PyObject *bdiff(PyObject *self, PyObject *args) +{ + char *sa, *sb, *rb; + PyObject *result = NULL; + struct line *al, *bl; + struct hunk l, *h; + int an, bn, count; + Py_ssize_t len = 0, la, lb; + PyThreadState *_save; + + if (!PyArg_ParseTuple(args, "s#s#:bdiff", &sa, &la, &sb, &lb)) + return NULL; + + _save = PyEval_SaveThread(); + an = splitlines(sa, la, &al); + bn = splitlines(sb, lb, &bl); + if (!al || !bl) + goto nomem; + + l.next = NULL; + count = diff(al, an, bl, bn, &l); + if (count < 0) + goto nomem; + + /* calculate length of output */ + la = lb = 0; + for (h = l.next; h; h = h->next) { + if (h->a1 != la || h->b1 != lb) + len += 12 + bl[h->b1].l - bl[lb].l; + la = h->a2; + lb = h->b2; + } + PyEval_RestoreThread(_save); + _save = NULL; + + result = PyBytes_FromStringAndSize(NULL, len); + + if (!result) + goto nomem; + + /* build binary patch */ + rb = PyBytes_AsString(result); + la = lb = 0; + + for (h = l.next; h; h = h->next) { + if (h->a1 != la || h->b1 != lb) { + len = bl[h->b1].l - bl[lb].l; + +#define checkputbe32(__x, __c) \ + if (__x > UINT_MAX) { \ + PyErr_SetString(PyExc_ValueError, \ + "bdiff: value too large for putbe32"); \ + goto nomem; \ + } \ + putbe32((uint32_t)(__x), __c); + + checkputbe32(al[la].l - al->l, rb); + checkputbe32(al[h->a1].l - al->l, rb + 4); + checkputbe32(len, rb + 8); + memcpy(rb + 12, bl[lb].l, len); + rb += 12 + len; + } + la = h->a2; + lb = h->b2; + } + +nomem: + if (_save) + PyEval_RestoreThread(_save); + free(al); + free(bl); + freehunks(l.next); + return result ? result : PyErr_NoMemory(); +} + +/* + * If allws != 0, remove all whitespace (' ', \t and \r). Otherwise, + * reduce whitespace sequences to a single space and trim remaining whitespace + * from end of lines. + */ +static PyObject *fixws(PyObject *self, PyObject *args) +{ + PyObject *s, *result = NULL; + char allws, c; + const char *r; + Py_ssize_t i, rlen, wlen = 0; + char *w; + + if (!PyArg_ParseTuple(args, "Sb:fixws", &s, &allws)) + return NULL; + r = PyBytes_AsString(s); + rlen = PyBytes_Size(s); + + w = (char *)malloc(rlen ? rlen : 1); + if (!w) + goto nomem; + + for (i = 0; i != rlen; i++) { + c = r[i]; + if (c == ' ' || c == '\t' || c == '\r') { + if (!allws && (wlen == 0 || w[wlen - 1] != ' ')) + w[wlen++] = ' '; + } else if (c == '\n' && !allws + && wlen > 0 && w[wlen - 1] == ' ') { + w[wlen - 1] = '\n'; + } else { + w[wlen++] = c; + } + } + + result = PyBytes_FromStringAndSize(w, wlen); + +nomem: + free(w); + return result ? result : PyErr_NoMemory(); +} + + +static char mdiff_doc[] = "Efficient binary diff."; + +static PyMethodDef methods[] = { + {"bdiff", bdiff, METH_VARARGS, "calculate a binary diff\n"}, + {"blocks", blocks, METH_VARARGS, "find a list of matching lines\n"}, + {"fixws", fixws, METH_VARARGS, "normalize diff whitespaces\n"}, + {NULL, NULL} +}; + +#ifdef IS_PY3K +static struct PyModuleDef bdiff_module = { + PyModuleDef_HEAD_INIT, + "bdiff", + mdiff_doc, + -1, + methods +}; + +PyMODINIT_FUNC PyInit_bdiff(void) +{ + return PyModule_Create(&bdiff_module); +} +#else +PyMODINIT_FUNC initbdiff(void) +{ + Py_InitModule3("bdiff", methods, mdiff_doc); +} +#endif + |