/* Find a best match between two vectors. Copyright (C) 2009, 2010 Free Software Foundation, Inc. Written by Andreas Gruenbacher , 2009. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* Before including this file, you need to define: EQUAL_IDX(x, y) A two-argument macro that tests elements at index x and y for equality. OFFSET A signed integer type sufficient to hold the difference between two indices. Usually something like ssize_t. */ /* * Shortest Edit Sequence * * Based on the Greedy LCS/SES Algorithm (Figure 2) in: * * Eugene W. Myers, "An O(ND) Difference Algorithm and Its Variations", * Algorithmica, Vol. 1, No. 1, pp. 251-266, March 1986. * Available: http://dx.doi.org/10.1007/BF01840446 * http://xmailserver.org/diff2.pdf * * Returns the number of changes (insertions and deletions) required to get * from a[] to b[]. Returns MAX + 1 if a[] cannot be turned into b[] with * MAX or fewer changes. * * MIN specifies the minimum number of elements in which a[] and b[] must * match. This allows to prevent trivial matches in which a sequence is * completely discarded, or completely made up. * * If PY is not NULL, matches a[] against a prefix of b[], and returns the * number of elements in b[] that were matched in *PY. Otherwise, matches * all elements of b[]. * * Note that the divide-and-conquer strategy discussed in section 4b of the * paper is more efficient, but does not allow an open-ended prefix string * search. */ OFFSET bestmatch(OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, OFFSET min, OFFSET max, OFFSET *py) { const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */ const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */ const OFFSET fmid = xoff - yoff; /* Center diagonal. */ OFFSET fmin = fmid; OFFSET fmax = fmid; OFFSET *V, *fd; OFFSET fmid_plus_2_min, ymax = -1; OFFSET c; V = malloc ((2 * max + 3) * sizeof (OFFSET)); fd = V + max + 1 - fmid; /* The number of elements that were matched in x and in y can be computed as either (x - x_skipped) or (y - y_skipped), with: delta = (x - xoff) - (y - yoff) x_skipped = (c + delta) / 2 y_skipped = (c - delta) / 2 For searching for a minimum number of matching elements, we end up with this check: (x - x_skipped) >= min ... x + y - c >= (xoff - yoff) + 2 * min x + y - c >= fmid + 2 * min */ if (min) { fmid_plus_2_min = fmid + 2 * min; min += yoff; if (min > ylim) return max + 1; } else fmid_plus_2_min = 0; /* disable this check */ if (!py) min = ylim; /* Handle the exact-match case. */ while (xoff < xlim && yoff < ylim && EQUAL_IDX (xoff, yoff)) { xoff++; yoff++; } if (xoff == xlim && yoff >= min && xoff + yoff >= fmid_plus_2_min) { ymax = yoff; c = 0; } else { fd[fmid] = xoff; for (c = 1; c <= max; c++) { OFFSET d; if (fmin > dmin) fd[--fmin - 1] = -1; else ++fmin; if (fmax < dmax) fd[++fmax + 1] = -1; else --fmax; for (d = fmax; d >= fmin; d -= 2) { OFFSET x, y; if (fd[d - 1] < fd[d + 1]) x = fd[d + 1]; else x = fd[d - 1] + 1; for (y = x - d; x < xlim && y < ylim && EQUAL_IDX (x, y); x++, y++) /* do nothing */ ; fd[d] = x; if (x == xlim && y >= min && x + y - c >= fmid_plus_2_min) { if (ymax < y) ymax = y; if (y == ylim) goto done; } } if (ymax != -1) goto done; } } done: if (py) *py = ymax; free (V); return c; } #undef OFFSET #undef EQUAL_IDX