summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-05-17 17:28:51 -0600
committerKarl Williamson <public@khwilliamson.com>2011-05-19 12:07:09 -0600
commitc4a302578b4cf06e68534ed5fff0a27b6584d38b (patch)
tree4c76af195016f61f166916ca0d65ce15c78eb18c /regcomp.c
parentbac5f0aea9d556b93f0da5cd585da691894e585a (diff)
downloadperl-c4a302578b4cf06e68534ed5fff0a27b6584d38b.tar.gz
regcomp.c: Fix bug in inversion list intersection
This code was derived from published code, which says use at your own risk. And in fact had bugs. I don't believe these show up in 5.14, as I think you have to have a list that has been inverted for this to happen. The comments describe what should have been done.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c56
1 files changed, 35 insertions, 21 deletions
diff --git a/regcomp.c b/regcomp.c
index 8518750b46..a1183d5707 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6256,7 +6256,8 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
/* Return the intersection of two inversion lists. The basis for this
* comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
* by Addison-Wesley, and explained at some length there. The preface says
- * to incorporate its examples into your code at your own risk.
+ * to incorporate its examples into your code at your own risk. In fact,
+ * it had bugs
*
* The algorithm is like a merge sort, and is essentially the same as the
* union above
@@ -6297,17 +6298,17 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
array */
bool cp_in_set; /* Is it in the input list's set or not */
- /* We need to take one or the other of the two inputs for the union.
- * Since we are merging two sorted lists, we take the smaller of the
- * next items. In case of a tie, we take the one that is not in its
- * set first (a difference from the union algorithm). If we took one
- * in the set first, it would increment the count, possibly to 2 which
- * would cause it to be output as starting a range in the intersection,
- * and the next time through we would take that same number, and output
- * it again as ending the set. By doing it the opposite of this, we
- * there is no possibility that the count will be momentarily
- * incremented to 2. (In a tie and both are in the set or both not in
- * the set, it doesn't matter which we take first.) */
+ /* We need to take one or the other of the two inputs for the
+ * intersection. Since we are merging two sorted lists, we take the
+ * smaller of the next items. In case of a tie, we take the one that
+ * is not in its set first (a difference from the union algorithm). If
+ * we took one in the set first, it would increment the count, possibly
+ * to 2 which would cause it to be output as starting a range in the
+ * intersection, and the next time through we would take that same
+ * number, and output it again as ending the set. By doing it the
+ * opposite of this, there is no possibility that the count will be
+ * momentarily incremented to 2. (In a tie and both are in the set or
+ * both not in the set, it doesn't matter which we take first.) */
if (array_a[i_a] < array_b[i_b]
|| (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
{
@@ -6336,19 +6337,32 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
}
}
- /* Here, we are finished going through at least one of the sets, which
- * means there is something remaining in at most one. See the comments in
- * the union code */
- if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
- || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
+ /* Here, we are finished going through at least one of the lists, which
+ * means there is something remaining in at most one. We check if the list
+ * that has been exhausted is positioned such that we are in the middle
+ * of a range in its set or not. (i_a and i_b point to elements 1 beyond
+ * the ones we care about.) There are four cases:
+ * 1) Both weren't in their sets, count is 0, and remains 0. There's
+ * nothing left in the intersection.
+ * 2) Both were in their sets, count is 2 and perhaps is incremented to
+ * above 2. What should be output is exactly that which is in the
+ * non-exhausted set, as everything it has is also in the intersection
+ * set, and everything it doesn't have can't be in the intersection
+ * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
+ * gets incremented to 2. Like the previous case, the intersection is
+ * everything that remains in the non-exhausted set.
+ * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
+ * remains 1. And the intersection has nothing more. */
+ if ((i_a == len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
+ || (i_b == len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
{
- count--;
+ count++;
}
/* The final length is what we've output so far plus what else is in the
- * intersection. Only one of the subexpressions below will be non-zero */
+ * intersection. At most one of the subexpressions below will be non-zero */
len_r = i_r;
- if (count == 2) {
+ if (count >= 2) {
len_r += (len_a - i_a) + (len_b - i_b);
}
@@ -6361,7 +6375,7 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
}
/* Finish outputting any remaining */
- if (count == 2) { /* Only one of will have a non-zero copy count */
+ if (count >= 2) { /* At most one will have a non-zero copy count */
IV copy_count;
if ((copy_count = len_a - i_a) > 0) {
Copy(array_a + i_a, array_r + i_r, copy_count, UV);