diff options
author | Ito Kazumitsu <kaz@maczuka.gcd.org> | 2006-01-22 02:22:21 +0000 |
---|---|---|
committer | Ito Kazumitsu <kaz@maczuka.gcd.org> | 2006-01-22 02:22:21 +0000 |
commit | d1a39538fc2f13d850d701e9298f62f2f7a505cb (patch) | |
tree | d4cdce49dc6b78a00524f6547f514763186ee9e0 | |
parent | 138e5757547f8462ed3452971dfb3dcca5023a32 (diff) | |
download | classpath-d1a39538fc2f13d850d701e9298f62f2f7a505cb.tar.gz |
2006-01-22 Ito Kazumitsu <kaz@maczuka.gcd.org>
Fixes bug #25837
* gnu/regexp/REMatch.java(empty): New boolean indicating
an empty string matched.
* gnu/regexp/RE.java(match): Sets empty flag when an empty
string matched.
(initialize): Support back reference \10, \11, and so on.
(parseInt): renamed from getEscapedChar and returns int.
* gnu/regexp/RETokenRepeated.java(match): Sets empty flag
when an empty string matched. Fixed a bug of the case where
an empty string matched. Added special handling of {0}.
* gnu/regexp/RETokenBackRef.java(match): Sets empty flag
when an empty string matched. Fixed the case insensitive matching.
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | gnu/regexp/RE.java | 43 | ||||
-rw-r--r-- | gnu/regexp/REMatch.java | 3 | ||||
-rw-r--r-- | gnu/regexp/RETokenBackRef.java | 21 | ||||
-rw-r--r-- | gnu/regexp/RETokenRepeated.java | 52 |
5 files changed, 117 insertions, 17 deletions
@@ -1,3 +1,18 @@ +2006-01-22 Ito Kazumitsu <kaz@maczuka.gcd.org> + + Fixes bug #25837 + * gnu/regexp/REMatch.java(empty): New boolean indicating + an empty string matched. + * gnu/regexp/RE.java(match): Sets empty flag when an empty + string matched. + (initialize): Support back reference \10, \11, and so on. + (parseInt): renamed from getEscapedChar and returns int. + * gnu/regexp/RETokenRepeated.java(match): Sets empty flag + when an empty string matched. Fixed a bug of the case where + an empty string matched. Added special handling of {0}. + * gnu/regexp/RETokenBackRef.java(match): Sets empty flag + when an empty string matched. Fixed the case insensitive matching. + 2006-01-21 Roman Kennke <kennke@aicas.com> * javax/swing/plaf/metal/MetalSplitPaneDivider.java diff --git a/gnu/regexp/RE.java b/gnu/regexp/RE.java index 607aa1a6f..ad2630e68 100644 --- a/gnu/regexp/RE.java +++ b/gnu/regexp/RE.java @@ -825,12 +825,31 @@ public class RE extends REToken { } // BACKREFERENCE OPERATOR - // \1 \2 ... \9 + // \1 \2 ... \9 and \10 \11 \12 ... // not available if RE_NO_BK_REFS is set + // Perl recognizes \10, \11, and so on only if enough number of + // parentheses have opened before it, otherwise they are treated + // as aliases of \010, \011, ... (octal characters). In case of + // Sun's JDK, octal character expression must always begin with \0. + // We will do as JDK does. But FIXME, take a look at "(a)(b)\29". + // JDK treats \2 as a back reference to the 2nd group because + // there are only two groups. But in our poor implementation, + // we cannot help but treat \29 as a back reference to the 29th group. else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) { addToken(currentToken); - currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens); + int numBegin = index - 1; + int numEnd = pLength; + for (int i = index; i < pLength; i++) { + if (! Character.isDigit(pattern[i])) { + numEnd = i; + break; + } + } + int num = parseInt(pattern, numBegin, numEnd-numBegin, 10); + + currentToken = new RETokenBackRef(subIndex,num,insens); + index = numEnd; } // START OF STRING OPERATOR @@ -999,12 +1018,12 @@ public class RE extends REToken { return index; } - private static char getEscapedChar(char[] input, int pos, int len, int radix) { + private static int parseInt(char[] input, int pos, int len, int radix) { int ret = 0; for (int i = pos; i < pos + len; i++) { ret = ret * radix + Character.digit(input[i], radix); } - return (char)ret; + return ret; } /** @@ -1059,7 +1078,7 @@ public class RE extends REToken { l++; } if (l != expectedLength) return null; - ce.ch = getEscapedChar(input, pos + 2, l, 16); + ce.ch = (char)(parseInt(input, pos + 2, l, 16)); ce.len = l + 2; } else { @@ -1077,7 +1096,7 @@ public class RE extends REToken { } if (l == 3 && input[pos + 2] > '3') l--; if (l <= 0) return null; - ce.ch = getEscapedChar(input, pos + 2, l, 8); + ce.ch = (char)(parseInt(input, pos + 2, l, 8)); ce.len = l + 2; } else { @@ -1246,12 +1265,20 @@ public class RE extends REToken { /* Implements abstract method REToken.match() */ boolean match(CharIndexed input, REMatch mymatch) { - if (firstToken == null) return next(input, mymatch); + int origin = mymatch.index; + boolean b; + if (firstToken == null) { + b = next(input, mymatch); + if (b) mymatch.empty = (mymatch.index == origin); + return b; + } // Note the start of this subexpression mymatch.start[subIndex] = mymatch.index; - return firstToken.match(input, mymatch); + b = firstToken.match(input, mymatch); + if (b) mymatch.empty = (mymatch.index == origin); + return b; } /** diff --git a/gnu/regexp/REMatch.java b/gnu/regexp/REMatch.java index cf25bb331..e06ae36cf 100644 --- a/gnu/regexp/REMatch.java +++ b/gnu/regexp/REMatch.java @@ -67,6 +67,7 @@ public final class REMatch implements Serializable, Cloneable { int[] start; // start positions (relative to offset) for each (sub)exp. int[] end; // end positions for the same REMatch next; // other possibility (to avoid having to use arrays) + boolean empty; // empty string matched public Object clone() { try { @@ -88,6 +89,7 @@ public final class REMatch implements Serializable, Cloneable { index = other.index; // need to deep clone? next = other.next; + empty = other.empty; } REMatch(int subs, int anchor, int eflags) { @@ -124,6 +126,7 @@ public final class REMatch implements Serializable, Cloneable { start[i] = end[i] = -1; } next = null; // cut off alternates + empty = false; } /** diff --git a/gnu/regexp/RETokenBackRef.java b/gnu/regexp/RETokenBackRef.java index 674822abd..3414ecf97 100644 --- a/gnu/regexp/RETokenBackRef.java +++ b/gnu/regexp/RETokenBackRef.java @@ -51,17 +51,32 @@ final class RETokenBackRef extends REToken { // should implement getMinimumLength() -- any ideas? boolean match(CharIndexed input, REMatch mymatch) { + if (num >= mymatch.start.length) return false; + if (num >= mymatch.end.length) return false; int b,e; b = mymatch.start[num]; e = mymatch.end[num]; if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but... + int origin = mymatch.index; for (int i=b; i<e; i++) { - if (input.charAt(mymatch.index+i-b) != input.charAt(i)) { - return false; + char c1 = input.charAt(mymatch.index+i-b); + char c2 = input.charAt(i); + if (c1 != c2) { + if (insens) { + if (c1 != Character.toLowerCase(c2) && + c1 != Character.toUpperCase(c2)) { + return false; + } + } + else { + return false; + } } } mymatch.index += e-b; - return next(input, mymatch); + boolean result = next(input, mymatch); + if (result) mymatch.empty = (mymatch.index == origin); + return result; } void dump(StringBuffer os) { diff --git a/gnu/regexp/RETokenRepeated.java b/gnu/regexp/RETokenRepeated.java index 6291a3c39..167ca9991 100644 --- a/gnu/regexp/RETokenRepeated.java +++ b/gnu/regexp/RETokenRepeated.java @@ -45,12 +45,14 @@ final class RETokenRepeated extends REToken { private int min,max; private boolean stingy; private boolean possessive; + private boolean alwaysEmpty; // Special case of {0} RETokenRepeated(int subIndex, REToken token, int min, int max) { super(subIndex); this.token = token; this.min = min; this.max = max; + alwaysEmpty = (min == 0 && max == 0); } /** Sets the minimal matching mode to true. */ @@ -91,6 +93,7 @@ final class RETokenRepeated extends REToken { // the subexpression back-reference operator allow that? boolean match(CharIndexed input, REMatch mymatch) { + int origin = mymatch.index; // number of times we've matched so far int numRepeats = 0; @@ -112,12 +115,17 @@ final class RETokenRepeated extends REToken { do { // Check for stingy match for each possibility. - if (stingy && (numRepeats >= min)) { + if ((stingy && (numRepeats >= min)) || alwaysEmpty) { REMatch result = matchRest(input, newMatch); if (result != null) { mymatch.assignFrom(result); + mymatch.empty = (mymatch.index == origin); return true; } + else { + // Special case of {0}. It must always match an empty string. + if (alwaysEmpty) return false; + } } doables = null; @@ -153,12 +161,43 @@ final class RETokenRepeated extends REToken { positions.addElement(newMatch); - // doables.index == lastIndex means an empty string - // was the longest that matched this token. - // We break here, otherwise we will fall into an endless loop. + // doables.index == lastIndex occurs either + // (1) when an empty string was the longest + // that matched this token. + // And this case occurs either + // (1-1) when this token is always empty, + // for example "()" or "(())". + // (1-2) when this token is not always empty + // but can match an empty string, for example, + // "a*", "(a|)". + // or + // (2) when the same string matches this token many times. + // For example, "acbab" itself matches "a.*b" and + // its substrings "acb" and "ab" also match. if (doables.index == lastIndex) { - if (numRepeats < min) numRepeats = min; - break; + if (doables.empty) { + // Case (1): We break here, otherwise we will fall + // into an endless loop. + if (numRepeats < min) numRepeats = min; + break; + } + else { + // Case (2): We cannot break here because, for example, + // "acbacb" matches "a.*b" up to 2 times but + // not 3 times. So we have to check numRepeats >= min. + // But we do not have to go further until numRepeats == max + // because the more numRepeats grows, the shorter the + // substring matching this token becomes. + if (numRepeats > min) { + // This means the previous match was successful, + // and that must be the best match. This match + // resulted in shortening the matched substring. + numRepeats--; + positions.remove(positions.size() - 1); + break; + } + if (numRepeats == min) break; + } } lastIndex = doables.index; } while (numRepeats < max); @@ -207,6 +246,7 @@ final class RETokenRepeated extends REToken { } if (allResults != null) { mymatch.assignFrom(allResults); // does this get all? + mymatch.empty = (mymatch.index == origin); return true; } // If we fall out, no matches. |