summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIto Kazumitsu <kaz@maczuka.gcd.org>2006-01-22 02:22:21 +0000
committerIto Kazumitsu <kaz@maczuka.gcd.org>2006-01-22 02:22:21 +0000
commitd1a39538fc2f13d850d701e9298f62f2f7a505cb (patch)
treed4cdce49dc6b78a00524f6547f514763186ee9e0
parent138e5757547f8462ed3452971dfb3dcca5023a32 (diff)
downloadclasspath-d1a39538fc2f13d850d701e9298f62f2f7a505cb.tar.gz
2006-01-22 Ito Kazumitsu <kaz@maczuka.gcd.org>
Fixes bug #25837 * gnu/regexp/REMatch.java(empty): New boolean indicating an empty string matched. * gnu/regexp/RE.java(match): Sets empty flag when an empty string matched. (initialize): Support back reference \10, \11, and so on. (parseInt): renamed from getEscapedChar and returns int. * gnu/regexp/RETokenRepeated.java(match): Sets empty flag when an empty string matched. Fixed a bug of the case where an empty string matched. Added special handling of {0}. * gnu/regexp/RETokenBackRef.java(match): Sets empty flag when an empty string matched. Fixed the case insensitive matching.
-rw-r--r--ChangeLog15
-rw-r--r--gnu/regexp/RE.java43
-rw-r--r--gnu/regexp/REMatch.java3
-rw-r--r--gnu/regexp/RETokenBackRef.java21
-rw-r--r--gnu/regexp/RETokenRepeated.java52
5 files changed, 117 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index f0ea7958f..41a95ae57 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2006-01-22 Ito Kazumitsu <kaz@maczuka.gcd.org>
+
+ Fixes bug #25837
+ * gnu/regexp/REMatch.java(empty): New boolean indicating
+ an empty string matched.
+ * gnu/regexp/RE.java(match): Sets empty flag when an empty
+ string matched.
+ (initialize): Support back reference \10, \11, and so on.
+ (parseInt): renamed from getEscapedChar and returns int.
+ * gnu/regexp/RETokenRepeated.java(match): Sets empty flag
+ when an empty string matched. Fixed a bug of the case where
+ an empty string matched. Added special handling of {0}.
+ * gnu/regexp/RETokenBackRef.java(match): Sets empty flag
+ when an empty string matched. Fixed the case insensitive matching.
+
2006-01-21 Roman Kennke <kennke@aicas.com>
* javax/swing/plaf/metal/MetalSplitPaneDivider.java
diff --git a/gnu/regexp/RE.java b/gnu/regexp/RE.java
index 607aa1a6f..ad2630e68 100644
--- a/gnu/regexp/RE.java
+++ b/gnu/regexp/RE.java
@@ -825,12 +825,31 @@ public class RE extends REToken {
}
// BACKREFERENCE OPERATOR
- // \1 \2 ... \9
+ // \1 \2 ... \9 and \10 \11 \12 ...
// not available if RE_NO_BK_REFS is set
+ // Perl recognizes \10, \11, and so on only if enough number of
+ // parentheses have opened before it, otherwise they are treated
+ // as aliases of \010, \011, ... (octal characters). In case of
+ // Sun's JDK, octal character expression must always begin with \0.
+ // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
+ // JDK treats \2 as a back reference to the 2nd group because
+ // there are only two groups. But in our poor implementation,
+ // we cannot help but treat \29 as a back reference to the 29th group.
else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
addToken(currentToken);
- currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
+ int numBegin = index - 1;
+ int numEnd = pLength;
+ for (int i = index; i < pLength; i++) {
+ if (! Character.isDigit(pattern[i])) {
+ numEnd = i;
+ break;
+ }
+ }
+ int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
+
+ currentToken = new RETokenBackRef(subIndex,num,insens);
+ index = numEnd;
}
// START OF STRING OPERATOR
@@ -999,12 +1018,12 @@ public class RE extends REToken {
return index;
}
- private static char getEscapedChar(char[] input, int pos, int len, int radix) {
+ private static int parseInt(char[] input, int pos, int len, int radix) {
int ret = 0;
for (int i = pos; i < pos + len; i++) {
ret = ret * radix + Character.digit(input[i], radix);
}
- return (char)ret;
+ return ret;
}
/**
@@ -1059,7 +1078,7 @@ public class RE extends REToken {
l++;
}
if (l != expectedLength) return null;
- ce.ch = getEscapedChar(input, pos + 2, l, 16);
+ ce.ch = (char)(parseInt(input, pos + 2, l, 16));
ce.len = l + 2;
}
else {
@@ -1077,7 +1096,7 @@ public class RE extends REToken {
}
if (l == 3 && input[pos + 2] > '3') l--;
if (l <= 0) return null;
- ce.ch = getEscapedChar(input, pos + 2, l, 8);
+ ce.ch = (char)(parseInt(input, pos + 2, l, 8));
ce.len = l + 2;
}
else {
@@ -1246,12 +1265,20 @@ public class RE extends REToken {
/* Implements abstract method REToken.match() */
boolean match(CharIndexed input, REMatch mymatch) {
- if (firstToken == null) return next(input, mymatch);
+ int origin = mymatch.index;
+ boolean b;
+ if (firstToken == null) {
+ b = next(input, mymatch);
+ if (b) mymatch.empty = (mymatch.index == origin);
+ return b;
+ }
// Note the start of this subexpression
mymatch.start[subIndex] = mymatch.index;
- return firstToken.match(input, mymatch);
+ b = firstToken.match(input, mymatch);
+ if (b) mymatch.empty = (mymatch.index == origin);
+ return b;
}
/**
diff --git a/gnu/regexp/REMatch.java b/gnu/regexp/REMatch.java
index cf25bb331..e06ae36cf 100644
--- a/gnu/regexp/REMatch.java
+++ b/gnu/regexp/REMatch.java
@@ -67,6 +67,7 @@ public final class REMatch implements Serializable, Cloneable {
int[] start; // start positions (relative to offset) for each (sub)exp.
int[] end; // end positions for the same
REMatch next; // other possibility (to avoid having to use arrays)
+ boolean empty; // empty string matched
public Object clone() {
try {
@@ -88,6 +89,7 @@ public final class REMatch implements Serializable, Cloneable {
index = other.index;
// need to deep clone?
next = other.next;
+ empty = other.empty;
}
REMatch(int subs, int anchor, int eflags) {
@@ -124,6 +126,7 @@ public final class REMatch implements Serializable, Cloneable {
start[i] = end[i] = -1;
}
next = null; // cut off alternates
+ empty = false;
}
/**
diff --git a/gnu/regexp/RETokenBackRef.java b/gnu/regexp/RETokenBackRef.java
index 674822abd..3414ecf97 100644
--- a/gnu/regexp/RETokenBackRef.java
+++ b/gnu/regexp/RETokenBackRef.java
@@ -51,17 +51,32 @@ final class RETokenBackRef extends REToken {
// should implement getMinimumLength() -- any ideas?
boolean match(CharIndexed input, REMatch mymatch) {
+ if (num >= mymatch.start.length) return false;
+ if (num >= mymatch.end.length) return false;
int b,e;
b = mymatch.start[num];
e = mymatch.end[num];
if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but...
+ int origin = mymatch.index;
for (int i=b; i<e; i++) {
- if (input.charAt(mymatch.index+i-b) != input.charAt(i)) {
- return false;
+ char c1 = input.charAt(mymatch.index+i-b);
+ char c2 = input.charAt(i);
+ if (c1 != c2) {
+ if (insens) {
+ if (c1 != Character.toLowerCase(c2) &&
+ c1 != Character.toUpperCase(c2)) {
+ return false;
+ }
+ }
+ else {
+ return false;
+ }
}
}
mymatch.index += e-b;
- return next(input, mymatch);
+ boolean result = next(input, mymatch);
+ if (result) mymatch.empty = (mymatch.index == origin);
+ return result;
}
void dump(StringBuffer os) {
diff --git a/gnu/regexp/RETokenRepeated.java b/gnu/regexp/RETokenRepeated.java
index 6291a3c39..167ca9991 100644
--- a/gnu/regexp/RETokenRepeated.java
+++ b/gnu/regexp/RETokenRepeated.java
@@ -45,12 +45,14 @@ final class RETokenRepeated extends REToken {
private int min,max;
private boolean stingy;
private boolean possessive;
+ private boolean alwaysEmpty; // Special case of {0}
RETokenRepeated(int subIndex, REToken token, int min, int max) {
super(subIndex);
this.token = token;
this.min = min;
this.max = max;
+ alwaysEmpty = (min == 0 && max == 0);
}
/** Sets the minimal matching mode to true. */
@@ -91,6 +93,7 @@ final class RETokenRepeated extends REToken {
// the subexpression back-reference operator allow that?
boolean match(CharIndexed input, REMatch mymatch) {
+ int origin = mymatch.index;
// number of times we've matched so far
int numRepeats = 0;
@@ -112,12 +115,17 @@ final class RETokenRepeated extends REToken {
do {
// Check for stingy match for each possibility.
- if (stingy && (numRepeats >= min)) {
+ if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
REMatch result = matchRest(input, newMatch);
if (result != null) {
mymatch.assignFrom(result);
+ mymatch.empty = (mymatch.index == origin);
return true;
}
+ else {
+ // Special case of {0}. It must always match an empty string.
+ if (alwaysEmpty) return false;
+ }
}
doables = null;
@@ -153,12 +161,43 @@ final class RETokenRepeated extends REToken {
positions.addElement(newMatch);
- // doables.index == lastIndex means an empty string
- // was the longest that matched this token.
- // We break here, otherwise we will fall into an endless loop.
+ // doables.index == lastIndex occurs either
+ // (1) when an empty string was the longest
+ // that matched this token.
+ // And this case occurs either
+ // (1-1) when this token is always empty,
+ // for example "()" or "(())".
+ // (1-2) when this token is not always empty
+ // but can match an empty string, for example,
+ // "a*", "(a|)".
+ // or
+ // (2) when the same string matches this token many times.
+ // For example, "acbab" itself matches "a.*b" and
+ // its substrings "acb" and "ab" also match.
if (doables.index == lastIndex) {
- if (numRepeats < min) numRepeats = min;
- break;
+ if (doables.empty) {
+ // Case (1): We break here, otherwise we will fall
+ // into an endless loop.
+ if (numRepeats < min) numRepeats = min;
+ break;
+ }
+ else {
+ // Case (2): We cannot break here because, for example,
+ // "acbacb" matches "a.*b" up to 2 times but
+ // not 3 times. So we have to check numRepeats >= min.
+ // But we do not have to go further until numRepeats == max
+ // because the more numRepeats grows, the shorter the
+ // substring matching this token becomes.
+ if (numRepeats > min) {
+ // This means the previous match was successful,
+ // and that must be the best match. This match
+ // resulted in shortening the matched substring.
+ numRepeats--;
+ positions.remove(positions.size() - 1);
+ break;
+ }
+ if (numRepeats == min) break;
+ }
}
lastIndex = doables.index;
} while (numRepeats < max);
@@ -207,6 +246,7 @@ final class RETokenRepeated extends REToken {
}
if (allResults != null) {
mymatch.assignFrom(allResults); // does this get all?
+ mymatch.empty = (mymatch.index == origin);
return true;
}
// If we fall out, no matches.