summaryrefslogtreecommitdiff
path: root/gnu/regexp/RE.java
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/regexp/RE.java')
-rw-r--r--gnu/regexp/RE.java436
1 files changed, 309 insertions, 127 deletions
diff --git a/gnu/regexp/RE.java b/gnu/regexp/RE.java
index bda977999..ef606a6d8 100644
--- a/gnu/regexp/RE.java
+++ b/gnu/regexp/RE.java
@@ -1,5 +1,5 @@
/* gnu/regexp/RE.java
- Copyright (C) 1998-2001, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -136,12 +136,13 @@ public class RE extends REToken {
/** Minimum length, in characters, of any possible match. */
private int minimumLength;
+ private int maximumLength;
/**
* Compilation flag. Do not differentiate case. Subsequent
* searches using this RE will be case insensitive.
*/
- public static final int REG_ICASE = 2;
+ public static final int REG_ICASE = 0x02;
/**
* Compilation flag. The match-any-character operator (dot)
@@ -149,14 +150,14 @@ public class RE extends REToken {
* bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
* the "/s" operator in Perl.
*/
- public static final int REG_DOT_NEWLINE = 4;
+ public static final int REG_DOT_NEWLINE = 0x04;
/**
* Compilation flag. Use multiline mode. In this mode, the ^ and $
* anchors will match based on newlines within the input. This is
* equivalent to the "/m" operator in Perl.
*/
- public static final int REG_MULTILINE = 8;
+ public static final int REG_MULTILINE = 0x08;
/**
* Execution flag.
@@ -185,14 +186,14 @@ public class RE extends REToken {
* // m4.toString(): "fool"<BR>
* </CODE>
*/
- public static final int REG_NOTBOL = 16;
+ public static final int REG_NOTBOL = 0x10;
/**
* Execution flag.
* The match-end operator ($) does not match at the end
* of the input string. Useful for matching on substrings.
*/
- public static final int REG_NOTEOL = 32;
+ public static final int REG_NOTEOL = 0x20;
/**
* Execution flag.
@@ -206,7 +207,7 @@ public class RE extends REToken {
* the example under REG_NOTBOL. It also affects the use of the \&lt;
* and \b operators.
*/
- public static final int REG_ANCHORINDEX = 64;
+ public static final int REG_ANCHORINDEX = 0x40;
/**
* Execution flag.
@@ -215,14 +216,24 @@ public class RE extends REToken {
* the corresponding subexpressions. For example, you may want to
* replace all matches of "one dollar" with "$1".
*/
- public static final int REG_NO_INTERPOLATE = 128;
+ public static final int REG_NO_INTERPOLATE = 0x80;
/**
* Execution flag.
* Try to match the whole input string. An implicit match-end operator
* is added to this regexp.
*/
- public static final int REG_TRY_ENTIRE_MATCH = 256;
+ public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
+
+ /**
+ * Execution flag.
+ * The substitute and substituteAll methods will treat the
+ * character '\' in the replacement as an escape to a literal
+ * character. In this case "\n", "\$", "\\", "\x40" and "\012"
+ * will become "n", "$", "\", "x40" and "012" respectively.
+ * This flag has no effect if REG_NO_INTERPOLATE is set on.
+ */
+ public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
/** Returns a string representing the version of the gnu.regexp package. */
public static final String version() {
@@ -280,12 +291,13 @@ public class RE extends REToken {
}
// internal constructor used for alternation
- private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
+ private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
super(subIndex);
firstToken = first;
lastToken = last;
numSubs = subs;
minimumLength = minLength;
+ maximumLength = maxLength;
addToken(new RETokenEndSub(subIndex));
}
@@ -371,8 +383,9 @@ public class RE extends REToken {
&& !syntax.get(RESyntax.RE_LIMITED_OPS)) {
// make everything up to here be a branch. create vector if nec.
addToken(currentToken);
- RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
+ RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
minimumLength = 0;
+ maximumLength = 0;
if (branches == null) {
branches = new Vector();
}
@@ -414,116 +427,12 @@ public class RE extends REToken {
// [...] | [^...]
else if ((unit.ch == '[') && !(unit.bk || quot)) {
- Vector options = new Vector();
- boolean negative = false;
- // FIXME: lastChar == 0 means lastChar is not set. But what if
- // \u0000 is used as a meaningful character?
- char lastChar = 0;
- if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
-
- // Check for initial caret, negation
- if ((ch = pattern[index]) == '^') {
- negative = true;
- if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- ch = pattern[index];
- }
-
- // Check for leading right bracket literal
- if (ch == ']') {
- lastChar = ch;
- if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- }
-
- while ((ch = pattern[index++]) != ']') {
- if ((ch == '-') && (lastChar != 0)) {
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- if ((ch = pattern[index]) == ']') {
- options.addElement(new RETokenChar(subIndex,lastChar,insens));
- lastChar = '-';
- } else {
- if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
- CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
- if (ce == null)
- throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
- ch = ce.ch;
- index = index + ce.len - 1;
- }
- options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
- lastChar = 0;
- index++;
- }
- } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- int posixID = -1;
- boolean negate = false;
- // FIXME: asciiEsc == 0 means asciiEsc is not set. But what if
- // \u0000 is used as a meaningful character?
- char asciiEsc = 0;
- NamedProperty np = null;
- if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
- switch (pattern[index]) {
- case 'D':
- negate = true;
- case 'd':
- posixID = RETokenPOSIX.DIGIT;
- break;
- case 'S':
- negate = true;
- case 's':
- posixID = RETokenPOSIX.SPACE;
- break;
- case 'W':
- negate = true;
- case 'w':
- posixID = RETokenPOSIX.ALNUM;
- break;
- }
- }
- if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
- np = getNamedProperty(pattern, index - 1, pLength);
- if (np == null)
- throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
- index = index - 1 + np.len - 1;
- }
- else {
- CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
- if (ce == null)
- throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
- asciiEsc = ce.ch;
- index = index - 1 + ce.len - 1;
- }
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
-
- if (posixID != -1) {
- options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
- } else if (np != null) {
- options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
- } else if (asciiEsc != 0) {
- lastChar = asciiEsc;
- } else {
- lastChar = pattern[index];
- }
- ++index;
- } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
- StringBuffer posixSet = new StringBuffer();
- index = getPosixSet(pattern,index+1,posixSet);
- int posixId = RETokenPOSIX.intValue(posixSet.toString());
- if (posixId != -1)
- options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
- } else {
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
- lastChar = ch;
- }
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- } // while in list
- // Out of list, index is one past ']'
-
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
-
// Create a new RETokenOneOf
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index, pLength, cflags, syntax, 0);
addToken(currentToken);
- options.trimToSize();
- currentToken = new RETokenOneOf(subIndex,options,negative);
+ currentToken = result.token;
+ index = result.index;
}
// SUBEXPRESSIONS
@@ -533,7 +442,10 @@ public class RE extends REToken {
boolean pure = false;
boolean comment = false;
boolean lookAhead = false;
+ boolean lookBehind = false;
+ boolean independent = false;
boolean negativelh = false;
+ boolean negativelb = false;
if ((index+1 < pLength) && (pattern[index] == '?')) {
switch (pattern[index+1]) {
case '!':
@@ -551,6 +463,34 @@ public class RE extends REToken {
index += 2;
}
break;
+ case '<':
+ // We assume that if the syntax supports look-ahead,
+ // it also supports look-behind.
+ if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
+ index++;
+ switch (pattern[index +1]) {
+ case '!':
+ pure = true;
+ negativelb = true;
+ lookBehind = true;
+ index += 2;
+ break;
+ case '=':
+ pure = true;
+ lookBehind = true;
+ index += 2;
+ }
+ }
+ break;
+ case '>':
+ // We assume that if the syntax supports look-ahead,
+ // it also supports independent group.
+ if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
+ pure = true;
+ independent = true;
+ index += 2;
+ }
+ break;
case 'i':
case 'd':
case 'm':
@@ -713,13 +653,20 @@ public class RE extends REToken {
numSubs++;
}
- int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
+ int useIndex = (pure || lookAhead || lookBehind || independent) ?
+ 0 : nextSub + numSubs;
currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
numSubs += ((RE) currentToken).getNumSubs();
if (lookAhead) {
currentToken = new RETokenLookAhead(currentToken,negativelh);
}
+ else if (lookBehind) {
+ currentToken = new RETokenLookBehind(currentToken,negativelb);
+ }
+ else if (independent) {
+ currentToken = new RETokenIndependent(currentToken);
+ }
index = nextIndex;
if (flagsSaved) {
@@ -1026,9 +973,10 @@ public class RE extends REToken {
addToken(currentToken);
if (branches != null) {
- branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
+ branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
branches.trimToSize(); // compact the Vector
minimumLength = 0;
+ maximumLength = 0;
firstToken = lastToken = null;
addToken(new RETokenOneOf(subIndex,branches,false));
}
@@ -1036,6 +984,199 @@ public class RE extends REToken {
}
+ private static class ParseCharClassResult {
+ RETokenOneOf token;
+ int index;
+ boolean returnAtAndOperator = false;
+ }
+
+ /**
+ * Parse [...] or [^...] and make an RETokenOneOf instance.
+ * @param subIndex subIndex to be given to the created RETokenOneOf instance.
+ * @param pattern Input array of characters to be parsed.
+ * @param index Index pointing to the character next to the beginning '['.
+ * @param pLength Limit of the input array.
+ * @param cflags Compilation flags used to parse the pattern.
+ * @param pflags Flags that affect the behavior of this method.
+ * @param syntax Syntax used to parse the pattern.
+ */
+ private static ParseCharClassResult parseCharClass(int subIndex,
+ char[] pattern, int index,
+ int pLength, int cflags, RESyntax syntax, int pflags)
+ throws REException {
+
+ boolean insens = ((cflags & REG_ICASE) > 0);
+ Vector options = new Vector();
+ Vector addition = new Vector();
+ boolean additionAndAppeared = false;
+ final int RETURN_AT_AND = 0x01;
+ boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
+ boolean negative = false;
+ char ch;
+
+ char lastChar = 0;
+ boolean lastCharIsSet = false;
+ if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
+
+ // Check for initial caret, negation
+ if ((ch = pattern[index]) == '^') {
+ negative = true;
+ if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ ch = pattern[index];
+ }
+
+ // Check for leading right bracket literal
+ if (ch == ']') {
+ lastChar = ch; lastCharIsSet = true;
+ if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ }
+
+ while ((ch = pattern[index++]) != ']') {
+ if ((ch == '-') && (lastCharIsSet)) {
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ if ((ch = pattern[index]) == ']') {
+ options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ lastChar = '-';
+ } else {
+ if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ ch = ce.ch;
+ index = index + ce.len - 1;
+ }
+ options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
+ lastChar = 0; lastCharIsSet = false;
+ index++;
+ }
+ } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ int posixID = -1;
+ boolean negate = false;
+ char asciiEsc = 0;
+ boolean asciiEscIsSet = false;
+ NamedProperty np = null;
+ if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
+ switch (pattern[index]) {
+ case 'D':
+ negate = true;
+ case 'd':
+ posixID = RETokenPOSIX.DIGIT;
+ break;
+ case 'S':
+ negate = true;
+ case 's':
+ posixID = RETokenPOSIX.SPACE;
+ break;
+ case 'W':
+ negate = true;
+ case 'w':
+ posixID = RETokenPOSIX.ALNUM;
+ break;
+ }
+ }
+ if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
+ np = getNamedProperty(pattern, index - 1, pLength);
+ if (np == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ index = index - 1 + np.len - 1;
+ }
+ else {
+ CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ asciiEsc = ce.ch; asciiEscIsSet = true;
+ index = index - 1 + ce.len - 1;
+ }
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+
+ if (posixID != -1) {
+ options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
+ } else if (np != null) {
+ options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
+ } else if (asciiEscIsSet) {
+ lastChar = asciiEsc; lastCharIsSet = true;
+ } else {
+ lastChar = pattern[index]; lastCharIsSet = true;
+ }
+ ++index;
+ } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
+ StringBuffer posixSet = new StringBuffer();
+ index = getPosixSet(pattern,index+1,posixSet);
+ int posixId = RETokenPOSIX.intValue(posixSet.toString());
+ if (posixId != -1)
+ options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
+ } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index, pLength, cflags, syntax, 0);
+ addition.addElement(result.token);
+ addition.addElement("|");
+ index = result.index;
+ } else if ((ch == '&') &&
+ (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
+ (index < pLength) && (pattern[index] == '&')) {
+ if (returnAtAndOperator) {
+ ParseCharClassResult result = new ParseCharClassResult();
+ options.trimToSize();
+ if (additionAndAppeared) addition.addElement("&");
+ if (addition.size() == 0) addition = null;
+ result.token = new RETokenOneOf(subIndex,
+ options, addition, negative);
+ result.index = index - 1;
+ result.returnAtAndOperator = true;
+ return result;
+ }
+ // The precedence of the operator "&&" is the lowest.
+ // So we postpone adding "&" until other elements
+ // are added. And we insert Boolean.FALSE at the
+ // beginning of the list of tokens following "&&".
+ // So, "&&[a-b][k-m]" will be stored in the Vecter
+ // addition in this order:
+ // Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
+ if (additionAndAppeared) addition.addElement("&");
+ addition.addElement(Boolean.FALSE);
+ additionAndAppeared = true;
+
+ // The part on which "&&" operates may be either
+ // (1) explicitly enclosed by []
+ // or
+ // (2) not enclosed by [] and terminated by the
+ // next "&&" or the end of the character list.
+ // Let the preceding else if block do the case (1).
+ // We must do something in case of (2).
+ if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index+1, pLength, cflags, syntax,
+ RETURN_AT_AND);
+ addition.addElement(result.token);
+ addition.addElement("|");
+ // If the method returned at the next "&&", it is OK.
+ // Otherwise we have eaten the mark of the end of this
+ // character list "]". In this case we must give back
+ // the end mark.
+ index = (result.returnAtAndOperator ?
+ result.index: result.index - 1);
+ }
+ } else {
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ lastChar = ch; lastCharIsSet = true;
+ }
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ } // while in list
+ // Out of list, index is one past ']'
+
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+
+ ParseCharClassResult result = new ParseCharClassResult();
+ // Create a new RETokenOneOf
+ options.trimToSize();
+ if (additionAndAppeared) addition.addElement("&");
+ if (addition.size() == 0) addition = null;
+ result.token = new RETokenOneOf(subIndex,options, addition, negative);
+ result.index = index;
+ return result;
+ }
+
private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
unit.ch = input[index++];
unit.bk = (unit.ch == '\\'
@@ -1072,7 +1213,7 @@ public class RE extends REToken {
public String toString() { return expr; }
}
- private CharExpression getCharExpression(char[] input, int pos, int lim,
+ private static CharExpression getCharExpression(char[] input, int pos, int lim,
RESyntax syntax) {
CharExpression ce = new CharExpression();
char c = input[pos];
@@ -1164,7 +1305,7 @@ public class RE extends REToken {
int len;
}
- private NamedProperty getNamedProperty(char[] input, int pos, int lim) {
+ private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
NamedProperty np = new NamedProperty();
char c = input[pos];
if (c == '\\') {
@@ -1297,6 +1438,10 @@ public class RE extends REToken {
return minimumLength;
}
+ public int getMaximumLength() {
+ return maximumLength;
+ }
+
/**
* Returns an array of all matches found in the input.
*
@@ -1568,8 +1713,7 @@ public class RE extends REToken {
StringBuffer buffer = new StringBuffer();
REMatch m = getMatchImpl(input,index,eflags,buffer);
if (m==null) return buffer.toString();
- buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
- replace : m.substituteInto(replace) );
+ buffer.append(getReplacement(replace, m, eflags));
if (input.move(m.end[0])) {
do {
buffer.append(input.charAt(0));
@@ -1630,8 +1774,7 @@ public class RE extends REToken {
StringBuffer buffer = new StringBuffer();
REMatch m;
while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
- buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
- replace : m.substituteInto(replace) );
+ buffer.append(getReplacement(replace, m, eflags));
index = m.getEndIndex();
if (m.end[0] == 0) {
char ch = input.charAt(0);
@@ -1646,11 +1789,50 @@ public class RE extends REToken {
}
return buffer.toString();
}
+
+ public static String getReplacement(String replace, REMatch m, int eflags) {
+ if ((eflags & REG_NO_INTERPOLATE) > 0)
+ return replace;
+ else {
+ if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) {
+ StringBuffer sb = new StringBuffer();
+ int l = replace.length();
+ for (int i = 0; i < l; i++) {
+ char c = replace.charAt(i);
+ switch(c) {
+ case '\\':
+ i++;
+ // Let StringIndexOutOfBoundsException be thrown.
+ sb.append(replace.charAt(i));
+ break;
+ case '$':
+ int i1 = i + 1;
+ while (i1 < replace.length() &&
+ Character.isDigit(replace.charAt(i1))) i1++;
+ sb.append(m.substituteInto(replace.substring(i, i1)));
+ i = i1 - 1;
+ break;
+ default:
+ sb.append(c);
+ }
+ }
+ return sb.toString();
+ }
+ else
+ return m.substituteInto(replace);
+ }
+ }
/* Helper function for constructor */
private void addToken(REToken next) {
if (next == null) return;
minimumLength += next.getMinimumLength();
+ int nmax = next.getMaximumLength();
+ if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
+ maximumLength += nmax;
+ else
+ maximumLength = Integer.MAX_VALUE;
+
if (firstToken == null) {
lastToken = firstToken = next;
} else {