diff options
Diffstat (limited to 'libjava/classpath/gnu/java/util/regex/RE.java')
-rw-r--r-- | libjava/classpath/gnu/java/util/regex/RE.java | 3022 |
1 files changed, 1511 insertions, 1511 deletions
diff --git a/libjava/classpath/gnu/java/util/regex/RE.java b/libjava/classpath/gnu/java/util/regex/RE.java index d064f7a3579..5e9974a4928 100644 --- a/libjava/classpath/gnu/java/util/regex/RE.java +++ b/libjava/classpath/gnu/java/util/regex/RE.java @@ -53,7 +53,7 @@ import java.util.ResourceBundle; * expressions. * <P> * A regular expression object (class RE) is compiled by constructing it - * from a String, StringBuffer or character array, with optional + * from a String, StringBuffer or character array, with optional * compilation flags (below) * and an optional syntax specification (see RESyntax; if not specified, * <code>RESyntax.RE_SYNTAX_PERL5</code> is used). @@ -110,7 +110,7 @@ import java.util.ResourceBundle; * <P> * You can optionally affect the execution environment by using a * combination of execution flags (constants listed below). - * + * * <P> * All operations on a regular expression are performed in a * thread-safe manner. @@ -281,7 +281,7 @@ public class RE extends REToken { if (messages == null) messages = - PropertyResourceBundle.getBundle (bundle, Locale.getDefault ()); + PropertyResourceBundle.getBundle (bundle, Locale.getDefault ()); return messages.getString (key); } @@ -335,7 +335,7 @@ public class RE extends REToken // internal constructor used for alternation private RE (REToken first, REToken last, int subs, int subIndex, - int minLength, int maxLength) + int minLength, int maxLength) { super (subIndex); firstToken = first; @@ -347,9 +347,9 @@ public class RE extends REToken } private RE (Object patternObj, int cflags, RESyntax syntax, int myIndex, - int nextSub) throws REException + int nextSub) throws REException { - super (myIndex); // Subexpression index of this token. + super (myIndex); // Subexpression index of this token. initialize (patternObj, cflags, syntax, myIndex, nextSub); } @@ -361,41 +361,41 @@ public class RE extends REToken // The meat of construction protected void initialize (Object patternObj, int cflags, RESyntax syntax, - int myIndex, int nextSub) throws REException + int myIndex, int nextSub) throws REException { char[] pattern; if (patternObj instanceof String) { - pattern = ((String) patternObj).toCharArray (); + pattern = ((String) patternObj).toCharArray (); } else if (patternObj instanceof char[]) { - pattern = (char[]) patternObj; + pattern = (char[]) patternObj; } else if (patternObj instanceof StringBuffer) { - pattern = new char[((StringBuffer) patternObj).length ()]; - ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0); + pattern = new char[((StringBuffer) patternObj).length ()]; + ((StringBuffer) patternObj).getChars (0, pattern.length, pattern, 0); } else if (patternObj instanceof StringBuilder) { - pattern = new char[((StringBuilder) patternObj).length ()]; - ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0); + pattern = new char[((StringBuilder) patternObj).length ()]; + ((StringBuilder) patternObj).getChars (0, pattern.length, pattern, 0); } else if (patternObj instanceof CPStringBuilder) { - pattern = new char[((CPStringBuilder) patternObj).length ()]; - ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern, - 0); + pattern = new char[((CPStringBuilder) patternObj).length ()]; + ((CPStringBuilder) patternObj).getChars (0, pattern.length, pattern, + 0); } else { - pattern = patternObj.toString ().toCharArray (); + pattern = patternObj.toString ().toCharArray (); } int pLength = pattern.length; - numSubs = 0; // Number of subexpressions in this token. + numSubs = 0; // Number of subexpressions in this token. ArrayList < REToken > branches = null; // linked list of tokens (sort of -- some closed loops can exist) @@ -429,956 +429,956 @@ public class RE extends REToken while (index < pLength) { - // read the next character unit (including backslash escapes) - index = getCharUnit (pattern, index, unit, quot); - - if (unit.bk) - if (unit.ch == 'Q') - { - quot = true; - continue; - } - else if (unit.ch == 'E') - { - quot = false; - continue; - } - if (quot) - unit.bk = false; - - if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) - { - if (Character.isWhitespace (unit.ch)) - { - continue; - } - if (unit.ch == '#') - { - for (int i = index; i < pLength; i++) - { - if (pattern[i] == '\n') - { - index = i + 1; - continue; - } - else if (pattern[i] == '\r') - { - if (i + 1 < pLength && pattern[i + 1] == '\n') - { - index = i + 2; - } - else - { - index = i + 1; - } - continue; - } - } - index = pLength; - continue; - } - } - - // ALTERNATION OPERATOR - // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT) - // not available if RE_LIMITED_OPS is set - - // TODO: the '\n' literal here should be a test against REToken.newline, - // which unfortunately may be more than a single character. - if (((unit.ch == '|' - && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot))) - || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') - && !(unit.bk || quot))) - && !syntax.get (RESyntax.RE_LIMITED_OPS)) - { - // make everything up to here be a branch. create vector if nec. - addToken (currentToken); - RE theBranch = - new RE (firstToken, lastToken, numSubs, subIndex, minimumLength, - maximumLength); - minimumLength = 0; - maximumLength = 0; - if (branches == null) - { - branches = new ArrayList < REToken > (); - } - branches.add (theBranch); - firstToken = lastToken = currentToken = null; - } - - // INTERVAL OPERATOR: - // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES) - // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES) - // - // OPEN QUESTION: - // what is proper interpretation of '{' at start of string? - // - // This method used to check "repeat.empty.token" to avoid such regexp - // as "(a*){2,}", but now "repeat.empty.token" is allowed. - - else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS) - && (syntax. - get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) - { - int newIndex = getMinMax (pattern, index, minMax, syntax); - if (newIndex > index) - { - if (minMax.first > minMax.second) - throw new - REException (getLocalizedMessage ("interval.order"), - REException.REG_BADRPT, newIndex); - if (currentToken == null) - throw new - REException (getLocalizedMessage ("repeat.no.token"), - REException.REG_BADRPT, newIndex); - if (currentToken instanceof RETokenRepeated) - throw new - REException (getLocalizedMessage ("repeat.chained"), - REException.REG_BADRPT, newIndex); - if (currentToken instanceof RETokenWordBoundary - || currentToken instanceof RETokenWordBoundary) - throw new - REException (getLocalizedMessage ("repeat.assertion"), - REException.REG_BADRPT, newIndex); - index = newIndex; - currentToken = - setRepeated (currentToken, minMax.first, minMax.second, - index); - } - else - { - addToken (currentToken); - currentToken = new RETokenChar (subIndex, unit.ch, insens); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - } - - // LIST OPERATOR: - // [...] | [^...] - - else if ((unit.ch == '[') && !(unit.bk || quot)) - { - // Create a new RETokenOneOf - ParseCharClassResult result = - parseCharClass (subIndex, pattern, index, pLength, cflags, - syntax, 0); - addToken (currentToken); - currentToken = result.token; - index = result.index; - } - - // SUBEXPRESSIONS - // (...) | \(...\) depending on RE_NO_BK_PARENS - - else if ((unit.ch == '(') - && (syntax. - get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) - { - boolean pure = false; - boolean comment = false; - boolean lookAhead = false; - boolean lookBehind = false; - boolean independent = false; - boolean negativelh = false; - boolean negativelb = false; - if ((index + 1 < pLength) && (pattern[index] == '?')) - { - switch (pattern[index + 1]) - { - case '!': - if (syntax.get (RESyntax.RE_LOOKAHEAD)) - { - pure = true; - negativelh = true; - lookAhead = true; - index += 2; - } - break; - case '=': - if (syntax.get (RESyntax.RE_LOOKAHEAD)) - { - pure = true; - lookAhead = true; - index += 2; - } - break; - case '<': - // We assume that if the syntax supports look-ahead, - // it also supports look-behind. - if (syntax.get (RESyntax.RE_LOOKAHEAD)) - { - index++; - switch (pattern[index + 1]) - { - case '!': - pure = true; - negativelb = true; - lookBehind = true; - index += 2; - break; - case '=': - pure = true; - lookBehind = true; - index += 2; - } - } - break; - case '>': - // We assume that if the syntax supports look-ahead, - // it also supports independent group. - if (syntax.get (RESyntax.RE_LOOKAHEAD)) - { - pure = true; - independent = true; - index += 2; - } - break; - case 'i': - case 'd': - case 'm': - case 's': - case 'u': - case 'x': - case '-': - if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS)) - break; - // Set or reset syntax flags. - int flagIndex = index + 1; - int endFlag = -1; - RESyntax newSyntax = new RESyntax (syntax); - int newCflags = cflags; - boolean negate = false; - while (flagIndex < pLength && endFlag < 0) - { - switch (pattern[flagIndex]) - { - case 'i': - if (negate) - newCflags &= ~REG_ICASE; - else - newCflags |= REG_ICASE; - flagIndex++; - break; - case 'd': - if (negate) - newSyntax.setLineSeparator (RESyntax. - DEFAULT_LINE_SEPARATOR); - else - newSyntax.setLineSeparator ("\n"); - flagIndex++; - break; - case 'm': - if (negate) - newCflags &= ~REG_MULTILINE; - else - newCflags |= REG_MULTILINE; - flagIndex++; - break; - case 's': - if (negate) - newCflags &= ~REG_DOT_NEWLINE; - else - newCflags |= REG_DOT_NEWLINE; - flagIndex++; - break; - case 'u': - if (negate) - newCflags |= REG_ICASE_USASCII; - else - newCflags &= ~REG_ICASE_USASCII; - flagIndex++; - break; - case 'x': - if (negate) - newCflags &= ~REG_X_COMMENTS; - else - newCflags |= REG_X_COMMENTS; - flagIndex++; - break; - case '-': - negate = true; - flagIndex++; - break; - case ':': - case ')': - endFlag = pattern[flagIndex]; - break; - default: - throw new - REException (getLocalizedMessage - ("repeat.no.token"), - REException.REG_BADRPT, index); - } - } - if (endFlag == ')') - { - syntax = newSyntax; - cflags = newCflags; - insens = ((cflags & REG_ICASE) > 0); - insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); - // This can be treated as though it were a comment. - comment = true; - index = flagIndex - 1; - break; - } - if (endFlag == ':') - { - savedSyntax = syntax; - savedCflags = cflags; - flagsSaved = true; - syntax = newSyntax; - cflags = newCflags; - insens = ((cflags & REG_ICASE) > 0); - insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); - index = flagIndex - 1; - // Fall through to the next case. - } - else - { - throw new - REException (getLocalizedMessage - ("unmatched.paren"), - REException.REG_ESUBREG, index); - } - case ':': - if (syntax.get (RESyntax.RE_PURE_GROUPING)) - { - pure = true; - index += 2; - } - break; - case '#': - if (syntax.get (RESyntax.RE_COMMENTS)) - { - comment = true; - } - break; - default: - throw new - REException (getLocalizedMessage ("repeat.no.token"), - REException.REG_BADRPT, index); - } - } - - if (index >= pLength) - { - throw new - REException (getLocalizedMessage ("unmatched.paren"), - REException.REG_ESUBREG, index); - } - - // find end of subexpression - int endIndex = index; - int nextIndex = index; - int nested = 0; - - while (((nextIndex = - getCharUnit (pattern, endIndex, unit, false)) > 0) - && !(nested == 0 && (unit.ch == ')') - && (syntax. - get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk - || quot)))) - { - if ((endIndex = nextIndex) >= pLength) - throw new - REException (getLocalizedMessage ("subexpr.no.end"), - REException.REG_ESUBREG, nextIndex); - else - if ((unit.ch == '[') && !(unit.bk || quot)) - { - // I hate to do something similar to the LIST OPERATOR matters - // above, but ... - int listIndex = nextIndex; - if (listIndex < pLength && pattern[listIndex] == '^') - listIndex++; - if (listIndex < pLength && pattern[listIndex] == ']') - listIndex++; - int listEndIndex = -1; - int listNest = 0; - while (listIndex < pLength && listEndIndex < 0) - { - switch (pattern[listIndex++]) - { - case '\\': - listIndex++; - break; - case '[': - // Sun's API document says that regexp like "[a-d[m-p]]" - // is legal. Even something like "[[[^]]]]" is accepted. - listNest++; - if (listIndex < pLength - && pattern[listIndex] == '^') - listIndex++; - if (listIndex < pLength - && pattern[listIndex] == ']') - listIndex++; - break; - case ']': - if (listNest == 0) - listEndIndex = listIndex; - listNest--; - break; - } - } - if (listEndIndex >= 0) - { - nextIndex = listEndIndex; - if ((endIndex = nextIndex) >= pLength) - throw new - REException (getLocalizedMessage ("subexpr.no.end"), - REException.REG_ESUBREG, nextIndex); - else - continue; - } - throw new - REException (getLocalizedMessage ("subexpr.no.end"), - REException.REG_ESUBREG, nextIndex); - } - else if (unit.ch == '(' - && (syntax. - get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk - || quot))) - nested++; - else if (unit.ch == ')' - && (syntax. - get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk - || quot))) - nested--; - } - - // endIndex is now position at a ')','\)' - // nextIndex is end of string or position after ')' or '\)' - - if (comment) - index = nextIndex; - else - { // not a comment - // create RE subexpression as token. - addToken (currentToken); - if (!pure) - { - numSubs++; - } - - int useIndex = (pure || lookAhead || lookBehind - || independent) ? 0 : nextSub + numSubs; - currentToken = - new RE (String.valueOf (pattern, index, endIndex - index). - toCharArray (), cflags, syntax, useIndex, - nextSub + numSubs); - numSubs += ((RE) currentToken).getNumSubs (); - - if (lookAhead) - { - currentToken = - new RETokenLookAhead (currentToken, negativelh); - } - else if (lookBehind) - { - currentToken = - new RETokenLookBehind (currentToken, negativelb); - } - else if (independent) - { - currentToken = new RETokenIndependent (currentToken); - } - - index = nextIndex; - if (flagsSaved) - { - syntax = savedSyntax; - cflags = savedCflags; - insens = ((cflags & REG_ICASE) > 0); - insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); - flagsSaved = false; - } - } // not a comment - } // subexpression - - // UNMATCHED RIGHT PAREN - // ) or \) throw exception if - // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) - else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) - && ((unit.ch == ')') - && (syntax. - get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) - { - throw new REException (getLocalizedMessage ("unmatched.paren"), - REException.REG_EPAREN, index); - } - - // START OF LINE OPERATOR - // ^ - - else if ((unit.ch == '^') && !(unit.bk || quot)) - { - addToken (currentToken); - currentToken = null; - RETokenStart token = null; - if ((cflags & REG_MULTILINE) > 0) - { - String sep = syntax.getLineSeparator (); - if (sep == null) - { - token = new RETokenStart (subIndex, null, true); - } - else - { - token = new RETokenStart (subIndex, sep); - } - } - else - { - token = new RETokenStart (subIndex, null); - } - addToken (token); - } - - // END OF LINE OPERATOR - // $ - - else if ((unit.ch == '$') && !(unit.bk || quot)) - { - addToken (currentToken); - currentToken = null; - RETokenEnd token = null; - if ((cflags & REG_MULTILINE) > 0) - { - String sep = syntax.getLineSeparator (); - if (sep == null) - { - token = new RETokenEnd (subIndex, null, true); - } - else - { - token = new RETokenEnd (subIndex, sep); - } - } - else - { - token = new RETokenEnd (subIndex, null); - } - addToken (token); - } - - // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null) - // . - - else if ((unit.ch == '.') && !(unit.bk || quot)) - { - addToken (currentToken); - currentToken = - new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE) - || ((cflags & REG_DOT_NEWLINE) > 0), - syntax.get (RESyntax.RE_DOT_NOT_NULL)); - } - - // ZERO-OR-MORE REPEAT OPERATOR - // * - // - // This method used to check "repeat.empty.token" to avoid such regexp - // as "(a*)*", but now "repeat.empty.token" is allowed. - - else if ((unit.ch == '*') && !(unit.bk || quot)) - { - if (currentToken == null) - throw new REException (getLocalizedMessage ("repeat.no.token"), - REException.REG_BADRPT, index); - if (currentToken instanceof RETokenRepeated) - throw new REException (getLocalizedMessage ("repeat.chained"), - REException.REG_BADRPT, index); - if (currentToken instanceof RETokenWordBoundary - || currentToken instanceof RETokenWordBoundary) - throw new REException (getLocalizedMessage ("repeat.assertion"), - REException.REG_BADRPT, index); - currentToken = - setRepeated (currentToken, 0, Integer.MAX_VALUE, index); - } - - // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR - // + | \+ depending on RE_BK_PLUS_QM - // not available if RE_LIMITED_OPS is set - // - // This method used to check "repeat.empty.token" to avoid such regexp - // as "(a*)+", but now "repeat.empty.token" is allowed. - - else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS) - && (!syntax. - get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) - { - if (currentToken == null) - throw new REException (getLocalizedMessage ("repeat.no.token"), - REException.REG_BADRPT, index); - - // Check for possessive matching on RETokenRepeated - if (currentToken instanceof RETokenRepeated) - { - RETokenRepeated tokenRep = (RETokenRepeated) currentToken; - if (syntax.get (RESyntax.RE_POSSESSIVE_OPS) - && !tokenRep.isPossessive () && !tokenRep.isStingy ()) - tokenRep.makePossessive (); - else - throw new - REException (getLocalizedMessage ("repeat.chained"), - REException.REG_BADRPT, index); - - } - else if (currentToken instanceof RETokenWordBoundary - || currentToken instanceof RETokenWordBoundary) - throw new REException (getLocalizedMessage ("repeat.assertion"), - REException.REG_BADRPT, index); - else - currentToken = - setRepeated (currentToken, 1, Integer.MAX_VALUE, index); - } - - // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR - // ? | \? depending on RE_BK_PLUS_QM - // not available if RE_LIMITED_OPS is set - // stingy matching if RE_STINGY_OPS is set and it follows a quantifier - - else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS) - && (!syntax. - get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) - { - if (currentToken == null) - throw new REException (getLocalizedMessage ("repeat.no.token"), - REException.REG_BADRPT, index); - - // Check for stingy matching on RETokenRepeated - if (currentToken instanceof RETokenRepeated) - { - RETokenRepeated tokenRep = (RETokenRepeated) currentToken; - if (syntax.get (RESyntax.RE_STINGY_OPS) - && !tokenRep.isStingy () && !tokenRep.isPossessive ()) - tokenRep.makeStingy (); - else - throw new - REException (getLocalizedMessage ("repeat.chained"), - REException.REG_BADRPT, index); - } - else if (currentToken instanceof RETokenWordBoundary - || currentToken instanceof RETokenWordBoundary) - throw new REException (getLocalizedMessage ("repeat.assertion"), - REException.REG_BADRPT, index); - else - currentToken = setRepeated (currentToken, 0, 1, index); - } - - // OCTAL CHARACTER - // \0377 - - else if (unit.bk && (unit.ch == '0') - && syntax.get (RESyntax.RE_OCTAL_CHAR)) - { - CharExpression ce = - getCharExpression (pattern, index - 2, pLength, syntax); - if (ce == null) - throw new REException ("invalid octal character", - REException.REG_ESCAPE, index); - index = index - 2 + ce.len; - addToken (currentToken); - currentToken = new RETokenChar (subIndex, ce.ch, insens); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // BACKREFERENCE OPERATOR - // \1 \2 ... \9 and \10 \11 \12 ... - // not available if RE_NO_BK_REFS is set - // Perl recognizes \10, \11, and so on only if enough number of - // parentheses have opened before it, otherwise they are treated - // as aliases of \010, \011, ... (octal characters). In case of - // Sun's JDK, octal character expression must always begin with \0. - // We will do as JDK does. But FIXME, take a look at "(a)(b)\29". - // JDK treats \2 as a back reference to the 2nd group because - // there are only two groups. But in our poor implementation, - // we cannot help but treat \29 as a back reference to the 29th group. - - else if (unit.bk && Character.isDigit (unit.ch) - && !syntax.get (RESyntax.RE_NO_BK_REFS)) - { - addToken (currentToken); - int numBegin = index - 1; - int numEnd = pLength; - for (int i = index; i < pLength; i++) - { - if (!Character.isDigit (pattern[i])) - { - numEnd = i; - break; - } - } - int num = parseInt (pattern, numBegin, numEnd - numBegin, 10); - - currentToken = new RETokenBackRef (subIndex, num, insens); - if (insensUSASCII) - currentToken.unicodeAware = false; - index = numEnd; - } - - // START OF STRING OPERATOR - // \A if RE_STRING_ANCHORS is set - - else if (unit.bk && (unit.ch == 'A') - && syntax.get (RESyntax.RE_STRING_ANCHORS)) - { - addToken (currentToken); - currentToken = new RETokenStart (subIndex, null); - } - - // WORD BREAK OPERATOR - // \b if ???? - - else if (unit.bk && (unit.ch == 'b') - && syntax.get (RESyntax.RE_STRING_ANCHORS)) - { - addToken (currentToken); - currentToken = - new RETokenWordBoundary (subIndex, - RETokenWordBoundary. - BEGIN | RETokenWordBoundary.END, - false); - } - - // WORD BEGIN OPERATOR - // \< if ???? - else if (unit.bk && (unit.ch == '<')) - { - addToken (currentToken); - currentToken = - new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN, - false); - } - - // WORD END OPERATOR - // \> if ???? - else if (unit.bk && (unit.ch == '>')) - { - addToken (currentToken); - currentToken = - new RETokenWordBoundary (subIndex, RETokenWordBoundary.END, - false); - } - - // NON-WORD BREAK OPERATOR - // \B if ???? - - else if (unit.bk && (unit.ch == 'B') - && syntax.get (RESyntax.RE_STRING_ANCHORS)) - { - addToken (currentToken); - currentToken = - new RETokenWordBoundary (subIndex, - RETokenWordBoundary. - BEGIN | RETokenWordBoundary.END, true); - } - - - // DIGIT OPERATOR - // \d if RE_CHAR_CLASS_ESCAPES is set - - else if (unit.bk && (unit.ch == 'd') - && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) - { - addToken (currentToken); - currentToken = - new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // NON-DIGIT OPERATOR - // \D - - else if (unit.bk && (unit.ch == 'D') - && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) - { - addToken (currentToken); - currentToken = - new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // NEWLINE ESCAPE - // \n - - else if (unit.bk && (unit.ch == 'n')) - { - addToken (currentToken); - currentToken = new RETokenChar (subIndex, '\n', false); - } - - // RETURN ESCAPE - // \r - - else if (unit.bk && (unit.ch == 'r')) - { - addToken (currentToken); - currentToken = new RETokenChar (subIndex, '\r', false); - } - - // WHITESPACE OPERATOR - // \s if RE_CHAR_CLASS_ESCAPES is set - - else if (unit.bk && (unit.ch == 's') - && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) - { - addToken (currentToken); - currentToken = - new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // NON-WHITESPACE OPERATOR - // \S - - else if (unit.bk && (unit.ch == 'S') - && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) - { - addToken (currentToken); - currentToken = - new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // TAB ESCAPE - // \t - - else if (unit.bk && (unit.ch == 't')) - { - addToken (currentToken); - currentToken = new RETokenChar (subIndex, '\t', false); - } - - // ALPHANUMERIC OPERATOR - // \w - - else if (unit.bk && (unit.ch == 'w') - && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) - { - addToken (currentToken); - currentToken = - new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // NON-ALPHANUMERIC OPERATOR - // \W - - else if (unit.bk && (unit.ch == 'W') - && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) - { - addToken (currentToken); - currentToken = - new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // END OF STRING OPERATOR - // \Z, \z - - // FIXME: \Z and \z are different in that if the input string - // ends with a line terminator, \Z matches the position before - // the final terminator. This special behavior of \Z is yet - // to be implemented. - - else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') && - syntax.get (RESyntax.RE_STRING_ANCHORS)) - { - addToken (currentToken); - currentToken = new RETokenEnd (subIndex, null); - } - - // HEX CHARACTER, UNICODE CHARACTER - // \x1B, \u1234 - - else - if ((unit.bk && (unit.ch == 'x') - && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk - && (unit.ch == 'u') - && syntax. - get (RESyntax. - RE_UNICODE_CHAR))) - { - CharExpression ce = - getCharExpression (pattern, index - 2, pLength, syntax); - if (ce == null) - throw new REException ("invalid hex character", - REException.REG_ESCAPE, index); - index = index - 2 + ce.len; - addToken (currentToken); - currentToken = new RETokenChar (subIndex, ce.ch, insens); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // NAMED PROPERTY - // \p{prop}, \P{prop} - - else - if ((unit.bk && (unit.ch == 'p') - && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk - && (unit.ch == - 'P') - && syntax. - get (RESyntax. - RE_NAMED_PROPERTY))) - { - NamedProperty np = getNamedProperty (pattern, index - 2, pLength); - if (np == null) - throw new REException ("invalid escape sequence", - REException.REG_ESCAPE, index); - index = index - 2 + np.len; - addToken (currentToken); - currentToken = - getRETokenNamedProperty (subIndex, np, insens, index); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - - // END OF PREVIOUS MATCH - // \G - - else if (unit.bk && (unit.ch == 'G') && - syntax.get (RESyntax.RE_STRING_ANCHORS)) - { - addToken (currentToken); - currentToken = new RETokenEndOfPreviousMatch (subIndex); - } - - // NON-SPECIAL CHARACTER (or escape to make literal) - // c | \* for example - - else - { // not a special character - addToken (currentToken); - currentToken = new RETokenChar (subIndex, unit.ch, insens); - if (insensUSASCII) - currentToken.unicodeAware = false; - } - } // end while + // read the next character unit (including backslash escapes) + index = getCharUnit (pattern, index, unit, quot); + + if (unit.bk) + if (unit.ch == 'Q') + { + quot = true; + continue; + } + else if (unit.ch == 'E') + { + quot = false; + continue; + } + if (quot) + unit.bk = false; + + if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) + { + if (Character.isWhitespace (unit.ch)) + { + continue; + } + if (unit.ch == '#') + { + for (int i = index; i < pLength; i++) + { + if (pattern[i] == '\n') + { + index = i + 1; + continue; + } + else if (pattern[i] == '\r') + { + if (i + 1 < pLength && pattern[i + 1] == '\n') + { + index = i + 2; + } + else + { + index = i + 1; + } + continue; + } + } + index = pLength; + continue; + } + } + + // ALTERNATION OPERATOR + // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT) + // not available if RE_LIMITED_OPS is set + + // TODO: the '\n' literal here should be a test against REToken.newline, + // which unfortunately may be more than a single character. + if (((unit.ch == '|' + && (syntax.get (RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot))) + || (syntax.get (RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') + && !(unit.bk || quot))) + && !syntax.get (RESyntax.RE_LIMITED_OPS)) + { + // make everything up to here be a branch. create vector if nec. + addToken (currentToken); + RE theBranch = + new RE (firstToken, lastToken, numSubs, subIndex, minimumLength, + maximumLength); + minimumLength = 0; + maximumLength = 0; + if (branches == null) + { + branches = new ArrayList < REToken > (); + } + branches.add (theBranch); + firstToken = lastToken = currentToken = null; + } + + // INTERVAL OPERATOR: + // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES) + // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES) + // + // OPEN QUESTION: + // what is proper interpretation of '{' at start of string? + // + // This method used to check "repeat.empty.token" to avoid such regexp + // as "(a*){2,}", but now "repeat.empty.token" is allowed. + + else if ((unit.ch == '{') && syntax.get (RESyntax.RE_INTERVALS) + && (syntax. + get (RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) + { + int newIndex = getMinMax (pattern, index, minMax, syntax); + if (newIndex > index) + { + if (minMax.first > minMax.second) + throw new + REException (getLocalizedMessage ("interval.order"), + REException.REG_BADRPT, newIndex); + if (currentToken == null) + throw new + REException (getLocalizedMessage ("repeat.no.token"), + REException.REG_BADRPT, newIndex); + if (currentToken instanceof RETokenRepeated) + throw new + REException (getLocalizedMessage ("repeat.chained"), + REException.REG_BADRPT, newIndex); + if (currentToken instanceof RETokenWordBoundary + || currentToken instanceof RETokenWordBoundary) + throw new + REException (getLocalizedMessage ("repeat.assertion"), + REException.REG_BADRPT, newIndex); + index = newIndex; + currentToken = + setRepeated (currentToken, minMax.first, minMax.second, + index); + } + else + { + addToken (currentToken); + currentToken = new RETokenChar (subIndex, unit.ch, insens); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + } + + // LIST OPERATOR: + // [...] | [^...] + + else if ((unit.ch == '[') && !(unit.bk || quot)) + { + // Create a new RETokenOneOf + ParseCharClassResult result = + parseCharClass (subIndex, pattern, index, pLength, cflags, + syntax, 0); + addToken (currentToken); + currentToken = result.token; + index = result.index; + } + + // SUBEXPRESSIONS + // (...) | \(...\) depending on RE_NO_BK_PARENS + + else if ((unit.ch == '(') + && (syntax. + get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) + { + boolean pure = false; + boolean comment = false; + boolean lookAhead = false; + boolean lookBehind = false; + boolean independent = false; + boolean negativelh = false; + boolean negativelb = false; + if ((index + 1 < pLength) && (pattern[index] == '?')) + { + switch (pattern[index + 1]) + { + case '!': + if (syntax.get (RESyntax.RE_LOOKAHEAD)) + { + pure = true; + negativelh = true; + lookAhead = true; + index += 2; + } + break; + case '=': + if (syntax.get (RESyntax.RE_LOOKAHEAD)) + { + pure = true; + lookAhead = true; + index += 2; + } + break; + case '<': + // We assume that if the syntax supports look-ahead, + // it also supports look-behind. + if (syntax.get (RESyntax.RE_LOOKAHEAD)) + { + index++; + switch (pattern[index + 1]) + { + case '!': + pure = true; + negativelb = true; + lookBehind = true; + index += 2; + break; + case '=': + pure = true; + lookBehind = true; + index += 2; + } + } + break; + case '>': + // We assume that if the syntax supports look-ahead, + // it also supports independent group. + if (syntax.get (RESyntax.RE_LOOKAHEAD)) + { + pure = true; + independent = true; + index += 2; + } + break; + case 'i': + case 'd': + case 'm': + case 's': + case 'u': + case 'x': + case '-': + if (!syntax.get (RESyntax.RE_EMBEDDED_FLAGS)) + break; + // Set or reset syntax flags. + int flagIndex = index + 1; + int endFlag = -1; + RESyntax newSyntax = new RESyntax (syntax); + int newCflags = cflags; + boolean negate = false; + while (flagIndex < pLength && endFlag < 0) + { + switch (pattern[flagIndex]) + { + case 'i': + if (negate) + newCflags &= ~REG_ICASE; + else + newCflags |= REG_ICASE; + flagIndex++; + break; + case 'd': + if (negate) + newSyntax.setLineSeparator (RESyntax. + DEFAULT_LINE_SEPARATOR); + else + newSyntax.setLineSeparator ("\n"); + flagIndex++; + break; + case 'm': + if (negate) + newCflags &= ~REG_MULTILINE; + else + newCflags |= REG_MULTILINE; + flagIndex++; + break; + case 's': + if (negate) + newCflags &= ~REG_DOT_NEWLINE; + else + newCflags |= REG_DOT_NEWLINE; + flagIndex++; + break; + case 'u': + if (negate) + newCflags |= REG_ICASE_USASCII; + else + newCflags &= ~REG_ICASE_USASCII; + flagIndex++; + break; + case 'x': + if (negate) + newCflags &= ~REG_X_COMMENTS; + else + newCflags |= REG_X_COMMENTS; + flagIndex++; + break; + case '-': + negate = true; + flagIndex++; + break; + case ':': + case ')': + endFlag = pattern[flagIndex]; + break; + default: + throw new + REException (getLocalizedMessage + ("repeat.no.token"), + REException.REG_BADRPT, index); + } + } + if (endFlag == ')') + { + syntax = newSyntax; + cflags = newCflags; + insens = ((cflags & REG_ICASE) > 0); + insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); + // This can be treated as though it were a comment. + comment = true; + index = flagIndex - 1; + break; + } + if (endFlag == ':') + { + savedSyntax = syntax; + savedCflags = cflags; + flagsSaved = true; + syntax = newSyntax; + cflags = newCflags; + insens = ((cflags & REG_ICASE) > 0); + insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); + index = flagIndex - 1; + // Fall through to the next case. + } + else + { + throw new + REException (getLocalizedMessage + ("unmatched.paren"), + REException.REG_ESUBREG, index); + } + case ':': + if (syntax.get (RESyntax.RE_PURE_GROUPING)) + { + pure = true; + index += 2; + } + break; + case '#': + if (syntax.get (RESyntax.RE_COMMENTS)) + { + comment = true; + } + break; + default: + throw new + REException (getLocalizedMessage ("repeat.no.token"), + REException.REG_BADRPT, index); + } + } + + if (index >= pLength) + { + throw new + REException (getLocalizedMessage ("unmatched.paren"), + REException.REG_ESUBREG, index); + } + + // find end of subexpression + int endIndex = index; + int nextIndex = index; + int nested = 0; + + while (((nextIndex = + getCharUnit (pattern, endIndex, unit, false)) > 0) + && !(nested == 0 && (unit.ch == ')') + && (syntax. + get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk + || quot)))) + { + if ((endIndex = nextIndex) >= pLength) + throw new + REException (getLocalizedMessage ("subexpr.no.end"), + REException.REG_ESUBREG, nextIndex); + else + if ((unit.ch == '[') && !(unit.bk || quot)) + { + // I hate to do something similar to the LIST OPERATOR matters + // above, but ... + int listIndex = nextIndex; + if (listIndex < pLength && pattern[listIndex] == '^') + listIndex++; + if (listIndex < pLength && pattern[listIndex] == ']') + listIndex++; + int listEndIndex = -1; + int listNest = 0; + while (listIndex < pLength && listEndIndex < 0) + { + switch (pattern[listIndex++]) + { + case '\\': + listIndex++; + break; + case '[': + // Sun's API document says that regexp like "[a-d[m-p]]" + // is legal. Even something like "[[[^]]]]" is accepted. + listNest++; + if (listIndex < pLength + && pattern[listIndex] == '^') + listIndex++; + if (listIndex < pLength + && pattern[listIndex] == ']') + listIndex++; + break; + case ']': + if (listNest == 0) + listEndIndex = listIndex; + listNest--; + break; + } + } + if (listEndIndex >= 0) + { + nextIndex = listEndIndex; + if ((endIndex = nextIndex) >= pLength) + throw new + REException (getLocalizedMessage ("subexpr.no.end"), + REException.REG_ESUBREG, nextIndex); + else + continue; + } + throw new + REException (getLocalizedMessage ("subexpr.no.end"), + REException.REG_ESUBREG, nextIndex); + } + else if (unit.ch == '(' + && (syntax. + get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk + || quot))) + nested++; + else if (unit.ch == ')' + && (syntax. + get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk + || quot))) + nested--; + } + + // endIndex is now position at a ')','\)' + // nextIndex is end of string or position after ')' or '\)' + + if (comment) + index = nextIndex; + else + { // not a comment + // create RE subexpression as token. + addToken (currentToken); + if (!pure) + { + numSubs++; + } + + int useIndex = (pure || lookAhead || lookBehind + || independent) ? 0 : nextSub + numSubs; + currentToken = + new RE (String.valueOf (pattern, index, endIndex - index). + toCharArray (), cflags, syntax, useIndex, + nextSub + numSubs); + numSubs += ((RE) currentToken).getNumSubs (); + + if (lookAhead) + { + currentToken = + new RETokenLookAhead (currentToken, negativelh); + } + else if (lookBehind) + { + currentToken = + new RETokenLookBehind (currentToken, negativelb); + } + else if (independent) + { + currentToken = new RETokenIndependent (currentToken); + } + + index = nextIndex; + if (flagsSaved) + { + syntax = savedSyntax; + cflags = savedCflags; + insens = ((cflags & REG_ICASE) > 0); + insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); + flagsSaved = false; + } + } // not a comment + } // subexpression + + // UNMATCHED RIGHT PAREN + // ) or \) throw exception if + // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) + else if (!syntax.get (RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) + && ((unit.ch == ')') + && (syntax. + get (RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) + { + throw new REException (getLocalizedMessage ("unmatched.paren"), + REException.REG_EPAREN, index); + } + + // START OF LINE OPERATOR + // ^ + + else if ((unit.ch == '^') && !(unit.bk || quot)) + { + addToken (currentToken); + currentToken = null; + RETokenStart token = null; + if ((cflags & REG_MULTILINE) > 0) + { + String sep = syntax.getLineSeparator (); + if (sep == null) + { + token = new RETokenStart (subIndex, null, true); + } + else + { + token = new RETokenStart (subIndex, sep); + } + } + else + { + token = new RETokenStart (subIndex, null); + } + addToken (token); + } + + // END OF LINE OPERATOR + // $ + + else if ((unit.ch == '$') && !(unit.bk || quot)) + { + addToken (currentToken); + currentToken = null; + RETokenEnd token = null; + if ((cflags & REG_MULTILINE) > 0) + { + String sep = syntax.getLineSeparator (); + if (sep == null) + { + token = new RETokenEnd (subIndex, null, true); + } + else + { + token = new RETokenEnd (subIndex, sep); + } + } + else + { + token = new RETokenEnd (subIndex, null); + } + addToken (token); + } + + // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null) + // . + + else if ((unit.ch == '.') && !(unit.bk || quot)) + { + addToken (currentToken); + currentToken = + new RETokenAny (subIndex, syntax.get (RESyntax.RE_DOT_NEWLINE) + || ((cflags & REG_DOT_NEWLINE) > 0), + syntax.get (RESyntax.RE_DOT_NOT_NULL)); + } + + // ZERO-OR-MORE REPEAT OPERATOR + // * + // + // This method used to check "repeat.empty.token" to avoid such regexp + // as "(a*)*", but now "repeat.empty.token" is allowed. + + else if ((unit.ch == '*') && !(unit.bk || quot)) + { + if (currentToken == null) + throw new REException (getLocalizedMessage ("repeat.no.token"), + REException.REG_BADRPT, index); + if (currentToken instanceof RETokenRepeated) + throw new REException (getLocalizedMessage ("repeat.chained"), + REException.REG_BADRPT, index); + if (currentToken instanceof RETokenWordBoundary + || currentToken instanceof RETokenWordBoundary) + throw new REException (getLocalizedMessage ("repeat.assertion"), + REException.REG_BADRPT, index); + currentToken = + setRepeated (currentToken, 0, Integer.MAX_VALUE, index); + } + + // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR + // + | \+ depending on RE_BK_PLUS_QM + // not available if RE_LIMITED_OPS is set + // + // This method used to check "repeat.empty.token" to avoid such regexp + // as "(a*)+", but now "repeat.empty.token" is allowed. + + else if ((unit.ch == '+') && !syntax.get (RESyntax.RE_LIMITED_OPS) + && (!syntax. + get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) + { + if (currentToken == null) + throw new REException (getLocalizedMessage ("repeat.no.token"), + REException.REG_BADRPT, index); + + // Check for possessive matching on RETokenRepeated + if (currentToken instanceof RETokenRepeated) + { + RETokenRepeated tokenRep = (RETokenRepeated) currentToken; + if (syntax.get (RESyntax.RE_POSSESSIVE_OPS) + && !tokenRep.isPossessive () && !tokenRep.isStingy ()) + tokenRep.makePossessive (); + else + throw new + REException (getLocalizedMessage ("repeat.chained"), + REException.REG_BADRPT, index); + + } + else if (currentToken instanceof RETokenWordBoundary + || currentToken instanceof RETokenWordBoundary) + throw new REException (getLocalizedMessage ("repeat.assertion"), + REException.REG_BADRPT, index); + else + currentToken = + setRepeated (currentToken, 1, Integer.MAX_VALUE, index); + } + + // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR + // ? | \? depending on RE_BK_PLUS_QM + // not available if RE_LIMITED_OPS is set + // stingy matching if RE_STINGY_OPS is set and it follows a quantifier + + else if ((unit.ch == '?') && !syntax.get (RESyntax.RE_LIMITED_OPS) + && (!syntax. + get (RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) + { + if (currentToken == null) + throw new REException (getLocalizedMessage ("repeat.no.token"), + REException.REG_BADRPT, index); + + // Check for stingy matching on RETokenRepeated + if (currentToken instanceof RETokenRepeated) + { + RETokenRepeated tokenRep = (RETokenRepeated) currentToken; + if (syntax.get (RESyntax.RE_STINGY_OPS) + && !tokenRep.isStingy () && !tokenRep.isPossessive ()) + tokenRep.makeStingy (); + else + throw new + REException (getLocalizedMessage ("repeat.chained"), + REException.REG_BADRPT, index); + } + else if (currentToken instanceof RETokenWordBoundary + || currentToken instanceof RETokenWordBoundary) + throw new REException (getLocalizedMessage ("repeat.assertion"), + REException.REG_BADRPT, index); + else + currentToken = setRepeated (currentToken, 0, 1, index); + } + + // OCTAL CHARACTER + // \0377 + + else if (unit.bk && (unit.ch == '0') + && syntax.get (RESyntax.RE_OCTAL_CHAR)) + { + CharExpression ce = + getCharExpression (pattern, index - 2, pLength, syntax); + if (ce == null) + throw new REException ("invalid octal character", + REException.REG_ESCAPE, index); + index = index - 2 + ce.len; + addToken (currentToken); + currentToken = new RETokenChar (subIndex, ce.ch, insens); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // BACKREFERENCE OPERATOR + // \1 \2 ... \9 and \10 \11 \12 ... + // not available if RE_NO_BK_REFS is set + // Perl recognizes \10, \11, and so on only if enough number of + // parentheses have opened before it, otherwise they are treated + // as aliases of \010, \011, ... (octal characters). In case of + // Sun's JDK, octal character expression must always begin with \0. + // We will do as JDK does. But FIXME, take a look at "(a)(b)\29". + // JDK treats \2 as a back reference to the 2nd group because + // there are only two groups. But in our poor implementation, + // we cannot help but treat \29 as a back reference to the 29th group. + + else if (unit.bk && Character.isDigit (unit.ch) + && !syntax.get (RESyntax.RE_NO_BK_REFS)) + { + addToken (currentToken); + int numBegin = index - 1; + int numEnd = pLength; + for (int i = index; i < pLength; i++) + { + if (!Character.isDigit (pattern[i])) + { + numEnd = i; + break; + } + } + int num = parseInt (pattern, numBegin, numEnd - numBegin, 10); + + currentToken = new RETokenBackRef (subIndex, num, insens); + if (insensUSASCII) + currentToken.unicodeAware = false; + index = numEnd; + } + + // START OF STRING OPERATOR + // \A if RE_STRING_ANCHORS is set + + else if (unit.bk && (unit.ch == 'A') + && syntax.get (RESyntax.RE_STRING_ANCHORS)) + { + addToken (currentToken); + currentToken = new RETokenStart (subIndex, null); + } + + // WORD BREAK OPERATOR + // \b if ???? + + else if (unit.bk && (unit.ch == 'b') + && syntax.get (RESyntax.RE_STRING_ANCHORS)) + { + addToken (currentToken); + currentToken = + new RETokenWordBoundary (subIndex, + RETokenWordBoundary. + BEGIN | RETokenWordBoundary.END, + false); + } + + // WORD BEGIN OPERATOR + // \< if ???? + else if (unit.bk && (unit.ch == '<')) + { + addToken (currentToken); + currentToken = + new RETokenWordBoundary (subIndex, RETokenWordBoundary.BEGIN, + false); + } + + // WORD END OPERATOR + // \> if ???? + else if (unit.bk && (unit.ch == '>')) + { + addToken (currentToken); + currentToken = + new RETokenWordBoundary (subIndex, RETokenWordBoundary.END, + false); + } + + // NON-WORD BREAK OPERATOR + // \B if ???? + + else if (unit.bk && (unit.ch == 'B') + && syntax.get (RESyntax.RE_STRING_ANCHORS)) + { + addToken (currentToken); + currentToken = + new RETokenWordBoundary (subIndex, + RETokenWordBoundary. + BEGIN | RETokenWordBoundary.END, true); + } + + + // DIGIT OPERATOR + // \d if RE_CHAR_CLASS_ESCAPES is set + + else if (unit.bk && (unit.ch == 'd') + && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) + { + addToken (currentToken); + currentToken = + new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, false); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // NON-DIGIT OPERATOR + // \D + + else if (unit.bk && (unit.ch == 'D') + && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) + { + addToken (currentToken); + currentToken = + new RETokenPOSIX (subIndex, RETokenPOSIX.DIGIT, insens, true); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // NEWLINE ESCAPE + // \n + + else if (unit.bk && (unit.ch == 'n')) + { + addToken (currentToken); + currentToken = new RETokenChar (subIndex, '\n', false); + } + + // RETURN ESCAPE + // \r + + else if (unit.bk && (unit.ch == 'r')) + { + addToken (currentToken); + currentToken = new RETokenChar (subIndex, '\r', false); + } + + // WHITESPACE OPERATOR + // \s if RE_CHAR_CLASS_ESCAPES is set + + else if (unit.bk && (unit.ch == 's') + && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) + { + addToken (currentToken); + currentToken = + new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, false); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // NON-WHITESPACE OPERATOR + // \S + + else if (unit.bk && (unit.ch == 'S') + && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) + { + addToken (currentToken); + currentToken = + new RETokenPOSIX (subIndex, RETokenPOSIX.SPACE, insens, true); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // TAB ESCAPE + // \t + + else if (unit.bk && (unit.ch == 't')) + { + addToken (currentToken); + currentToken = new RETokenChar (subIndex, '\t', false); + } + + // ALPHANUMERIC OPERATOR + // \w + + else if (unit.bk && (unit.ch == 'w') + && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) + { + addToken (currentToken); + currentToken = + new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, false); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // NON-ALPHANUMERIC OPERATOR + // \W + + else if (unit.bk && (unit.ch == 'W') + && syntax.get (RESyntax.RE_CHAR_CLASS_ESCAPES)) + { + addToken (currentToken); + currentToken = + new RETokenPOSIX (subIndex, RETokenPOSIX.ALNUM, insens, true); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // END OF STRING OPERATOR + // \Z, \z + + // FIXME: \Z and \z are different in that if the input string + // ends with a line terminator, \Z matches the position before + // the final terminator. This special behavior of \Z is yet + // to be implemented. + + else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') && + syntax.get (RESyntax.RE_STRING_ANCHORS)) + { + addToken (currentToken); + currentToken = new RETokenEnd (subIndex, null); + } + + // HEX CHARACTER, UNICODE CHARACTER + // \x1B, \u1234 + + else + if ((unit.bk && (unit.ch == 'x') + && syntax.get (RESyntax.RE_HEX_CHAR)) || (unit.bk + && (unit.ch == 'u') + && syntax. + get (RESyntax. + RE_UNICODE_CHAR))) + { + CharExpression ce = + getCharExpression (pattern, index - 2, pLength, syntax); + if (ce == null) + throw new REException ("invalid hex character", + REException.REG_ESCAPE, index); + index = index - 2 + ce.len; + addToken (currentToken); + currentToken = new RETokenChar (subIndex, ce.ch, insens); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // NAMED PROPERTY + // \p{prop}, \P{prop} + + else + if ((unit.bk && (unit.ch == 'p') + && syntax.get (RESyntax.RE_NAMED_PROPERTY)) || (unit.bk + && (unit.ch == + 'P') + && syntax. + get (RESyntax. + RE_NAMED_PROPERTY))) + { + NamedProperty np = getNamedProperty (pattern, index - 2, pLength); + if (np == null) + throw new REException ("invalid escape sequence", + REException.REG_ESCAPE, index); + index = index - 2 + np.len; + addToken (currentToken); + currentToken = + getRETokenNamedProperty (subIndex, np, insens, index); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + + // END OF PREVIOUS MATCH + // \G + + else if (unit.bk && (unit.ch == 'G') && + syntax.get (RESyntax.RE_STRING_ANCHORS)) + { + addToken (currentToken); + currentToken = new RETokenEndOfPreviousMatch (subIndex); + } + + // NON-SPECIAL CHARACTER (or escape to make literal) + // c | \* for example + + else + { // not a special character + addToken (currentToken); + currentToken = new RETokenChar (subIndex, unit.ch, insens); + if (insensUSASCII) + currentToken.unicodeAware = false; + } + } // end while // Add final buffered token and an EndSub marker addToken (currentToken); if (branches != null) { - branches. - add (new - RE (firstToken, lastToken, numSubs, subIndex, minimumLength, - maximumLength)); - branches.trimToSize (); // compact the Vector - minimumLength = 0; - maximumLength = 0; - firstToken = lastToken = null; - addToken (new RETokenOneOf (subIndex, branches, false)); + branches. + add (new + RE (firstToken, lastToken, numSubs, subIndex, minimumLength, + maximumLength)); + branches.trimToSize (); // compact the Vector + minimumLength = 0; + maximumLength = 0; + firstToken = lastToken = null; + addToken (new RETokenOneOf (subIndex, branches, false)); } else addToken (new RETokenEndSub (subIndex)); @@ -1403,11 +1403,11 @@ public class RE extends REToken * @param syntax Syntax used to parse the pattern. */ private static ParseCharClassResult parseCharClass (int subIndex, - char[]pattern, - int index, int pLength, - int cflags, - RESyntax syntax, - int pflags) throws + char[]pattern, + int index, int pLength, + int cflags, + RESyntax syntax, + int pflags) throws REException { @@ -1425,256 +1425,256 @@ public class RE extends REToken boolean lastCharIsSet = false; if (index == pLength) throw new REException (getLocalizedMessage ("unmatched.bracket"), - REException.REG_EBRACK, index); + REException.REG_EBRACK, index); // Check for initial caret, negation if ((ch = pattern[index]) == '^') { - negative = true; - if (++index == pLength) - throw new REException (getLocalizedMessage ("class.no.end"), - REException.REG_EBRACK, index); - ch = pattern[index]; + negative = true; + if (++index == pLength) + throw new REException (getLocalizedMessage ("class.no.end"), + REException.REG_EBRACK, index); + ch = pattern[index]; } // Check for leading right bracket literal if (ch == ']') { - lastChar = ch; - lastCharIsSet = true; - if (++index == pLength) - throw new REException (getLocalizedMessage ("class.no.end"), - REException.REG_EBRACK, index); + lastChar = ch; + lastCharIsSet = true; + if (++index == pLength) + throw new REException (getLocalizedMessage ("class.no.end"), + REException.REG_EBRACK, index); } while ((ch = pattern[index++]) != ']') { - if ((ch == '-') && (lastCharIsSet)) - { - if (index == pLength) - throw new REException (getLocalizedMessage ("class.no.end"), - REException.REG_EBRACK, index); - if ((ch = pattern[index]) == ']') - { - RETokenChar t = new RETokenChar (subIndex, lastChar, insens); - if (insensUSASCII) - t.unicodeAware = false; - options.add (t); - lastChar = '-'; - } - else - { - if ((ch == '\\') - && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) - { - CharExpression ce = - getCharExpression (pattern, index, pLength, syntax); - if (ce == null) - throw new REException ("invalid escape sequence", - REException.REG_ESCAPE, index); - ch = ce.ch; - index = index + ce.len - 1; - } - RETokenRange t = - new RETokenRange (subIndex, lastChar, ch, insens); - if (insensUSASCII) - t.unicodeAware = false; - options.add (t); - lastChar = 0; - lastCharIsSet = false; - index++; - } - } - else if ((ch == '\\') - && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) - { - if (index == pLength) - throw new REException (getLocalizedMessage ("class.no.end"), - REException.REG_EBRACK, index); - int posixID = -1; - boolean negate = false; - char asciiEsc = 0; - boolean asciiEscIsSet = false; - NamedProperty np = null; - if (("dswDSW".indexOf (pattern[index]) != -1) - && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) - { - switch (pattern[index]) - { - case 'D': - negate = true; - case 'd': - posixID = RETokenPOSIX.DIGIT; - break; - case 'S': - negate = true; - case 's': - posixID = RETokenPOSIX.SPACE; - break; - case 'W': - negate = true; - case 'w': - posixID = RETokenPOSIX.ALNUM; - break; - } - } - if (("pP".indexOf (pattern[index]) != -1) - && syntax.get (RESyntax.RE_NAMED_PROPERTY)) - { - np = getNamedProperty (pattern, index - 1, pLength); - if (np == null) - throw new REException ("invalid escape sequence", - REException.REG_ESCAPE, index); - index = index - 1 + np.len - 1; - } - else - { - CharExpression ce = - getCharExpression (pattern, index - 1, pLength, syntax); - if (ce == null) - throw new REException ("invalid escape sequence", - REException.REG_ESCAPE, index); - asciiEsc = ce.ch; - asciiEscIsSet = true; - index = index - 1 + ce.len - 1; - } - if (lastCharIsSet) - { - RETokenChar t = new RETokenChar (subIndex, lastChar, insens); - if (insensUSASCII) - t.unicodeAware = false; - options.add (t); - } - - if (posixID != -1) - { - RETokenPOSIX t = - new RETokenPOSIX (subIndex, posixID, insens, negate); - if (insensUSASCII) - t.unicodeAware = false; - options.add (t); - } - else if (np != null) - { - RETokenNamedProperty t = - getRETokenNamedProperty (subIndex, np, insens, index); - if (insensUSASCII) - t.unicodeAware = false; - options.add (t); - } - else if (asciiEscIsSet) - { - lastChar = asciiEsc; - lastCharIsSet = true; - } - else - { - lastChar = pattern[index]; - lastCharIsSet = true; - } - ++index; - } - else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES)) - && (index < pLength) && (pattern[index] == ':')) - { - CPStringBuilder posixSet = new CPStringBuilder (); - index = getPosixSet (pattern, index + 1, posixSet); - int posixId = RETokenPOSIX.intValue (posixSet.toString ()); - if (posixId != -1) - { - RETokenPOSIX t = - new RETokenPOSIX (subIndex, posixId, insens, false); - if (insensUSASCII) - t.unicodeAware = false; - options.add (t); - } - } - else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS))) - { - ParseCharClassResult result = - parseCharClass (subIndex, pattern, index, pLength, cflags, - syntax, 0); - addition.add (result.token); - addition.add ("|"); - index = result.index; - } - else if ((ch == '&') && - (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) && - (index < pLength) && (pattern[index] == '&')) - { - if (returnAtAndOperator) - { - ParseCharClassResult result = new ParseCharClassResult (); - options.trimToSize (); - if (additionAndAppeared) - addition.add ("&"); - if (addition.size () == 0) - addition = null; - result.token = new RETokenOneOf (subIndex, - options, addition, negative); - result.index = index - 1; - result.returnAtAndOperator = true; - return result; - } - // The precedence of the operator "&&" is the lowest. - // So we postpone adding "&" until other elements - // are added. And we insert Boolean.FALSE at the - // beginning of the list of tokens following "&&". - // So, "&&[a-b][k-m]" will be stored in the Vecter - // addition in this order: - // Boolean.FALSE, [a-b], "|", [k-m], "|", "&" - if (additionAndAppeared) - addition.add ("&"); - addition.add (Boolean.FALSE); - additionAndAppeared = true; - - // The part on which "&&" operates may be either - // (1) explicitly enclosed by [] - // or - // (2) not enclosed by [] and terminated by the - // next "&&" or the end of the character list. - // Let the preceding else if block do the case (1). - // We must do something in case of (2). - if ((index + 1 < pLength) && (pattern[index + 1] != '[')) - { - ParseCharClassResult result = - parseCharClass (subIndex, pattern, index + 1, pLength, - cflags, syntax, - RETURN_AT_AND); - addition.add (result.token); - addition.add ("|"); - // If the method returned at the next "&&", it is OK. - // Otherwise we have eaten the mark of the end of this - // character list "]". In this case we must give back - // the end mark. - index = (result.returnAtAndOperator ? - result.index : result.index - 1); - } - } - else - { - if (lastCharIsSet) - { - RETokenChar t = new RETokenChar (subIndex, lastChar, insens); - if (insensUSASCII) - t.unicodeAware = false; - options.add (t); - } - lastChar = ch; - lastCharIsSet = true; - } - if (index == pLength) - throw new REException (getLocalizedMessage ("class.no.end"), - REException.REG_EBRACK, index); - } // while in list + if ((ch == '-') && (lastCharIsSet)) + { + if (index == pLength) + throw new REException (getLocalizedMessage ("class.no.end"), + REException.REG_EBRACK, index); + if ((ch = pattern[index]) == ']') + { + RETokenChar t = new RETokenChar (subIndex, lastChar, insens); + if (insensUSASCII) + t.unicodeAware = false; + options.add (t); + lastChar = '-'; + } + else + { + if ((ch == '\\') + && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) + { + CharExpression ce = + getCharExpression (pattern, index, pLength, syntax); + if (ce == null) + throw new REException ("invalid escape sequence", + REException.REG_ESCAPE, index); + ch = ce.ch; + index = index + ce.len - 1; + } + RETokenRange t = + new RETokenRange (subIndex, lastChar, ch, insens); + if (insensUSASCII) + t.unicodeAware = false; + options.add (t); + lastChar = 0; + lastCharIsSet = false; + index++; + } + } + else if ((ch == '\\') + && syntax.get (RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) + { + if (index == pLength) + throw new REException (getLocalizedMessage ("class.no.end"), + REException.REG_EBRACK, index); + int posixID = -1; + boolean negate = false; + char asciiEsc = 0; + boolean asciiEscIsSet = false; + NamedProperty np = null; + if (("dswDSW".indexOf (pattern[index]) != -1) + && syntax.get (RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) + { + switch (pattern[index]) + { + case 'D': + negate = true; + case 'd': + posixID = RETokenPOSIX.DIGIT; + break; + case 'S': + negate = true; + case 's': + posixID = RETokenPOSIX.SPACE; + break; + case 'W': + negate = true; + case 'w': + posixID = RETokenPOSIX.ALNUM; + break; + } + } + if (("pP".indexOf (pattern[index]) != -1) + && syntax.get (RESyntax.RE_NAMED_PROPERTY)) + { + np = getNamedProperty (pattern, index - 1, pLength); + if (np == null) + throw new REException ("invalid escape sequence", + REException.REG_ESCAPE, index); + index = index - 1 + np.len - 1; + } + else + { + CharExpression ce = + getCharExpression (pattern, index - 1, pLength, syntax); + if (ce == null) + throw new REException ("invalid escape sequence", + REException.REG_ESCAPE, index); + asciiEsc = ce.ch; + asciiEscIsSet = true; + index = index - 1 + ce.len - 1; + } + if (lastCharIsSet) + { + RETokenChar t = new RETokenChar (subIndex, lastChar, insens); + if (insensUSASCII) + t.unicodeAware = false; + options.add (t); + } + + if (posixID != -1) + { + RETokenPOSIX t = + new RETokenPOSIX (subIndex, posixID, insens, negate); + if (insensUSASCII) + t.unicodeAware = false; + options.add (t); + } + else if (np != null) + { + RETokenNamedProperty t = + getRETokenNamedProperty (subIndex, np, insens, index); + if (insensUSASCII) + t.unicodeAware = false; + options.add (t); + } + else if (asciiEscIsSet) + { + lastChar = asciiEsc; + lastCharIsSet = true; + } + else + { + lastChar = pattern[index]; + lastCharIsSet = true; + } + ++index; + } + else if ((ch == '[') && (syntax.get (RESyntax.RE_CHAR_CLASSES)) + && (index < pLength) && (pattern[index] == ':')) + { + CPStringBuilder posixSet = new CPStringBuilder (); + index = getPosixSet (pattern, index + 1, posixSet); + int posixId = RETokenPOSIX.intValue (posixSet.toString ()); + if (posixId != -1) + { + RETokenPOSIX t = + new RETokenPOSIX (subIndex, posixId, insens, false); + if (insensUSASCII) + t.unicodeAware = false; + options.add (t); + } + } + else if ((ch == '[') && (syntax.get (RESyntax.RE_NESTED_CHARCLASS))) + { + ParseCharClassResult result = + parseCharClass (subIndex, pattern, index, pLength, cflags, + syntax, 0); + addition.add (result.token); + addition.add ("|"); + index = result.index; + } + else if ((ch == '&') && + (syntax.get (RESyntax.RE_NESTED_CHARCLASS)) && + (index < pLength) && (pattern[index] == '&')) + { + if (returnAtAndOperator) + { + ParseCharClassResult result = new ParseCharClassResult (); + options.trimToSize (); + if (additionAndAppeared) + addition.add ("&"); + if (addition.size () == 0) + addition = null; + result.token = new RETokenOneOf (subIndex, + options, addition, negative); + result.index = index - 1; + result.returnAtAndOperator = true; + return result; + } + // The precedence of the operator "&&" is the lowest. + // So we postpone adding "&" until other elements + // are added. And we insert Boolean.FALSE at the + // beginning of the list of tokens following "&&". + // So, "&&[a-b][k-m]" will be stored in the Vecter + // addition in this order: + // Boolean.FALSE, [a-b], "|", [k-m], "|", "&" + if (additionAndAppeared) + addition.add ("&"); + addition.add (Boolean.FALSE); + additionAndAppeared = true; + + // The part on which "&&" operates may be either + // (1) explicitly enclosed by [] + // or + // (2) not enclosed by [] and terminated by the + // next "&&" or the end of the character list. + // Let the preceding else if block do the case (1). + // We must do something in case of (2). + if ((index + 1 < pLength) && (pattern[index + 1] != '[')) + { + ParseCharClassResult result = + parseCharClass (subIndex, pattern, index + 1, pLength, + cflags, syntax, + RETURN_AT_AND); + addition.add (result.token); + addition.add ("|"); + // If the method returned at the next "&&", it is OK. + // Otherwise we have eaten the mark of the end of this + // character list "]". In this case we must give back + // the end mark. + index = (result.returnAtAndOperator ? + result.index : result.index - 1); + } + } + else + { + if (lastCharIsSet) + { + RETokenChar t = new RETokenChar (subIndex, lastChar, insens); + if (insensUSASCII) + t.unicodeAware = false; + options.add (t); + } + lastChar = ch; + lastCharIsSet = true; + } + if (index == pLength) + throw new REException (getLocalizedMessage ("class.no.end"), + REException.REG_EBRACK, index); + } // while in list // Out of list, index is one past ']' if (lastCharIsSet) { - RETokenChar t = new RETokenChar (subIndex, lastChar, insens); - if (insensUSASCII) - t.unicodeAware = false; - options.add (t); + RETokenChar t = new RETokenChar (subIndex, lastChar, insens); + if (insensUSASCII) + t.unicodeAware = false; + options.add (t); } ParseCharClassResult result = new ParseCharClassResult (); @@ -1690,17 +1690,17 @@ public class RE extends REToken } private static int getCharUnit (char[]input, int index, CharUnit unit, - boolean quot) throws REException + boolean quot) throws REException { unit.ch = input[index++]; unit.bk = (unit.ch == '\\' - && (!quot || index >= input.length || input[index] == 'E')); + && (!quot || index >= input.length || input[index] == 'E')); if (unit.bk) if (index < input.length) - unit.ch = input[index++]; + unit.ch = input[index++]; else - throw new REException (getLocalizedMessage ("ends.with.backslash"), - REException.REG_ESCAPE, index); + throw new REException (getLocalizedMessage ("ends.with.backslash"), + REException.REG_ESCAPE, index); return index; } @@ -1709,7 +1709,7 @@ public class RE extends REToken int ret = 0; for (int i = pos; i < pos + len; i++) { - ret = ret * radix + Character.digit (input[i], radix); + ret = ret * radix + Character.digit (input[i], radix); } return ret; } @@ -1736,92 +1736,92 @@ public class RE extends REToken } private static CharExpression getCharExpression (char[]input, int pos, - int lim, RESyntax syntax) + int lim, RESyntax syntax) { CharExpression ce = new CharExpression (); char c = input[pos]; if (c == '\\') { - if (pos + 1 >= lim) - return null; - c = input[pos + 1]; - switch (c) - { - case 't': - ce.ch = '\t'; - ce.len = 2; - break; - case 'n': - ce.ch = '\n'; - ce.len = 2; - break; - case 'r': - ce.ch = '\r'; - ce.len = 2; - break; - case 'x': - case 'u': - if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) || - (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR))) - { - int l = 0; - int expectedLength = (c == 'x' ? 2 : 4); - for (int i = pos + 2; i < pos + 2 + expectedLength; i++) - { - if (i >= lim) - break; - if (!((input[i] >= '0' && input[i] <= '9') || - (input[i] >= 'A' && input[i] <= 'F') || - (input[i] >= 'a' && input[i] <= 'f'))) - break; - l++; - } - if (l != expectedLength) - return null; - ce.ch = (char) (parseInt (input, pos + 2, l, 16)); - ce.len = l + 2; - } - else - { - ce.ch = c; - ce.len = 2; - } - break; - case '0': - if (syntax.get (RESyntax.RE_OCTAL_CHAR)) - { - int l = 0; - for (int i = pos + 2; i < pos + 2 + 3; i++) - { - if (i >= lim) - break; - if (input[i] < '0' || input[i] > '7') - break; - l++; - } - if (l == 3 && input[pos + 2] > '3') - l--; - if (l <= 0) - return null; - ce.ch = (char) (parseInt (input, pos + 2, l, 8)); - ce.len = l + 2; - } - else - { - ce.ch = c; - ce.len = 2; - } - break; - default: - ce.ch = c; - ce.len = 2; - break; - } + if (pos + 1 >= lim) + return null; + c = input[pos + 1]; + switch (c) + { + case 't': + ce.ch = '\t'; + ce.len = 2; + break; + case 'n': + ce.ch = '\n'; + ce.len = 2; + break; + case 'r': + ce.ch = '\r'; + ce.len = 2; + break; + case 'x': + case 'u': + if ((c == 'x' && syntax.get (RESyntax.RE_HEX_CHAR)) || + (c == 'u' && syntax.get (RESyntax.RE_UNICODE_CHAR))) + { + int l = 0; + int expectedLength = (c == 'x' ? 2 : 4); + for (int i = pos + 2; i < pos + 2 + expectedLength; i++) + { + if (i >= lim) + break; + if (!((input[i] >= '0' && input[i] <= '9') || + (input[i] >= 'A' && input[i] <= 'F') || + (input[i] >= 'a' && input[i] <= 'f'))) + break; + l++; + } + if (l != expectedLength) + return null; + ce.ch = (char) (parseInt (input, pos + 2, l, 16)); + ce.len = l + 2; + } + else + { + ce.ch = c; + ce.len = 2; + } + break; + case '0': + if (syntax.get (RESyntax.RE_OCTAL_CHAR)) + { + int l = 0; + for (int i = pos + 2; i < pos + 2 + 3; i++) + { + if (i >= lim) + break; + if (input[i] < '0' || input[i] > '7') + break; + l++; + } + if (l == 3 && input[pos + 2] > '3') + l--; + if (l <= 0) + return null; + ce.ch = (char) (parseInt (input, pos + 2, l, 8)); + ce.len = l + 2; + } + else + { + ce.ch = c; + ce.len = 2; + } + break; + default: + ce.ch = c; + ce.len = 2; + break; + } } else { - ce.ch = input[pos]; - ce.len = 1; + ce.ch = input[pos]; + ce.len = 1; } ce.expr = new String (input, pos, ce.len); return ce; @@ -1846,60 +1846,60 @@ public class RE extends REToken } private static NamedProperty getNamedProperty (char[]input, int pos, - int lim) + int lim) { NamedProperty np = new NamedProperty (); char c = input[pos]; if (c == '\\') { - if (++pos >= lim) - return null; - c = input[pos++]; - switch (c) - { - case 'p': - np.negate = false; - break; - case 'P': - np.negate = true; - break; - default: - return null; - } - c = input[pos++]; - if (c == '{') - { - int p = -1; - for (int i = pos; i < lim; i++) - { - if (input[i] == '}') - { - p = i; - break; - } - } - if (p < 0) - return null; - int len = p - pos; - np.name = new String (input, pos, len); - np.len = len + 4; - } - else - { - np.name = new String (input, pos - 1, 1); - np.len = 3; - } - return np; + if (++pos >= lim) + return null; + c = input[pos++]; + switch (c) + { + case 'p': + np.negate = false; + break; + case 'P': + np.negate = true; + break; + default: + return null; + } + c = input[pos++]; + if (c == '{') + { + int p = -1; + for (int i = pos; i < lim; i++) + { + if (input[i] == '}') + { + p = i; + break; + } + } + if (p < 0) + return null; + int len = p - pos; + np.name = new String (input, pos, len); + np.len = len + 4; + } + else + { + np.name = new String (input, pos - 1, 1); + np.len = 3; + } + return np; } else return null; } private static RETokenNamedProperty getRETokenNamedProperty (int subIndex, - NamedProperty - np, - boolean insens, - int index) + NamedProperty + np, + boolean insens, + int index) throws REException { try @@ -1953,18 +1953,18 @@ public class RE extends REToken private boolean isMatchImpl (CharIndexed input, int index, int eflags) { - if (firstToken == null) // Trivial case + if (firstToken == null) // Trivial case return (input.charAt (0) == CharIndexed.OUT_OF_BOUNDS); REMatch m = new REMatch (numSubs, index, eflags); if (firstToken.match (input, m)) { - if (m != null) - { - if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS) - { - return true; - } - } + if (m != null) + { + if (input.charAt (m.index) == CharIndexed.OUT_OF_BOUNDS) + { + return true; + } + } } return false; } @@ -1984,10 +1984,10 @@ public class RE extends REToken { if (lastToken != null) { - lastToken.setUncle (uncle); + lastToken.setUncle (uncle); } else - super.setUncle (uncle); // to deal with empty subexpressions + super.setUncle (uncle); // to deal with empty subexpressions } // Overrides REToken.chain @@ -2063,25 +2063,25 @@ public class RE extends REToken // this has been changed since 1.03 to be non-overlapping matches private REMatch[] getAllMatchesImpl (CharIndexed input, int index, - int eflags) + int eflags) { List < REMatch > all = new ArrayList < REMatch > (); REMatch m = null; while ((m = getMatchImpl (input, index, eflags, null)) != null) { - all.add (m); - index = m.getEndIndex (); - if (m.end[0] == 0) - { // handle pathological case of zero-length match - index++; - input.move (1); - } - else - { - input.move (m.end[0]); - } - if (!input.isValid ()) - break; + all.add (m); + index = m.getEndIndex (); + if (m.end[0] == 0) + { // handle pathological case of zero-length match + index++; + input.move (1); + } + else + { + input.move (m.end[0]); + } + if (!input.isValid ()) + break; } return all.toArray (new REMatch[all.size ()]); } @@ -2092,7 +2092,7 @@ public class RE extends REToken input.setHitEnd (mymatch); if (firstToken == null) { - return next (input, mymatch); + return next (input, mymatch); } // Note the start of this subexpression @@ -2108,7 +2108,7 @@ public class RE extends REToken boolean b = match (input, mymatch); if (b) { - return mymatch; + return mymatch; } return null; } @@ -2168,78 +2168,78 @@ public class RE extends REToken * @param buffer The StringBuffer to save pre-match text in. * @return An REMatch instance referencing the match, or null if none. */ public REMatch getMatch (Object input, int index, int eflags, - CPStringBuilder buffer) + CPStringBuilder buffer) { return getMatchImpl (makeCharIndexed (input, index), index, eflags, - buffer); + buffer); } REMatch getMatchImpl (CharIndexed input, int anchor, int eflags, - CPStringBuilder buffer) + CPStringBuilder buffer) { boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0); boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0); RE re = (tryEntireMatch ? (RE) this.clone () : this); if (tryEntireMatch) { - RETokenEnd reEnd = new RETokenEnd (0, null); - reEnd.setFake (true); - re.chain (reEnd); + RETokenEnd reEnd = new RETokenEnd (0, null); + reEnd.setFake (true); + re.chain (reEnd); } // Create a new REMatch to hold results REMatch mymatch = new REMatch (numSubs, anchor, eflags); do { - /* The following potimization is commented out because - the matching should be tried even if the length of - input is obviously too short in order that - java.util.regex.Matcher#hitEnd() may work correctly. - // Optimization: check if anchor + minimumLength > length - if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { - */ - if (re.match (input, mymatch)) - { - REMatch best = mymatch; - // We assume that the match that coms first is the best. - // And the following "The longer, the better" rule has - // been commented out. The longest is not neccesarily - // the best. For example, "a" out of "aaa" is the best - // match for /a+?/. - /* - // Find best match of them all to observe leftmost longest - while ((mymatch = mymatch.next) != null) { - if (mymatch.index > best.index) { - best = mymatch; - } - } - */ - best.end[0] = best.index; - best.finish (input); - input.setLastMatch (best); - return best; - } - /* End of the optimization commented out - } - */ - mymatch.clear (++anchor); - // Append character to buffer if needed - if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS) - { - buffer.append (input.charAt (0)); - } - // java.util.regex.Matcher#hitEnd() requires that the search should - // be tried at the end of input, so we use move1(1) instead of move(1) + /* The following potimization is commented out because + the matching should be tried even if the length of + input is obviously too short in order that + java.util.regex.Matcher#hitEnd() may work correctly. + // Optimization: check if anchor + minimumLength > length + if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { + */ + if (re.match (input, mymatch)) + { + REMatch best = mymatch; + // We assume that the match that coms first is the best. + // And the following "The longer, the better" rule has + // been commented out. The longest is not neccesarily + // the best. For example, "a" out of "aaa" is the best + // match for /a+?/. + /* + // Find best match of them all to observe leftmost longest + while ((mymatch = mymatch.next) != null) { + if (mymatch.index > best.index) { + best = mymatch; + } + } + */ + best.end[0] = best.index; + best.finish (input); + input.setLastMatch (best); + return best; + } + /* End of the optimization commented out + } + */ + mymatch.clear (++anchor); + // Append character to buffer if needed + if (buffer != null && input.charAt (0) != CharIndexed.OUT_OF_BOUNDS) + { + buffer.append (input.charAt (0)); + } + // java.util.regex.Matcher#hitEnd() requires that the search should + // be tried at the end of input, so we use move1(1) instead of move(1) } while (doMove && input.move1 (1)); // Special handling at end of input for e.g. "$" if (minimumLength == 0) { - if (match (input, mymatch)) - { - mymatch.finish (input); - return mymatch; - } + if (match (input, mymatch)) + { + mymatch.finish (input); + return mymatch; + } } return null; @@ -2283,10 +2283,10 @@ public class RE extends REToken * set to the index position specified. */ public REMatchEnumeration getMatchEnumeration (Object input, int index, - int eflags) + int eflags) { return new REMatchEnumeration (this, makeCharIndexed (input, index), - index, eflags); + index, eflags); } @@ -2307,7 +2307,7 @@ public class RE extends REToken * Substitutes the replacement text for the first match found in the input * beginning at the specified index position. Specifying an index * effectively causes the regular expression engine to throw away the - * specified number of characters. + * specified number of characters. * * @param input The input text. * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto). @@ -2335,14 +2335,14 @@ public class RE extends REToken * @see REMatch#substituteInto */ public String substitute (Object input, String replace, int index, - int eflags) + int eflags) { return substituteImpl (makeCharIndexed (input, index), replace, index, - eflags); + eflags); } private String substituteImpl (CharIndexed input, String replace, int index, - int eflags) + int eflags) { CPStringBuilder buffer = new CPStringBuilder (); REMatch m = getMatchImpl (input, index, eflags, buffer); @@ -2351,17 +2351,17 @@ public class RE extends REToken buffer.append (getReplacement (replace, m, eflags)); if (input.move (m.end[0])) { - do - { - buffer.append (input.charAt (0)); - } - while (input.move (1)); + do + { + buffer.append (input.charAt (0)); + } + while (input.move (1)); } return buffer.toString (); } /** - * Substitutes the replacement text for each non-overlapping match found + * Substitutes the replacement text for each non-overlapping match found * in the input text. * * @param input The input text. @@ -2375,7 +2375,7 @@ public class RE extends REToken } /** - * Substitutes the replacement text for each non-overlapping match found + * Substitutes the replacement text for each non-overlapping match found * in the input text, starting at the specified index. * * If the regular expression allows the empty string to match, it will @@ -2394,7 +2394,7 @@ public class RE extends REToken } /** - * Substitutes the replacement text for each non-overlapping match found + * Substitutes the replacement text for each non-overlapping match found * in the input text, starting at the specified index and using the * specified execution flags. * @@ -2407,35 +2407,35 @@ public class RE extends REToken * @see REMatch#substituteInto */ public String substituteAll (Object input, String replace, int index, - int eflags) + int eflags) { return substituteAllImpl (makeCharIndexed (input, index), replace, index, - eflags); + eflags); } private String substituteAllImpl (CharIndexed input, String replace, - int index, int eflags) + int index, int eflags) { CPStringBuilder buffer = new CPStringBuilder (); REMatch m; while ((m = getMatchImpl (input, index, eflags, buffer)) != null) { - buffer.append (getReplacement (replace, m, eflags)); - index = m.getEndIndex (); - if (m.end[0] == 0) - { - char ch = input.charAt (0); - if (ch != CharIndexed.OUT_OF_BOUNDS) - buffer.append (ch); - input.move (1); - } - else - { - input.move (m.end[0]); - } - - if (!input.isValid ()) - break; + buffer.append (getReplacement (replace, m, eflags)); + index = m.getEndIndex (); + if (m.end[0] == 0) + { + char ch = input.charAt (0); + if (ch != CharIndexed.OUT_OF_BOUNDS) + buffer.append (ch); + input.move (1); + } + else + { + input.move (m.end[0]); + } + + if (!input.isValid ()) + break; } return buffer.toString (); } @@ -2446,36 +2446,36 @@ public class RE extends REToken return replace; else { - if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) - { - CPStringBuilder sb = new CPStringBuilder (); - int l = replace.length (); - for (int i = 0; i < l; i++) - { - char c = replace.charAt (i); - switch (c) - { - case '\\': - i++; - // Let StringIndexOutOfBoundsException be thrown. - sb.append (replace.charAt (i)); - break; - case '$': - int i1 = i + 1; - while (i1 < replace.length () && - Character.isDigit (replace.charAt (i1))) - i1++; - sb.append (m.substituteInto (replace.substring (i, i1))); - i = i1 - 1; - break; - default: - sb.append (c); - } - } - return sb.toString (); - } - else - return m.substituteInto (replace); + if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) + { + CPStringBuilder sb = new CPStringBuilder (); + int l = replace.length (); + for (int i = 0; i < l; i++) + { + char c = replace.charAt (i); + switch (c) + { + case '\\': + i++; + // Let StringIndexOutOfBoundsException be thrown. + sb.append (replace.charAt (i)); + break; + case '$': + int i1 = i + 1; + while (i1 < replace.length () && + Character.isDigit (replace.charAt (i1))) + i1++; + sb.append (m.substituteInto (replace.substring (i, i1))); + i = i1 - 1; + break; + default: + sb.append (c); + } + } + return sb.toString (); + } + else + return m.substituteInto (replace); } } @@ -2493,45 +2493,45 @@ public class RE extends REToken if (firstToken == null) { - lastToken = firstToken = next; + lastToken = firstToken = next; } else { - // if chain returns false, it "rejected" the token due to - // an optimization, and next was combined with lastToken - if (lastToken.chain (next)) - { - lastToken = next; - } + // if chain returns false, it "rejected" the token due to + // an optimization, and next was combined with lastToken + if (lastToken.chain (next)) + { + lastToken = next; + } } } private static REToken setRepeated (REToken current, int min, int max, - int index) throws REException + int index) throws REException { if (current == null) throw new REException (getLocalizedMessage ("repeat.no.token"), - REException.REG_BADRPT, index); + REException.REG_BADRPT, index); return new RETokenRepeated (current.subIndex, current, min, max); } private static int getPosixSet (char[]pattern, int index, - CPStringBuilder buf) + CPStringBuilder buf) { // Precondition: pattern[index-1] == ':' // we will return pos of closing ']'. int i; for (i = index; i < (pattern.length - 1); i++) { - if ((pattern[i] == ':') && (pattern[i + 1] == ']')) - return i + 2; - buf.append (pattern[i]); + if ((pattern[i] == ':') && (pattern[i + 1] == ']')) + return i + 2; + buf.append (pattern[i]); } - return index; // didn't match up + return index; // didn't match up } private int getMinMax (char[]input, int index, IntPair minMax, - RESyntax syntax) throws REException + RESyntax syntax) throws REException { // Precondition: input[index-1] == '{', minMax != null @@ -2539,11 +2539,11 @@ public class RE extends REToken int startIndex = index; if (index == input.length) { - if (mustMatch) - throw new REException (getLocalizedMessage ("unmatched.brace"), - REException.REG_EBRACE, index); - else - return startIndex; + if (mustMatch) + throw new REException (getLocalizedMessage ("unmatched.brace"), + REException.REG_EBRACE, index); + else + return startIndex; } int min, max = 0; @@ -2553,20 +2553,20 @@ public class RE extends REToken // Read string of digits do { - index = getCharUnit (input, index, unit, false); - if (Character.isDigit (unit.ch)) - buf.append (unit.ch); + index = getCharUnit (input, index, unit, false); + if (Character.isDigit (unit.ch)) + buf.append (unit.ch); } while ((index != input.length) && Character.isDigit (unit.ch)); // Check for {} tomfoolery if (buf.length () == 0) { - if (mustMatch) - throw new REException (getLocalizedMessage ("interval.error"), - REException.REG_EBRACE, index); - else - return startIndex; + if (mustMatch) + throw new REException (getLocalizedMessage ("interval.error"), + REException.REG_EBRACE, index); + else + return startIndex; } min = Integer.parseInt (buf.toString ()); @@ -2575,8 +2575,8 @@ public class RE extends REToken max = min; else if (index == input.length) if (mustMatch) - throw new REException (getLocalizedMessage ("interval.no.end"), - REException.REG_EBRACE, index); + throw new REException (getLocalizedMessage ("interval.no.end"), + REException.REG_EBRACE, index); else return startIndex; else @@ -2585,28 +2585,28 @@ public class RE extends REToken buf = new CPStringBuilder (); // Read string of digits while (((index = - getCharUnit (input, index, unit, false)) != input.length) - && Character.isDigit (unit.ch)) - buf.append (unit.ch); + getCharUnit (input, index, unit, false)) != input.length) + && Character.isDigit (unit.ch)) + buf.append (unit.ch); if (! - ((unit.ch == '}') - && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))) - if (mustMatch) - throw new REException (getLocalizedMessage ("interval.error"), - REException.REG_EBRACE, index); + ((unit.ch == '}') + && (syntax.get (RESyntax.RE_NO_BK_BRACES) ^ unit.bk))) + if (mustMatch) + throw new REException (getLocalizedMessage ("interval.error"), + REException.REG_EBRACE, index); else return startIndex; // This is the case of {x,} if (buf.length () == 0) - max = Integer.MAX_VALUE; + max = Integer.MAX_VALUE; else - max = Integer.parseInt (buf.toString ()); + max = Integer.parseInt (buf.toString ()); } else if (mustMatch) throw new REException (getLocalizedMessage ("interval.error"), - REException.REG_EBRACE, index); + REException.REG_EBRACE, index); else return startIndex; @@ -2651,13 +2651,13 @@ public class RE extends REToken // be the most likely because this is the case with // java.util.regex.Matcher. // We could let a String or a CharSequence fall through - // to final input, but since it'a very likely input type, + // to final input, but since it'a very likely input type, // we check it first. if (input instanceof CharIndexed) { - CharIndexed ci = (CharIndexed) input; - ci.setAnchor (index); - return ci; + CharIndexed ci = (CharIndexed) input; + ci.setAnchor (index); + return ci; } else if (input instanceof CharSequence) return new CharIndexedCharSequence ((CharSequence) input, index); |