diff options
author | Paul McGuire <ptmcg@austin.rr.com> | 2019-08-18 22:47:08 -0500 |
---|---|---|
committer | Paul McGuire <ptmcg@austin.rr.com> | 2019-08-18 22:47:08 -0500 |
commit | 072f0ddd3c7907bf061c12a21d352a0f31001508 (patch) | |
tree | e9e9cf107a701d643d85403c98019cb3de1d4b1a | |
parent | d5c036d138d83160dfb9ad24c87d42f2d25dd7c3 (diff) | |
download | pyparsing-git-072f0ddd3c7907bf061c12a21d352a0f31001508.tar.gz |
Add regex range collapsing to compress large character ranges for faster re performance; update CHANGES to reflect new booleansearchparser example
-rw-r--r-- | CHANGES | 11 | ||||
-rw-r--r-- | pyparsing.py | 50 |
2 files changed, 45 insertions, 16 deletions
@@ -55,6 +55,13 @@ Version 2.5.0a1 suppression. As part of resolution to a question posted by John Greene on StackOverflow. +- Potentially *huge* performance enhancement when parsing Word + expressions built from pyparsing_unicode character sets. Word now + internally converts ranges of consecutive characters to regex + character ranges (converting "0123456789" to "0-9" for instance), + resulting in as much as 50X improvement in performance! Work + inspired by a question posted by Midnighter on StackOverflow. + - Fixed bug in CloseMatch where end location was incorrectly computed; and updated partial_gene_match.py example. @@ -65,6 +72,10 @@ Version 2.5.0a1 - BigQueryViewParser.py added to examples directory, PR submitted by Michael Smedberg, nice work! +- booleansearchparser.py added to examples directory, PR submitted + by xecgr. Builds on searchparser.py, adding support for '*' + wildcards and non-Western alphabets. + Version 2.4.2 - July, 2019 -------------------------- diff --git a/pyparsing.py b/pyparsing.py index 43f1abc..d6c0c34 100644 --- a/pyparsing.py +++ b/pyparsing.py @@ -96,7 +96,7 @@ classes inherit from. Use the docstrings for examples of how to: """ __version__ = "2.5.0a1" -__versionTime__ = "10 Aug 2019 11:56 UTC" +__versionTime__ = "19 Aug 2019 03:39 UTC" __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>" import string @@ -239,16 +239,6 @@ singleArgBuiltins = [sum, len, sorted, reversed, list, tuple, set, any, all, min _generatorType = type((y for y in range(1))) -def _xml_escape(data): - """Escape &, <, >, ", ', etc. in a string of data.""" - - # ampersand must be replaced first - from_symbols = '&><"\'' - to_symbols = ('&' + s + ';' for s in "amp gt lt quot apos".split()) - for from_, to_ in zip(from_symbols, to_symbols): - data = data.replace(from_, to_) - return data - alphas = string.ascii_uppercase + string.ascii_lowercase nums = "0123456789" hexnums = nums + "ABCDEFabcdef" @@ -2987,13 +2977,13 @@ class Word(Token): if ' ' not in self.initCharsOrig + self.bodyCharsOrig and (min == 1 and max == 0 and exact == 0): if self.bodyCharsOrig == self.initCharsOrig: - self.reString = "[%s]+" % _escapeRegexRangeChars(self.initCharsOrig) + self.reString = "[%s]+" % _collapseAndEscapeRegexRangeChars(self.initCharsOrig) elif len(self.initCharsOrig) == 1: self.reString = "%s[%s]*" % (re.escape(self.initCharsOrig), - _escapeRegexRangeChars(self.bodyCharsOrig),) + _collapseAndEscapeRegexRangeChars(self.bodyCharsOrig),) else: - self.reString = "[%s][%s]*" % (_escapeRegexRangeChars(self.initCharsOrig), - _escapeRegexRangeChars(self.bodyCharsOrig),) + self.reString = "[%s][%s]*" % (_collapseAndEscapeRegexRangeChars(self.initCharsOrig), + _collapseAndEscapeRegexRangeChars(self.bodyCharsOrig),) if self.asKeyword: self.reString = r"\b" + self.reString + r"\b" @@ -3071,7 +3061,7 @@ class Char(_WordRegex): """ def __init__(self, charset, asKeyword=False, excludeChars=None): super().__init__(charset, exact=1, asKeyword=asKeyword, excludeChars=excludeChars) - self.reString = "[%s]" % _escapeRegexRangeChars(''.join(self.initChars)) + self.reString = "[%s]" % _collapseAndEscapeRegexRangeChars(self.initChars) if asKeyword: self.reString = r"\b%s\b" % self.reString self.re = re.compile(self.reString) @@ -5301,6 +5291,34 @@ def _escapeRegexRangeChars(s): s = s.replace("\t", r"\t") return str(s) +def _collapseAndEscapeRegexRangeChars(s): + def is_consecutive(c): + c_int = ord(c) + is_consecutive.prev, prev = c_int, is_consecutive.prev + if c_int - prev > 1: + is_consecutive.value = next(is_consecutive.counter) + return is_consecutive.value + + is_consecutive.prev = 0 + is_consecutive.counter = itertools.count() + is_consecutive.value = -1 + + def escape_re_range_char(c): + return '\\' + c if c in r"\^-]" else c + + ret = [] + for _, chars in itertools.groupby(sorted(s), key=is_consecutive): + first = last = next(chars) + for c in chars: + last = c + if first == last: + ret.append(first) + else: + ret.append("{}-{}".format(escape_re_range_char(first), + escape_re_range_char(last))) + return ''.join(ret) + + def oneOf(strs, caseless=False, useRegex=True, asKeyword=False): """Helper to quickly define a set of alternative Literals, and makes sure to do longest-first testing when there is a conflict, |