From f1fc1015646028bee1a8c5052e85def25847324f Mon Sep 17 00:00:00 2001 From: ptmcg Date: Wed, 17 Aug 2016 23:14:44 +0000 Subject: Added CloseMatch class git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/trunk@425 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b --- src/CHANGES | 7 ++++++ src/pyparsing.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- src/unitTests.py | 28 +++++++++++++++++++++++ 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/src/CHANGES b/src/CHANGES index c2156cc..40c08ff 100644 --- a/src/CHANGES +++ b/src/CHANGES @@ -2,6 +2,13 @@ Change Log ========== +Version 2.1.9 - +------------------------------ +- Added class CloseMatch, a variation on Literal which matches + "close" matches, that is, strings with at most 'n' mismatching + characters. + + Version 2.1.8 - ------------------------------ - Fixed issue in the optimization to _trim_arity, when the full diff --git a/src/pyparsing.py b/src/pyparsing.py index 5b63a73..269e789 100644 --- a/src/pyparsing.py +++ b/src/pyparsing.py @@ -61,7 +61,7 @@ The pyparsing module handles some of the problems that are typically vexing when """ __version__ = "2.1.9" -__versionTime__ = "15 Aug 2016 18:14 UTC" +__versionTime__ = "17 Aug 2016 23:06 UTC" __author__ = "Paul McGuire " import string @@ -110,7 +110,7 @@ __all__ = [ 'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd', 'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute', 'indentedBlock', 'originalTextFor', 'ungroup', 'infixNotation','locatedExpr', 'withClass', -'tokenMap', 'pyparsing_common', +'CloseMatch', 'tokenMap', 'pyparsing_common', ] system_version = tuple(sys.version_info)[:3] @@ -294,7 +294,7 @@ class _ParseResultsWithOffset(object): def __getitem__(self,i): return self.tup[i] def __repr__(self): - return repr(self.tup) + return repr(self.tup[0]) def setOffset(self,i): self.tup = (self.tup[0],i) @@ -2491,6 +2491,67 @@ class CaselessKeyword(Keyword): return loc+self.matchLen, self.match raise ParseException(instring, loc, self.errmsg, self) +class CloseMatch(Token): + """ + A variation on L{Literal} which matches "close" matches, that is, + strings with at most 'n' mismatching characters. C{CloseMatch} takes parameters: + - C{match_string} - string to be matched + - C{maxMismatches} - (C{default=1}) maximum number of mismatches allowed to count as a match + + The results from a successful parse will contain the matched text from the input string and the following named results: + - C{mismatches} - a list of the positions within the match_string where mismatches were found + - C{original} - the original match_string used to compare against the input string + + If C{mismatches} is an empty list, then the match was an exact match. + + Example:: + patt = CloseMatch("ATCATCGAATGGA") + patt.parseString("ATCATCGAAXGGA") # -> (['ATCATCGAAXGGA'], {'mismatches': [[9]], 'original': ['ATCATCGAATGGA']}) + patt.parseString("ATCAXCGAAXGGA") # -> Exception: Expected 'ATCATCGAATGGA' (with up to 1 mismatches) (at char 0), (line:1, col:1) + + # exact match + patt.parseString("ATCATCGAATGGA") # -> (['ATCATCGAATGGA'], {'mismatches': [[]], 'original': ['ATCATCGAATGGA']}) + + # close match allowing up to 2 mismatches + patt = CloseMatch("ATCATCGAATGGA", maxMismatches=2) + patt.parseString("ATCAXCGAAXGGA") # -> (['ATCAXCGAAXGGA'], {'mismatches': [[4, 9]], 'original': ['ATCATCGAATGGA']}) + """ + def __init__(self, match_string, maxMismatches=1): + super(CloseMatch,self).__init__() + self.name = match_string + self.match_string = match_string + self.maxMismatches = maxMismatches + self.errmsg = "Expected %r (with up to %d mismatches)" % (self.match_string, self.maxMismatches) + self.mayIndexError = False + self.mayReturnEmpty = False + + def parseImpl( self, instring, loc, doActions=True ): + start = loc + instrlen = len(instring) + maxloc = start + len(self.match_string) + + if maxloc <= instrlen: + match_string = self.match_string + match_stringloc = 0 + mismatches = [] + maxMismatches = self.maxMismatches + + for match_stringloc,s_m in enumerate(zip(instring[loc:maxloc], self.match_string)): + src,mat = s_m + if src != mat: + mismatches.append(match_stringloc) + if len(mismatches) > maxMismatches: + break + else: + loc = match_stringloc + 1 + results = ParseResults([instring[start:loc]]) + results['original'] = self.match_string + results['mismatches'] = mismatches + return loc, results + + raise ParseException(instring, loc, self.errmsg, self) + + class Word(Token): """ Token for matching words composed of allowed character sets. diff --git a/src/unitTests.py b/src/unitTests.py index b80a9f4..6a46154 100644 --- a/src/unitTests.py +++ b/src/unitTests.py @@ -3100,7 +3100,35 @@ class InlineLiteralsUsingTest(ParseTestCase): result = date_str.parseString("1999/12/31") # -> ['1999', '12', '31'] assert result.asList() == ['1999', '12', '31'], "inlineLiteralsUsing(example 2) failed!" +class CloseMatchTest(ParseTestCase): + def runTest(self): + import pyparsing as pp + + searchseq = pp.CloseMatch("ATCATCGAATGGA", 2) + _, results = searchseq.runTests(""" + ATCATCGAATGGA + XTCATCGAATGGX + ATCATCGAAXGGA + ATCAXXGAATGGA + ATCAXXGAATGXA + ATCAXXGAATGG + """) + expected = ( + [], + [0,12], + [9], + [4,5], + None, + None + ) + + for r,exp in zip(results, expected): + if exp is not None: + assert r[1].mismatches == exp, "fail CloseMatch between %r and %r" % (searchseq.sequence, r[0]) + print(r[0], 'exc: %s' % r[1] if exp is None and isinstance(r[1], Exception) + else ("no match", "match")[r[1].mismatches == exp]) + class MiscellaneousParserTests(ParseTestCase): def runTest(self): import pyparsing -- cgit v1.2.1