Fixed bug in CloseMatch where end location was incorrectly computed; and updated partial_gene_match.py example.

author: Paul McGuire <ptmcg@austin.rr.com> 2019-08-05 23:58:25 -0500
committer: Paul McGuire <ptmcg@austin.rr.com> 2019-08-05 23:58:25 -0500
commit: 123e83037b5de90fb964e96267bd60e90c70db19 (patch)
tree: 5695330948a33d42790b54d2049671e670729c87
parent: 9ca5931db90a487085851fd9f847066fd48ae55b (diff)
download: pyparsing-git-123e83037b5de90fb964e96267bd60e90c70db19.tar.gz
3 files changed, 56 insertions, 90 deletions
diff --git a/CHANGES b/CHANGES
index 612de6c..68503c7 100644
--- a/CHANGES
+++ b/CHANGES
@@ -38,6 +38,9 @@ Version 2.5.0a1
   suppression. As part of resolution to a question posted by John
   Greene on StackOverflow.
 
+- Fixed bug in CloseMatch where end location was incorrectly
+  computed; and updated partial_gene_match.py example.
+
 - BigQueryViewParser.py added to examples directory, PR submitted
   by Michael Smedberg, nice work!
 
diff --git a/examples/partial_gene_match.py b/examples/partial_gene_match.py
index e19cf8b..8ca9c44 100644
--- a/examples/partial_gene_match.py
+++ b/examples/partial_gene_match.py
@@ -1,88 +1,51 @@
-# parital_gene_match.py
-#
-#  Example showing how to create a customized pyparsing Token, in this case,
-#  one that is very much like Literal, but which tolerates up to 'n' character
-#  mismatches
-from pyparsing import *
-
-import urllib.request, urllib.parse, urllib.error
-
-# read in a bunch of genomic data
-datafile = urllib.request.urlopen("http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta")
-fastasrc = datafile.read()
-datafile.close()
-
-"""
-Sample header:
->NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
-"""
-integer = Word(nums).setParseAction(lambda t:int(t[0]))
-genebit = Group(">" + Word(alphanums.upper()+"-_") + "|" +
-                Word(printables)("id") + SkipTo("length=", include=True) +
-                integer("genelen") + LineEnd() +
-                Combine(OneOrMore(Word("ACGTN")),adjacent=False)("gene"))
-
-# read gene data from .fasta file - takes just a few seconds
-genedata = OneOrMore(genebit).parseString(fastasrc)
-
-
-class CloseMatch(Token):
-    """A special subclass of Token that does *close* matches. For each
-       close match of the given string, a tuple is returned giving the
-       found close match, and a list of mismatch positions."""
-    def __init__(self, seq, maxMismatches=1):
-        super(CloseMatch,self).__init__()
-        self.name = seq
-        self.sequence = seq
-        self.maxMismatches = maxMismatches
-        self.errmsg = "Expected " + self.sequence
-        self.mayIndexError = False
-        self.mayReturnEmpty = False
-
-    def parseImpl( self, instring, loc, doActions=True ):
-        start = loc
-        instrlen = len(instring)
-        maxloc = start + len(self.sequence)
-
-        if maxloc <= instrlen:
-            seq = self.sequence
-            seqloc = 0
-            mismatches = []
-            throwException = False
-            done = False
-            while loc < maxloc and not done:
-                if instring[loc] != seq[seqloc]:
-                    mismatches.append(seqloc)
-                    if len(mismatches) > self.maxMismatches:
-                        throwException = True
-                        done = True
-                loc += 1
-                seqloc += 1
-        else:
-            throwException = True
-
-        if throwException:
-            exc = self.myException
-            exc.loc = loc
-            exc.pstr = instring
-            raise exc
-
-        return loc, (instring[start:loc],mismatches)
-
-# using the genedata extracted above, look for close matches of a gene sequence
-searchseq = CloseMatch("TTAAATCTAGAAGAT", 3)
-for g in genedata:
-    print("%s (%d)" % (g.id, g.genelen))
-    print("-"*24)
-    for t,startLoc,endLoc in searchseq.scanString(g.gene, overlap=True):
-        matched, mismatches = t[0]
-        print("MATCH:", searchseq.sequence)
-        print("FOUND:", matched)
-        if mismatches:
-            print("      ", ''.join(' ' if i not in mismatches else '*'
-                            for i,c in enumerate(searchseq.sequence)))
-        else:
-            print("<exact match>")
-        print("at location", startLoc)
-        print()
-    print()
+# parital_gene_match.py
+#
+#  Example showing how to use the CloseMatch class, to find strings in a gene with up to 'n' mismatches
+#
+import pyparsing as pp
+
+import urllib.request, urllib.parse, urllib.error
+from contextlib import closing
+
+# read in a bunch of genomic data
+data_url = "http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta"
+with closing(urllib.request.urlopen(data_url)) as datafile:
+    fastasrc = datafile.read().decode()
+
+"""
+Sample header:
+>NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
+"""
+integer = pp.Word(pp.nums).setParseAction(lambda t:int(t[0]))
+genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id")
+                   + "|" + pp.Word(pp.printables)("organism")
+                   + "|" + pp.Word(pp.printables)("location")
+                   + "|" + "length=" + integer("gene_len")
+                   + pp.LineEnd()
+                   + pp.Word("ACGTN")[1, ...].addParseAction(''.join)("gene"))
+
+# read gene data from .fasta file - takes just a few seconds
+genedata = genebit[1, ...].parseString(fastasrc)
+
+# using the genedata extracted above, look for close matches of a gene sequence
+searchseq = pp.CloseMatch("TTAAATCTAGAAGAT", 3)
+
+for g in genedata:
+    show_header = True
+    for t, startLoc, endLoc in searchseq.scanString(g.gene, overlap=True):
+        if show_header:
+            print("%s/%s/%s (%d)" % (g.gene_id, g.organism, g.location, g.gene_len))
+            print("-" * 24)
+            show_header = False
+
+        matched = t[0]
+        mismatches = t['mismatches']
+        print("MATCH:", searchseq.match_string)
+        print("FOUND:", matched)
+        if mismatches:
+            print("      ", ''.join('*' if i in mismatches else ' '
+                                    for i, c in enumerate(searchseq.match_string)))
+        else:
+            print("<exact match>")
+        print("at location", startLoc)
+        print()
diff --git a/pyparsing.py b/pyparsing.py
index 92e2929..23b36dc 100644
--- a/pyparsing.py
+++ b/pyparsing.py
@@ -96,7 +96,7 @@ classes inherit from. Use the docstrings for examples of how to:
 """
 
 __version__ = "2.5.0a1"
-__versionTime__ = "06 Aug 2019 01:12 UTC"
+__versionTime__ = "06 Aug 2019 04:55 UTC"
 __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
 
 import string
@@ -2843,7 +2843,7 @@ class CloseMatch(Token):
                     if len(mismatches) > maxMismatches:
                         break
             else:
-                loc = match_stringloc + 1
+                loc = start + match_stringloc + 1
                 results = ParseResults([instring[start:loc]])
                 results['original'] = match_string
                 results['mismatches'] = mismatches
author	Paul McGuire <ptmcg@austin.rr.com>	2019-08-05 23:58:25 -0500
committer	Paul McGuire <ptmcg@austin.rr.com>	2019-08-05 23:58:25 -0500
commit	123e83037b5de90fb964e96267bd60e90c70db19 (patch)
tree	5695330948a33d42790b54d2049671e670729c87
parent	9ca5931db90a487085851fd9f847066fd48ae55b (diff)
download	pyparsing-git-123e83037b5de90fb964e96267bd60e90c70db19.tar.gz