summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul McGuire <ptmcg@austin.rr.com>2019-08-05 23:58:25 -0500
committerPaul McGuire <ptmcg@austin.rr.com>2019-08-05 23:58:25 -0500
commit123e83037b5de90fb964e96267bd60e90c70db19 (patch)
tree5695330948a33d42790b54d2049671e670729c87
parent9ca5931db90a487085851fd9f847066fd48ae55b (diff)
downloadpyparsing-git-123e83037b5de90fb964e96267bd60e90c70db19.tar.gz
Fixed bug in CloseMatch where end location was incorrectly computed; and updated partial_gene_match.py example.
-rw-r--r--CHANGES3
-rw-r--r--examples/partial_gene_match.py139
-rw-r--r--pyparsing.py4
3 files changed, 56 insertions, 90 deletions
diff --git a/CHANGES b/CHANGES
index 612de6c..68503c7 100644
--- a/CHANGES
+++ b/CHANGES
@@ -38,6 +38,9 @@ Version 2.5.0a1
suppression. As part of resolution to a question posted by John
Greene on StackOverflow.
+- Fixed bug in CloseMatch where end location was incorrectly
+ computed; and updated partial_gene_match.py example.
+
- BigQueryViewParser.py added to examples directory, PR submitted
by Michael Smedberg, nice work!
diff --git a/examples/partial_gene_match.py b/examples/partial_gene_match.py
index e19cf8b..8ca9c44 100644
--- a/examples/partial_gene_match.py
+++ b/examples/partial_gene_match.py
@@ -1,88 +1,51 @@
-# parital_gene_match.py
-#
-# Example showing how to create a customized pyparsing Token, in this case,
-# one that is very much like Literal, but which tolerates up to 'n' character
-# mismatches
-from pyparsing import *
-
-import urllib.request, urllib.parse, urllib.error
-
-# read in a bunch of genomic data
-datafile = urllib.request.urlopen("http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta")
-fastasrc = datafile.read()
-datafile.close()
-
-"""
-Sample header:
->NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
-"""
-integer = Word(nums).setParseAction(lambda t:int(t[0]))
-genebit = Group(">" + Word(alphanums.upper()+"-_") + "|" +
- Word(printables)("id") + SkipTo("length=", include=True) +
- integer("genelen") + LineEnd() +
- Combine(OneOrMore(Word("ACGTN")),adjacent=False)("gene"))
-
-# read gene data from .fasta file - takes just a few seconds
-genedata = OneOrMore(genebit).parseString(fastasrc)
-
-
-class CloseMatch(Token):
- """A special subclass of Token that does *close* matches. For each
- close match of the given string, a tuple is returned giving the
- found close match, and a list of mismatch positions."""
- def __init__(self, seq, maxMismatches=1):
- super(CloseMatch,self).__init__()
- self.name = seq
- self.sequence = seq
- self.maxMismatches = maxMismatches
- self.errmsg = "Expected " + self.sequence
- self.mayIndexError = False
- self.mayReturnEmpty = False
-
- def parseImpl( self, instring, loc, doActions=True ):
- start = loc
- instrlen = len(instring)
- maxloc = start + len(self.sequence)
-
- if maxloc <= instrlen:
- seq = self.sequence
- seqloc = 0
- mismatches = []
- throwException = False
- done = False
- while loc < maxloc and not done:
- if instring[loc] != seq[seqloc]:
- mismatches.append(seqloc)
- if len(mismatches) > self.maxMismatches:
- throwException = True
- done = True
- loc += 1
- seqloc += 1
- else:
- throwException = True
-
- if throwException:
- exc = self.myException
- exc.loc = loc
- exc.pstr = instring
- raise exc
-
- return loc, (instring[start:loc],mismatches)
-
-# using the genedata extracted above, look for close matches of a gene sequence
-searchseq = CloseMatch("TTAAATCTAGAAGAT", 3)
-for g in genedata:
- print("%s (%d)" % (g.id, g.genelen))
- print("-"*24)
- for t,startLoc,endLoc in searchseq.scanString(g.gene, overlap=True):
- matched, mismatches = t[0]
- print("MATCH:", searchseq.sequence)
- print("FOUND:", matched)
- if mismatches:
- print(" ", ''.join(' ' if i not in mismatches else '*'
- for i,c in enumerate(searchseq.sequence)))
- else:
- print("<exact match>")
- print("at location", startLoc)
- print()
- print()
+# parital_gene_match.py
+#
+# Example showing how to use the CloseMatch class, to find strings in a gene with up to 'n' mismatches
+#
+import pyparsing as pp
+
+import urllib.request, urllib.parse, urllib.error
+from contextlib import closing
+
+# read in a bunch of genomic data
+data_url = "http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta"
+with closing(urllib.request.urlopen(data_url)) as datafile:
+ fastasrc = datafile.read().decode()
+
+"""
+Sample header:
+>NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
+"""
+integer = pp.Word(pp.nums).setParseAction(lambda t:int(t[0]))
+genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id")
+ + "|" + pp.Word(pp.printables)("organism")
+ + "|" + pp.Word(pp.printables)("location")
+ + "|" + "length=" + integer("gene_len")
+ + pp.LineEnd()
+ + pp.Word("ACGTN")[1, ...].addParseAction(''.join)("gene"))
+
+# read gene data from .fasta file - takes just a few seconds
+genedata = genebit[1, ...].parseString(fastasrc)
+
+# using the genedata extracted above, look for close matches of a gene sequence
+searchseq = pp.CloseMatch("TTAAATCTAGAAGAT", 3)
+
+for g in genedata:
+ show_header = True
+ for t, startLoc, endLoc in searchseq.scanString(g.gene, overlap=True):
+ if show_header:
+ print("%s/%s/%s (%d)" % (g.gene_id, g.organism, g.location, g.gene_len))
+ print("-" * 24)
+ show_header = False
+
+ matched = t[0]
+ mismatches = t['mismatches']
+ print("MATCH:", searchseq.match_string)
+ print("FOUND:", matched)
+ if mismatches:
+ print(" ", ''.join('*' if i in mismatches else ' '
+ for i, c in enumerate(searchseq.match_string)))
+ else:
+ print("<exact match>")
+ print("at location", startLoc)
+ print()
diff --git a/pyparsing.py b/pyparsing.py
index 92e2929..23b36dc 100644
--- a/pyparsing.py
+++ b/pyparsing.py
@@ -96,7 +96,7 @@ classes inherit from. Use the docstrings for examples of how to:
"""
__version__ = "2.5.0a1"
-__versionTime__ = "06 Aug 2019 01:12 UTC"
+__versionTime__ = "06 Aug 2019 04:55 UTC"
__author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
import string
@@ -2843,7 +2843,7 @@ class CloseMatch(Token):
if len(mismatches) > maxMismatches:
break
else:
- loc = match_stringloc + 1
+ loc = start + match_stringloc + 1
results = ParseResults([instring[start:loc]])
results['original'] = match_string
results['mismatches'] = mismatches