diff options
author | Paul McGuire <ptmcg@austin.rr.com> | 2019-08-05 23:58:25 -0500 |
---|---|---|
committer | Paul McGuire <ptmcg@austin.rr.com> | 2019-08-05 23:58:25 -0500 |
commit | 123e83037b5de90fb964e96267bd60e90c70db19 (patch) | |
tree | 5695330948a33d42790b54d2049671e670729c87 | |
parent | 9ca5931db90a487085851fd9f847066fd48ae55b (diff) | |
download | pyparsing-git-123e83037b5de90fb964e96267bd60e90c70db19.tar.gz |
Fixed bug in CloseMatch where end location was incorrectly computed; and updated partial_gene_match.py example.
-rw-r--r-- | CHANGES | 3 | ||||
-rw-r--r-- | examples/partial_gene_match.py | 139 | ||||
-rw-r--r-- | pyparsing.py | 4 |
3 files changed, 56 insertions, 90 deletions
@@ -38,6 +38,9 @@ Version 2.5.0a1 suppression. As part of resolution to a question posted by John Greene on StackOverflow. +- Fixed bug in CloseMatch where end location was incorrectly + computed; and updated partial_gene_match.py example. + - BigQueryViewParser.py added to examples directory, PR submitted by Michael Smedberg, nice work! diff --git a/examples/partial_gene_match.py b/examples/partial_gene_match.py index e19cf8b..8ca9c44 100644 --- a/examples/partial_gene_match.py +++ b/examples/partial_gene_match.py @@ -1,88 +1,51 @@ -# parital_gene_match.py
-#
-# Example showing how to create a customized pyparsing Token, in this case,
-# one that is very much like Literal, but which tolerates up to 'n' character
-# mismatches
-from pyparsing import *
-
-import urllib.request, urllib.parse, urllib.error
-
-# read in a bunch of genomic data
-datafile = urllib.request.urlopen("http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta")
-fastasrc = datafile.read()
-datafile.close()
-
-"""
-Sample header:
->NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
-"""
-integer = Word(nums).setParseAction(lambda t:int(t[0]))
-genebit = Group(">" + Word(alphanums.upper()+"-_") + "|" +
- Word(printables)("id") + SkipTo("length=", include=True) +
- integer("genelen") + LineEnd() +
- Combine(OneOrMore(Word("ACGTN")),adjacent=False)("gene"))
-
-# read gene data from .fasta file - takes just a few seconds
-genedata = OneOrMore(genebit).parseString(fastasrc)
-
-
-class CloseMatch(Token):
- """A special subclass of Token that does *close* matches. For each
- close match of the given string, a tuple is returned giving the
- found close match, and a list of mismatch positions."""
- def __init__(self, seq, maxMismatches=1):
- super(CloseMatch,self).__init__()
- self.name = seq
- self.sequence = seq
- self.maxMismatches = maxMismatches
- self.errmsg = "Expected " + self.sequence
- self.mayIndexError = False
- self.mayReturnEmpty = False
-
- def parseImpl( self, instring, loc, doActions=True ):
- start = loc
- instrlen = len(instring)
- maxloc = start + len(self.sequence)
-
- if maxloc <= instrlen:
- seq = self.sequence
- seqloc = 0
- mismatches = []
- throwException = False
- done = False
- while loc < maxloc and not done:
- if instring[loc] != seq[seqloc]:
- mismatches.append(seqloc)
- if len(mismatches) > self.maxMismatches:
- throwException = True
- done = True
- loc += 1
- seqloc += 1
- else:
- throwException = True
-
- if throwException:
- exc = self.myException
- exc.loc = loc
- exc.pstr = instring
- raise exc
-
- return loc, (instring[start:loc],mismatches)
-
-# using the genedata extracted above, look for close matches of a gene sequence
-searchseq = CloseMatch("TTAAATCTAGAAGAT", 3)
-for g in genedata:
- print("%s (%d)" % (g.id, g.genelen))
- print("-"*24)
- for t,startLoc,endLoc in searchseq.scanString(g.gene, overlap=True):
- matched, mismatches = t[0]
- print("MATCH:", searchseq.sequence)
- print("FOUND:", matched)
- if mismatches:
- print(" ", ''.join(' ' if i not in mismatches else '*'
- for i,c in enumerate(searchseq.sequence)))
- else:
- print("<exact match>")
- print("at location", startLoc)
- print()
- print()
+# parital_gene_match.py +# +# Example showing how to use the CloseMatch class, to find strings in a gene with up to 'n' mismatches +# +import pyparsing as pp + +import urllib.request, urllib.parse, urllib.error +from contextlib import closing + +# read in a bunch of genomic data +data_url = "http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicoplastORFsNAs_ToxoDB-6.0.fasta" +with closing(urllib.request.urlopen(data_url)) as datafile: + fastasrc = datafile.read().decode() + +""" +Sample header: +>NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201 +""" +integer = pp.Word(pp.nums).setParseAction(lambda t:int(t[0])) +genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id") + + "|" + pp.Word(pp.printables)("organism") + + "|" + pp.Word(pp.printables)("location") + + "|" + "length=" + integer("gene_len") + + pp.LineEnd() + + pp.Word("ACGTN")[1, ...].addParseAction(''.join)("gene")) + +# read gene data from .fasta file - takes just a few seconds +genedata = genebit[1, ...].parseString(fastasrc) + +# using the genedata extracted above, look for close matches of a gene sequence +searchseq = pp.CloseMatch("TTAAATCTAGAAGAT", 3) + +for g in genedata: + show_header = True + for t, startLoc, endLoc in searchseq.scanString(g.gene, overlap=True): + if show_header: + print("%s/%s/%s (%d)" % (g.gene_id, g.organism, g.location, g.gene_len)) + print("-" * 24) + show_header = False + + matched = t[0] + mismatches = t['mismatches'] + print("MATCH:", searchseq.match_string) + print("FOUND:", matched) + if mismatches: + print(" ", ''.join('*' if i in mismatches else ' ' + for i, c in enumerate(searchseq.match_string))) + else: + print("<exact match>") + print("at location", startLoc) + print() diff --git a/pyparsing.py b/pyparsing.py index 92e2929..23b36dc 100644 --- a/pyparsing.py +++ b/pyparsing.py @@ -96,7 +96,7 @@ classes inherit from. Use the docstrings for examples of how to: """ __version__ = "2.5.0a1" -__versionTime__ = "06 Aug 2019 01:12 UTC" +__versionTime__ = "06 Aug 2019 04:55 UTC" __author__ = "Paul McGuire <ptmcg@users.sourceforge.net>" import string @@ -2843,7 +2843,7 @@ class CloseMatch(Token): if len(mismatches) > maxMismatches: break else: - loc = match_stringloc + 1 + loc = start + match_stringloc + 1 results = ParseResults([instring[start:loc]]) results['original'] = match_string results['mismatches'] = mismatches |