diff options
Diffstat (limited to 'examples/partial_gene_match.py')
-rw-r--r-- | examples/partial_gene_match.py | 130 |
1 files changed, 65 insertions, 65 deletions
diff --git a/examples/partial_gene_match.py b/examples/partial_gene_match.py index 8bf5f7c..e19cf8b 100644 --- a/examples/partial_gene_match.py +++ b/examples/partial_gene_match.py @@ -1,6 +1,6 @@ # parital_gene_match.py
#
-# Example showing how to create a customized pyparsing Token, in this case,
+# Example showing how to create a customized pyparsing Token, in this case,
# one that is very much like Literal, but which tolerates up to 'n' character
# mismatches
from pyparsing import *
@@ -12,77 +12,77 @@ datafile = urllib.request.urlopen("http://toxodb.org/common/downloads/release-6. fastasrc = datafile.read()
datafile.close()
-"""
-Sample header:
+"""
+Sample header:
>NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
-"""
-integer = Word(nums).setParseAction(lambda t:int(t[0]))
-genebit = Group(">" + Word(alphanums.upper()+"-_") + "|" +
- Word(printables)("id") + SkipTo("length=", include=True) +
- integer("genelen") + LineEnd() +
- Combine(OneOrMore(Word("ACGTN")),adjacent=False)("gene"))
-
+"""
+integer = Word(nums).setParseAction(lambda t:int(t[0]))
+genebit = Group(">" + Word(alphanums.upper()+"-_") + "|" +
+ Word(printables)("id") + SkipTo("length=", include=True) +
+ integer("genelen") + LineEnd() +
+ Combine(OneOrMore(Word("ACGTN")),adjacent=False)("gene"))
+
# read gene data from .fasta file - takes just a few seconds
-genedata = OneOrMore(genebit).parseString(fastasrc)
+genedata = OneOrMore(genebit).parseString(fastasrc)
-class CloseMatch(Token):
+class CloseMatch(Token):
"""A special subclass of Token that does *close* matches. For each
close match of the given string, a tuple is returned giving the
found close match, and a list of mismatch positions."""
- def __init__(self, seq, maxMismatches=1):
- super(CloseMatch,self).__init__()
- self.name = seq
- self.sequence = seq
- self.maxMismatches = maxMismatches
- self.errmsg = "Expected " + self.sequence
- self.mayIndexError = False
- self.mayReturnEmpty = False
-
- def parseImpl( self, instring, loc, doActions=True ):
- start = loc
- instrlen = len(instring)
- maxloc = start + len(self.sequence)
-
- if maxloc <= instrlen:
- seq = self.sequence
- seqloc = 0
- mismatches = []
- throwException = False
- done = False
- while loc < maxloc and not done:
- if instring[loc] != seq[seqloc]:
- mismatches.append(seqloc)
- if len(mismatches) > self.maxMismatches:
- throwException = True
- done = True
- loc += 1
- seqloc += 1
- else:
- throwException = True
-
- if throwException:
- exc = self.myException
- exc.loc = loc
- exc.pstr = instring
- raise exc
-
- return loc, (instring[start:loc],mismatches)
+ def __init__(self, seq, maxMismatches=1):
+ super(CloseMatch,self).__init__()
+ self.name = seq
+ self.sequence = seq
+ self.maxMismatches = maxMismatches
+ self.errmsg = "Expected " + self.sequence
+ self.mayIndexError = False
+ self.mayReturnEmpty = False
+
+ def parseImpl( self, instring, loc, doActions=True ):
+ start = loc
+ instrlen = len(instring)
+ maxloc = start + len(self.sequence)
+
+ if maxloc <= instrlen:
+ seq = self.sequence
+ seqloc = 0
+ mismatches = []
+ throwException = False
+ done = False
+ while loc < maxloc and not done:
+ if instring[loc] != seq[seqloc]:
+ mismatches.append(seqloc)
+ if len(mismatches) > self.maxMismatches:
+ throwException = True
+ done = True
+ loc += 1
+ seqloc += 1
+ else:
+ throwException = True
+
+ if throwException:
+ exc = self.myException
+ exc.loc = loc
+ exc.pstr = instring
+ raise exc
+
+ return loc, (instring[start:loc],mismatches)
# using the genedata extracted above, look for close matches of a gene sequence
searchseq = CloseMatch("TTAAATCTAGAAGAT", 3)
-for g in genedata:
- print("%s (%d)" % (g.id, g.genelen))
- print("-"*24)
- for t,startLoc,endLoc in searchseq.scanString(g.gene, overlap=True):
- matched, mismatches = t[0]
- print("MATCH:", searchseq.sequence)
- print("FOUND:", matched)
- if mismatches:
- print(" ", ''.join(' ' if i not in mismatches else '*'
- for i,c in enumerate(searchseq.sequence)))
- else:
- print("<exact match>")
- print("at location", startLoc)
- print()
- print()
\ No newline at end of file +for g in genedata:
+ print("%s (%d)" % (g.id, g.genelen))
+ print("-"*24)
+ for t,startLoc,endLoc in searchseq.scanString(g.gene, overlap=True):
+ matched, mismatches = t[0]
+ print("MATCH:", searchseq.sequence)
+ print("FOUND:", matched)
+ if mismatches:
+ print(" ", ''.join(' ' if i not in mismatches else '*'
+ for i,c in enumerate(searchseq.sequence)))
+ else:
+ print("<exact match>")
+ print("at location", startLoc)
+ print()
+ print()
|