summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul McGuire <ptmcg@austin.rr.com>2019-08-07 06:37:43 -0500
committerPaul McGuire <ptmcg@austin.rr.com>2019-08-07 06:37:43 -0500
commitc02db7427de3197d607e30ba42031884802a6f94 (patch)
tree0f6a87bbfdede65d12565b3005308386125510c6
parent123e83037b5de90fb964e96267bd60e90c70db19 (diff)
downloadpyparsing-git-c02db7427de3197d607e30ba42031884802a6f94.tar.gz
Typo and spelling cleanup, add helpful comments
-rw-r--r--examples/partial_gene_match.py13
1 files changed, 10 insertions, 3 deletions
diff --git a/examples/partial_gene_match.py b/examples/partial_gene_match.py
index 8ca9c44..3d48f9d 100644
--- a/examples/partial_gene_match.py
+++ b/examples/partial_gene_match.py
@@ -1,10 +1,10 @@
-# parital_gene_match.py
+# partial_gene_match.py
#
# Example showing how to use the CloseMatch class, to find strings in a gene with up to 'n' mismatches
#
import pyparsing as pp
-import urllib.request, urllib.parse, urllib.error
+import urllib.request
from contextlib import closing
# read in a bunch of genomic data
@@ -12,11 +12,12 @@ data_url = "http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicop
with closing(urllib.request.urlopen(data_url)) as datafile:
fastasrc = datafile.read().decode()
+# define parser to extract gene definitions
"""
Sample header:
>NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
"""
-integer = pp.Word(pp.nums).setParseAction(lambda t:int(t[0]))
+integer = pp.pyparsing_common.integer
genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id")
+ "|" + pp.Word(pp.printables)("organism")
+ "|" + pp.Word(pp.printables)("location")
@@ -25,6 +26,10 @@ genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id")
+ pp.Word("ACGTN")[1, ...].addParseAction(''.join)("gene"))
# read gene data from .fasta file - takes just a few seconds
+# An important aspect of this parsing process is the reassembly of all the separate lines of the
+# gene into a single scannable string. Just searching the raw .fasta file could overlook matches
+# if the match is broken up across separate lines. The parse action in the genebit parser does
+# this reassembly work.
genedata = genebit[1, ...].parseString(fastasrc)
# using the genedata extracted above, look for close matches of a gene sequence
@@ -32,8 +37,10 @@ searchseq = pp.CloseMatch("TTAAATCTAGAAGAT", 3)
for g in genedata:
show_header = True
+ # scan for close matches, list out found strings, and mark mismatch locations
for t, startLoc, endLoc in searchseq.scanString(g.gene, overlap=True):
if show_header:
+ # only need to show the header once
print("%s/%s/%s (%d)" % (g.gene_id, g.organism, g.location, g.gene_len))
print("-" * 24)
show_header = False