Typo and spelling cleanup, add helpful comments

author: Paul McGuire <ptmcg@austin.rr.com> 2019-08-07 06:37:43 -0500
committer: Paul McGuire <ptmcg@austin.rr.com> 2019-08-07 06:37:43 -0500
commit: c02db7427de3197d607e30ba42031884802a6f94 (patch)
tree: 0f6a87bbfdede65d12565b3005308386125510c6
parent: 123e83037b5de90fb964e96267bd60e90c70db19 (diff)
download: pyparsing-git-c02db7427de3197d607e30ba42031884802a6f94.tar.gz
1 files changed, 10 insertions, 3 deletions
diff --git a/examples/partial_gene_match.py b/examples/partial_gene_match.py
index 8ca9c44..3d48f9d 100644
--- a/examples/partial_gene_match.py
+++ b/examples/partial_gene_match.py
@@ -1,10 +1,10 @@
-# parital_gene_match.py
+# partial_gene_match.py
 #
 #  Example showing how to use the CloseMatch class, to find strings in a gene with up to 'n' mismatches
 #
 import pyparsing as pp
 
-import urllib.request, urllib.parse, urllib.error
+import urllib.request
 from contextlib import closing
 
 # read in a bunch of genomic data
@@ -12,11 +12,12 @@ data_url = "http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicop
 with closing(urllib.request.urlopen(data_url)) as datafile:
     fastasrc = datafile.read().decode()
 
+# define parser to extract gene definitions
 """
 Sample header:
 >NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201
 """
-integer = pp.Word(pp.nums).setParseAction(lambda t:int(t[0]))
+integer = pp.pyparsing_common.integer
 genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id")
                    + "|" + pp.Word(pp.printables)("organism")
                    + "|" + pp.Word(pp.printables)("location")
@@ -25,6 +26,10 @@ genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id")
                    + pp.Word("ACGTN")[1, ...].addParseAction(''.join)("gene"))
 
 # read gene data from .fasta file - takes just a few seconds
+# An important aspect of this parsing process is the reassembly of all the separate lines of the
+# gene into a single scannable string. Just searching the raw .fasta file could overlook matches
+# if the match is broken up across separate lines. The parse action in the genebit parser does
+# this reassembly work.
 genedata = genebit[1, ...].parseString(fastasrc)
 
 # using the genedata extracted above, look for close matches of a gene sequence
@@ -32,8 +37,10 @@ searchseq = pp.CloseMatch("TTAAATCTAGAAGAT", 3)
 
 for g in genedata:
     show_header = True
+    # scan for close matches, list out found strings, and mark mismatch locations
     for t, startLoc, endLoc in searchseq.scanString(g.gene, overlap=True):
         if show_header:
+            # only need to show the header once
             print("%s/%s/%s (%d)" % (g.gene_id, g.organism, g.location, g.gene_len))
             print("-" * 24)
             show_header = False
author	Paul McGuire <ptmcg@austin.rr.com>	2019-08-07 06:37:43 -0500
committer	Paul McGuire <ptmcg@austin.rr.com>	2019-08-07 06:37:43 -0500
commit	c02db7427de3197d607e30ba42031884802a6f94 (patch)
tree	0f6a87bbfdede65d12565b3005308386125510c6
parent	123e83037b5de90fb964e96267bd60e90c70db19 (diff)
download	pyparsing-git-c02db7427de3197d607e30ba42031884802a6f94.tar.gz