diff options
author | Paul McGuire <ptmcg@austin.rr.com> | 2019-08-07 06:37:43 -0500 |
---|---|---|
committer | Paul McGuire <ptmcg@austin.rr.com> | 2019-08-07 06:37:43 -0500 |
commit | c02db7427de3197d607e30ba42031884802a6f94 (patch) | |
tree | 0f6a87bbfdede65d12565b3005308386125510c6 | |
parent | 123e83037b5de90fb964e96267bd60e90c70db19 (diff) | |
download | pyparsing-git-c02db7427de3197d607e30ba42031884802a6f94.tar.gz |
Typo and spelling cleanup, add helpful comments
-rw-r--r-- | examples/partial_gene_match.py | 13 |
1 files changed, 10 insertions, 3 deletions
diff --git a/examples/partial_gene_match.py b/examples/partial_gene_match.py index 8ca9c44..3d48f9d 100644 --- a/examples/partial_gene_match.py +++ b/examples/partial_gene_match.py @@ -1,10 +1,10 @@ -# parital_gene_match.py +# partial_gene_match.py # # Example showing how to use the CloseMatch class, to find strings in a gene with up to 'n' mismatches # import pyparsing as pp -import urllib.request, urllib.parse, urllib.error +import urllib.request from contextlib import closing # read in a bunch of genomic data @@ -12,11 +12,12 @@ data_url = "http://toxodb.org/common/downloads/release-6.0/Tgondii/TgondiiApicop with closing(urllib.request.urlopen(data_url)) as datafile: fastasrc = datafile.read().decode() +# define parser to extract gene definitions """ Sample header: >NC_001799-6-2978-2778 | organism=Toxoplasma_gondii_RH | location=NC_001799:2778-2978(-) | length=201 """ -integer = pp.Word(pp.nums).setParseAction(lambda t:int(t[0])) +integer = pp.pyparsing_common.integer genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id") + "|" + pp.Word(pp.printables)("organism") + "|" + pp.Word(pp.printables)("location") @@ -25,6 +26,10 @@ genebit = pp.Group(">" + pp.Word(pp.alphanums.upper() + "-_")("gene_id") + pp.Word("ACGTN")[1, ...].addParseAction(''.join)("gene")) # read gene data from .fasta file - takes just a few seconds +# An important aspect of this parsing process is the reassembly of all the separate lines of the +# gene into a single scannable string. Just searching the raw .fasta file could overlook matches +# if the match is broken up across separate lines. The parse action in the genebit parser does +# this reassembly work. genedata = genebit[1, ...].parseString(fastasrc) # using the genedata extracted above, look for close matches of a gene sequence @@ -32,8 +37,10 @@ searchseq = pp.CloseMatch("TTAAATCTAGAAGAT", 3) for g in genedata: show_header = True + # scan for close matches, list out found strings, and mark mismatch locations for t, startLoc, endLoc in searchseq.scanString(g.gene, overlap=True): if show_header: + # only need to show the header once print("%s/%s/%s (%d)" % (g.gene_id, g.organism, g.location, g.gene_len)) print("-" * 24) show_header = False |