summaryrefslogtreecommitdiff
path: root/src/examples/sparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/examples/sparser.py')
-rw-r--r--src/examples/sparser.py365
1 files changed, 365 insertions, 0 deletions
diff --git a/src/examples/sparser.py b/src/examples/sparser.py
new file mode 100644
index 0000000..7217a11
--- /dev/null
+++ b/src/examples/sparser.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python
+
+"""
+NAME:
+ sparser.py
+
+SYNOPSIS:
+ sparser.py [options] filename
+
+DESCRIPTION:
+ The sparser.py script is a Specified PARSER. It is unique (as far as I can
+ tell) because it doesn't care about the delimiter(s). The user specifies
+ what is expected, and the order, for each line of text. All of the heavy
+ lifting is handled by pyparsing (http://pyparsing.sf.net).
+
+OPTIONS:
+ -h,--help this message
+ -v,--version version
+ -d,--debug turn on debug messages
+
+EXAMPLES:
+ 1. As standalone
+ sparser.py myfile
+ 2. As library
+ import sparser
+ ...
+
+#Copyright (C) 2006 Tim Cera timcera@earthlink.net
+#
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 2 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+
+#===imports======================
+import sys
+import os
+import getopt
+import re
+import gzip
+
+from pyparsing import *
+
+
+#===globals======================
+modname = "sparser"
+__version__ = "0.1"
+
+
+#--option args--
+debug_p = 0
+#opt_b=None #string arg, default is undefined
+
+
+#---positional args, default is empty---
+pargs = []
+
+
+#---other---
+
+
+#===utilities====================
+def msg(txt):
+ """Send message to stdout."""
+ sys.stdout.write(txt)
+ sys.stdout.flush()
+
+def debug(ftn, txt):
+ """Used for debugging."""
+ if debug_p:
+ sys.stdout.write("%s.%s:%s\n" % (modname, ftn, txt))
+ sys.stdout.flush()
+
+def fatal(ftn, txt):
+ """If can't continue."""
+ msg = "%s.%s:FATAL:%s\n" % (modname, ftn, txt)
+ raise SystemExit, msg
+
+def usage():
+ """Prints the docstring."""
+ print __doc__
+
+
+
+#====================================
+class ToInteger(TokenConverter):
+ """Converter to make token into an integer."""
+ def postParse( self, instring, loc, tokenlist ):
+ return int(tokenlist[0])
+
+class ToFloat(TokenConverter):
+ """Converter to make token into a float."""
+ def postParse( self, instring, loc, tokenlist ):
+ return float(tokenlist[0])
+
+class ParseFileLineByLine:
+ """
+ Bring data from text files into a program, optionally parsing each line
+ according to specifications in a parse definition file.
+
+ ParseFileLineByLine instances can be used like normal file objects (i.e. by
+ calling readline(), readlines(), and write()), but can also be used as
+ sequences of lines in for-loops.
+
+ ParseFileLineByLine objects also handle compression transparently. i.e. it
+ is possible to read lines from a compressed text file as if it were not
+ compressed. Compression is deduced from the file name suffixes '.Z'
+ (compress/uncompress), '.gz' (gzip/gunzip), and '.bz2' (bzip2).
+
+ The parse definition file name is developed based on the input file name.
+ If the input file name is 'basename.ext', then the definition file is
+ 'basename_def.ext'. If a definition file specific to the input file is not
+ found, then the program searches for the file 'sparse.def' which would be
+ the definition file for all files in that directory without a file specific
+ definition file.
+
+ Finally, ParseFileLineByLine objects accept file names that start with '~'
+ or '~user' to indicate a home directory, as well as URLs (for reading
+ only).
+
+ Constructor:
+ ParseFileLineByLine(|filename|, |mode|='"r"'), where |filename| is the name
+ of the file (or a URL) and |mode| is one of '"r"' (read), '"w"' (write) or
+ '"a"' (append, not supported for .Z files).
+ """
+
+ def __init__(self, filename, mode = 'r'):
+ """Opens input file, and if available the definition file. If the
+ definition file is available __init__ will then create some pyparsing
+ helper variables. """
+ if mode not in ['r', 'w', 'a']:
+ raise IOError, (0, 'Illegal mode: ' + repr(mode))
+
+ if string.find(filename, ':/') > 1: # URL
+ if mode == 'w':
+ raise IOError, "can't write to a URL"
+ import urllib
+ self.file = urllib.urlopen(filename)
+ else:
+ filename = os.path.expanduser(filename)
+ if mode == 'r' or mode == 'a':
+ if not os.path.exists(filename):
+ raise IOError, (2, 'No such file or directory: ' + filename)
+ filen, file_extension = os.path.splitext(filename)
+ command_dict = {
+ ('.Z', 'r'):
+ "self.file = os.popen('uncompress -c ' + filename, mode)",
+ ('.gz', 'r'):
+ "self.file = gzip.GzipFile(filename, 'rb')",
+ ('.bz2', 'r'):
+ "self.file = os.popen('bzip2 -dc ' + filename, mode)",
+ ('.Z', 'w'):
+ "self.file = os.popen('compress > ' + filename, mode)",
+ ('.gz', 'w'):
+ "self.file = gzip.GzipFile(filename, 'wb')",
+ ('.bz2', 'w'):
+ "self.file = os.popen('bzip2 > ' + filename, mode)",
+ ('.Z', 'a'):
+ "raise IOError, (0, 'Can\'t append to .Z files')",
+ ('.gz', 'a'):
+ "self.file = gzip.GzipFile(filename, 'ab')",
+ ('.bz2', 'a'):
+ "raise IOError, (0, 'Can\'t append to .bz2 files')",
+ }
+
+ exec command_dict.get((file_extension, mode),
+ 'self.file = open(filename, mode)')
+
+ self.grammar = None
+
+ # Try to find a parse ('*_def.ext') definition file. First try to find
+ # a file specific parse definition file, then look for 'sparse.def'
+ # that would be the definition file for all files within the directory.
+
+ # The definition file is pure Python. The one variable that needs to
+ # be specified is 'parse'. The 'parse' variable is a list of tuples
+ # defining the name, type, and because it is a list, the order of
+ # variables on each line in the data file. The variable name is a
+ # string, the type variable is defined as integer, real, and qString.
+
+ # parse = [
+ # ('year', integer),
+ # ('month', integer),
+ # ('day', integer),
+ # ('value', real),
+ # ]
+
+ definition_file_one = filen + "_def" + file_extension
+ definition_file_two = os.path.dirname(filen) + os.sep + "sparse.def"
+ if os.path.exists(definition_file_one):
+ self.parsedef = definition_file_one
+ elif os.path.exists(definition_file_two):
+ self.parsedef = definition_file_two
+ else:
+ self.parsedef = None
+ return None
+
+ # Create some handy pyparsing constructs. I kept 'decimal_sep' so that
+ # could easily change to parse if the decimal separator is a ",".
+ decimal_sep = "."
+ sign = oneOf("+ -")
+ # part of printables without decimal_sep, +, -
+ special_chars = string.replace('!"#$%&\'()*,./:;<=>?@[\\]^_`{|}~',
+ decimal_sep, "")
+ integer = ToInteger(
+ Combine(Optional(sign) +
+ Word(nums))).setName("integer")
+ positive_integer = ToInteger(
+ Combine(Optional("+") +
+ Word(nums))).setName("integer")
+ negative_integer = ToInteger(
+ Combine("-" +
+ Word(nums))).setName("integer")
+ real = ToFloat(
+ Combine(Optional(sign) +
+ Word(nums) +
+ decimal_sep +
+ Optional(Word(nums)) +
+ Optional(oneOf("E e") +
+ Word(nums)))).setName("real")
+ positive_real = ToFloat(
+ Combine(Optional("+") +
+ Word(nums) +
+ decimal_sep +
+ Optional(Word(nums)) +
+ Optional(oneOf("E e") +
+ Word(nums)))).setName("real")
+ negative_real = ToFloat(
+ Combine("-" +
+ Word(nums) +
+ decimal_sep +
+ Optional(Word(nums)) +
+ Optional(oneOf("E e") +
+ Word(nums)))).setName("real")
+ qString = ( sglQuotedString | dblQuotedString ).setName("qString")
+
+ # add other characters we should skip over between interesting fields
+ integer_junk = Optional(
+ Suppress(
+ Word(alphas +
+ special_chars +
+ decimal_sep))).setName("integer_junk")
+ real_junk = Optional(
+ Suppress(
+ Word(alphas +
+ special_chars))).setName("real_junk")
+ qString_junk = SkipTo(qString).setName("qString_junk")
+
+ # Now that 'integer', 'real', and 'qString' have been assigned I can
+ # execute the definition file.
+ execfile(self.parsedef)
+
+ # Build the grammar, combination of the 'integer', 'real, 'qString',
+ # and '*_junk' variables assigned above in the order specified in the
+ # definition file.
+ grammar = []
+ for nam, expr in parse:
+ grammar.append( eval(expr.name + "_junk"))
+ grammar.append( expr.setResultsName(nam) )
+ self.grammar = And( grammar[1:] + [restOfLine] )
+
+ def __del__(self):
+ """Delete (close) the file wrapper."""
+ self.close()
+
+ def __getitem__(self, item):
+ """Used in 'for line in fp:' idiom."""
+ line = self.readline()
+ if not line:
+ raise IndexError
+ return line
+
+ def readline(self):
+ """Reads (and optionally parses) a single line."""
+ line = self.file.readline()
+ if self.grammar and line:
+ try:
+ return self.grammar.parseString(line).asDict()
+ except ParseException:
+ return self.readline()
+ else:
+ return line
+
+ def readlines(self):
+ """Returns a list of all lines (optionally parsed) in the file."""
+ if self.grammar:
+ tot = []
+ # Used this way instead of a 'for' loop against
+ # self.file.readlines() so that there wasn't two copies of the file
+ # in memory.
+ while 1:
+ line = self.file.readline()
+ if not line:
+ break
+ tot.append(line)
+ return tot
+ return self.file.readlines()
+
+ def write(self, data):
+ """Write to a file."""
+ self.file.write(data)
+
+ def writelines(self, list):
+ """Write a list to a file. Each item in the list is a line in the
+ file.
+ """
+ for line in list:
+ self.file.write(line)
+
+ def close(self):
+ """Close the file."""
+ self.file.close()
+
+ def flush(self):
+ """Flush in memory contents to file."""
+ self.file.flush()
+
+
+#=============================
+def main(pargs):
+ """This should only be used for testing. The primary mode of operation is
+ as an imported library.
+ """
+ input_file = sys.argv[1]
+ fp = ParseFileLineByLine(input_file)
+ for i in fp:
+ print i
+
+
+#-------------------------
+if __name__ == '__main__':
+ ftn = "main"
+ opts, pargs = getopt.getopt(sys.argv[1:], 'hvd',
+ ['help', 'version', 'debug', 'bb='])
+ for opt in opts:
+ if opt[0] == '-h' or opt[0] == '--help':
+ print modname+": version="+__version__
+ usage()
+ sys.exit(0)
+ elif opt[0] == '-v' or opt[0] == '--version':
+ print modname+": version="+__version__
+ sys.exit(0)
+ elif opt[0] == '-d' or opt[0] == '--debug':
+ debug_p = 1
+ elif opt[0] == '--bb':
+ opt_b = opt[1]
+
+ #---make the object and run it---
+ main(pargs)
+
+#===Revision Log===
+#Created by mkpythonproj:
+#2006-02-06 Tim Cera
+#