diff options
| -rwxr-xr-x | sandbox/blais/rstlime/rstlime.py | 404 | ||||
| -rw-r--r-- | sandbox/blais/rstlime/testinput.txt | 93 |
2 files changed, 497 insertions, 0 deletions
diff --git a/sandbox/blais/rstlime/rstlime.py b/sandbox/blais/rstlime/rstlime.py new file mode 100755 index 000000000..52b93aed9 --- /dev/null +++ b/sandbox/blais/rstlime/rstlime.py @@ -0,0 +1,404 @@ +#!/usr/bin/env python +""" +Extract data entries like this from a set of reStructuredText documents like +this:: + + [book] + :title: Probability Theory + :subtitle: The Logic of Science + :authors: E.T. Jaynes, G. Larry Bretthorst + :isbn: 978-0521592710 + +and store them in an existing database and/or infer a new database model. This +allows you to very easily embed data within a text file. + +(The ideas for this project originate from Nabu (http://furius.ca/nabu/). It is +an attempt at providing the basic functionality without all the complications, +setup and customization that Nabu requires.) +""" +__author__ = 'Martin Blais <blais@furius.ca>' + +## FIXME: TODO: +## How do we deal with subtypes? e.g. :p/cell: +## What do we do when we have multiple values for the same key? +## Should we join the lines for values? +## Add document ids. + + +# stdlib imports +import sys, os, re, getpass +from types import ClassType +from StringIO import StringIO +from collections import defaultdict +from operator import attrgetter + +# psyco imports +import psycopg2 as dbapi + +# docutils imports +from docutils import core, nodes, io + + +#------------------------------------------------------------------------------- +# Parsing and values extraction code + +# Note: this is generic, perhaps should be part of docutils.nodes. +def find_first(nodetype, node, document): + """ Find the first node under 'node' that is of the same type as 'nodetype'.""" + + assert isinstance(nodetype, ClassType) + found = [] + + class FindFirst(nodes.SparseNodeVisitor): + def visit(self, node): + found.append(node) + return True # stop + + setattr(FindFirst, 'visit_%s' % nodetype.__name__, + FindFirst.visit) + + vis = FindFirst(document) + node.walk(vis) + + return found[0] if found else None + +def get_file_entries(fn): + """ Parse a file and extract entries from it. """ + text = open(fn).read() + document = core.publish_doctree( + open(fn), + source_class=io.FileInput, + ## source_path=fn, + reader_name='standalone', + parser_name='restructuredtext', + settings_overrides={'report_level': 'error'}, + ) + + # Extract the unique document id. + docid = None + docinfo = find_first(nodes.docinfo, document, document) + if docinfo: + fields = extract_fields(docinfo, document) + docid = dict(fields).get('Id', None) + if docid is None: + docid = basename(fn) + + # Obtain all the data from the document. + v = FindData(docid, document) + document.walk(v) + return docid, v.entries + + +class Entry(object): + """ A data entry read from a file. """ + + def __init__(self, source, table, values): + self.source = source + self.table = table + self.values = values + + def __str__(self): + s = StringIO() + s.write('[%s] (%s)\n' % (self.table, self.source)) + for x in self.values: + s.write(' :%s: %s\n' % x) + return s.getvalue() + +class FindData(nodes.SparseNodeVisitor): + """ A visitor that finds all the definition_list_item which match our + desired tagging for format.""" + + # Regexp for the definition tag. + tagre = re.compile('\[([a-zA-Z0-9_]+)\]\s*$') + + def __init__(self, docid, *args): + nodes.SparseNodeVisitor.__init__(self, *args) + self.docid = docid + self.entries = [] + + def visit_term(self, node): + if len(node.children) != 1: + return + mo = self.tagre.match(node.astext()) + if not mo: + return + + table = str2table(mo.group(1)) + dlitem = node.parent + if len(dlitem.children) != 2: + return + + defn = dlitem.children[1] + if not isinstance(defn, nodes.definition) or len(defn.children) != 1: + return + flist = defn.children[0] + if not isinstance(flist, nodes.field_list): + return + + fields = extract_fields(flist, self.document) + e = Entry(self.docid, table, fields) + self.entries.append(e) + + raise nodes.SkipNode() + +def str2table(s): + "Convert a string to a valid table name." + return s.lower().replace(' ', '_') + + +def extract_fields(node, document): + "Return a list of (key, value) pairs from all underlying field_list's." + v = ExtractFields(document) + node.walk(v) + return list(v) + +class ExtractFields(nodes.SparseNodeVisitor, list): + """ A visitor for a field_list that extracts all the name/value pairs. """ + + def visit_field_name(self, node): + self.key = node.astext() + + def visit_field_body(self, node): + self.append( (self.key, node.astext()) ) + self.key = None + + + +#------------------------------------------------------------------------------- +# Table definition inference code. + +# Note: this is generic utils code. +def seq2dict(seq, classify_fun): + """Given a sequence of objects and a function to classify them, return a dict of + (key, sublist of objects) whereby 'key' is computed by calling + 'classify_fun' on objects.""" + assert isinstance(seq, (list, tuple)), seq + r = defaultdict(list) + for e in seq: + try: + r[classify_fun(e)].append(e) + except Exception: + pass + return r + +def infer_tables(entries): + """ Given a list of entries, infer some database models from it. """ + + table_entries = seq2dict(entries, attrgetter('table')) + return dict((table, infer_table(entries)) + for table, entries in table_entries.iteritems()) + +intre = re.compile('[0-9]+$') +floatre = re.compile('[0-9\.]+$') + +def infer_table(entries): + """ Given a list of entries from the same table, infer a table description. + This returns a dict of 'table-name' to a sorted list of (column-name, type) + pairs.""" + + coldata = defaultdict(list) + sortorder = defaultdict(int) + for e in entries: + for i, (key, value) in enumerate(e.values): + coldata[key].append(value) + sortorder[key] += i + + coldefs = {} + for colname, values in coldata.iteritems(): + if all(intre.match(x) for x in values): + ctype = int + elif all(floatre.match(x) for x in values): + ctype = float + else: + ctype = unicode + coldefs[colname] = ctype + + return sorted(coldefs.items(), key=lambda x: sortorder[x[0]]) + +sqltypes = {int: 'INTEGER', + float: 'FLOAT', + unicode: 'TEXT'} + +def sqlcol(colname): + "Sanitize columns names for SQL." + return colname.strip().lower().replace(' ', '_').replace('-', '_') + +def table2sql(table, tabledef): + """Generate SQL table definition code given the table name and columns + definition.""" + lines = ['CREATE TABLE %s (' % table] + for colname, ctype in tabledef: + lines.append(' %s %s,' % (sqlcol(colname), sqltypes[ctype])) + lines[-1] = lines[-1][:-1] + + lines.append(');') + lines.append('') + return os.linesep.join(lines) + + +#------------------------------------------------------------------------------- +# Database introspection. + +def db_get_tables(conn): + "List all the tables of the database." + curs = conn.cursor() + curs.execute(""" + SELECT table_name FROM information_schema.tables + WHERE table_schema = 'public'; + """) + return [x[0] for x in curs] + +def db_get_table_columns(conn, table): + "List all the columns of a table in the database." + curs = conn.cursor() + curs.execute(""" + SELECT column_name, data_type FROM information_schema.columns + WHERE table_schema = 'public' AND + table_name = %s + """, (table,)) + return list(curs) + +def db_get_model(conn): + "Obtain the definition of database tables and columns." + dbmodel = {} + for table in db_get_tables(conn): + dbmodel[table] = db_get_table_columns(conn, table) + return dbmodel + + + +#------------------------------------------------------------------------------- +# Filling up the database. + +def store_entries(entries_list, dbmodel, conn): + """ Given a list of entries to be stored, try to store as much data as + possible in the given database model.""" + + curs = conn.cursor() + + dbmodel = dict((k, dict(v)) for (k,v) in dbmodel.iteritems()) + for e in entries_list: + try: + cols = dbmodel[e.table] + except KeyError: + pass # Table for available. + + scols = [] + svalues = [] + colset = set() + for cname, cvalue in e.values: + if cname in colset: + continue # Cannot store two of the same key. + else: + colset.add(cname) + + dtype = cols[sqlcol(cname)] + if dtype == 'text': + value = unicode(cvalue) + else: + raise NotImplementedError("Unsupported type.") + + scols.append(sqlcol(cname)) + svalues.append(value) + + if svalues: + curs.execute(""" + INSERT INTO %s (%s) VALUES (%s) + """ % (e.table, + ','.join(scols), + ','.join(['%s'] * len(svalues))), + svalues) + + conn.commit() + + +#------------------------------------------------------------------------------- +# Main program. + +def parse_dburi(dburi): + """ Parse the database connection URI. """ + + user, passwd, host, dbname = [None] * 4 + mo = re.match('(db|postgres|postgresql)://(?:([^:@]+)' + '(?::([^:@]+))?@)?([a-z0-9]+)/([a-z0-9]+)/?$', dburi) + if mo: + user, passwd, host, dbname = mo.group(2, 3, 4, 5) + elif re.match('[a-z]+', dburi): + dbname = dburi + else: + parser.error("Invalid database connection string.") + + if user is None: + user = getpass.getuser() + if passwd is None: + passwd = getpass.getpass() + if host is None: + host = 'localhost' + r = (user, passwd, host, dbname) + assert None not in r, r + return r + + +def main(): + import optparse + parser = optparse.OptionParser(__doc__.strip()) + + parser.add_option('-c', '--infer-definition', action='store', metavar="FILE", + help="Infer the definition of tables from the data and " + "store in the given filename.") + + parser.add_option('-s', '--store', action='store', metavar="CONNSTR", + default="postgres://localhost/test", + help="If present, store the contents to a given database. " + "You must provide a database connection URI.") + + opts, args = parser.parse_args() + + if not args: + parser.error("You must specific a list of filenames to process.") + + # Disable the conversion of system messages into text. + nodes.system_message.astext = lambda *args: u'' + + # Process each input file. + entries_by_document = {} + entries_list = [] + for fn in args: + docid, entries = get_file_entries(fn) + entries_by_document[docid] = entries + entries_list.extend(entries) + + # Infer the definition of the database into CREATE TABLE statements. + if opts.infer_definition: + f = open(opts.infer_definition, 'w') + defs = infer_tables(entries) + for table, tabledef in defs.iteritems(): + f.write(table2sql(table, tabledef)) + f.write('\n') +## FIXME: we need to try to create the tables. +## FIXME: add an option to drop and recreate the tables. + + # Open a connection to the database. + if opts.store: + user, passwd, host, dbname = parse_dburi(opts.store) + conn = dbapi.connect(host=host, + user=user, + password=passwd, + database=dbname) + + # Open the database and inspect the model. + dbmodel = db_get_model(conn) + store_entries(entries_list, dbmodel, conn) + + + + + + +if __name__ == '__main__': + ## inspect_db() + main() + + + + diff --git a/sandbox/blais/rstlime/testinput.txt b/sandbox/blais/rstlime/testinput.txt new file mode 100644 index 000000000..bbefb9d21 --- /dev/null +++ b/sandbox/blais/rstlime/testinput.txt @@ -0,0 +1,93 @@ +.. -*- coding: utf-8 -*- +===================================================== + Some test input for automatic object extraction +===================================================== +:Id: 69f40ba7-4068-46ef-ac07-7f381aba4f2b +:Tags: Reading +:Date: $Date: 2005/08/16 02:10:50 $ + + +Definitions in the Database +=========================== + +[book] + :title: pdf2table: A Method to Extract Table Information from PDF Files + :authors: Burcu Yildiz, Katharina Kaiser, and Silvia Miksch + :institution: Institute of Software Technology & Interactive + Systems, Vienna University of Technology, Vienna, Austria + :url: ? + :comments: + + A paper that describes heuristics to implement the automatic + extraction of tables from PDF files. The input consists in a list + of (text, x, y, width, height, font) entries for all the parcels + of text that are present in a PDF file. This input is generated by + an existing Java tool. I read this paper in order to implement it + as an exercise for programming in Haskell. + + +[book] + :title: Options, Futures and Other Derivatives + :edition: 6th + :author: John C. Hull + :hardcover: 816 pages + :publisher: Prentice Hall; 6 edition (June 10, 2005) + :isbn-13: 978-0131499089 + :comments: + + This is considered the basic reference textbook in finance. I + found it to be very clearly written and to obtain a wide breadth + of material. + + + +[book] + :title: Probability Theory + :subtitle: The Logic of Science + :authors: E.T. Jaynes, G. Larry Bretthorst + :isbn: 978-0521592710 + :comments: + + Ref.ed by traders at GS/Japan. Ref. awesome book on statistic, + with an admitted bent on the “statistics should be Bayesian” idea. + + +Other definitions, which should not be part of the database. +============================================================ + +:gambit + 1. An opening in chess in which a minor piece, or pieces, usually a pawn, + is offered in exchange for a favorable position. + 2. A maneuver, stratagem, or ploy, especially one used at an initial + stage. + 3. A remark intended to open a conversation. + +:specious + 1. Having the ring of truth or plausibility but actually fallacious: a + specious argument. + 2. Deceptively attractive. + + +Some other type of object +========================= + +[addr] + :n: La Croix Bleue (Assurances) - Blue Cross (Insurance) + :e: info@qc.croixbleue.ca + :p: +1.514.286.8403 Information Et Ventes Directes, + Assurance individuelle santé et voyage + :p: +1.514.286.8411, +1.800.361.6068 + (urgence, et ventes quand fermé, a frais virés, pas de problème) + :f: +1.514.286.8358 + :p: (other) +1.514.286.7682, +1.877.286.7682, +1.800.361.5706 + :x: print + +[addr] + :n: Australian Taxation Office + :p: General - 13 2861 + :p: GST enquiries - 13 6140 + :p: Australian Business Number enquiries - 13 2478 + :w: http://www.ato.gov.au/ + :w: http://www.taxinstitute.com.au/ + + |
