summaryrefslogtreecommitdiff
path: root/sandbox/blais
diff options
context:
space:
mode:
authorblais <blais@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2008-10-03 18:28:06 +0000
committerblais <blais@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2008-10-03 18:28:06 +0000
commitf39a047ebb5ac06915f71b258ea921f56e8151f5 (patch)
tree4fb05397bc1a3b3bc2516dca668ff8c0a8c2f747 /sandbox/blais
parent827188fdb8e1cb59ac40e94c0659689b8bf7bc44 (diff)
downloaddocutils-f39a047ebb5ac06915f71b258ea921f56e8151f5.tar.gz
Prototype version of rstlime, to extract DB entries from rest files.
git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@5660 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
Diffstat (limited to 'sandbox/blais')
-rwxr-xr-xsandbox/blais/rstlime/rstlime.py404
-rw-r--r--sandbox/blais/rstlime/testinput.txt93
2 files changed, 497 insertions, 0 deletions
diff --git a/sandbox/blais/rstlime/rstlime.py b/sandbox/blais/rstlime/rstlime.py
new file mode 100755
index 000000000..52b93aed9
--- /dev/null
+++ b/sandbox/blais/rstlime/rstlime.py
@@ -0,0 +1,404 @@
+#!/usr/bin/env python
+"""
+Extract data entries like this from a set of reStructuredText documents like
+this::
+
+ [book]
+ :title: Probability Theory
+ :subtitle: The Logic of Science
+ :authors: E.T. Jaynes, G. Larry Bretthorst
+ :isbn: 978-0521592710
+
+and store them in an existing database and/or infer a new database model. This
+allows you to very easily embed data within a text file.
+
+(The ideas for this project originate from Nabu (http://furius.ca/nabu/). It is
+an attempt at providing the basic functionality without all the complications,
+setup and customization that Nabu requires.)
+"""
+__author__ = 'Martin Blais <blais@furius.ca>'
+
+## FIXME: TODO:
+## How do we deal with subtypes? e.g. :p/cell:
+## What do we do when we have multiple values for the same key?
+## Should we join the lines for values?
+## Add document ids.
+
+
+# stdlib imports
+import sys, os, re, getpass
+from types import ClassType
+from StringIO import StringIO
+from collections import defaultdict
+from operator import attrgetter
+
+# psyco imports
+import psycopg2 as dbapi
+
+# docutils imports
+from docutils import core, nodes, io
+
+
+#-------------------------------------------------------------------------------
+# Parsing and values extraction code
+
+# Note: this is generic, perhaps should be part of docutils.nodes.
+def find_first(nodetype, node, document):
+ """ Find the first node under 'node' that is of the same type as 'nodetype'."""
+
+ assert isinstance(nodetype, ClassType)
+ found = []
+
+ class FindFirst(nodes.SparseNodeVisitor):
+ def visit(self, node):
+ found.append(node)
+ return True # stop
+
+ setattr(FindFirst, 'visit_%s' % nodetype.__name__,
+ FindFirst.visit)
+
+ vis = FindFirst(document)
+ node.walk(vis)
+
+ return found[0] if found else None
+
+def get_file_entries(fn):
+ """ Parse a file and extract entries from it. """
+ text = open(fn).read()
+ document = core.publish_doctree(
+ open(fn),
+ source_class=io.FileInput,
+ ## source_path=fn,
+ reader_name='standalone',
+ parser_name='restructuredtext',
+ settings_overrides={'report_level': 'error'},
+ )
+
+ # Extract the unique document id.
+ docid = None
+ docinfo = find_first(nodes.docinfo, document, document)
+ if docinfo:
+ fields = extract_fields(docinfo, document)
+ docid = dict(fields).get('Id', None)
+ if docid is None:
+ docid = basename(fn)
+
+ # Obtain all the data from the document.
+ v = FindData(docid, document)
+ document.walk(v)
+ return docid, v.entries
+
+
+class Entry(object):
+ """ A data entry read from a file. """
+
+ def __init__(self, source, table, values):
+ self.source = source
+ self.table = table
+ self.values = values
+
+ def __str__(self):
+ s = StringIO()
+ s.write('[%s] (%s)\n' % (self.table, self.source))
+ for x in self.values:
+ s.write(' :%s: %s\n' % x)
+ return s.getvalue()
+
+class FindData(nodes.SparseNodeVisitor):
+ """ A visitor that finds all the definition_list_item which match our
+ desired tagging for format."""
+
+ # Regexp for the definition tag.
+ tagre = re.compile('\[([a-zA-Z0-9_]+)\]\s*$')
+
+ def __init__(self, docid, *args):
+ nodes.SparseNodeVisitor.__init__(self, *args)
+ self.docid = docid
+ self.entries = []
+
+ def visit_term(self, node):
+ if len(node.children) != 1:
+ return
+ mo = self.tagre.match(node.astext())
+ if not mo:
+ return
+
+ table = str2table(mo.group(1))
+ dlitem = node.parent
+ if len(dlitem.children) != 2:
+ return
+
+ defn = dlitem.children[1]
+ if not isinstance(defn, nodes.definition) or len(defn.children) != 1:
+ return
+ flist = defn.children[0]
+ if not isinstance(flist, nodes.field_list):
+ return
+
+ fields = extract_fields(flist, self.document)
+ e = Entry(self.docid, table, fields)
+ self.entries.append(e)
+
+ raise nodes.SkipNode()
+
+def str2table(s):
+ "Convert a string to a valid table name."
+ return s.lower().replace(' ', '_')
+
+
+def extract_fields(node, document):
+ "Return a list of (key, value) pairs from all underlying field_list's."
+ v = ExtractFields(document)
+ node.walk(v)
+ return list(v)
+
+class ExtractFields(nodes.SparseNodeVisitor, list):
+ """ A visitor for a field_list that extracts all the name/value pairs. """
+
+ def visit_field_name(self, node):
+ self.key = node.astext()
+
+ def visit_field_body(self, node):
+ self.append( (self.key, node.astext()) )
+ self.key = None
+
+
+
+#-------------------------------------------------------------------------------
+# Table definition inference code.
+
+# Note: this is generic utils code.
+def seq2dict(seq, classify_fun):
+ """Given a sequence of objects and a function to classify them, return a dict of
+ (key, sublist of objects) whereby 'key' is computed by calling
+ 'classify_fun' on objects."""
+ assert isinstance(seq, (list, tuple)), seq
+ r = defaultdict(list)
+ for e in seq:
+ try:
+ r[classify_fun(e)].append(e)
+ except Exception:
+ pass
+ return r
+
+def infer_tables(entries):
+ """ Given a list of entries, infer some database models from it. """
+
+ table_entries = seq2dict(entries, attrgetter('table'))
+ return dict((table, infer_table(entries))
+ for table, entries in table_entries.iteritems())
+
+intre = re.compile('[0-9]+$')
+floatre = re.compile('[0-9\.]+$')
+
+def infer_table(entries):
+ """ Given a list of entries from the same table, infer a table description.
+ This returns a dict of 'table-name' to a sorted list of (column-name, type)
+ pairs."""
+
+ coldata = defaultdict(list)
+ sortorder = defaultdict(int)
+ for e in entries:
+ for i, (key, value) in enumerate(e.values):
+ coldata[key].append(value)
+ sortorder[key] += i
+
+ coldefs = {}
+ for colname, values in coldata.iteritems():
+ if all(intre.match(x) for x in values):
+ ctype = int
+ elif all(floatre.match(x) for x in values):
+ ctype = float
+ else:
+ ctype = unicode
+ coldefs[colname] = ctype
+
+ return sorted(coldefs.items(), key=lambda x: sortorder[x[0]])
+
+sqltypes = {int: 'INTEGER',
+ float: 'FLOAT',
+ unicode: 'TEXT'}
+
+def sqlcol(colname):
+ "Sanitize columns names for SQL."
+ return colname.strip().lower().replace(' ', '_').replace('-', '_')
+
+def table2sql(table, tabledef):
+ """Generate SQL table definition code given the table name and columns
+ definition."""
+ lines = ['CREATE TABLE %s (' % table]
+ for colname, ctype in tabledef:
+ lines.append(' %s %s,' % (sqlcol(colname), sqltypes[ctype]))
+ lines[-1] = lines[-1][:-1]
+
+ lines.append(');')
+ lines.append('')
+ return os.linesep.join(lines)
+
+
+#-------------------------------------------------------------------------------
+# Database introspection.
+
+def db_get_tables(conn):
+ "List all the tables of the database."
+ curs = conn.cursor()
+ curs.execute("""
+ SELECT table_name FROM information_schema.tables
+ WHERE table_schema = 'public';
+ """)
+ return [x[0] for x in curs]
+
+def db_get_table_columns(conn, table):
+ "List all the columns of a table in the database."
+ curs = conn.cursor()
+ curs.execute("""
+ SELECT column_name, data_type FROM information_schema.columns
+ WHERE table_schema = 'public' AND
+ table_name = %s
+ """, (table,))
+ return list(curs)
+
+def db_get_model(conn):
+ "Obtain the definition of database tables and columns."
+ dbmodel = {}
+ for table in db_get_tables(conn):
+ dbmodel[table] = db_get_table_columns(conn, table)
+ return dbmodel
+
+
+
+#-------------------------------------------------------------------------------
+# Filling up the database.
+
+def store_entries(entries_list, dbmodel, conn):
+ """ Given a list of entries to be stored, try to store as much data as
+ possible in the given database model."""
+
+ curs = conn.cursor()
+
+ dbmodel = dict((k, dict(v)) for (k,v) in dbmodel.iteritems())
+ for e in entries_list:
+ try:
+ cols = dbmodel[e.table]
+ except KeyError:
+ pass # Table for available.
+
+ scols = []
+ svalues = []
+ colset = set()
+ for cname, cvalue in e.values:
+ if cname in colset:
+ continue # Cannot store two of the same key.
+ else:
+ colset.add(cname)
+
+ dtype = cols[sqlcol(cname)]
+ if dtype == 'text':
+ value = unicode(cvalue)
+ else:
+ raise NotImplementedError("Unsupported type.")
+
+ scols.append(sqlcol(cname))
+ svalues.append(value)
+
+ if svalues:
+ curs.execute("""
+ INSERT INTO %s (%s) VALUES (%s)
+ """ % (e.table,
+ ','.join(scols),
+ ','.join(['%s'] * len(svalues))),
+ svalues)
+
+ conn.commit()
+
+
+#-------------------------------------------------------------------------------
+# Main program.
+
+def parse_dburi(dburi):
+ """ Parse the database connection URI. """
+
+ user, passwd, host, dbname = [None] * 4
+ mo = re.match('(db|postgres|postgresql)://(?:([^:@]+)'
+ '(?::([^:@]+))?@)?([a-z0-9]+)/([a-z0-9]+)/?$', dburi)
+ if mo:
+ user, passwd, host, dbname = mo.group(2, 3, 4, 5)
+ elif re.match('[a-z]+', dburi):
+ dbname = dburi
+ else:
+ parser.error("Invalid database connection string.")
+
+ if user is None:
+ user = getpass.getuser()
+ if passwd is None:
+ passwd = getpass.getpass()
+ if host is None:
+ host = 'localhost'
+ r = (user, passwd, host, dbname)
+ assert None not in r, r
+ return r
+
+
+def main():
+ import optparse
+ parser = optparse.OptionParser(__doc__.strip())
+
+ parser.add_option('-c', '--infer-definition', action='store', metavar="FILE",
+ help="Infer the definition of tables from the data and "
+ "store in the given filename.")
+
+ parser.add_option('-s', '--store', action='store', metavar="CONNSTR",
+ default="postgres://localhost/test",
+ help="If present, store the contents to a given database. "
+ "You must provide a database connection URI.")
+
+ opts, args = parser.parse_args()
+
+ if not args:
+ parser.error("You must specific a list of filenames to process.")
+
+ # Disable the conversion of system messages into text.
+ nodes.system_message.astext = lambda *args: u''
+
+ # Process each input file.
+ entries_by_document = {}
+ entries_list = []
+ for fn in args:
+ docid, entries = get_file_entries(fn)
+ entries_by_document[docid] = entries
+ entries_list.extend(entries)
+
+ # Infer the definition of the database into CREATE TABLE statements.
+ if opts.infer_definition:
+ f = open(opts.infer_definition, 'w')
+ defs = infer_tables(entries)
+ for table, tabledef in defs.iteritems():
+ f.write(table2sql(table, tabledef))
+ f.write('\n')
+## FIXME: we need to try to create the tables.
+## FIXME: add an option to drop and recreate the tables.
+
+ # Open a connection to the database.
+ if opts.store:
+ user, passwd, host, dbname = parse_dburi(opts.store)
+ conn = dbapi.connect(host=host,
+ user=user,
+ password=passwd,
+ database=dbname)
+
+ # Open the database and inspect the model.
+ dbmodel = db_get_model(conn)
+ store_entries(entries_list, dbmodel, conn)
+
+
+
+
+
+
+if __name__ == '__main__':
+ ## inspect_db()
+ main()
+
+
+
+
diff --git a/sandbox/blais/rstlime/testinput.txt b/sandbox/blais/rstlime/testinput.txt
new file mode 100644
index 000000000..bbefb9d21
--- /dev/null
+++ b/sandbox/blais/rstlime/testinput.txt
@@ -0,0 +1,93 @@
+.. -*- coding: utf-8 -*-
+=====================================================
+ Some test input for automatic object extraction
+=====================================================
+:Id: 69f40ba7-4068-46ef-ac07-7f381aba4f2b
+:Tags: Reading
+:Date: $Date: 2005/08/16 02:10:50 $
+
+
+Definitions in the Database
+===========================
+
+[book]
+ :title: pdf2table: A Method to Extract Table Information from PDF Files
+ :authors: Burcu Yildiz, Katharina Kaiser, and Silvia Miksch
+ :institution: Institute of Software Technology & Interactive
+ Systems, Vienna University of Technology, Vienna, Austria
+ :url: ?
+ :comments:
+
+ A paper that describes heuristics to implement the automatic
+ extraction of tables from PDF files. The input consists in a list
+ of (text, x, y, width, height, font) entries for all the parcels
+ of text that are present in a PDF file. This input is generated by
+ an existing Java tool. I read this paper in order to implement it
+ as an exercise for programming in Haskell.
+
+
+[book]
+ :title: Options, Futures and Other Derivatives
+ :edition: 6th
+ :author: John C. Hull
+ :hardcover: 816 pages
+ :publisher: Prentice Hall; 6 edition (June 10, 2005)
+ :isbn-13: 978-0131499089
+ :comments:
+
+ This is considered the basic reference textbook in finance. I
+ found it to be very clearly written and to obtain a wide breadth
+ of material.
+
+
+
+[book]
+ :title: Probability Theory
+ :subtitle: The Logic of Science
+ :authors: E.T. Jaynes, G. Larry Bretthorst
+ :isbn: 978-0521592710
+ :comments:
+
+ Ref.ed by traders at GS/Japan. Ref. awesome book on statistic,
+ with an admitted bent on the “statistics should be Bayesian” idea.
+
+
+Other definitions, which should not be part of the database.
+============================================================
+
+:gambit
+ 1. An opening in chess in which a minor piece, or pieces, usually a pawn,
+ is offered in exchange for a favorable position.
+ 2. A maneuver, stratagem, or ploy, especially one used at an initial
+ stage.
+ 3. A remark intended to open a conversation.
+
+:specious
+ 1. Having the ring of truth or plausibility but actually fallacious: a
+ specious argument.
+ 2. Deceptively attractive.
+
+
+Some other type of object
+=========================
+
+[addr]
+ :n: La Croix Bleue (Assurances) - Blue Cross (Insurance)
+ :e: info@qc.croixbleue.ca
+ :p: +1.514.286.8403 Information Et Ventes Directes,
+ Assurance individuelle santé et voyage
+ :p: +1.514.286.8411, +1.800.361.6068
+ (urgence, et ventes quand fermé, a frais virés, pas de problème)
+ :f: +1.514.286.8358
+ :p: (other) +1.514.286.7682, +1.877.286.7682, +1.800.361.5706
+ :x: print
+
+[addr]
+ :n: Australian Taxation Office
+ :p: General - 13 2861
+ :p: GST enquiries - 13 6140
+ :p: Australian Business Number enquiries - 13 2478
+ :w: http://www.ato.gov.au/
+ :w: http://www.taxinstitute.com.au/
+
+