From 6a0b8ae432cde46a621aa4b363ab76835d376add Mon Sep 17 00:00:00 2001 From: Michele Simionato Date: Sat, 1 Jan 2011 06:13:13 +0100 Subject: Some work on my record papers --- artima/python/Makefile | 2 +- artima/python/easydb.py | 24 ++++ artima/python/records2.py | 294 +++++++++++++++++++++++++++++++++++++++++ artima/python/records3.py | 328 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 647 insertions(+), 1 deletion(-) create mode 100644 artima/python/easydb.py create mode 100644 artima/python/records2.py create mode 100644 artima/python/records3.py diff --git a/artima/python/Makefile b/artima/python/Makefile index 8e36744..0f71c65 100644 --- a/artima/python/Makefile +++ b/artima/python/Makefile @@ -36,7 +36,7 @@ records2: records2.py $(MINIDOC) -d records2; $(POST) /tmp/records2.rst 269269 records3: records3.py - $(MINIDOC) -d records3; $(POST) /tmp/records3.rst + $(MINIDOC) -d records3; $(POST) /tmp/records3.rst 301076 decorator3: decorator3.txt $(POST) decorator3.txt 243843 diff --git a/artima/python/easydb.py b/artima/python/easydb.py new file mode 100644 index 0000000..cd9f069 --- /dev/null +++ b/artima/python/easydb.py @@ -0,0 +1,24 @@ +# easydb.py +from operator import itemgetter +from collections import namedtuple # for Python >= 2.6 + +def get_table_from_db(cursor, query_templ, query_args=(), ntuple=None): + if query_args: + cursor.execute(query_templ, query_args) + else: + cursor.execute(query_templ) + rows = cursor.fetchall() + fields = map(itemgetter(0), cursor.description) + Ntuple = ntuple or namedtuple('DBTuple', fields) + yield Ntuple(*fields) + for row in rows: + yield Ntuple(*row) + +if __name__ == '__main__': # test + from sqlite3 import dbapi2 + conn = dbapi2.connect(':memory:') + conn.execute('create table test(id integer, descr varchar)') + conn.execute("insert into test values (1,'one')") + conn.execute("insert into test values (2,'two')") + for rec in get_table_from_db(conn.cursor(), 'select * from test'): + print rec diff --git a/artima/python/records2.py b/artima/python/records2.py new file mode 100644 index 0000000..281d492 --- /dev/null +++ b/artima/python/records2.py @@ -0,0 +1,294 @@ +# .. -*- coding: utf-8 -*- +""" +In the previous installment I discussed the namedtuple_ type which was +introduced in the standard library with Python 2.6 (if you are using +an older Python version you can just download the original +`Hettinger's recipe`_. In questa puntata farò uso delle namedtuple per gestire +i record provenienti da un database a darò qualche consiglio su +come processare e come visualizzare tali record. + +How to dump a database table +------------------------------------------------------------------- + +The simplest approach to extract the content of a database table is +to convert it into a sequence of named tuples. We can do so by +defining a function +``get_table_from_db`` analogous to the ``get_table`` function we discussed +in the first installment of this series. I assume here a working familiarity +with the `DB API 2`_ (aka PEP 249), the standard way to interact +with a relational database from Python: + +$$easydb + +Notice in particular the line + +.. code-block:: python + + fields = map(itemgetter(0), cursor.description) + +Here, we are extracting the field names from the ``.description`` attribute +of the cursor, which returns a list of tuples. We just take the first +element of each tuple by using ``itemgetter``, an utility function +defined in the operator_ module. You could do the same with a list +comprehension ``fields = [x[0] for x in cursor.description]`` but +``itemgetter`` is the most idiomatic solution. + +The example here use the SQLite_ database, since drivers for it are +included in the standard library starting from Python 2.5; however, +you can easily adapt the code to any other database. +Finally, let me notice that if you know the database schema in +advance, you can just pass a pre-defined namedtuple to +``get_table_from_db``: there is not need to autogenerate it +from the query. This is useful if you want to give aliases to +the field names, especially in the case the name of a column +conflict with a Python keyword. + +If you run the script you will get:: + + $ python easydb.py + DBTuple(id='id', descr='descr') + DBTuple(id=1, descr=u"one") + DBTuple(id=2, descr=u"two") + + +.. _DB API 2: http://www.python.org/dev/peps/pep-0249/ +.. _SQLite: http://www.sqlite.org/ + +A higher level approach +---------------------------------------------------------- + +Using the DB API 2 is a very low level approach; nowadays most people +prefer to use an Object Relation Mapper (ORM); the most powerful +there is is SQLAlchemy_. Using SQLAlchemy my example can be +rewritten as + +.. include-code:: sa.py + +I am not a fan of ORMs. I find them too sophisticated +(*simple is better than complex*) +e nascondono l'SQL al programmatore (*explicit is better than implicit*). +Ciò detto, sono il primo a dire che ci sarebbe molto bisogno di una +DB API 3 ufficiale, di più alto livello della DB API 2, senza per questo +essere un ORM. In pratica, mi piacerebbe avere un equivalente +dell'engine di SQLAlchemy nella libreria standard, e che il *recordset* +ritornato da una query fosse costituito da namedtuple, non da tuple +ordinarie. + +.. _SQLAlchemy: http://www.sqlalchemy.org/ +.. _ORM: http://en.wikipedia.org/wiki/Object-relational_mapping + +Generare tabelle +----------------------------------------------------- + +Un lavoro comunissimo è quello di leggere dei dati da un database, +processarli e produrre come output una tabella di risultati. L'output +potrebbe essere un file CSV di numeri da usare per un grafico +oppure semplicemente una tabella HTML da +pubblicare nel sito aziendale. Un workflow tipico è il +seguente, da leggere dall'alto verso il basso:: + + + | + | get_table + | + + | + | processor + | + + | + | processor + | + + | + | renderer + | + + +Il processore è un oggetto che prende +una tabella in ingresso e ritorna una tabella in uscita, eventualmente con +un numero di righe e/o di colonne diverso di quello in ingresso. +Siccome le tabelle sono degli oggetti iterabili, è naturale implementare +un processore in Python tramite un generatore che prendere +un iterabile e ritorna un iterabile. +In generale, vi possono essere più processori che agiscono uno dopo l'altro e +quindi più tabelle intermedie. L'ultimo processore ritorna la tabella +finale che viene successivamente convertita in una stringa e +salvata in un file, in formato CSV, HTML, XML o altro. + +Per esempio, supponiamo di voler generare una tabella HTML. +In tal caso ci serve un (pre)processore che converte una tabella di +record astratti in una tabella di record concreti, che non sono +altro che sequenze di stringhe in cui i caratteri speciali +dell'HTML sono stati *escaped*; tale processore può essere +implementato come un semplice generatore: + +$$htmlescape + +Si noti che ``htmlescape`` è un processore del tutto generico che +non ha neppure bisogno che i record in ingresso siano delle namedtuple: +è sufficiente che siano delle sequenze generiche. + +Il renderer finale può essere implementato come segue: + +$$HtmlTable + +Notate che ``HtmlTable`` può essere interpretato anche come un processore, +visto che ``HtmlTable(table)`` è un oggetto iterabile che ritorna +blocchi di codice HTML. Il metodo ``.render`` può essere pensato +come il renderizzatore di default, ma è possibile usare dei renderizzatori +più sofisticati, in almeno due modi: + +1. tramite l'ereditarietà, ovvero derivando una sottoclasse di ``HtmlTable`` + e sovrascrivendo il metodo ``render``; + +2. in maniera funzionale, usando ``HtmlTable`` come un processore e passando + il suo output ad un renderizzatore completamente indipendente. + +Entrambe le possibilità hanno dei pro e dei contro, ma +l'approccio funzionale è più indicato se lo scopo finale +è quello di disaccoppiare il codice. Inoltre, la composizione funzionale +è concettualmente più leggera di una gerarchia di ereditarietà. +Questo assicura semplicità e maggiore scalabilità a casi più complessi. + +È banale verificare che il tutto funziona con un semplice test: + +$$test + +In questo esempio ``get_test_table`` legge la tabella iniziale, ``htmlescape`` +è il processore e ``HtmlTable`` è il renderer. Eseguendo il test si ottiene +la tabella seguente: + +.. raw:: html + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ABCD
1234
5678
><&"
+
+ +È chiaro che l'approccio che ho delineato in questo articolo è del tutto +generale e si applica direttamente anche ad altri casi; lascio come +esercizio scrivere un processore/renderizzatore che converte +in formato XML, Latex o CSV. + +I lettori delle `Avventure di un Pythonista in Schemeland`_ avranno +riconosciuto l'inflenza della programmazione funzionale. +Non si tratta di un caso fortuito: io +sono dell'idea che la conoscenza di linguaggi non-mainstream sia molto +utile anche quando si programma esclusivamente in linguaggi +mainstream. In particolare, la conoscenza dei linguaggi funzionali vi +permette di mettere in dubbio concetti che paiono dogmi indiscutibili +in certi ambienti (tipo la "bontà" della programmazione ad oggetti) e +di aprirvi a design alternativi. Non è un caso neppure il fatto che +Python (che fin dall'inizio non è mai stato un linguaggio a oggetti +bigotto alla Java) si stia muovendo sempre più verso soluzioni +funzionali, sia nel linguaggio core ( *list comprehensions*, +*generator expressions*, *tuple unpacking*, ecc) che nelle librerie +(*itertools*, *namedtuple*, ecc). + +La miniserie non finisce qui: c'è ancora molto da dire sul +problema della visualizzazione di tabelle e a questo argomento +dedicheremo interamente la terza ed ultima parte. Ci vediamo +alla prossima, *happy hacking*! + +.. _Avventure di un Pythonista in Schemeland: http://stacktrace.it/articoli/2008/02/le-avventure-di-un-pythonista-schemeland-1/ +.. _operator: http://docs.python.org/library/operator.html +.. _namedtuple: http://docs.python.org/library/collections#collections.namedtuple +.. _Hettinger's recipe: http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/500261 +""" + +import os, cgi, easydb +from tabular_data import headtail + +class HtmlTable(object): + "Convert a sequence header+body into a HTML table" + # this is just a pedagogic implementation, in a real implementation + # you should not hard-code your css at the Python level. + name = "noname" + border = "1" + summary = "" + css = """\ + + """ + def __init__(self, header_plus_body): + self.header, self.body = headtail(header_plus_body) + + def render(self): + join = os.linesep.join + templ = '''\ + %s + + %%s +
''' % (self.css, self.name, self.border, self.summary) + head, tail = headtail(self) # post-processed head and tail + h = '\n%s\n\n' % join(head) + b = '\n%s\n\n' % join(join(r) for r in tail) + return templ % (h+b) + + def __iter__(self): + yield [''] + ['%s' % h for h in self.header] + [''] + for r, row in enumerate(self.body): + ls = ['' % ["even", "odd"][r % 2]] + for col in row: + ls.append('%s' % col) + ls.append('') + yield ls + +def htmlescape(table): + "Converts a table of records into a table of HTML-escaped records" + for rec in table: + yield [cgi.escape(str(field), quote=True) for field in rec] + +def test(): + page = """\ + + + + + %s + + + """ + def get_test_table(): + return 'ABCD', '1234', '5678', '><&"' + t = HtmlTable(htmlescape(get_test_table())) + print >> file('output.html', 'w'), page % t.render() + +if __name__ == '__main__': + test() diff --git a/artima/python/records3.py b/artima/python/records3.py new file mode 100644 index 0000000..1a74f29 --- /dev/null +++ b/artima/python/records3.py @@ -0,0 +1,328 @@ +r"""\ +In the first two installaments of this series +I discussed how to read and process homogeneous records. +In this final installment I will discuss non-homogeneous records and +we will devise a small framework to convert text records into CSV, HTML, +XML or other formats. *En passant*, I will discuss various object oriented +techniques and patterns. + +.. figure:: http://www.phyast.pitt.edu/~micheles/python/patchwork1.jpg + :width: 300 + + Fig 1: object-oriented design + +A micro-framework to convert records into text +------------------------------------------------------------------- + +It is well know that I am not a framework lower and there are certainly +many Python programmers sharing this attitude, starting from Guido. +Actually, my dislike of frameworks is inversely proportional to their +size: I hate the mega-frameworks, I tolerate the medium-sized framework +and I like enough the micro-frameworks. In this installment I will define +a micro-framework to render non-homogeneous records into text. The +framework is based on the `template pattern`_: in order to define a +renderer class, the programmer inherits from a mother class ``RecordRenderer`` +and fills in the rendering methods: then the framework with automatically +call them but without too much magic. + +This approach is acceptable only when the base class is simple: it is much +less acceptable when you start already from a deep hierarchy. For me a +hierarchy is deep if there are more than two levels: if looking at +mother and children is not enough, and I am forced to look even and +the grand-parent classes, the framework is already too complex. + +Inheritance-based frameworks have the tendency to go out of control, +because it become natural to extend the hierarchy too much. In +traditional object-oriented languages it is quite natural to use +inheritance, but as I said elsewhere one should always keep in mind +that alternative are always possible (a notable new language *without* +inheritance is Go). + +Anyway, one should not fight the language she is using: in Python the +`template pattern`_ is a perfectly reasonable approach. + +.. figure:: http://www.phyast.pitt.edu/~micheles/python/patchwork2.jpg + + Fig 2: the *template pattern* + +To convert into text a non-homogenous +record with *N* fields requires in general *N+1* +functions: *N* functions to convert the fields and a function to convert +the full record. It is natural to group the needed functions as method +of a renderer class: the *N* field-associated rendering functions will +be methods converting values into strings, whereas the *N+1* function +will be a ``.render`` method converting the record of strings so obtained +into a single string. We will use a base class called ``RecordRendererABC``, +where the ABC suffix means *Abstract Base Class*. + +I should point out that an Abstract Base Class in Python can provide +concrete methods to its subclasses and therefore the meaning of ABC +in Python is different than in C++/Java: a Python ABC is a mixin class, +which can provide implementation; it is not necessarily pure interface. + +For instance, suppose we want to convert an Article record + + ``Article = namedtuple("Article", "title author pubdate")`` + +into CSV format. + +How do we proceed? First of all we define a suitable subclass of +``RecordRendererABC``: + +$$CSVArticleRenderer + +Notice that ``CSVArticleRenderer`` defines a ``.schema`` class +attribute, a namedtuple containing the names of the rendering methods. + +In this example both title and author are converted by using the ``.str`` +method, inherited from the base class, whereas the publication date +is converted by using the ``.isodate`` method, which is defined +directly in the ``CSVArticleRenderer`` class. +The ``.render`` method is inherited and converts the input namedtuple +into a string by converting into strings the fields with the corresponding +methods and by joining the results, using a comma as separator. +Here is an example: + + >>> a = Article("test title", "test author", datetime(2008, 05, 15)) + >>> r = CSVArticleRenderer(a) + +The ``.render`` method works as expected: + + >>> print r.render() + test title,test author,2008-05-15 + +By default the separator (``delimiter``) is set to the empty string ''. +This is useful for implementing different renderers. For instance, +suppose we want to define a renderer converting the articles into HTML +format. Suppose we defined three CSS classes ``title``, ``author`` and +``pubdate`` to visualize the different fields in different ways, for +instance with different colors. We could define a renderer using the +CSS classes as follows: + +$$HTMLArticleRenderer + +Here is how the renderer works: + + >>> r = HTMLArticleRenderer(a) + >>> print r.render() + test title + test author + 2008-05-15 + +Design notes +-------------------------------------------------------------------- + +Having discussed the usage of the framework, it is now the time to +discuss the implementation of the base class and the reasons for the +design choices I made. +Here is the source code for ``RecordRendererABC``: + +$$RecordRendererABC + +Let me start from the constructor. The ``__init__`` methods accepts in input +a single argument, a sequence with length equal to the length of the schema. +The input sequence *is not required to be a namedtuple*: there is no type check +such as ``isinstance(input, self.schema.__class__)``. +A type check here would be a design mistake, since it would restrict without +reason the field of applicability of the renderer and it would force +the users to use type converted without need. The only requirement for +the ``input`` object is that ``zip(self.schema, input)`` must not raise +an exception: in other words, it is enough that ``input`` had the right +length. + +Actually ``zip(self.schema, input)`` would not raise an error even if +``input`` had a different length. This is potentially dangerous. +For instance, imagine that for some reason (say a programmer error) +we passed a sequence of length zero: then ``zip(self.schema, input)`` +would silently return an empty list. Since *errors should never pass silently*, +I decided to add a check on the length: in this way if there is an error +we see it immediately, at instantiation time, and not too late, +when we start iterating on the renderer. It is always better to discover +errors early. + +On the other hand, it is best to not exaggerate with the checks. For instance, +if ``.input`` is a list, it is theoretically possible for an evil programmer +to modify the list *after* instantiation, by adding or removing elements. +Then ``zip(self.schema, input)`` could behave in an unexpected way. +However, there is no way to protect against evil (or just bad) programmers. +Even if we replaced ``.input`` with a tuple, which is immutable, its +size could always be changes, simply by overriding the ``.input`` +attribute after instantiation. + +The Python way is to limit the checks to the one dictated from common +sense, intended to limit accidental errors which are likely to happen: +for the rest, the attitude is to trust the user. Checks motivated by +paranoia and lack of trust in the user are not to be introduced, since +in a dynamic language the user can do whenever she wants anyway. +The attitude is mutuated from the `spirit of C`_ (*trust the programmer*). +According to this maxim I decided not to add additional checks. + +In special cases (for instance if you are implementing a subclass of +``RecordRendererABC`` which requires for ``.input`` to be a record) +it may be sensible to introduce some additional check. For instance +you may want to ensure that ``.input`` be a record with the right +fields. However, even in this case it is best not to introduce a +type check like +``isinstance(input, self.schema.__class__)``; you can instead +use a lighter check like ``input._fields == self.schema.fields``: +in this way any object with the right fields would be accepted, +not use a namedtuple. The basic idea is to follow the +`duck typing`_ principle: don't be too picky and +accept as good anything with the needed +attributes. + +In this logic you may want to enlarge even more the field of +acceptable objects: for instance a dictionary-like object +with the right keys could act as a substitute for a record. +We could implement such feature by adding an ``if`` in the +``__init__`` method, by introducing a special case when the input object +is a dictionary. But that would be bad programming: the point of object +oriented programming is to avoid ``ifs`` and to replace them with methods. +In our example, we should remember that Python provides a *classmethod* +construct, which *raison d'etre* is exactly to manage this use case: +it allows the programmer to define alternate constructors, without +the need for complicating the ``__init__`` method. Using alternate +constructors is called `factory method pattern`_ and it is one of +the basic techniques of OOP. The advantages are clear, expecially +in terms of simplicity and easy of maintenance, but also from the +point of view of code reuse and extensibility. + +.. figure:: http://www.phyast.pitt.edu/~micheles/python/patchwork3.jpg + + Fig 3: the *factory method pattern* + +In our example dictionaries are rendered through the ``.frommap`` +classmethod: + + >>> r = CSVArticleRenderer.frommap(dict( + ... title="test title", author="test author", + ... pubdate=datetime(2008, 05, 15))) + +There is also a ``.fromobj`` classmethod accepting in input any +object with a set of attributes which is a superset (proper or +improper) of the schema's attributes. This is pure *duck typing*. +If the object lacks an attribute, we will get an ``AttributeError`` +at instantiation time, an absolutely clear and telling error message; +on the other hand, if the object has enough attributes, it will be +automatically converted into a namedtuple. + +The base class also defines the special methods ``__iter__`` and ``__len__``: +therefore each rendered instance is a sequence of fixed length and can be +passed in input to another renderer. In other words, renderers are +composable in the functional sense. + +Renderers are actually homogeneous records with fields which are strings +and can be passed to the ``HtmlTable`` object defined in the previous +installment. It is trivial to convert a rendered into a list of strings: +thanks to the ``__iter__`` method, ``list(renderer)`` works as expected +(idem for ``tuple(renderer)`` and ``len(renderer)``). `list``, ``tuple`` and +``len`` are actually builtin generic functions which play well with +*duck typing* and are definible for any custom object. + +It was good to discuss what was implemented into ``RecordRendererABC``; +it is also interesting to discuss what was *not* implemented. +In particular, I did not implement the renderers are namedtuples. +I wanted to avoid the *blob* antipattern_, when you have a class which +is everything to everybody. I wanted to keep namedtuples simple, without +adding any methods to them: renderers are logically an independent concept, +even if they can be converted into namedtuples, being iterable. + +.. figure:: http://www.phyast.pitt.edu/~micheles/python/blob.jpg + :width: 350 + + Fig 4: the *blob antipattern* + +I did define ``CSVArticleRenderer`` and ``HTMLArticleRenderer`` as +subclasses of ``RecordRendererABC``. An alternate design could have +introduced different abstract intermediate subclasses, depending on the output +format: for instance ``CSVRecordRenderer``, ``HTMLRecordRenderer``, +``XMLRecordRenderer``, etc. However I have decided of following strictly +the rule that *flat is better than nested*, and to keep the hierachies +as short as possible. +Actually in Python 2.6+ one could define three abstract interfaces +``CSVRecordRenderer``, ``HTMLRecordRenderer`` and ``XMLRecordRenderer`` +and one could register her concrete classes with such interfaces: this +can be done without using inheritance and by keeping the hierarchy flat. + +.. _spirit of C: http://www.artima.com/cppsource/spiritofc.html +.. _duck typing: http://en.wikipedia.org/wiki/Duck_typing +.. _factory method pattern: http://en.wikipedia.org/wiki/Factory_method_pattern +.. _antipattern: http://en.wikipedia.org/wiki/Antipattern +.. _template pattern: http://en.wikipedia.org/wiki/Template_pattern +""" +import os, cgi +from datetime import datetime +from tabular_data import headtail +from collections import namedtuple + +# in Python 2.6 use abstractmethod, abstractproperty instead +class notimplemented(object): + "Descriptor raising a meaningful error message for nonoverridden attributes" + def __init__(self, message): + self.message = message + def __get__(self, obj, objcls=None): + raise NotImplementedError(self.message) + +class RecordRendererABC(object): + schema = () # a namedtuple specifying the names of the converters + delimiter = '' + + @classmethod + def frommap(cls, kw): + return cls(cls.schema.__class__(**kw)) + + @classmethod + def fromobj(cls, obj): + Schema = cls.schema.__class__ + nt = Schema._make(getattr(obj, field) for field in Schema._fields) + return cls(nt) + + def __init__(self, input): + li, ls = len(input), len(self) + if li != ls: + raise TypeError('%s has %d fields, expected %d' % (input, li, ls)) + self.input = input + + def __iter__(self): + for convertername, value in zip(self.schema, self.input): + yield getattr(self, convertername)(value) + + def __len__(self): + return len(self.schema) + + def str(self, value): + return str(value) + + def render(self): + return self.delimiter.join(self) + +Article = namedtuple("Article", "title author pubdate") + +class CSVArticleRenderer(RecordRendererABC): + schema = Article("str", "str", "isodate") + delimiter = ',' + def isodate(self, date): + return date.isoformat()[:10] + +class HTMLArticleRenderer(RecordRendererABC): + schema = Article(title='title', author='author', pubdate="pubdate") + delimiter = '\n' + def title(self, title): + return '%s' % cgi.escape(title) + def author(self, author): + return '%s' % cgi.escape(author) + def pubdate(self, date): + return '%s' % date.isoformat()[:10] + +# todo: xml.escape +def to_xml(ntuple): + name = ntuple.__class__.__name__ + xml = ['<%s>' % name] + for i, field in enumerate(ntuple._fields): + xml.append("<%s>%s" % (field, ntuple[i], field)) + xml.append('' % name) + return os.linesep.join(xml) + +if __name__ == '__main__': + import doctest; doctest.testmod() + HTMLArticleRenderer.fromobj(Article("a",'b', datetime.today())) -- cgit v1.2.1