diff options
author | wiemann <wiemann@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2006-01-09 20:44:25 +0000 |
---|---|---|
committer | wiemann <wiemann@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2006-01-09 20:44:25 +0000 |
commit | d77fdfef70e08114f57cbef5d91707df8717ea9f (patch) | |
tree | 49444e3486c0c333cb7b33dfa721296c08ee4ece /docutils/transforms/frontmatter.py | |
parent | 53cd16ca6ca5f638cbe5956988e88f9339e355cf (diff) | |
parent | 3993c4097756e9885bcfbd07cb1cc1e4e95e50e4 (diff) | |
download | docutils-0.4.tar.gz |
Release 0.4: tagging released revisiondocutils-0.4
git-svn-id: http://svn.code.sf.net/p/docutils/code/tags/docutils-0.4@4268 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
Diffstat (limited to 'docutils/transforms/frontmatter.py')
-rw-r--r-- | docutils/transforms/frontmatter.py | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/docutils/transforms/frontmatter.py b/docutils/transforms/frontmatter.py new file mode 100644 index 000000000..14a3aa8cc --- /dev/null +++ b/docutils/transforms/frontmatter.py @@ -0,0 +1,514 @@ +# Authors: David Goodger, Ueli Schlaepfer +# Contact: goodger@users.sourceforge.net +# Revision: $Revision$ +# Date: $Date$ +# Copyright: This module has been placed in the public domain. + +""" +Transforms related to the front matter of a document or a section +(information found before the main text): + +- `DocTitle`: Used to transform a lone top level section's title to + the document title, promote a remaining lone top-level section's + title to the document subtitle, and determine the document's title + metadata (document['title']) based on the document title and/or the + "title" setting. + +- `SectionSubTitle`: Used to transform a lone subsection into a + subtitle. + +- `DocInfo`: Used to transform a bibliographic field list into docinfo + elements. +""" + +__docformat__ = 'reStructuredText' + +import re +from docutils import nodes, utils +from docutils.transforms import TransformError, Transform + + +class TitlePromoter(Transform): + + """ + Abstract base class for DocTitle and SectionSubTitle transforms. + """ + + def promote_title(self, node): + """ + Transform the following tree:: + + <node> + <section> + <title> + ... + + into :: + + <node> + <title> + ... + + `node` is normally a document. + """ + # `node` must not have a title yet. + assert not (len(node) and isinstance(node[0], nodes.title)) + section, index = self.candidate_index(node) + if index is None: + return None + # Transfer the section's attributes to the node: + node.attributes.update(section.attributes) + # setup_child is called automatically for all nodes. + node[:] = (section[:1] # section title + + node[:index] # everything that was in the + # node before the section + + section[1:]) # everything that was in the section + assert isinstance(node[0], nodes.title) + return 1 + + def promote_subtitle(self, node): + """ + Transform the following node tree:: + + <node> + <title> + <section> + <title> + ... + + into :: + + <node> + <title> + <subtitle> + ... + """ + subsection, index = self.candidate_index(node) + if index is None: + return None + subtitle = nodes.subtitle() + # Transfer the subsection's attributes to the new subtitle: + # This causes trouble with list attributes! To do: Write a + # test case which catches direct access to the `attributes` + # dictionary and/or write a test case which shows problems in + # this particular case. + subtitle.attributes.update(subsection.attributes) + # We're losing the subtitle's attributes here! To do: Write a + # test case which shows this behavior. + # Transfer the contents of the subsection's title to the + # subtitle: + subtitle[:] = subsection[0][:] + node[:] = (node[:1] # title + + [subtitle] + # everything that was before the section: + + node[1:index] + # everything that was in the subsection: + + subsection[1:]) + return 1 + + def candidate_index(self, node): + """ + Find and return the promotion candidate and its index. + + Return (None, None) if no valid candidate was found. + """ + index = node.first_child_not_matching_class( + nodes.PreBibliographic) + if index is None or len(node) > (index + 1) or \ + not isinstance(node[index], nodes.section): + return None, None + else: + return node[index], index + + +class DocTitle(TitlePromoter): + + """ + In reStructuredText_, there is no way to specify a document title + and subtitle explicitly. Instead, we can supply the document title + (and possibly the subtitle as well) implicitly, and use this + two-step transform to "raise" or "promote" the title(s) (and their + corresponding section contents) to the document level. + + 1. If the document contains a single top-level section as its + first non-comment element, the top-level section's title + becomes the document's title, and the top-level section's + contents become the document's immediate contents. The lone + top-level section header must be the first non-comment element + in the document. + + For example, take this input text:: + + ================= + Top-Level Title + ================= + + A paragraph. + + Once parsed, it looks like this:: + + <document> + <section names="top-level title"> + <title> + Top-Level Title + <paragraph> + A paragraph. + + After running the DocTitle transform, we have:: + + <document names="top-level title"> + <title> + Top-Level Title + <paragraph> + A paragraph. + + 2. If step 1 successfully determines the document title, we + continue by checking for a subtitle. + + If the lone top-level section itself contains a single + second-level section as its first non-comment element, that + section's title is promoted to the document's subtitle, and + that section's contents become the document's immediate + contents. Given this input text:: + + ================= + Top-Level Title + ================= + + Second-Level Title + ~~~~~~~~~~~~~~~~~~ + + A paragraph. + + After parsing and running the Section Promotion transform, the + result is:: + + <document names="top-level title"> + <title> + Top-Level Title + <subtitle names="second-level title"> + Second-Level Title + <paragraph> + A paragraph. + + (Note that the implicit hyperlink target generated by the + "Second-Level Title" is preserved on the "subtitle" element + itself.) + + Any comment elements occurring before the document title or + subtitle are accumulated and inserted as the first body elements + after the title(s). + + This transform also sets the document's metadata title + (document['title']). + + .. _reStructuredText: http://docutils.sf.net/rst.html + """ + + default_priority = 320 + + def set_metadata(self): + """ + Set document['title'] metadata title from the following + sources, listed in order of priority: + + * Existing document['title'] attribute. + * "title" setting. + * Document title node (as promoted by promote_title). + """ + if not self.document.hasattr('title'): + if self.document.settings.title is not None: + self.document['title'] = self.document.settings.title + elif len(self.document) and isinstance(self.document[0], nodes.title): + self.document['title'] = self.document[0].astext() + + def apply(self): + if getattr(self.document.settings, 'doctitle_xform', 1): + # promote_(sub)title defined in TitlePromoter base class. + if self.promote_title(self.document): + # If a title has been promoted, also try to promote a + # subtitle. + self.promote_subtitle(self.document) + # Set document['title']. + self.set_metadata() + + +class SectionSubTitle(TitlePromoter): + + """ + This works like document subtitles, but for sections. For example, :: + + <section> + <title> + Title + <section> + <title> + Subtitle + ... + + is transformed into :: + + <section> + <title> + Title + <subtitle> + Subtitle + ... + + For details refer to the docstring of DocTitle. + """ + + default_priority = 350 + + def apply(self): + if not getattr(self.document.settings, 'sectsubtitle_xform', 1): + return + for section in self.document.traverse(nodes.section): + # On our way through the node tree, we are deleting + # sections, but we call self.promote_subtitle for those + # sections nonetheless. To do: Write a test case which + # shows the problem and discuss on Docutils-develop. + self.promote_subtitle(section) + + +class DocInfo(Transform): + + """ + This transform is specific to the reStructuredText_ markup syntax; + see "Bibliographic Fields" in the `reStructuredText Markup + Specification`_ for a high-level description. This transform + should be run *after* the `DocTitle` transform. + + Given a field list as the first non-comment element after the + document title and subtitle (if present), registered bibliographic + field names are transformed to the corresponding DTD elements, + becoming child elements of the "docinfo" element (except for a + dedication and/or an abstract, which become "topic" elements after + "docinfo"). + + For example, given this document fragment after parsing:: + + <document> + <title> + Document Title + <field_list> + <field> + <field_name> + Author + <field_body> + <paragraph> + A. Name + <field> + <field_name> + Status + <field_body> + <paragraph> + $RCSfile$ + ... + + After running the bibliographic field list transform, the + resulting document tree would look like this:: + + <document> + <title> + Document Title + <docinfo> + <author> + A. Name + <status> + frontmatter.py + ... + + The "Status" field contained an expanded RCS keyword, which is + normally (but optionally) cleaned up by the transform. The sole + contents of the field body must be a paragraph containing an + expanded RCS keyword of the form "$keyword: expansion text $". Any + RCS keyword can be processed in any bibliographic field. The + dollar signs and leading RCS keyword name are removed. Extra + processing is done for the following RCS keywords: + + - "RCSfile" expands to the name of the file in the RCS or CVS + repository, which is the name of the source file with a ",v" + suffix appended. The transform will remove the ",v" suffix. + + - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC + time zone). The RCS Keywords transform will extract just the + date itself and transform it to an ISO 8601 format date, as in + "2000-12-31". + + (Since the source file for this text is itself stored under CVS, + we can't show an example of the "Date" RCS keyword because we + can't prevent any RCS keywords used in this explanation from + being expanded. Only the "RCSfile" keyword is stable; its + expansion text changes only if the file name changes.) + + .. _reStructuredText: http://docutils.sf.net/rst.html + .. _reStructuredText Markup Specification: + http://docutils.sf.net/docs/ref/rst/restructuredtext.html + """ + + default_priority = 340 + + biblio_nodes = { + 'author': nodes.author, + 'authors': nodes.authors, + 'organization': nodes.organization, + 'address': nodes.address, + 'contact': nodes.contact, + 'version': nodes.version, + 'revision': nodes.revision, + 'status': nodes.status, + 'date': nodes.date, + 'copyright': nodes.copyright, + 'dedication': nodes.topic, + 'abstract': nodes.topic} + """Canonical field name (lowcased) to node class name mapping for + bibliographic fields (field_list).""" + + def apply(self): + if not getattr(self.document.settings, 'docinfo_xform', 1): + return + document = self.document + index = document.first_child_not_matching_class( + nodes.PreBibliographic) + if index is None: + return + candidate = document[index] + if isinstance(candidate, nodes.field_list): + biblioindex = document.first_child_not_matching_class( + (nodes.Titular, nodes.Decorative)) + nodelist = self.extract_bibliographic(candidate) + del document[index] # untransformed field list (candidate) + document[biblioindex:biblioindex] = nodelist + + def extract_bibliographic(self, field_list): + docinfo = nodes.docinfo() + bibliofields = self.language.bibliographic_fields + labels = self.language.labels + topics = {'dedication': None, 'abstract': None} + for field in field_list: + try: + name = field[0][0].astext() + normedname = nodes.fully_normalize_name(name) + if not (len(field) == 2 and bibliofields.has_key(normedname) + and self.check_empty_biblio_field(field, name)): + raise TransformError + canonical = bibliofields[normedname] + biblioclass = self.biblio_nodes[canonical] + if issubclass(biblioclass, nodes.TextElement): + if not self.check_compound_biblio_field(field, name): + raise TransformError + utils.clean_rcs_keywords( + field[1][0], self.rcs_keyword_substitutions) + docinfo.append(biblioclass('', '', *field[1][0])) + elif issubclass(biblioclass, nodes.authors): + self.extract_authors(field, name, docinfo) + elif issubclass(biblioclass, nodes.topic): + if topics[canonical]: + field[-1] += self.document.reporter.warning( + 'There can only be one "%s" field.' % name, + base_node=field) + raise TransformError + title = nodes.title(name, labels[canonical]) + topics[canonical] = biblioclass( + '', title, classes=[canonical], *field[1].children) + else: + docinfo.append(biblioclass('', *field[1].children)) + except TransformError: + if len(field[-1]) == 1 \ + and isinstance(field[-1][0], nodes.paragraph): + utils.clean_rcs_keywords( + field[-1][0], self.rcs_keyword_substitutions) + docinfo.append(field) + nodelist = [] + if len(docinfo) != 0: + nodelist.append(docinfo) + for name in ('dedication', 'abstract'): + if topics[name]: + nodelist.append(topics[name]) + return nodelist + + def check_empty_biblio_field(self, field, name): + if len(field[-1]) < 1: + field[-1] += self.document.reporter.warning( + 'Cannot extract empty bibliographic field "%s".' % name, + base_node=field) + return None + return 1 + + def check_compound_biblio_field(self, field, name): + if len(field[-1]) > 1: + field[-1] += self.document.reporter.warning( + 'Cannot extract compound bibliographic field "%s".' % name, + base_node=field) + return None + if not isinstance(field[-1][0], nodes.paragraph): + field[-1] += self.document.reporter.warning( + 'Cannot extract bibliographic field "%s" containing ' + 'anything other than a single paragraph.' % name, + base_node=field) + return None + return 1 + + rcs_keyword_substitutions = [ + (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+' + r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'), + (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'), + (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1'),] + + def extract_authors(self, field, name, docinfo): + try: + if len(field[1]) == 1: + if isinstance(field[1][0], nodes.paragraph): + authors = self.authors_from_one_paragraph(field) + elif isinstance(field[1][0], nodes.bullet_list): + authors = self.authors_from_bullet_list(field) + else: + raise TransformError + else: + authors = self.authors_from_paragraphs(field) + authornodes = [nodes.author('', '', *author) + for author in authors if author] + if len(authornodes) >= 1: + docinfo.append(nodes.authors('', *authornodes)) + else: + raise TransformError + except TransformError: + field[-1] += self.document.reporter.warning( + 'Bibliographic field "%s" incompatible with extraction: ' + 'it must contain either a single paragraph (with authors ' + 'separated by one of "%s"), multiple paragraphs (one per ' + 'author), or a bullet list with one paragraph (one author) ' + 'per item.' + % (name, ''.join(self.language.author_separators)), + base_node=field) + raise + + def authors_from_one_paragraph(self, field): + text = field[1][0].astext().strip() + if not text: + raise TransformError + for authorsep in self.language.author_separators: + authornames = text.split(authorsep) + if len(authornames) > 1: + break + authornames = [author.strip() for author in authornames] + authors = [[nodes.Text(author)] for author in authornames if author] + return authors + + def authors_from_bullet_list(self, field): + authors = [] + for item in field[1][0]: + if len(item) != 1 or not isinstance(item[0], nodes.paragraph): + raise TransformError + authors.append(item[0].children) + if not authors: + raise TransformError + return authors + + def authors_from_paragraphs(self, field): + for item in field[1]: + if not isinstance(item, nodes.paragraph): + raise TransformError + authors = [item.children for item in field[1]] + return authors |