summaryrefslogtreecommitdiff
path: root/docutils/transforms/frontmatter.py
diff options
context:
space:
mode:
Diffstat (limited to 'docutils/transforms/frontmatter.py')
-rw-r--r--docutils/transforms/frontmatter.py514
1 files changed, 514 insertions, 0 deletions
diff --git a/docutils/transforms/frontmatter.py b/docutils/transforms/frontmatter.py
new file mode 100644
index 000000000..14a3aa8cc
--- /dev/null
+++ b/docutils/transforms/frontmatter.py
@@ -0,0 +1,514 @@
+# Authors: David Goodger, Ueli Schlaepfer
+# Contact: goodger@users.sourceforge.net
+# Revision: $Revision$
+# Date: $Date$
+# Copyright: This module has been placed in the public domain.
+
+"""
+Transforms related to the front matter of a document or a section
+(information found before the main text):
+
+- `DocTitle`: Used to transform a lone top level section's title to
+ the document title, promote a remaining lone top-level section's
+ title to the document subtitle, and determine the document's title
+ metadata (document['title']) based on the document title and/or the
+ "title" setting.
+
+- `SectionSubTitle`: Used to transform a lone subsection into a
+ subtitle.
+
+- `DocInfo`: Used to transform a bibliographic field list into docinfo
+ elements.
+"""
+
+__docformat__ = 'reStructuredText'
+
+import re
+from docutils import nodes, utils
+from docutils.transforms import TransformError, Transform
+
+
+class TitlePromoter(Transform):
+
+ """
+ Abstract base class for DocTitle and SectionSubTitle transforms.
+ """
+
+ def promote_title(self, node):
+ """
+ Transform the following tree::
+
+ <node>
+ <section>
+ <title>
+ ...
+
+ into ::
+
+ <node>
+ <title>
+ ...
+
+ `node` is normally a document.
+ """
+ # `node` must not have a title yet.
+ assert not (len(node) and isinstance(node[0], nodes.title))
+ section, index = self.candidate_index(node)
+ if index is None:
+ return None
+ # Transfer the section's attributes to the node:
+ node.attributes.update(section.attributes)
+ # setup_child is called automatically for all nodes.
+ node[:] = (section[:1] # section title
+ + node[:index] # everything that was in the
+ # node before the section
+ + section[1:]) # everything that was in the section
+ assert isinstance(node[0], nodes.title)
+ return 1
+
+ def promote_subtitle(self, node):
+ """
+ Transform the following node tree::
+
+ <node>
+ <title>
+ <section>
+ <title>
+ ...
+
+ into ::
+
+ <node>
+ <title>
+ <subtitle>
+ ...
+ """
+ subsection, index = self.candidate_index(node)
+ if index is None:
+ return None
+ subtitle = nodes.subtitle()
+ # Transfer the subsection's attributes to the new subtitle:
+ # This causes trouble with list attributes! To do: Write a
+ # test case which catches direct access to the `attributes`
+ # dictionary and/or write a test case which shows problems in
+ # this particular case.
+ subtitle.attributes.update(subsection.attributes)
+ # We're losing the subtitle's attributes here! To do: Write a
+ # test case which shows this behavior.
+ # Transfer the contents of the subsection's title to the
+ # subtitle:
+ subtitle[:] = subsection[0][:]
+ node[:] = (node[:1] # title
+ + [subtitle]
+ # everything that was before the section:
+ + node[1:index]
+ # everything that was in the subsection:
+ + subsection[1:])
+ return 1
+
+ def candidate_index(self, node):
+ """
+ Find and return the promotion candidate and its index.
+
+ Return (None, None) if no valid candidate was found.
+ """
+ index = node.first_child_not_matching_class(
+ nodes.PreBibliographic)
+ if index is None or len(node) > (index + 1) or \
+ not isinstance(node[index], nodes.section):
+ return None, None
+ else:
+ return node[index], index
+
+
+class DocTitle(TitlePromoter):
+
+ """
+ In reStructuredText_, there is no way to specify a document title
+ and subtitle explicitly. Instead, we can supply the document title
+ (and possibly the subtitle as well) implicitly, and use this
+ two-step transform to "raise" or "promote" the title(s) (and their
+ corresponding section contents) to the document level.
+
+ 1. If the document contains a single top-level section as its
+ first non-comment element, the top-level section's title
+ becomes the document's title, and the top-level section's
+ contents become the document's immediate contents. The lone
+ top-level section header must be the first non-comment element
+ in the document.
+
+ For example, take this input text::
+
+ =================
+ Top-Level Title
+ =================
+
+ A paragraph.
+
+ Once parsed, it looks like this::
+
+ <document>
+ <section names="top-level title">
+ <title>
+ Top-Level Title
+ <paragraph>
+ A paragraph.
+
+ After running the DocTitle transform, we have::
+
+ <document names="top-level title">
+ <title>
+ Top-Level Title
+ <paragraph>
+ A paragraph.
+
+ 2. If step 1 successfully determines the document title, we
+ continue by checking for a subtitle.
+
+ If the lone top-level section itself contains a single
+ second-level section as its first non-comment element, that
+ section's title is promoted to the document's subtitle, and
+ that section's contents become the document's immediate
+ contents. Given this input text::
+
+ =================
+ Top-Level Title
+ =================
+
+ Second-Level Title
+ ~~~~~~~~~~~~~~~~~~
+
+ A paragraph.
+
+ After parsing and running the Section Promotion transform, the
+ result is::
+
+ <document names="top-level title">
+ <title>
+ Top-Level Title
+ <subtitle names="second-level title">
+ Second-Level Title
+ <paragraph>
+ A paragraph.
+
+ (Note that the implicit hyperlink target generated by the
+ "Second-Level Title" is preserved on the "subtitle" element
+ itself.)
+
+ Any comment elements occurring before the document title or
+ subtitle are accumulated and inserted as the first body elements
+ after the title(s).
+
+ This transform also sets the document's metadata title
+ (document['title']).
+
+ .. _reStructuredText: http://docutils.sf.net/rst.html
+ """
+
+ default_priority = 320
+
+ def set_metadata(self):
+ """
+ Set document['title'] metadata title from the following
+ sources, listed in order of priority:
+
+ * Existing document['title'] attribute.
+ * "title" setting.
+ * Document title node (as promoted by promote_title).
+ """
+ if not self.document.hasattr('title'):
+ if self.document.settings.title is not None:
+ self.document['title'] = self.document.settings.title
+ elif len(self.document) and isinstance(self.document[0], nodes.title):
+ self.document['title'] = self.document[0].astext()
+
+ def apply(self):
+ if getattr(self.document.settings, 'doctitle_xform', 1):
+ # promote_(sub)title defined in TitlePromoter base class.
+ if self.promote_title(self.document):
+ # If a title has been promoted, also try to promote a
+ # subtitle.
+ self.promote_subtitle(self.document)
+ # Set document['title'].
+ self.set_metadata()
+
+
+class SectionSubTitle(TitlePromoter):
+
+ """
+ This works like document subtitles, but for sections. For example, ::
+
+ <section>
+ <title>
+ Title
+ <section>
+ <title>
+ Subtitle
+ ...
+
+ is transformed into ::
+
+ <section>
+ <title>
+ Title
+ <subtitle>
+ Subtitle
+ ...
+
+ For details refer to the docstring of DocTitle.
+ """
+
+ default_priority = 350
+
+ def apply(self):
+ if not getattr(self.document.settings, 'sectsubtitle_xform', 1):
+ return
+ for section in self.document.traverse(nodes.section):
+ # On our way through the node tree, we are deleting
+ # sections, but we call self.promote_subtitle for those
+ # sections nonetheless. To do: Write a test case which
+ # shows the problem and discuss on Docutils-develop.
+ self.promote_subtitle(section)
+
+
+class DocInfo(Transform):
+
+ """
+ This transform is specific to the reStructuredText_ markup syntax;
+ see "Bibliographic Fields" in the `reStructuredText Markup
+ Specification`_ for a high-level description. This transform
+ should be run *after* the `DocTitle` transform.
+
+ Given a field list as the first non-comment element after the
+ document title and subtitle (if present), registered bibliographic
+ field names are transformed to the corresponding DTD elements,
+ becoming child elements of the "docinfo" element (except for a
+ dedication and/or an abstract, which become "topic" elements after
+ "docinfo").
+
+ For example, given this document fragment after parsing::
+
+ <document>
+ <title>
+ Document Title
+ <field_list>
+ <field>
+ <field_name>
+ Author
+ <field_body>
+ <paragraph>
+ A. Name
+ <field>
+ <field_name>
+ Status
+ <field_body>
+ <paragraph>
+ $RCSfile$
+ ...
+
+ After running the bibliographic field list transform, the
+ resulting document tree would look like this::
+
+ <document>
+ <title>
+ Document Title
+ <docinfo>
+ <author>
+ A. Name
+ <status>
+ frontmatter.py
+ ...
+
+ The "Status" field contained an expanded RCS keyword, which is
+ normally (but optionally) cleaned up by the transform. The sole
+ contents of the field body must be a paragraph containing an
+ expanded RCS keyword of the form "$keyword: expansion text $". Any
+ RCS keyword can be processed in any bibliographic field. The
+ dollar signs and leading RCS keyword name are removed. Extra
+ processing is done for the following RCS keywords:
+
+ - "RCSfile" expands to the name of the file in the RCS or CVS
+ repository, which is the name of the source file with a ",v"
+ suffix appended. The transform will remove the ",v" suffix.
+
+ - "Date" expands to the format "YYYY/MM/DD hh:mm:ss" (in the UTC
+ time zone). The RCS Keywords transform will extract just the
+ date itself and transform it to an ISO 8601 format date, as in
+ "2000-12-31".
+
+ (Since the source file for this text is itself stored under CVS,
+ we can't show an example of the "Date" RCS keyword because we
+ can't prevent any RCS keywords used in this explanation from
+ being expanded. Only the "RCSfile" keyword is stable; its
+ expansion text changes only if the file name changes.)
+
+ .. _reStructuredText: http://docutils.sf.net/rst.html
+ .. _reStructuredText Markup Specification:
+ http://docutils.sf.net/docs/ref/rst/restructuredtext.html
+ """
+
+ default_priority = 340
+
+ biblio_nodes = {
+ 'author': nodes.author,
+ 'authors': nodes.authors,
+ 'organization': nodes.organization,
+ 'address': nodes.address,
+ 'contact': nodes.contact,
+ 'version': nodes.version,
+ 'revision': nodes.revision,
+ 'status': nodes.status,
+ 'date': nodes.date,
+ 'copyright': nodes.copyright,
+ 'dedication': nodes.topic,
+ 'abstract': nodes.topic}
+ """Canonical field name (lowcased) to node class name mapping for
+ bibliographic fields (field_list)."""
+
+ def apply(self):
+ if not getattr(self.document.settings, 'docinfo_xform', 1):
+ return
+ document = self.document
+ index = document.first_child_not_matching_class(
+ nodes.PreBibliographic)
+ if index is None:
+ return
+ candidate = document[index]
+ if isinstance(candidate, nodes.field_list):
+ biblioindex = document.first_child_not_matching_class(
+ (nodes.Titular, nodes.Decorative))
+ nodelist = self.extract_bibliographic(candidate)
+ del document[index] # untransformed field list (candidate)
+ document[biblioindex:biblioindex] = nodelist
+
+ def extract_bibliographic(self, field_list):
+ docinfo = nodes.docinfo()
+ bibliofields = self.language.bibliographic_fields
+ labels = self.language.labels
+ topics = {'dedication': None, 'abstract': None}
+ for field in field_list:
+ try:
+ name = field[0][0].astext()
+ normedname = nodes.fully_normalize_name(name)
+ if not (len(field) == 2 and bibliofields.has_key(normedname)
+ and self.check_empty_biblio_field(field, name)):
+ raise TransformError
+ canonical = bibliofields[normedname]
+ biblioclass = self.biblio_nodes[canonical]
+ if issubclass(biblioclass, nodes.TextElement):
+ if not self.check_compound_biblio_field(field, name):
+ raise TransformError
+ utils.clean_rcs_keywords(
+ field[1][0], self.rcs_keyword_substitutions)
+ docinfo.append(biblioclass('', '', *field[1][0]))
+ elif issubclass(biblioclass, nodes.authors):
+ self.extract_authors(field, name, docinfo)
+ elif issubclass(biblioclass, nodes.topic):
+ if topics[canonical]:
+ field[-1] += self.document.reporter.warning(
+ 'There can only be one "%s" field.' % name,
+ base_node=field)
+ raise TransformError
+ title = nodes.title(name, labels[canonical])
+ topics[canonical] = biblioclass(
+ '', title, classes=[canonical], *field[1].children)
+ else:
+ docinfo.append(biblioclass('', *field[1].children))
+ except TransformError:
+ if len(field[-1]) == 1 \
+ and isinstance(field[-1][0], nodes.paragraph):
+ utils.clean_rcs_keywords(
+ field[-1][0], self.rcs_keyword_substitutions)
+ docinfo.append(field)
+ nodelist = []
+ if len(docinfo) != 0:
+ nodelist.append(docinfo)
+ for name in ('dedication', 'abstract'):
+ if topics[name]:
+ nodelist.append(topics[name])
+ return nodelist
+
+ def check_empty_biblio_field(self, field, name):
+ if len(field[-1]) < 1:
+ field[-1] += self.document.reporter.warning(
+ 'Cannot extract empty bibliographic field "%s".' % name,
+ base_node=field)
+ return None
+ return 1
+
+ def check_compound_biblio_field(self, field, name):
+ if len(field[-1]) > 1:
+ field[-1] += self.document.reporter.warning(
+ 'Cannot extract compound bibliographic field "%s".' % name,
+ base_node=field)
+ return None
+ if not isinstance(field[-1][0], nodes.paragraph):
+ field[-1] += self.document.reporter.warning(
+ 'Cannot extract bibliographic field "%s" containing '
+ 'anything other than a single paragraph.' % name,
+ base_node=field)
+ return None
+ return 1
+
+ rcs_keyword_substitutions = [
+ (re.compile(r'\$' r'Date: (\d\d\d\d)[-/](\d\d)[-/](\d\d)[ T][\d:]+'
+ r'[^$]* \$', re.IGNORECASE), r'\1-\2-\3'),
+ (re.compile(r'\$' r'RCSfile: (.+),v \$', re.IGNORECASE), r'\1'),
+ (re.compile(r'\$[a-zA-Z]+: (.+) \$'), r'\1'),]
+
+ def extract_authors(self, field, name, docinfo):
+ try:
+ if len(field[1]) == 1:
+ if isinstance(field[1][0], nodes.paragraph):
+ authors = self.authors_from_one_paragraph(field)
+ elif isinstance(field[1][0], nodes.bullet_list):
+ authors = self.authors_from_bullet_list(field)
+ else:
+ raise TransformError
+ else:
+ authors = self.authors_from_paragraphs(field)
+ authornodes = [nodes.author('', '', *author)
+ for author in authors if author]
+ if len(authornodes) >= 1:
+ docinfo.append(nodes.authors('', *authornodes))
+ else:
+ raise TransformError
+ except TransformError:
+ field[-1] += self.document.reporter.warning(
+ 'Bibliographic field "%s" incompatible with extraction: '
+ 'it must contain either a single paragraph (with authors '
+ 'separated by one of "%s"), multiple paragraphs (one per '
+ 'author), or a bullet list with one paragraph (one author) '
+ 'per item.'
+ % (name, ''.join(self.language.author_separators)),
+ base_node=field)
+ raise
+
+ def authors_from_one_paragraph(self, field):
+ text = field[1][0].astext().strip()
+ if not text:
+ raise TransformError
+ for authorsep in self.language.author_separators:
+ authornames = text.split(authorsep)
+ if len(authornames) > 1:
+ break
+ authornames = [author.strip() for author in authornames]
+ authors = [[nodes.Text(author)] for author in authornames if author]
+ return authors
+
+ def authors_from_bullet_list(self, field):
+ authors = []
+ for item in field[1][0]:
+ if len(item) != 1 or not isinstance(item[0], nodes.paragraph):
+ raise TransformError
+ authors.append(item[0].children)
+ if not authors:
+ raise TransformError
+ return authors
+
+ def authors_from_paragraphs(self, field):
+ for item in field[1]:
+ if not isinstance(item, nodes.paragraph):
+ raise TransformError
+ authors = [item.children for item in field[1]]
+ return authors