summaryrefslogtreecommitdiff
path: root/sandbox/fwiemann/xhtml2rest/xhtml2rest.py
diff options
context:
space:
mode:
Diffstat (limited to 'sandbox/fwiemann/xhtml2rest/xhtml2rest.py')
-rwxr-xr-xsandbox/fwiemann/xhtml2rest/xhtml2rest.py551
1 files changed, 0 insertions, 551 deletions
diff --git a/sandbox/fwiemann/xhtml2rest/xhtml2rest.py b/sandbox/fwiemann/xhtml2rest/xhtml2rest.py
deleted file mode 100755
index 8c6b366db..000000000
--- a/sandbox/fwiemann/xhtml2rest/xhtml2rest.py
+++ /dev/null
@@ -1,551 +0,0 @@
-#!/usr/bin/python
-"""
-NAME
-====
-
-xhtml2rest - Convert xhtml to reStructuredText
-
-SYNOPSIS
-========
-
-xhtml2rest *xhtmlfile* > *restfile*
-
-DESCRIPTION
-===========
-
-``xhtml2rest``, which, far from being a decent and complete program, is
-only something to begin with, hopefully processes the given UTF-8
-xhtml file and produces reStructuredText "source code" in the standard
-output. If your input is html and/or not in UTF-8, you can convert it
-to UTF-8 xhtml using ``iconv`` and ``tidy``:
-
- iconv -f *source_encoding* -t utf-8 *source_html* > *html_utf8*
-
- tidy -utf8 -asxml -o *xhtmlfile* *html_utf8*
-
- xhtml2rest *xhtmlfile* > *restfile*
-
-Interestingly, since reStructuredText is not simple markup, but has
-very strict rules with the intention that the source is perfectly
-readable, it turns out that converting html to reStructuredText is
-actually *rendering*. ``xhtml2rest`` is a small rendering engine. Since
-I had no time to study how existing rendering engines work, I had to
-reinvent the wheel. So although the code is clean (I actually wrote it
-twice), I doubt that the core logic is adequate for future extensions.
-But it's better than nothing. There is some documentation in the code,
-but feel free to email me if you need more explanations.
-
-LIMITATIONS
-===========
-
-I created ``xhtml2rest`` for a very specific job. It does that job
-correctly, but for your web page it might not work. It should not be
-very hard, however, either to improve the code, or to determine what
-it is in your web page that confuses ``xhtml2rest`` and remove it.
-
-Other than that, there are the following limitations:
-
-* No indented tables
-
-* No multi-col or -row spans in tables
-
-* No support for \<br>
-
-* Not tested in nested tables (check http://www.w3m.org/story.html)
-
-* \<th> support is quick and dirty
-
-* If the same anchor text is met twice, the anchor is ignored
-
-* No indented \<pre> elements (but I'm not sure the HTML standard
- allows them)
-
-* Images are ignored
-
-* The word HARDWIRED in the code indicates a hardwired hack which is
- specific to the job I wanted ``xhtml2rest`` to do.
-
-META
-====
-
-``xhtml2rest`` was created by Antonios Christofides,
-anthony@itia.ntua.gr, May-June 2005.
-
-Revision: $Revision$
-
-The code and this text is hereby placed in the public domain.
-"""
-
-import xml.dom.minidom
-import re
-import sys
-import textwrap
-import math
-import UserList
-import warnings
-import codecs
-
-###############################################################################
-# Global variables. I know. I'm terribly sorry. Please get rid of them.
-
-# 'unindent' is used by list items. A li list item is always indented, but its
-# first line is "unindented" and contains the number or bullet. However, it was
-# difficult for the li node to tell its #text contents (which may be deeply
-# nested) to use that. So it just places the number or bullet, which must be 4
-# characters, like " 1. ", in "unindent". The first text to be rendered uses
-# the unindent and then sets it to empty again.
-
-unindent = ''
-hyperlinks = {} # text-target pairs found in "a href" elements
-###############################################################################
-
-class Ditem:
- """A document item; usually a node, but can be a block of text
- resulting from processing adjacent inline items. If it is a node,
- it is usually the BlockDitem subclass; if it is text, it is
- normally a plain Ditem."""
- def __init__(self, text):
- self.text = text # Contained text (empty for BlockDitem)
- self.type = '' # tag for block node, empty for inline
- self.indentlevel = 0 # 0 - unindented; 1 - indented; etc.
- def __repr__(self):
- return self.__class__.__name__+'("""'+self.text+'""")'
- def propagate_indents(self):
- "Propagates indent level recursively to children"
- pass
- def maxwidth(self):
- "Width it will occupy if allowed to render on infinite width"
- self.remove_white_space()
- return len(self.text) + 4*self.indentlevel
- def minwidth(self):
- "Width it will occupy if wrapped as much as possible"
- wordlens = [len(x) for x in self.text.split()]
- if wordlens: return max(wordlens) + 4*self.indentlevel
- else: return 0
- def format(self, width):
- """Returns contents formatted so as not to exceed specified
- width, if possible"""
- global unindent
- if(self.type=='pre'): raise Exception, "What are we doing here?"
- self.remove_white_space()
- # Quick hack to fix a problem. Do we begin with '* '?
- while len(self.text)>=2 and self.text[1]==' ' and self.text[0] in '*-':
- # It may be mistaken for a bullet list. Strip it.
- self.text = self.text[2:]
- if width < self.minwidth(): width = self.minwidth()
- # The textwrap module has the nasty habit of breaking at hyphens. So
- # we'll do a nasty hack: find a character that does not exist in the
- # text, replace all hyphens with that character, ok, you get the point.
- hyphensurrogate = ''
- for c in '!@#$%^&*~':
- if self.text.find(c)<0:
- hyphensurrogate = c
- break
- if not hyphensurrogate: raise Exception, "Houston we have a problem"
- text = self.text.replace('-', hyphensurrogate)
- wrapper = textwrap.TextWrapper(
- initial_indent=((4*self.indentlevel)-len(unindent))*' '+unindent,
- subsequent_indent=4*self.indentlevel*' ',
- width=width, break_long_words = False)
- unindent = ''
- text = wrapper.fill(text)
- text = text.replace(hyphensurrogate, '-')
- return text
- def empty(self):
- "Returns true if contains nothing"
- return not self.text
- def remove_white_space(self):
- "Removes extra white space"
- self.text = re.sub('\s+', ' ', self.text).strip()
- def canmerge(self):
- "Tells whether it's possible to merge this Ditem with adjacent ones"
- return True
- def merge(self, aditem):
- """If possible, merges aditem, which should be an adjacent Ditem that
- comes after this one."""
- if not self.canmerge() or not aditem.canmerge(): return False
- if len(self.text)>0 and self.text[-1] == '_' and len(aditem.text)>0 \
- and aditem.text[0] not in """ \n\t:.,!=/|;"'?<>[]{}()""":
- # Leave space after link if not followed by punctuation
- self.text = self.text + ' ' + aditem.text
- else:
- self.text = self.text + aditem.text
- return True
-
-class BlockDitem(Ditem):
- "A Ditem which contains other Ditems"
- def __init__(self, type):
- Ditem.__init__(self, '')
- self.type = type
- self.children = [] # Contained Ditems
- def __repr__(self):
- return self.__class__.__name__+'("'+self.type+'"); children = '+repr(self.children)
- def maxwidth(self):
- childmaxwidths = [x.maxwidth() for x in self.children]
- return childmaxwidths and max(childmaxwidths) or 0
- def minwidth(self):
- childminwidths = [x.minwidth() for x in self.children]
- return childminwidths and max(childminwidths) or 0
- def propagate_indents(self):
- for x in self.children:
- x.indentlevel = self.indentlevel
- x.propagate_indents()
- def format(self, width):
- if width < self.minwidth(): width = self.minwidth()
- results = [x.format(width) for x in self.children]
- results = [x for x in results if x]
- return "\n\n".join(results)
- def empty(self):
- return not (self.children)
- def canmerge(self):
- return False
-
-class PreDitem(Ditem):
- "A Ditem representing a literal block"
- def maxwidth(self):
- return max([len(x) for x in self.text.split('\n')])
- def minwidth(self):
- return self.maxwidth() # Literal block; width's given
- def remove_white_space(self):
- pass
- def format(self, width):
- result = '::\n\n'
- for x in self.text.split('\n'):
- result = result + ' ' + x + '\n'
- result = result + '..\n\n'
- return result
- def canmerge(self):
- return False
-
-class HeadingDitem(BlockDitem):
- "A Ditem representing an h1, h2, ..., h9"
- def __init__(self, type):
- BlockDitem.__init__(self, type)
- def minwidth(self):
- return self.maxwidth() # Headings don't wrap
- def format(self, width):
- assert(len(self.children)==1)
- text = self.children[0].format(32767)
- level = eval(self.type[1])
- underliner = "=-`'.~*+^"[level-1]
- return text + '\n' + len(text)*underliner
-
-class BlockQuoteDitem(BlockDitem):
- "A Ditem representing a blockquote"
- def __init__(self, type):
- BlockDitem.__init__(self, type)
- def propagate_indents(self):
- self.indentlevel = self.indentlevel + 1
- BlockDitem.propagate_indents(self)
-
-class ListDitem(BlockDitem):
- "A Ditem representing an ol, ul, or dl"
- def __init__(self, type):
- BlockDitem.__init__(self, type)
- def format(self, width):
- # First pass the list type and order to the children
- order = 1
- for x in self.children:
- if isinstance(x, ListItemDitem):
- x.listtype = self.type
- x.order = order
- order = order+1
- # And then process normally
- return BlockDitem.format(self, width)
-
-class ListItemDitem(BlockDitem):
- "A Ditem representing a li, dt, or dd"
- def __init__(self, type):
- BlockDitem.__init__(self, type)
- self.listtype = None
- self.order = 0
- def minwidth(self):
- if self.type == 'dt': return self.maxwidth() # Don't wrap dt
- else: return BlockDitem.minwidth(self)
- def propagate_indents(self):
- if self.type in ('li', 'ol', 'dd'):
- self.indentlevel = self.indentlevel + 1
- BlockDitem.propagate_indents(self)
- def format(self, width):
- global unindent
- if self.type == 'li' and self.listtype == 'ol':
- unindent = ('%d. ' % (self.order)).ljust(4)
- elif self.type == 'li' and self.listtype == 'ul':
- unindent = '* '
- return BlockDitem.format(self, width)
-
-class RenderedColumn:
- "Width information about a column being rendered"
- def __init__(self, minwidth, maxwidth):
- self.minwidth = minwidth
- self.maxwidth = maxwidth
- self.curwidth = maxwidth
- self.fixedwidth = 0
- def logwidth(self):
- if self.maxwidth==0: return 0
- else: return math.log(self.maxwidth)
- def update(self, minwidth, maxwidth):
- "Replaces minwidth/maxwidth if greater"
- self.minwidth = minwidth>self.minwidth and minwidth or self.minwidth
- self.maxwidth = maxwidth>self.maxwidth and maxwidth or self.maxwidth
- self.curwidth = self.maxwidth
-
-class RenderedColumns(UserList.UserList):
- "A list of RenderedColumn"
- def __init__(self, alist):
- self.data = alist
- def totalWidth(self):
- "Returns total table width"
- return reduce(lambda x,y: x+y, [z.curwidth for z in self.data]) \
- + len(self.data) + 1
- def sumLogWidth(self):
- "Returns sum of logwidth for nonfixed columns"
- return reduce(lambda x,y: x+y,
- [x.logwidth()*(1-x.fixedwidth) for x in self.data])
- def distributeWidthDifference(self, width):
- "Step 4 of w3m table rendering algorithm"
- # Note: The use of math.ceil below is because I'd rather have a
- # suboptimal width (a few characters less than requested width) rather
- # than go find what to do with rounding.
- w = self.totalWidth() - width
- assert(w>0)
- repeat_distribution = 1
- while repeat_distribution:
- repeat_distribution = 0
- for x in self.data:
- if x.fixedwidth: continue
- if x.curwidth - math.ceil(w*x.logwidth()/self.sumLogWidth()) < \
- x.minwidth:
- x.curwidth = x.minwidth
- x.fixedwidth = 1
- w = self.totalWidth() - width
- repeat_distribution=1
- break
- # Now that the we finished finding which columns need to be fixed to
- # their minimum width, perform the distribution once again, without
- # checking, and actually change remaining column widths
- for x in self.data:
- if x.fixedwidth: continue
- x.curwidth = x.curwidth - math.ceil(w*x.logwidth()/self.sumLogWidth())
-
-def tablehrule(colwidths, rule='-'):
- "Returns a horizontal table separator for given column widths"
- result = '+'
- for x in colwidths:
- result = result + rule * x + '+'
- return result
-
-class TableDitem(BlockDitem):
- def __init__(self, type):
- BlockDitem.__init__(self, type)
- def format(self, width):
- # Uses table rendering algorithm of w3m
- # (http://www.w3m.org/story.html), but ignoring width attribute
- # Step 1
- columns = RenderedColumns([RenderedColumn(x.minwidth(),
- max(x.maxwidth(), 1) # A column can't be smaller than 1 character
- ) for x in self.children[0].children])
- for x in self.children:
- for i in range(len(columns)):
- if (len(x.children)<=i): continue # Skip empty columns
- columns[i].update(x.children[i].minwidth(), x.children[i].maxwidth())
- # Step 2 (width attribute) ignored
- # Step 3 (already done - list was created with maxwidth)
- # Step 4
- if columns.totalWidth() > width: columns.distributeWidthDifference(width)
- # OK, column widths are now calculated
- colwidths = [int(x.curwidth) for x in columns]
- result = tablehrule(colwidths) + '\n'
- usedheadbodysep = False
- for tr in self.children:
- result = result + tr.format(colwidths)
- rule = '-'
- if not usedheadbodysep and tr.children[0].type == 'th' \
- and tr!=self.children[-1]:
- rule = '='
- usedheadbodysep = True
- result = result + tablehrule(colwidths, rule) + '\n'
- return result
-
-class TrDitem(BlockDitem):
- def __init__(self, type):
- BlockDitem.__init__(self, type)
- def maxwidth(self):
- return reduce(lambda x,y: x+y,
- [x.maxwidth() for x in self.children]) + len(self.children) + 1
- def minwidth(self):
- return reduce(lambda x,y: x+y,
- [x.minwidth() for x in self.children]) + len(self.children) + 1
- def format(self, colwidths):
- columns = [] # List of lists of lines
- maxlinecount = 0 # Num of lines in vertically largest column
- for i in range(len(colwidths)):
- if len(self.children)<=i: lines = [ '' ]
- else: lines = self.children[i].format(colwidths[i]).split('\n')
- lines = [x + ' ' * (colwidths[i]-len(x)) for x in lines] # Pad to col len
- maxlinecount = max(maxlinecount, len(lines))
- columns.append(lines)
- # Pad vertically
- for i in range(len(columns)):
- for j in range(maxlinecount-len(columns[i])):
- columns[i].append(' ' * colwidths[i])
- result = ''
- # Add vertical separators
- for i in range(maxlinecount):
- result = result + '|'
- for j in range(len(columns)):
- result = result + columns[j][i] + '|'
- result = result + '\n'
- return result
-
-def handleNodeList(nodelist):
- "Processes given nodes; merges them if possible; returns ditem list"
- ditems = []
- curditem = Ditem('')
- for node in nodelist:
- aditem = handleNode(node)
- if curditem.merge(aditem): continue
- ditems.append(curditem)
- curditem = aditem
- if not curditem.empty(): ditems.append(curditem)
- return ditems
-
-def handleNode(node):
- if node.nodeType == node.TEXT_NODE:
- return handleText(node)
- elif node.nodeName=='a':
- return handleAnchor(node)
- elif re.match('h\d', node.nodeName):
- return handleHeading(node)
- elif node.nodeName=='div' and node.getAttribute('class')=='cit': # HARDWIRED
- return handleBlockQuote(node)
- elif node.nodeName in ('body', 'div', 'p', 'td', 'th'):
- return handleGenericBlock(node)
- elif node.nodeName in ('em', 'i'):
- return handleEmphasis(node)
- elif node.nodeName in ('strong', 'b'):
- return handleStrong(node)
- elif node.nodeName in ('ol', 'ul', 'dl'):
- return handleList(node)
- elif node.nodeName in ('li', 'dd', 'dt'):
- return handleListItem(node)
- elif node.nodeName in ('table'):
- return handleTable(node)
- elif node.nodeName in ('tr'):
- return handleTr(node)
- elif node.nodeName in ('pre'):
- return handlePre(node)
- elif node.hasChildNodes():
- contents = handleNodeList(node.childNodes)
- if len(contents) == 1: return contents[0]
- if len(contents) == 0: return Ditem('')
- result = BlockDitem(node.nodeName)
- result.children = contents
- return result
- return Ditem('')
-
-def processChildren(node):
- if node.hasChildNodes():
- return handleNodeList(node.childNodes)
- else:
- return ()
-
-def mergeChildren(node):
- contents = processChildren(node)
- if len(contents)>1: raise Exception('Unexpected block elements')
- if contents: return contents[0]
- else: return Ditem('')
-
-def handleText(node):
- return Ditem(node.data)
-
-def handleAnchor(node):
- result = mergeChildren(node)
- result.type = node.nodeName
- result.text = result.text.strip()
- if result.text == '': return result
- target = node.getAttribute('href').strip()
- if target=="" or target[0]=='#': return result # Ignore intrnl links
- result.text = re.sub('\s+', ' ', result.text)
- key = result.text.lower()
- if hyperlinks.has_key(key) and hyperlinks[key]!=target:
- # The following try-except is a quick hack to ensure that the
- # program will not stop because of problems in the warning
- # mechanism. One such specific problem is a UnicodeEncodeError
- # when result.text contains difficult characters.
- try:
- warnings.warn("Ignoring second appearance of anchor '" + result.text +
- "' with different target")
- except:
- pass
- return result
- hyperlinks[key] = target
- result.text = '`'+result.text+'`_'
- return result
-
-def handleHeading(node):
- contents = mergeChildren(node)
- if contents.empty(): return contents
- result = HeadingDitem(node.nodeName)
- result.children.append(contents)
- return result
-
-def handleEmphasis(node):
- result = mergeChildren(node)
- result.type = node.nodeName
- if result.text:
- result.text = '*' + result.text + '*'
- return result
-
-def handleStrong(node):
- result = mergeChildren(node)
- result.type = node.nodeName
- if result.text:
- result.text = '**' + result.text + '**'
- return result
-
-def handleGenericBlock(node):
- result = BlockDitem(node.nodeName)
- result.children = processChildren(node)
- return result
-
-def handleBlockQuote(node):
- result = BlockQuoteDitem(node.nodeName)
- result.children = processChildren(node)
- return result
-
-def handleList(node):
- result = ListDitem(node.nodeName)
- result.children = processChildren(node)
- return result
-
-def handleListItem(node):
- result = ListItemDitem(node.nodeName)
- result.children = processChildren(node)
- return result
-
-def handleTable(node):
- result = TableDitem(node.nodeName)
- # Ignore table contents that are not tr
- result.children = [x
- for x in processChildren(node) if x.type=='tr']
- return result
-
-def handleTr(node):
- result = TrDitem(node.nodeName)
- # Ignore tr contents that are not th or td
- result.children = [x
- for x in processChildren(node) if x.type in ('th', 'td')]
- return result
-
-def handlePre(node):
- return PreDitem(mergeChildren(node).text)
-
-dom1 = xml.dom.minidom.parse(sys.argv[1])
-ditem = handleNode(dom1.getElementsByTagName("body")[0])
-ditem.propagate_indents()
-(utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup('utf-8')
-outf = utf8_writer(sys.stdout)
-outf.write(ditem.format(79) + '\n')
-for h in hyperlinks.keys():
- outf.write('\n.. _`' + h + '`:\n ' + hyperlinks[h] + '\n')