xml2po/modes/docbook.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

# -*- coding: utf-8 -*-
# Copyright (c) 2004 Danilo Segan <danilo@kvota.net>.
#
# This file is part of xml2po.
#
# xml2po is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# xml2po is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with xml2po; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#

# This implements special instructions for handling DocBook XML documents
# in a better way.
#
#  This means:
#   — better handling of nested complicated tags (i.e. definitions of
#     ignored-tags and final-tags)
#   — support for merging translator-credits back into DocBook articles
#   — support for setting a language
#

# We use "currentXmlMode" class name for all modes
#  -- it might be better to have it named docbookXmlMode, but it will make loading harder;
#     it is also not necessary until we start supporting extracting strings from more
#     than one document type at the same time
#
import re
import libxml2
import os
import md5
import sys

class docbookXmlMode:
    """Class for special handling of DocBook document types.

    It sets lang attribute on article elements, and adds translators
    to articleinfo/copyright."""
    def __init__(self):
        self.lists = ['itemizedlist', 'orderedlist', 'variablelist',
                      'segmentedlist', 'simplelist', 'calloutlist', 'varlistentry' ]
        self.objects = [ 'figure', 'textobject', 'imageobject', 'mediaobject',
                         'screenshot' ]
        
    def getIgnoredTags(self):
        "Returns array of tags to be ignored."
        return  self.objects + self.lists

    def getFinalTags(self):
        "Returns array of tags to be considered 'final'."
        return ['para', 'formalpara', 'simpara',
                'releaseinfo', 'revnumber', 'title',
                'date', 'term', 'programlisting'] + self.objects + self.lists

    def getSpacePreserveTags(self):
        "Returns array of tags in which spaces are to be preserved."
        return [
            'classsynopsisinfo',
            'computeroutput',
            'funcsynopsisinfo',
            'literallayout',
            'programlisting',
            'screen',
            'synopsis',
            'userinput'
            ]

    def getStringForTranslators(self):
        """Returns string which will be used to credit translators."""
        return "translator-credits"

    def getCommentForTranslators(self):
        """Returns a comment to be added next to string for crediting translators."""
        return """Put one translator per line, in the form of NAME <EMAIL>, YEAR1, YEAR2."""

    def _find_articleinfo(self, node):
        if node.name == 'articleinfo' or node.name == 'bookinfo':
            return node
        child = node.children
        while child:
            ret = self._find_articleinfo(child)
            if ret:
                return ret
            child = child.next
        return None

    def _find_lastcopyright(self, node):
        if not node.children:
            return None
        last = node.lastChild()
        tmp = last
        while tmp:
            if tmp.name == "copyright":
                last = tmp
                break
            tmp = tmp.prev
        return last

    def _md5_for_file(self, filename):
        hash = md5.new()
        input = open(filename, "rb")
        read = input.read(4096)
        while read:
            hash.update(read)
            read = input.read(4096)
        input.close()
        return hash.hexdigest()

    def _output_images(self, node, msg):
        if node and node.type=='element' and node.name=='imagedata':
            # Use .fileref to construct new message
            attr = node.prop("fileref")
            if attr:
                dir = os.path.dirname(msg.filename)
                fullpath = os.path.join(dir, attr)
                if os.path.exists(fullpath):
                    hash = self._md5_for_file(fullpath)
                else:
                    hash = "THIS FILE DOESN'T EXIST"
                    print >>sys.stderr, "Warning: image file '%s' not found." % fullpath
                    
                msg.outputMessage("@@image: '%s'; md5=%s" % (attr, hash), node.lineNo(),
                                  "When image changes, this message will be marked fuzzy or untranslated for you.\n"+
                                  "It doesn't matter what you translate it to: it's not used at all.")
        elif node and node.children:
            child = node.children
            while child:
                self._output_images(child,msg)
                child = child.next


    def preProcessXml(self, doc, msg):
        """Add additional messages of interest here."""
        root = doc.getRootElement()
        self._output_images(root,msg)

    def postProcessXmlTranslation(self, doc, language, translators):
        """Sets a language and translators in "doc" tree.
        
        "translators" is a string consisted of "Name <email>, years" pairs
        of each translator, separated by newlines."""

        root = doc.getRootElement()
        # DocBook documents can be something other than article, handle that as well in the future
        while root and root.name != 'article' and root.name != 'book':
            root = root.next
        if root and (root.name == 'article' or root.name == 'book'):
            root.setProp('lang', language)
        else:
            return
        
        if translators == self.getStringForTranslators():
            return
        else:
            # Now, lets find 'articleinfo' (it can be something else, but this goes along with 'article')
            ai = self._find_articleinfo(root)
            if not ai:
                return

            # Now, lets do one translator at a time
            lines = translators.split("\n")
            for line in lines:
                line = line.strip()
                match = re.match(r"^([^<,]+)\s*(?:<([^>,]+)>)?,\s*(.*)$", line)
                if match:
                    last = self._find_lastcopyright(ai)
                    copy = libxml2.newNode("copyright")
                    if last:
                        copy = last.addNextSibling(copy)
                    else:
                        ai.addChild(copy)
                    if match.group(3):
                        copy.newChild(None, "year", match.group(3).encode('utf-8'))
                    if match.group(1) and match.group(2):
                        holder = match.group(1)+"(%s)" % match.group(2)
                    elif match.group(1):
                        holder = match.group(1)
                    elif match.group(2):
                        holder = match.group(2)
                    else:
                        holder = "???"
                    copy.newChild(None, "holder", holder.encode('utf-8'))

# Perform some tests when ran standalone
if __name__ == '__main__':
    test = docbookXmlMode()
    print "Ignored tags       : " + repr(test.getIgnoredTags())
    print "Final tags         : " + repr(test.getFinalTags())
    print "Space-preserve tags: " + repr(test.getSpacePreserveTags())

    print "Credits from string: '%s'" % test.getStringForTranslators()
    print "Explanation for credits:\n\t'%s'" % test.getCommentForTranslators()