giscanner/docbookdescription.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185


TAG_PROGRAM_LISTING = '<programlisting'
TAG_CDATA = '<![CDATA['
TAGS = {TAG_PROGRAM_LISTING, TAG_CDATA, ']]>', '</programlisting>'}

def get_formatted_description(description):
    desc = description.replace("|[", "<informalexample><programlisting>") \
                          .replace("]|", "</programlisting></informalexample>")

    desc = "<para>%s</para>" % desc

# we still need to handle this case
#    # Handle "#include <xxxxx>"
#    $text =~ s/#include(\s+)<([^>]+)>/#include$1&lt;$2&gt;/g;

    formatted_desc = ""

    inside_tags = []
    last_offset = 0
    for start, end, tag in _find_xml_tag_matches(desc):
        if len(inside_tags) == 0:
            new_desc = "\n</para>\n<para>\n".join(desc[last_offset:start].split('\n\n'))
        else:
            new_desc = desc[last_offset:start]

        if TAG_CDATA not in inside_tags:
            new_desc = _escape_non_cdata_section(new_desc)

        formatted_desc += new_desc
        formatted_desc += tag
        if tag == TAG_PROGRAM_LISTING:
            formatted_desc += '>'

        if tag in (TAG_CDATA, TAG_PROGRAM_LISTING):
            inside_tags.append(tag)
        else:
            try:
                inside_tags.pop()
            except IndexError:
                print "Error: mismatched tag:", tag
        last_offset = end

    formatted_desc += _escape_non_cdata_section(desc[last_offset:])
    return formatted_desc

def _find_xml_tag_matches(string):
    offset = 0
    while True:
        indexes = []
        for tag in TAGS:
            pos = string.find(tag, offset)
            if pos != -1:
                indexes.append((tag, pos))

        if indexes:
            tag, first = min(indexes, key=lambda x: x[1])
            if tag == TAG_PROGRAM_LISTING:
                end = string.find('>', first + len(tag) - 1) + 1
            else:
                end = first + len(tag)
            offset = end
            yield first, end, tag
        else:
            return

def _escape_non_cdata_section(string):
    string = _escape_ampersand_not_in_entity(string)
    string = _escape_lt_not_in_xml_tag(string)
    return _escape_gt_not_in_xml_tag(string)

def _escape_ampersand_not_in_entity(string):
    parts = string.split('&')

    output = parts[0]
    for part in parts[1:]:
        end = part.find(';')
        if end == -1 or not part[:end].isalpha():
            output += "&amp;"
        else:
            output += "&"
        output += part

    return output

def _is_valid_xml_tag_name(name):
    if len(name) < 1:
        return False
    elif name.isalpha() or (name[0].isalpha() and name[1:].isalnum()):
        return True

def _is_valid_xml_tag(string):
    # handle case where line end is between tag name and first argument.
    # ie. <link\nlinkend="link-id">My Link</link>
    string = string.replace('\n', ' ')

    if string[-1] == '/':
        string = string[:-1]

    # string is the inner part of the tag, without < and >
    if string[0] == '/' and _is_valid_xml_tag_name(string[1:]):
        #valid end tag
        return True
    elif _is_valid_xml_tag_name(string):
        #valid start tag with not params
        return True
    elif " " in string:
        # we are looking for: <tagname arg="value" arg2="value2">
        # TODO: handle spaces in values (between quotations)
        tagname, rest = string.split(" ", 1)
        if not _is_valid_xml_tag_name(tagname):
            return False

        while rest.strip():
            rest = rest.lstrip()

            if not '=' in rest:
                return False
            argname, rest = rest.split('=', 1)
            if not _is_valid_xml_tag_name(argname):
                return False
            if rest[0] != '"':
                return False
            value, rest = rest[1:].split('"', 1)

        return True

def _escape_lt_not_in_xml_tag(string):
    parts = string.split('<')

    output = parts[0]
    for part in parts[1:]:
        end = part.find('>')
        if end == -1 or not _is_valid_xml_tag(part[:end]):
            output += "&lt;"
        else:
            output += "<"
        output += part

    return output

def _escape_gt_not_in_xml_tag(string):
    parts = string.split('>')

    output = parts[0]
    for part in parts[1:]:
        start = output.rfind('<')
        if start == -1 or not _is_valid_xml_tag(output[start+1:]):
            output += "&gt;"
        else:
            output += ">"
        output += part

    return output


def test():
    assert _is_valid_xml_tag_name('a')
    assert _is_valid_xml_tag_name('refsect1')
    assert not _is_valid_xml_tag_name('1refsect')
    assert not _is_valid_xml_tag_name('1')

    assert _is_valid_xml_tag('/a')
    assert _is_valid_xml_tag('/refsect1')
    assert not _is_valid_xml_tag('/1')
    assert _is_valid_xml_tag('link')
    assert _is_valid_xml_tag('link linkend="value"')
    assert _is_valid_xml_tag('link  linkend="value"')
    assert _is_valid_xml_tag('link/')
    assert _is_valid_xml_tag('link linkend="value"/')
    assert _is_valid_xml_tag('link linkend="value" arg23="anothervalue"')
    assert _is_valid_xml_tag('link linkend="value" arg23="anothervalue with spaces"')
    assert not _is_valid_xml_tag('link linkend="value arg23="anothervalue with spaces"')
    assert not _is_valid_xml_tag('link linkend')
    assert _is_valid_xml_tag('link\nlinkend="link-id"')
    assert _is_valid_xml_tag('xref linkend="gtkstylecontext-classes"/')

    assert _is_valid_xml_tag('a href="http://www.gtk.org" title="&lt;i&gt;Our&lt;/i&gt; website"')
    assert _is_valid_xml_tag('ulink \nurl="http://www.freedesktop.org/Standards/wm-spec"')

    string = 'gtk_label_set_markup (label, "Go to the <a href="http://www.gtk.org" ' \
        + 'title="&lt;i&gt;Our&lt;/i&gt; website">GTK+ website</a> for more...");'
    assert _escape_lt_not_in_xml_tag(string) == string

if __name__ == '__main__':
    test()