1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
TAG_PROGRAM_LISTING = '<programlisting'
TAG_CDATA = '<![CDATA['
TAGS = {TAG_PROGRAM_LISTING, TAG_CDATA, ']]>', '</programlisting>'}
def get_formatted_description(description):
desc = description.replace("|[", "<informalexample><programlisting>") \
.replace("]|", "</programlisting></informalexample>")
desc = "<para>%s</para>" % desc
# we still need to handle this case
# # Handle "#include <xxxxx>"
# $text =~ s/#include(\s+)<([^>]+)>/#include$1<$2>/g;
formatted_desc = ""
inside_tags = []
last_offset = 0
for start, end, tag in _find_xml_tag_matches(desc):
if len(inside_tags) == 0:
new_desc = "\n</para>\n<para>\n".join(desc[last_offset:start].split('\n\n'))
else:
new_desc = desc[last_offset:start]
if TAG_CDATA not in inside_tags:
new_desc = _escape_non_cdata_section(new_desc)
formatted_desc += new_desc
formatted_desc += tag
if tag == TAG_PROGRAM_LISTING:
formatted_desc += '>'
if tag in (TAG_CDATA, TAG_PROGRAM_LISTING):
inside_tags.append(tag)
else:
try:
inside_tags.pop()
except IndexError:
print "Error: mismatched tag:", tag
last_offset = end
formatted_desc += _escape_non_cdata_section(desc[last_offset:])
return formatted_desc
def _find_xml_tag_matches(string):
offset = 0
while True:
indexes = []
for tag in TAGS:
pos = string.find(tag, offset)
if pos != -1:
indexes.append((tag, pos))
if indexes:
tag, first = min(indexes, key=lambda x: x[1])
if tag == TAG_PROGRAM_LISTING:
end = string.find('>', first + len(tag) - 1) + 1
else:
end = first + len(tag)
offset = end
yield first, end, tag
else:
return
def _escape_non_cdata_section(string):
string = _escape_ampersand_not_in_entity(string)
string = _escape_lt_not_in_xml_tag(string)
return _escape_gt_not_in_xml_tag(string)
def _escape_ampersand_not_in_entity(string):
parts = string.split('&')
output = parts[0]
for part in parts[1:]:
end = part.find(';')
if end == -1 or not part[:end].isalpha():
output += "&"
else:
output += "&"
output += part
return output
def _is_valid_xml_tag_name(name):
if len(name) < 1:
return False
elif name.isalpha() or (name[0].isalpha() and name[1:].isalnum()):
return True
def _is_valid_xml_tag(string):
# handle case where line end is between tag name and first argument.
# ie. <link\nlinkend="link-id">My Link</link>
string = string.replace('\n', ' ')
if string[-1] == '/':
string = string[:-1]
# string is the inner part of the tag, without < and >
if string[0] == '/' and _is_valid_xml_tag_name(string[1:]):
#valid end tag
return True
elif _is_valid_xml_tag_name(string):
#valid start tag with not params
return True
elif " " in string:
# we are looking for: <tagname arg="value" arg2="value2">
# TODO: handle spaces in values (between quotations)
tagname, rest = string.split(" ", 1)
if not _is_valid_xml_tag_name(tagname):
return False
while rest.strip():
rest = rest.lstrip()
if not '=' in rest:
return False
argname, rest = rest.split('=', 1)
if not _is_valid_xml_tag_name(argname):
return False
if rest[0] != '"':
return False
value, rest = rest[1:].split('"', 1)
return True
def _escape_lt_not_in_xml_tag(string):
parts = string.split('<')
output = parts[0]
for part in parts[1:]:
end = part.find('>')
if end == -1 or not _is_valid_xml_tag(part[:end]):
output += "<"
else:
output += "<"
output += part
return output
def _escape_gt_not_in_xml_tag(string):
parts = string.split('>')
output = parts[0]
for part in parts[1:]:
start = output.rfind('<')
if start == -1 or not _is_valid_xml_tag(output[start+1:]):
output += ">"
else:
output += ">"
output += part
return output
def test():
assert _is_valid_xml_tag_name('a')
assert _is_valid_xml_tag_name('refsect1')
assert not _is_valid_xml_tag_name('1refsect')
assert not _is_valid_xml_tag_name('1')
assert _is_valid_xml_tag('/a')
assert _is_valid_xml_tag('/refsect1')
assert not _is_valid_xml_tag('/1')
assert _is_valid_xml_tag('link')
assert _is_valid_xml_tag('link linkend="value"')
assert _is_valid_xml_tag('link linkend="value"')
assert _is_valid_xml_tag('link/')
assert _is_valid_xml_tag('link linkend="value"/')
assert _is_valid_xml_tag('link linkend="value" arg23="anothervalue"')
assert _is_valid_xml_tag('link linkend="value" arg23="anothervalue with spaces"')
assert not _is_valid_xml_tag('link linkend="value arg23="anothervalue with spaces"')
assert not _is_valid_xml_tag('link linkend')
assert _is_valid_xml_tag('link\nlinkend="link-id"')
assert _is_valid_xml_tag('xref linkend="gtkstylecontext-classes"/')
assert _is_valid_xml_tag('a href="http://www.gtk.org" title="<i>Our</i> website"')
assert _is_valid_xml_tag('ulink \nurl="http://www.freedesktop.org/Standards/wm-spec"')
string = 'gtk_label_set_markup (label, "Go to the <a href="http://www.gtk.org" ' \
+ 'title="<i>Our</i> website">GTK+ website</a> for more...");'
assert _escape_lt_not_in_xml_tag(string) == string
if __name__ == '__main__':
test()
|