TAG_PROGRAM_LISTING = '', ''} def get_formatted_description(description): desc = description.replace("|[", "") \ .replace("]|", "") desc = "%s" % desc # we still need to handle this case # # Handle "#include " # $text =~ s/#include(\s+)<([^>]+)>/#include$1<$2>/g; formatted_desc = "" inside_tags = [] last_offset = 0 for start, end, tag in _find_xml_tag_matches(desc): if len(inside_tags) == 0: new_desc = "\n\n\n".join(desc[last_offset:start].split('\n\n')) else: new_desc = desc[last_offset:start] if TAG_CDATA not in inside_tags: new_desc = _escape_non_cdata_section(new_desc) formatted_desc += new_desc formatted_desc += tag if tag == TAG_PROGRAM_LISTING: formatted_desc += '>' if tag in (TAG_CDATA, TAG_PROGRAM_LISTING): inside_tags.append(tag) else: try: inside_tags.pop() except IndexError: print "Error: mismatched tag:", tag last_offset = end formatted_desc += _escape_non_cdata_section(desc[last_offset:]) return formatted_desc def _find_xml_tag_matches(string): offset = 0 while True: indexes = [] for tag in TAGS: pos = string.find(tag, offset) if pos != -1: indexes.append((tag, pos)) if indexes: tag, first = min(indexes, key=lambda x: x[1]) if tag == TAG_PROGRAM_LISTING: end = string.find('>', first + len(tag) - 1) + 1 else: end = first + len(tag) offset = end yield first, end, tag else: return def _escape_non_cdata_section(string): string = _escape_ampersand_not_in_entity(string) string = _escape_lt_not_in_xml_tag(string) return _escape_gt_not_in_xml_tag(string) def _escape_ampersand_not_in_entity(string): parts = string.split('&') output = parts[0] for part in parts[1:]: end = part.find(';') if end == -1 or not part[:end].isalpha(): output += "&" else: output += "&" output += part return output def _is_valid_xml_tag_name(name): if len(name) < 1: return False elif name.isalpha() or (name[0].isalpha() and name[1:].isalnum()): return True def _is_valid_xml_tag(string): # handle case where line end is between tag name and first argument. # ie. My Link string = string.replace('\n', ' ') if string[-1] == '/': string = string[:-1] # string is the inner part of the tag, without < and > if string[0] == '/' and _is_valid_xml_tag_name(string[1:]): #valid end tag return True elif _is_valid_xml_tag_name(string): #valid start tag with not params return True elif " " in string: # we are looking for: # TODO: handle spaces in values (between quotations) tagname, rest = string.split(" ", 1) if not _is_valid_xml_tag_name(tagname): return False while rest.strip(): rest = rest.lstrip() if not '=' in rest: return False argname, rest = rest.split('=', 1) if not _is_valid_xml_tag_name(argname): return False if rest[0] != '"': return False value, rest = rest[1:].split('"', 1) return True def _escape_lt_not_in_xml_tag(string): parts = string.split('<') output = parts[0] for part in parts[1:]: end = part.find('>') if end == -1 or not _is_valid_xml_tag(part[:end]): output += "<" else: output += "<" output += part return output def _escape_gt_not_in_xml_tag(string): parts = string.split('>') output = parts[0] for part in parts[1:]: start = output.rfind('<') if start == -1 or not _is_valid_xml_tag(output[start+1:]): output += ">" else: output += ">" output += part return output def test(): assert _is_valid_xml_tag_name('a') assert _is_valid_xml_tag_name('refsect1') assert not _is_valid_xml_tag_name('1refsect') assert not _is_valid_xml_tag_name('1') assert _is_valid_xml_tag('/a') assert _is_valid_xml_tag('/refsect1') assert not _is_valid_xml_tag('/1') assert _is_valid_xml_tag('link') assert _is_valid_xml_tag('link linkend="value"') assert _is_valid_xml_tag('link linkend="value"') assert _is_valid_xml_tag('link/') assert _is_valid_xml_tag('link linkend="value"/') assert _is_valid_xml_tag('link linkend="value" arg23="anothervalue"') assert _is_valid_xml_tag('link linkend="value" arg23="anothervalue with spaces"') assert not _is_valid_xml_tag('link linkend="value arg23="anothervalue with spaces"') assert not _is_valid_xml_tag('link linkend') assert _is_valid_xml_tag('link\nlinkend="link-id"') assert _is_valid_xml_tag('xref linkend="gtkstylecontext-classes"/') assert _is_valid_xml_tag('a href="http://www.gtk.org" title="<i>Our</i> website"') assert _is_valid_xml_tag('ulink \nurl="http://www.freedesktop.org/Standards/wm-spec"') string = 'gtk_label_set_markup (label, "Go to the GTK+ website for more...");' assert _escape_lt_not_in_xml_tag(string) == string if __name__ == '__main__': test()