# -*- Mode: Python; py-indent-offset: 4 -*- '''Simple module for extracting GNOME style doc comments from C sources, so I can use them for other purposes.''' import sys, os, re # Used to tell if the "Since: ..." portion of the gtkdoc function description # should be omitted. This is useful for some C++ modules such as gstreamermm # that wrap C API which is still unstable and including this information would # not be useful. # This variable is modified from docextract_to_xml based on the --no-since # option being specified. no_since = False # Used to tell if extract() shall collect information from subdirectories. # This variable is modified from docextract_to_xml based on the --no-recursion # option being specified. no_recursion = False __all__ = ['extract'] class GtkDoc: def __init__(self): self.name = None # The block type ('function', 'signal', property', 'section', 'enum') self.block_type = '' self.params = [] self.annotations = [] self.description = '' self.ret = ('', []) # (return, annotations) def set_name(self, name): self.name = name def set_type(self, block_type): self.block_type = block_type def get_type(self): return self.block_type def add_param(self, name, description, annotations=[]): if name == '...': name = 'Varargs' self.params.append((name, description, annotations)) def append_to_last_param(self, extra): self.params[-1] = (self.params[-1][0], self.params[-1][1] + extra, self.params[-1][2]) def append_to_named_param(self, name, extra): for i in range(len(self.params)): if self.params[i][0] == name: self.params[i] = (name, self.params[i][1] + extra, self.params[i][2]) return # fall through to adding extra parameter ... self.add_param(name, extra) def add_annotation(self, annotation): self.annotations.append(annotation) def get_annotations(self): return self.annotations def append_to_description(self, extra): self.description = self.description + extra def get_description(self): return self.description def add_return(self, first_line, annotations=[]): self.ret = (first_line, annotations) def append_to_return(self, extra): self.ret = (self.ret[0] + extra, self.ret[1]) comment_start_pattern = re.compile(r'^\s*/\*\*\s') comment_end_pattern = re.compile(r'^\s*\*+/') comment_line_lead_pattern = re.compile(r'^\s*\*\s*') comment_empty_line_pattern = re.compile(r'^\s*\**\s*$') function_name_pattern = re.compile(r'^([a-z]\w*)\s*:?(\s*\(.*\)\s*){0,2}\s*$') signal_name_pattern = re.compile(r'^([A-Z]\w+::[a-z0-9-]+)\s*:?(\s*\(.*\)\s*){0,2}\s*$') property_name_pattern = re.compile(r'^([A-Z]\w+:[a-z0-9-]+)\s*:?(\s*\(.*\)\s*){0,2}\s*$') enum_name_pattern = re.compile(r'^([A-Z]\w+)\s*:?(\s*\(.*\)\s*){0,2}\s*$') section_name_pattern = re.compile(r'^(SECTION:[a-z0-9_-]+)\s*:?(\s*\(.*\)\s*){0,2}\s*$') return_pattern = re.compile(r'^@?(returns:|return\s+value:)(.*\n?)$', re.IGNORECASE) deprecated_pattern = re.compile(r'^(deprecated\s*:\s*.*\n?)$', re.IGNORECASE) rename_to_pattern = re.compile(r'^(rename\s+to)\s*:\s*(.*\n?)$', re.IGNORECASE) param_pattern = re.compile(r'^@(\S+)\s*:(.*\n?)$') # Used to extract the annotations in the parameter and return descriptions # extracted using above [param|return]_pattern patterns. annotations_pattern = re.compile(r'^(?:(\s*\(.*\)\s*)*:)') # Used to construct the annotation lists. annotation_lead_pattern = re.compile(r'^\s*\(\s*(.*?)\s*\)\s*') # These patterns determine the identifier of the current comment block. They # are grouped in a list for easy determination of block identifiers (in # skip_to_identifier). # The function_name_pattern shall be tested for last, because it always matches # signal and property identifiers. # The property_name_pattern shall be tested for after the section_name_pattern, # because the property_name_pattern matches most section identifiers. identifier_patterns = [ signal_name_pattern, section_name_pattern, property_name_pattern, enum_name_pattern, function_name_pattern ] # This pattern is to match return sections that forget to have a colon (':') # after the initial 'Return' phrase. It is not included by default in the list # of final sections below because a lot of function descriptions begin with # 'Returns ...' and the process_description() function would stop right at that # first line, thinking it is a return section. no_colon_return_pattern = re.compile(r'^@?(returns|return\s+value)\s*(.*\n?)$', re.IGNORECASE) since_pattern = re.compile(r'^(since\s*:\s*.*\n?)$', re.IGNORECASE) # This pattern is to match since sections that forget to have a colon (':') # after the initial 'Since' phrase. It is not included by default in the list # of final sections below because some function descriptions contain # 'Since ...' and the process_description() function would stop at that # line, thinking it is a since section. no_colon_since_pattern = re.compile(r'^Since\s+[.0-9]+\n?$') # These patterns normally will be encountered after the description. Knowing # the order of their appearance is difficult so this list is used to test when # one begins and the other ends when processing the rest of the sections after # the description. final_section_patterns = [ return_pattern, since_pattern, deprecated_pattern, rename_to_pattern ] def parse_file(fp, doc_dict): line = fp.readline() while line: cur_doc = GtkDoc() line = skip_to_comment_block(fp, line) line = skip_to_identifier(fp, line, cur_doc) # See if the identifier is found (stored in the current GtkDoc by # skip_to_identifier). If so, continue reading the rest of the comment # block. if cur_doc.name: line = process_params(fp, line, cur_doc) line = process_description(fp, line, cur_doc) line = process_final_sections(fp, line, cur_doc) # Add the current doc block to the dictionary of doc blocks. If # this block was initially recognized as an 'enum' block in # skip_to_identifier(), the process_params() function will mark the # block as invalid if enum members are not all caps. In that case # do not add the block. if cur_doc.get_type() != 'invalid': doc_dict[cur_doc.name] = cur_doc # Given a list of annotations as string of the form # '(annotation1) (annotation2) ...' return a list of annotations of the form # [ (name1, value1), (name2, value2) ... ]. Not all annotations have values so # the values in the list of tuples could be empty (''). def get_annotation_list(annotations): annotation_list = [] while annotations: match = annotation_lead_pattern.match(annotations) if match: annotation_contents = match.group(1) name, split, value = annotation_contents.strip().partition(' ') annotation_list.append((name, value)) # Remove first occurrence to continue processing. annotations = annotation_lead_pattern.sub('', annotations) else: break return annotation_list # Given a currently read line, test that line and continue reading until the # beginning of a comment block is found or eof is reached. Return the last # read line. def skip_to_comment_block(fp, line): while line: if comment_start_pattern.match(line): break line = fp.readline() return line # Given the current line in a comment block, continue skipping lines until a # non-blank line in the comment block is found or until the end of the block # (or eof) is reached. Returns the line where reading stopped. def skip_to_nonblank(fp, line): while line: if not comment_empty_line_pattern.match(line): break line = fp.readline() # Stop processing if eof or end of comment block is reached. if not line or comment_end_pattern.match(line): break return line # Given the first line of a comment block (the '/**'), see if the next # non-blank line is the identifier of the comment block. Stop processing if # the end of the block or eof is reached. Store the identifier (if there is # one) and its type ('function', 'signal' or 'property', 'enum') in the given # GtkDoc. Return the line where the identifier is found or the line that stops # the processing (if eof or the end of the comment block is found first). def skip_to_identifier(fp, line, cur_doc): # Skip the initial comment block line ('/**') if not eof. if line: line = fp.readline() # Now skip empty lines. line = skip_to_nonblank(fp, line) # See if the first non-blank line is the identifier. if line and not comment_end_pattern.match(line): # Remove the initial ' * ' in comment block line and see if there is an # identifier. line = comment_line_lead_pattern.sub('', line) for pattern in identifier_patterns: match = pattern.match(line) if match: # Set the GtkDoc name. cur_doc.set_name(match.group(1)) # Get annotations and add them to the GtkDoc. annotations = get_annotation_list(match.group(2)) for annotation in annotations: cur_doc.add_annotation(annotation) # Set the GtkDoc type. if pattern == signal_name_pattern: cur_doc.set_type('signal') elif pattern == section_name_pattern: cur_doc.set_type('section') elif pattern == property_name_pattern: cur_doc.set_type('property') elif pattern == enum_name_pattern: cur_doc.set_type('enum') elif pattern == function_name_pattern: cur_doc.set_type('function') return line return line # Given a currently read line (presumably the identifier line), read the next # lines, testing to see if the lines are part of parameter descriptions. If # so, store the parameter descriptions in the given doc block. Stop on eof and # return the last line that stops the processing. def process_params(fp, line, cur_doc): # Skip the identifier line if not eof. Also skip any blank lines in the # comment block. Return if eof or the end of the comment block are # encountered. if line: line = fp.readline() line = skip_to_nonblank(fp, line) if not line or comment_end_pattern.match(line): # If there are no parameters and this block has been recognized as an # enum block, then mark it as invalid. if cur_doc.get_type() == 'enum': cur_doc.set_type('invalid') return line # Remove initial ' * ' in first non-empty comment block line. line = comment_line_lead_pattern.sub('', line) # Now process possible parameters as long as no eof or the end of the # param section is not reached (which could be triggered by anything that # doesn't match a '@param:..." line, even the end of the comment block). match = param_pattern.match(line) # Mark the cur_doc type as invalid if no parameters are found. if not match and cur_doc.get_type() == 'enum': cur_doc.set_type('invalid') while line and match: name = match.group(1) description = match.group(2) # Set the cur_doc type as 'invalid' if the enum member names are not # all caps. if not name.isupper() and cur_doc.get_type() == 'enum': cur_doc.set_type('invalid') # First extract the annotations from the description and save them. annotations = [] annotation_match = annotations_pattern.match(description) if annotation_match: annotations = get_annotation_list(annotation_match.group(1)) # Remove the annotations from the description description = annotations_pattern.sub('', description) # Default to appending lines to current parameter. append_func = cur_doc.append_to_last_param # See if the return has been included as part of the parameter # section and make sure that lines are added to the GtkDoc return if # so. if name.lower() == "returns": cur_doc.add_return(description, annotations) append_func = cur_doc.append_to_return # If not, just add it as a regular parameter. else: cur_doc.add_param(name, description, annotations) # Now read lines and append them until next parameter, beginning of # description (an empty line), the end of the comment block or eof. line = fp.readline() while line: # Stop processing if end of comment block or a blank comment line # is encountered. if comment_empty_line_pattern.match(line) or \ comment_end_pattern.match(line): break # Remove initial ' * ' in comment block line. line = comment_line_lead_pattern.sub('', line) # Break from current param processing if a new one is # encountered. if param_pattern.match(line): break; # Otherwise, just append the current line and get the next line. append_func(line) line = fp.readline() # Re-evaluate match for while condition match = param_pattern.match(line) # End by returning the current line. return line # Having processed parameters, read the following lines into the description of # the current doc block until the end of the comment block, the end of file or # a return section is encountered. def process_description(fp, line, cur_doc): # First skip empty lines returning on eof or end of comment block. line = skip_to_nonblank(fp, line) if not line or comment_end_pattern.match(line): return line # Remove initial ' * ' in non-empty comment block line. line = comment_line_lead_pattern.sub('', line) # Also remove possible 'Description:' prefix. if line[:12] == 'Description:': line = line[12:] # Used to tell if the previous line was blank and a return section # uncommonly marked with 'Returns ...' instead of 'Returns: ...' has # started (assume it is non-empty to begin with). prev_line = 'non-empty' # Now read lines until a new section (like a return or a since section) is # encountered. while line: # See if the description section has ended (if the line begins with # 'Returns ...' and the previous line was empty -- this loop replaces # empty lines with a newline). if no_colon_return_pattern.match(line) and prev_line == '\n': return line # Or if one of the patterns of the final sections match for pattern in final_section_patterns: if pattern.match(line): return line # If not, append lines to description in the doc comment block. # But if --no-since is specified, skip a no_colon_since_pattern line. if no_since and \ no_colon_since_pattern.match(line) and prev_line == '\n': pass else: cur_doc.append_to_description(line) prev_line = line line = fp.readline() # Stop processing on eof or at the end of comment block. if not line or comment_end_pattern.match(line): return line # Remove initial ' * ' in line so that the text can be appended to the # description of the comment block and make sure that if the line is # empty it be interpreted as a newline. line = comment_line_lead_pattern.sub('', line) if not line: line = '\n' # Given the line that ended the description (the first line of one of the final # sections) process the final sections ('Returns:', 'Since:', etc.) until the # end of the comment block or eof. Return the line that ends the processing. def process_final_sections(fp, line, cur_doc): # Temporarily append the no_colon_since_pattern to the final section # patterns now that the description has been processed. It will be # removed at the end of this function so that future descriptions # that begin with 'Since ...' are not interpreted as a since section. final_section_patterns.append(no_colon_since_pattern) while line and not comment_end_pattern.match(line): # Remove leading ' * ' from current non-empty comment line. line = comment_line_lead_pattern.sub('', line) # Temporarily append the no colon return pattern to the final section # patterns now that the description has been processed. It will be # removed after the for loop below executes so that future descriptions # that begin with 'Returns ...' are not interpreted as a return # section. final_section_patterns.append(no_colon_return_pattern) for pattern in final_section_patterns: match = pattern.match(line) if match: if pattern == return_pattern or \ pattern == no_colon_return_pattern: # Dealing with a 'Returns:' so first extract the # annotations from the description and save them. description = match.group(2) annotations = [] annotation_match = \ annotations_pattern.match(description) if annotation_match: annotations = \ get_annotation_list(annotation_match.group(1)) # Remove the annotations from the description description = annotations_pattern.sub('', description) # Now add the return. cur_doc.add_return(description, annotations) # In case more lines need to be appended. append_func = cur_doc.append_to_return elif pattern == rename_to_pattern: # Dealing with a 'Rename to:' section (GObjectIntrospection # annotation) so no further lines will be appended but this # single one (and only to the annotations). append_func = None cur_doc.add_annotation((match.group(1), match.group(2))) else: # For all others ('Since:', 'Since ' and 'Deprecated:') # just append the line to the description for now. # But if --no-since is specified, skip a Since line. if no_since and (pattern == since_pattern or \ pattern == no_colon_since_pattern): pass else: cur_doc.append_to_description(line) # In case more lines need to be appended. append_func = cur_doc.append_to_description # Stop final section pattern matching for loop since a match # has already been found. break # Remove the no colon return pattern (which was temporarily added in # the just executed loop) from the list of final section patterns. final_section_patterns.pop() line = fp.readline() # Now continue appending lines to current section until a new one is # found or an eof or the end of the comment block is encountered. finished = False while not finished and line and \ not comment_end_pattern.match(line): # Remove leading ' * ' from line and make sure that if it is empty, # it be interpreted as a newline. line = comment_line_lead_pattern.sub('', line) if not line: line = '\n' for pattern in final_section_patterns: if pattern.match(line): finished = True break # Break out of loop if a new section is found (determined in above # inner loop). if finished: break # Now it's safe to append line. if append_func: append_func(line) # Get the next line to continue processing. line = fp.readline() # Remove the no_colon_since_pattern (which was temporarily added at # the beginning of this function) from the list of final section patterns. final_section_patterns.pop() return line def parse_dir(dir, exclude_files, doc_dict): for file in os.listdir(dir): if file in ('.', '..'): continue path = os.path.join(dir, file) if path in exclude_files: continue if os.path.isdir(path): if not no_recursion: parse_dir(path, exclude_files, doc_dict) elif len(file) > 2 and file[-2:] in ('.c', '.h'): sys.stderr.write("Processing " + path + '\n') parse_file(open(path, 'r'), doc_dict) def extract(dirs, exclude_files, doc_dict=None): if not doc_dict: doc_dict = {} for dir in dirs: parse_dir(dir, exclude_files, doc_dict) return doc_dict tmpl_section_pattern = re.compile(r'^$') def parse_tmpl(fp, doc_dict): cur_doc = None line = fp.readline() while line: match = tmpl_section_pattern.match(line) if match: cur_doc = None # new input shouldn't affect the old doc dict sect_type = match.group(1) sect_name = match.group(2) if sect_type == 'FUNCTION': cur_doc = doc_dict.get(sect_name) if not cur_doc: cur_doc = GtkDoc() cur_doc.set_name(sect_name) doc_dict[sect_name] = cur_doc elif line == '\n': cur_doc = None # don't worry about unused params. elif cur_doc: if line[:10] == '@Returns: ': if line[10:].strip(): cur_doc.append_to_return(line[10:]) elif line[0] == '@': pos = line.find(':') if pos >= 0: cur_doc.append_to_named_param(line[1:pos], line[pos+1:]) else: cur_doc.append_to_description(line) else: cur_doc.append_to_description(line) line = fp.readline() def extract_tmpl(dirs, exclude_files, doc_dict=None): if not doc_dict: doc_dict = {} for dir in dirs: for file in os.listdir(dir): if file in ('.', '..'): continue path = os.path.join(dir, file) if path in exclude_files: continue if os.path.isdir(path): continue if len(file) > 5 and file[-5:] == '.sgml': parse_tmpl(open(path, 'r'), doc_dict) return doc_dict