diff options
author | Kjell Ahlstedt <kjellahlstedt@gmail.com> | 2021-03-17 16:44:47 +0100 |
---|---|---|
committer | Kjell Ahlstedt <kjellahlstedt@gmail.com> | 2021-03-17 16:44:47 +0100 |
commit | 91c81c6e7ad8883f314d8d4c011b256d94468a38 (patch) | |
tree | 04a0cf42a283990fd5f9384405b3901fd30cf42e | |
parent | 3ec8bdad019572c518d2fee17040b46a161d6f40 (diff) | |
download | glibmm-91c81c6e7ad8883f314d8d4c011b256d94468a38.tar.gz |
Add tools/defs_gen/enumextract.py
A Python script that can replace the Perl script tools/enum.pl.
Why? Just because I've got fond of Python.
-rw-r--r-- | tools/Makefile.am | 1 | ||||
-rwxr-xr-x | tools/defs_gen/enumextract.py | 466 | ||||
-rwxr-xr-x | tools/enum.pl | 4 | ||||
-rwxr-xr-x | tools/gen_scripts/init_generate.sh | 2 |
4 files changed, 472 insertions, 1 deletions
diff --git a/tools/Makefile.am b/tools/Makefile.am index 2f37b02c..66d91ffb 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -55,6 +55,7 @@ AM_CXXFLAGS = $(GLIBMM_WXXFLAGS) EXTRA_DIST = defs_gen/definitions.py \ defs_gen/defsparser.py \ + defs_gen/enumextract.py \ defs_gen/h2def.py \ defs_gen/scmexpr.py \ defs_gen/docextract.py \ diff --git a/tools/defs_gen/enumextract.py b/tools/defs_gen/enumextract.py new file mode 100755 index 00000000..eb1e879d --- /dev/null +++ b/tools/defs_gen/enumextract.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 + +# Extracts enum definitions from C header files and writes a .defs file +# that gmmproc can read. + +# enumextract.py [--module modname] [--omit-deprecated] header_files... + +import os +import sys +import re + +# Globals + +# dictionary with enum names and values. +tokens = {} + +# A long warning is printed at most once. +has_warned_unknown_token = False + +# Part of a regular expression. +optional_cast = r'(?:\([a-z ]+\)\s*)?' + +# Compiled regular expressions. +comment_begin = re.compile(r'^(.*)/\*') +comment_end = re.compile(r'\*/(.*)') +deprecate_if_begin = re.compile(r'^\s*#\s*(:?if\s*!\s*defined|ifndef)\s*\(?\s*[A-Z_]+_DISABLE_DEPRECATED\s*\)?') +if_begin = re.compile(r'^\s*#\s*if') +if_end = re.compile(r'^\s*#\s*endif') +pp_directive = re.compile(r'^\s*#') +single_line_comment_c = re.compile(r'/\*.*?\*/') +single_line_comment_cpp = re.compile(r'//.*$') +enum_begin = re.compile(r'^\s*typedef\s+enum') +deprecated_type = re.compile(r'[A-Z]+_DEPRECATED_TYPE') +extract_enum_name = re.compile(r'^.*?(\w+)') +white_spaces = re.compile(r'\s+') +opening_bracket = re.compile(r'\s*{\s*') +extract_module_name = re.compile(r'^([A-Z][a-z]*)') +deprecated_enumerator = re.compile(r'[A-Z]+_DEPRECATED_ENUMERATOR') +dep_or_avail_enumerator = re.compile(r'^\s*(\w+)\s+\w+?_(:?DEPRECATED|AVAILABLE)_ENUMERATOR\w*(:?\s*\(.*?\))?') +parenthesis_value = re.compile(r"^\s*\S+\s*=\s*'[\(\)]'\s*$") +only_name = re.compile(r'^\w+$') +name_and_value1 = re.compile(r'^(\w+)\s*=?\s*(0x[0-9a-fA-F]+[\s0-9a-fx<-]*)$') +name_and_value2 = re.compile(r'^(\w+)\s*=?\s*(-?\s*[0-9]+)$') +name_and_value3 = re.compile(r'^(\w+)\s*=?\s*(' + optional_cast + r'\(?1[uU]?\s*<<\s*[0-9]+\s*\)?[\s0-9a-fx<-]*)$') +cast_or_unsigned1 = re.compile(optional_cast + r'(\(?1)[uU]') +cast_or_unsigned2 = re.compile(optional_cast + r'\(?1[uU]?\s*<<') +name_with_other_name = re.compile(r'^(\w+)\s*=?\s*(-?[ _x0-9a-fA-Z|()<~,]+)$') +other_name = re.compile(r'([A-Z][_A-Z0-9]+)') +name_with_char = re.compile(r"^(\w+)\s*=\s*'(.)'$") +comma_or_rbrace = re.compile(r'^(\w+)\s*=\s*(\%\%[A-Z]+\%\%)$') + +def parse(filepath, module, omit): + '''parse enums in a C file''' + + with open(filepath, mode='r') as file: + # if we are inside enum. + in_enum = False + # 1 or more, if we are inside deprecated lines. + in_deprecated = 0 + # if we are inside multiline comment. + in_comment = False + # line containing whole enum preprocessed definition to be processed. + line = '' + # line containing whole enum raw definition. + raw_line = '' + # if we already printed comment about basename of header file containing enums. + printed_from = False + # if only right bracket was found, not name. + rbracket_only = False + + for current_rawline in file: + current_line = current_rawline + if in_enum: + raw_line += ';; ' + current_rawline + if in_comment: + # end of multiline comment. + is_comment_end = comment_end.search(current_line) + if is_comment_end: + in_comment = False + if in_enum: + line += is_comment_end.group(1) + continue + + # omit deprecated stuff. + if omit and deprecate_if_begin.search(current_line): + in_deprecated += 1 + continue + if in_deprecated: + if if_begin.search(current_line): + in_deprecated += 1 + elif if_end.search(current_line): + in_deprecated -= 1 + continue + + # discard any preprocessor directives inside enums. + if pp_directive.search(current_line): + continue + + # filter single-line comments. + current_line = single_line_comment_c.sub('', current_line) + current_line = single_line_comment_cpp.sub('', current_line, 1) + + # beginning of multiline comment. + is_comment_begin = comment_begin.search(current_line) + if is_comment_begin: + in_comment = True + if in_enum: + line += is_comment_begin.group(1) + '\n' + continue + + # Replace the enumerator values ',' and '}' by strings that won't confuse + # process(). They are reset to the original strings when they are written + # to the output file. + # typedef enum { V1 = ',', V2 = '}' } E1; // is a legal definition. + current_line = current_line.replace("','", r'\%\%COMMA\%\%', 1) + current_line = current_line.replace("'}'", r'\%\%RBRACE\%\%', 1) + + # we have found an enum. + if enum_begin.search(current_line): + basename = os.path.basename(filepath) + if not printed_from: + print(';; From', basename, end='\n\n') + printed_from = True + in_enum = True + raw_line += ';; ' + current_rawline + continue + + # we have found the end of an enum. + if (in_enum and ('}' in current_line)) or rbracket_only: + # if the same line also contains ';' - that means there is a typedef name + # between '}' and ';'. + if ';' in current_line: + if not (omit and deprecated_type.search(current_line)): + enum_def = '} ' if rbracket_only else '' + enum_def += current_line + print(';; Original typedef:') + print(raw_line) + process(line, enum_def, module, omit) + in_enum = False + line = '' + raw_line = '' + rbracket_only = False + # we assume there is no such definition formed like this: + # typedef enum + # { + # ... + # } MyTypedef + # ; + # that would be stupid. + else: + rbracket_only = True + # don't append useless lines to $line. + continue + + if in_enum: + line += current_line + +def process(line, enum_def, module, omit): + '''convert enums to lisp''' + + global tokens, has_warned_unknown_token + # The name is the first word after the closing bracket. + # The name can be followed by *_DEPRECATED_TYPE* or *_AVAILABLE_TYPE* + # before the semicolon. + is_enum_name = extract_enum_name.search(enum_def) + if is_enum_name: + enum_def = is_enum_name.group(1) + c_name = enum_def + # replace all excessive whitespaces with one space. + line = white_spaces.sub(' ', line) + # get rid of any comments. + line = single_line_comment_c.sub('', line) + # get rid of opening bracket. + line = opening_bracket.sub('', line, 1) + # lets employ some heuristics. :) + perhaps_enum = 0 + perhaps_flags = 0 + # c_name = module + enum_def. + if not module: + is_module_name = extract_module_name.search(c_name) + if is_module_name: + module = is_module_name.group(1) + else: + module = '' + enum_def = enum_def.replace(module, '') + # names and their values. + c_names = [] + values = [] + # val - default value for enum, gets incremented after every value processed. + val = 0 + # these are just for case when enum value is equal to some sort of unknown + # value - preprocessor define or other enum. + unknown_flag = False + unknown_val = '' + unknown_base = '' + unknown_increment = 0 + + lines = line.split(',') + iter = 0 + while iter < len(lines): + # The enumerator name can be followed by *_DEPRECATED_ENUMERATOR*, + # *_DEPRECATED_ENUMERATOR*_FOR(*) or *_AVAILABLE_ENUMERATOR* + # before the equal sign or comma. + omit_enumerator = omit and deprecated_enumerator.search(lines[iter]) + lines[iter] = dep_or_avail_enumerator.sub(r'\1', lines[iter], 1) + + brackets_count = 0 + begin = iter + + # ignore ',' inside () brackets + # except '(' and ')' enum values + if parenthesis_value.search(lines[iter]): + iter += 1 + else: + first = True + while first or (iter < len(lines) and brackets_count != 0): + first = False + brackets_count += lines[iter].count('(') + brackets_count -= lines[iter].count(')') + iter += 1 + + if omit_enumerator: + continue + + # join with comma and remove leading and trailing spaces. + # also remove backslashes as some people like to add them before newlines... + i = ','.join(lines[begin:iter]).strip().replace('\\', '') + + # if only name exists [like MY_ENUM_VALUE]. + if only_name.search(i): + c_names.append(i) + if unknown_flag: + values.append(unknown_val) + tokens[i] = unknown_val + else: + values.append(str(val)) + tokens[i] = val + perhaps_enum += 1 + # if name with value exists [like MY_FLAG_VALUE = 0x2 or 0x5 << 22 + # or 42 or -13 (in this case entity is still enum, not flags) + # or 1 << 2 or (1 << 4) or (1 << 5) - 1]. + else: + m = name_and_value1.search(i) or name_and_value2.search(i) or name_and_value3.search(i) + if m: + tmp1 = m.group(1) + tmp2 = m.group(2) + c_names.append(tmp1) + # I do not know who thought that writing '- 1' as enum value is grrreat + # idea - strip whitespaces between unary minus and a digit. + if tmp2.startswith('- '): + tmp2 = re.sub(r'^-\s+', '', tmp2) + tmp3 = tmp2 + # Python does not understand C-style cast or the u suffix for unsigned. + tmp3 = cast_or_unsigned1.sub(r'\1', tmp3) + val = eval(tmp3) + if cast_or_unsigned2.search(tmp2): + perhaps_flags += 10 + elif tmp2.startswith('0x'): + perhaps_flags += 1 + else: + perhaps_enum += 1 + values.append(tmp2) + tokens[tmp1] = val + unknown_flag = False + else: + # if name with other name exists [like MY_FLAG_VALUE = MY_PREV_FLAG_VALUE + # or ~(MY_PREV_FLAG_VALUE | MY_EARLIER_VALUE | (1 << 5) - 1 | 0x200)]. + # [MY_FLAG MY_OTHER_FLAG is also supported - note lack of equal char.] + # [SOME_DEFINITION([X, [Y, [...]]]) definition is also supported.] + m = name_with_other_name.search(i) + if m: + tmp1 = m.group(1) + tmp2 = m.group(2) + c_names.append(tmp1) + # split r-values on "logical or" and for each splitted r-value check its + # numeric value and replace a name with it if possible. + tmps = tmp2.split('|') + # dont_eval is True if unknown token is found, so whole value is copied + # verbatim, without evaling. + dont_eval = False + if len(tmps) > 1: + perhaps_flags += 1 + else: + perhaps_enum += 1 + + for tmpval in tmps: + # if r-value is something like MY_FLAG or MY_DEFINE_VALUE3. + m = other_name.search(tmpval) + if m: + tmp3 = m.group(1) + if tmp3 not in tokens: + dont_eval = True + print('WARNING:', tmp3, 'value of', tmp1, "element in '", c_name, + "' enum is an unknown token.", file=sys.stderr) + if not has_warned_unknown_token: + has_warned_unknown_token = True + print("It probably is one of:", + " - preprocessor value - make sure that header defining this value is included in sources wrapping the enum.", + " - enum value from other header or module - see 'preprocessor value'.", + " - typo (happens rarely) - send a patch fixing this to maintainer of this module.", + sep='\n', file=sys.stderr) + # unknown value often makes a flag. + perhaps_flags += 1 + else: + tmp2 = tmp2.replace(tmp3, str(tokens[tmp3])) + else: + # else is a numeric value, so we do not do anything. + pass + + # check if there are still some non-numerical values. + if re.search(r'[_A-Z]+', tmp2): + dont_eval = True + + if not dont_eval: + val = eval(tmp2) + values.append(val) + tokens[tmp1] = val + unknown_flag = False + else: + values.append(tmp2) + unknown_flag = True + # wrapping in safety parens. + unknown_base = '(' + tmp2 + ')' + unknown_increment = 0 + tokens[tmp1] = unknown_base + + # if name with char exists (like MY_ENUM_VALUE = 'a'). + else: + m = name_with_char.search(i) + if m: + c_names.append(m.group(1)) + values.append("'" + m.group(2) + "'") + val = ord(m.group(2)) + tokens[m.group(1)] = val + unknown_flag = False + perhaps_enum += 1 + + # if it's one of the char values that were replaced by + # %%COMMA%% or %%RBRACE%%. + else: + m = comma_or_rbrace.search(i) + if m: + c_names.append(m.group(1)) + if m.group(2) == r'%%COMMA%%': + values.append("','") + val = ord(',') + elif m.group(2) == r'%%RBRACE%%': + values.append("'}'") + val = ord('}') + else: + values.append(m.group(2)) + tokens[m.group(1)] = val + unknown_flag = False + perhaps_enum += 1 + + # it should not get here, + # except if the last enumerator is followed by a comma. + elif not(not i and iter == len(lines)): + print("WARNING: I do not know how to parse it: '", i, "' in '", c_name, "'.", + sep='', file=sys.stderr) + + if unknown_flag: + unknown_increment += 1 + unknown_val = unknown_base + ' + ', + unknown_increment + else: + val += 1 + + entity = 'flags' if c_name.endswith('Flags') or perhaps_flags >= perhaps_enum else 'enum' + # get nick names. + ref_names = form_names(c_name, c_names) + # set format - decimal for enums, hexadecimal for flags. + vformat = '{0:d}' if entity == 'enum' else '{0:#x}' + # evaluate any unevaluated values and format them properly, if applicable. + for j in range(len(values)): + # if values[j] is a string that can be interpreted as a decimal integer, + # convert it to an integer, so the format (decimal or hexadecimal) + # can be selected by vformat. + if isinstance(values[j], str): + try: + values[j] = int(values[j]) + except ValueError: + pass + if isinstance(values[j], int): + values[j] = vformat.format(values[j]) + + # print the defs. + print('(define-', entity, '-extended ', enum_def, sep='') + print(' (in-module "', module, '")', sep='') + print(' (c-name "', c_name, '")', sep='') + print(' (values') + for j in range(len(c_names)): + value = '' + if values[j]: + value = ' "' + values[j] + '"' + print(' \'("', ref_names[j], '" "', c_names[j], '"', value, ')', sep='') + print(' )') + print(')\n') + +def form_names(c_name, c_names): + '''form nick names from C names''' + + names = [] + # no values in enum means no names. + if not c_names: + return names + + # search for length of a prefix. + leng = len(c_names[0]) - 1 + # if there is more than one value in enum, search for a common part. + if len(c_names) > 1: + for j in range(len(c_names)-1): + while c_names[j][leng-1] != '_' or c_names[j][0:leng] != c_names[j+1][0:leng]: + leng -= 1 + if leng <= 0: + break + if leng <= 0: + break + # if there is only one value in enum, we have to use name of the enum. + else: + subvals = c_names[0].split('_') + for j in range(len(subvals)): + subvals[j] = subvals[j].capitalize() + false_c_name = ''.join(subvals) + while leng > 0 and c_name[0:leng] != false_c_name[0:leng]: + leng -= 1 + tmpleng = leng + for subval in subvals: + leng += 1 + l = len(subval) + if tmpleng <= l: + break + tmpleng -= l + + # get prefix with given length. + prefix = c_names[0][0:leng] + # generate names. + for c_n in c_names: + if c_n[0:len(prefix)] == prefix: + # remove prefix. + c_n = c_n[len(prefix):] + c_n = c_n.lower().replace('_', '-') + names.append(c_n) + + return names + +# ----- Main ----- +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser( + description='Extract enum definitions from C/C++ header files and write a .defs file.') + parser.add_argument('--module', help='module name') + parser.add_argument('--omit-deprecated', action='store_true', dest='omit', + help='omit deprecated enums and enum values') + parser.add_argument('header_files', nargs='+', help='header file(s) to parse') + args = parser.parse_args() + + exitcode = 0 + for filepath in args.header_files: + try: + parse(filepath, args.module, args.omit) + except FileNotFoundError as err: + exitcode = 1 + print(err, file=sys.stderr) + + sys.exit(exitcode) diff --git a/tools/enum.pl b/tools/enum.pl index 3091f113..10d12830 100755 --- a/tools/enum.pl +++ b/tools/enum.pl @@ -1,5 +1,9 @@ #! /usr/bin/perl +# glibmm/tools/defs_gen/enumextract.py is a Python script with almost +# the same functionality as this Perl script. +# enumextract.py is newer and recommended. + # The lisp definitions for flags does not include order. # thus we must extract it ourselves. # Usage: ./enum.pl /gnome/head/cvs/gconf/gconf/*.h > gconf_enums.defs diff --git a/tools/gen_scripts/init_generate.sh b/tools/gen_scripts/init_generate.sh index 5ce3895e..97a695a0 100755 --- a/tools/gen_scripts/init_generate.sh +++ b/tools/gen_scripts/init_generate.sh @@ -32,7 +32,7 @@ fi # Scripts in glibmm. These are source files. gen_docs="$GMMPROC_GEN_SOURCE_DIR/glibmm/tools/defs_gen/docextract_to_xml.py" gen_methods="$GMMPROC_GEN_SOURCE_DIR/glibmm/tools/defs_gen/h2def.py" -gen_enums="$GMMPROC_GEN_SOURCE_DIR/glibmm/tools/enum.pl" +gen_enums="$GMMPROC_GEN_SOURCE_DIR/glibmm/tools/defs_gen/enumextract.py" # Where to find executables that generate extra defs (signals and properties). # glibmm is built with autotools. |