Add tools/defs_gen/enumextract.py

A Python script that can replace the Perl script tools/enum.pl. Why? Just because I've got fond of Python.
author: Kjell Ahlstedt <kjellahlstedt@gmail.com> 2021-03-17 16:44:47 +0100
committer: Kjell Ahlstedt <kjellahlstedt@gmail.com> 2021-03-17 16:44:47 +0100
commit: 91c81c6e7ad8883f314d8d4c011b256d94468a38 (patch)
tree: 04a0cf42a283990fd5f9384405b3901fd30cf42e /tools
parent: 3ec8bdad019572c518d2fee17040b46a161d6f40 (diff)
download: glibmm-91c81c6e7ad8883f314d8d4c011b256d94468a38.tar.gz
4 files changed, 472 insertions, 1 deletions
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 2f37b02c..66d91ffb 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -55,6 +55,7 @@ AM_CXXFLAGS = $(GLIBMM_WXXFLAGS)
 
 EXTRA_DIST = defs_gen/definitions.py \
   defs_gen/defsparser.py \
+  defs_gen/enumextract.py \
   defs_gen/h2def.py \
   defs_gen/scmexpr.py \
   defs_gen/docextract.py \
diff --git a/tools/defs_gen/enumextract.py b/tools/defs_gen/enumextract.py
new file mode 100755
index 00000000..eb1e879d
--- /dev/null
+++ b/tools/defs_gen/enumextract.py
@@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+
+# Extracts enum definitions from C header files and writes a .defs file
+# that gmmproc can read.
+
+# enumextract.py  [--module modname] [--omit-deprecated] header_files...
+
+import os
+import sys
+import re
+
+# Globals
+
+# dictionary with enum names and values.
+tokens = {}
+
+# A long warning is printed at most once.
+has_warned_unknown_token = False
+
+# Part of a regular expression.
+optional_cast = r'(?:\([a-z ]+\)\s*)?'
+
+# Compiled regular expressions.
+comment_begin = re.compile(r'^(.*)/\*')
+comment_end = re.compile(r'\*/(.*)')
+deprecate_if_begin = re.compile(r'^\s*#\s*(:?if\s*!\s*defined|ifndef)\s*\(?\s*[A-Z_]+_DISABLE_DEPRECATED\s*\)?')
+if_begin = re.compile(r'^\s*#\s*if')
+if_end = re.compile(r'^\s*#\s*endif')
+pp_directive = re.compile(r'^\s*#')
+single_line_comment_c = re.compile(r'/\*.*?\*/')
+single_line_comment_cpp = re.compile(r'//.*$')
+enum_begin = re.compile(r'^\s*typedef\s+enum')
+deprecated_type = re.compile(r'[A-Z]+_DEPRECATED_TYPE')
+extract_enum_name = re.compile(r'^.*?(\w+)')
+white_spaces = re.compile(r'\s+')
+opening_bracket = re.compile(r'\s*{\s*')
+extract_module_name = re.compile(r'^([A-Z][a-z]*)')
+deprecated_enumerator = re.compile(r'[A-Z]+_DEPRECATED_ENUMERATOR')
+dep_or_avail_enumerator = re.compile(r'^\s*(\w+)\s+\w+?_(:?DEPRECATED|AVAILABLE)_ENUMERATOR\w*(:?\s*\(.*?\))?')
+parenthesis_value = re.compile(r"^\s*\S+\s*=\s*'[\(\)]'\s*$")
+only_name = re.compile(r'^\w+$')
+name_and_value1 = re.compile(r'^(\w+)\s*=?\s*(0x[0-9a-fA-F]+[\s0-9a-fx<-]*)$')
+name_and_value2 = re.compile(r'^(\w+)\s*=?\s*(-?\s*[0-9]+)$')
+name_and_value3 = re.compile(r'^(\w+)\s*=?\s*(' + optional_cast + r'\(?1[uU]?\s*<<\s*[0-9]+\s*\)?[\s0-9a-fx<-]*)$')
+cast_or_unsigned1 = re.compile(optional_cast + r'(\(?1)[uU]')
+cast_or_unsigned2 = re.compile(optional_cast + r'\(?1[uU]?\s*<<')
+name_with_other_name = re.compile(r'^(\w+)\s*=?\s*(-?[ _x0-9a-fA-Z|()<~,]+)$')
+other_name = re.compile(r'([A-Z][_A-Z0-9]+)')
+name_with_char = re.compile(r"^(\w+)\s*=\s*'(.)'$")
+comma_or_rbrace = re.compile(r'^(\w+)\s*=\s*(\%\%[A-Z]+\%\%)$')
+
+def parse(filepath, module, omit):
+  '''parse enums in a C file'''
+
+  with open(filepath, mode='r') as file:
+    # if we are inside enum.
+    in_enum = False
+    # 1 or more, if we are inside deprecated lines.
+    in_deprecated = 0
+    # if we are inside multiline comment.
+    in_comment = False
+    # line containing whole enum preprocessed definition to be processed.
+    line = ''
+    # line containing whole enum raw definition.
+    raw_line = ''
+    # if we already printed comment about basename of header file containing enums.
+    printed_from = False
+    # if only right bracket was found, not name.
+    rbracket_only = False
+
+    for current_rawline in file:
+      current_line = current_rawline
+      if in_enum:
+        raw_line += ';; ' + current_rawline
+      if in_comment:
+        # end of multiline comment.
+        is_comment_end = comment_end.search(current_line)
+        if is_comment_end:
+          in_comment = False
+          if in_enum:
+            line += is_comment_end.group(1)
+        continue
+
+      # omit deprecated stuff.
+      if omit and deprecate_if_begin.search(current_line):
+        in_deprecated += 1
+        continue
+      if in_deprecated:
+        if if_begin.search(current_line):
+          in_deprecated += 1
+        elif if_end.search(current_line):
+          in_deprecated -= 1
+        continue
+
+      # discard any preprocessor directives inside enums.
+      if pp_directive.search(current_line):
+        continue
+
+      # filter single-line comments.
+      current_line = single_line_comment_c.sub('', current_line)
+      current_line = single_line_comment_cpp.sub('', current_line, 1)
+
+      # beginning of multiline comment.
+      is_comment_begin = comment_begin.search(current_line)
+      if is_comment_begin:
+        in_comment = True
+        if in_enum:
+          line += is_comment_begin.group(1) + '\n'
+        continue
+
+      # Replace the enumerator values ',' and '}' by strings that won't confuse
+      # process(). They are reset to the original strings when they are written
+      # to the output file.
+      # typedef enum { V1 = ',', V2 = '}' } E1; // is a legal definition.
+      current_line = current_line.replace("','", r'\%\%COMMA\%\%', 1)
+      current_line = current_line.replace("'}'", r'\%\%RBRACE\%\%', 1)
+
+      # we have found an enum.
+      if enum_begin.search(current_line):
+        basename = os.path.basename(filepath)
+        if not printed_from:
+          print(';; From', basename, end='\n\n')
+          printed_from = True
+        in_enum = True
+        raw_line += ';; ' + current_rawline
+        continue
+
+      # we have found the end of an enum.
+      if (in_enum and ('}' in current_line)) or rbracket_only:
+        # if the same line also contains ';' - that means there is a typedef name
+        # between '}' and ';'.
+        if ';' in current_line:
+          if not (omit and deprecated_type.search(current_line)):
+            enum_def = '} ' if rbracket_only else ''
+            enum_def += current_line
+            print(';; Original typedef:')
+            print(raw_line)
+            process(line, enum_def, module, omit)
+          in_enum = False
+          line = ''
+          raw_line = ''
+          rbracket_only = False
+        # we assume there is no such definition formed like this:
+        # typedef enum
+        # {
+        # ...
+        # } MyTypedef
+        # ;
+        # that would be stupid.
+        else:
+          rbracket_only = True
+          # don't append useless lines to $line.
+          continue
+
+      if in_enum:
+        line += current_line
+
+def process(line, enum_def, module, omit):
+  '''convert enums to lisp'''
+
+  global tokens, has_warned_unknown_token
+  # The name is the first word after the closing bracket.
+  # The name can be followed by *_DEPRECATED_TYPE* or *_AVAILABLE_TYPE*
+  # before the semicolon.
+  is_enum_name = extract_enum_name.search(enum_def)
+  if is_enum_name:
+    enum_def = is_enum_name.group(1)
+  c_name = enum_def
+  # replace all excessive whitespaces with one space.
+  line = white_spaces.sub(' ', line)
+  # get rid of any comments.
+  line = single_line_comment_c.sub('', line)
+  # get rid of opening bracket.
+  line = opening_bracket.sub('', line, 1)
+  # lets employ some heuristics. :)
+  perhaps_enum = 0
+  perhaps_flags = 0
+  # c_name = module + enum_def.
+  if not module:
+    is_module_name = extract_module_name.search(c_name)
+    if is_module_name:
+      module = is_module_name.group(1)
+    else:
+      module = ''
+  enum_def = enum_def.replace(module, '')
+  # names and their values.
+  c_names = []
+  values = []
+  # val - default value for enum, gets incremented after every value processed.
+  val = 0
+  # these are just for case when enum value is equal to some sort of unknown
+  # value - preprocessor define or other enum.
+  unknown_flag = False
+  unknown_val = ''
+  unknown_base = ''
+  unknown_increment = 0
+
+  lines = line.split(',')
+  iter = 0
+  while iter < len(lines):
+    # The enumerator name can be followed by *_DEPRECATED_ENUMERATOR*,
+    # *_DEPRECATED_ENUMERATOR*_FOR(*) or *_AVAILABLE_ENUMERATOR*
+    # before the equal sign or comma.
+    omit_enumerator = omit and deprecated_enumerator.search(lines[iter])
+    lines[iter] = dep_or_avail_enumerator.sub(r'\1', lines[iter], 1)
+
+    brackets_count = 0
+    begin = iter
+
+    # ignore ',' inside () brackets
+    # except '(' and ')' enum values
+    if parenthesis_value.search(lines[iter]):
+      iter += 1
+    else:
+      first = True
+      while first or (iter < len(lines) and brackets_count != 0):
+        first = False
+        brackets_count += lines[iter].count('(')
+        brackets_count -= lines[iter].count(')')
+        iter += 1
+
+    if omit_enumerator:
+      continue
+
+    # join with comma and remove leading and trailing spaces.
+    # also remove backslashes as some people like to add them before newlines...
+    i = ','.join(lines[begin:iter]).strip().replace('\\', '')
+
+    # if only name exists [like MY_ENUM_VALUE].
+    if only_name.search(i):
+      c_names.append(i)
+      if unknown_flag:
+        values.append(unknown_val)
+        tokens[i] = unknown_val
+      else:
+        values.append(str(val))
+        tokens[i] = val
+      perhaps_enum += 1
+    # if name with value exists [like MY_FLAG_VALUE = 0x2 or 0x5 << 22
+    # or 42 or -13 (in this case entity is still enum, not flags)
+    # or 1 << 2 or (1 << 4) or (1 << 5) - 1].
+    else:
+      m = name_and_value1.search(i) or name_and_value2.search(i) or name_and_value3.search(i)
+      if m:
+        tmp1 = m.group(1)
+        tmp2 = m.group(2)
+        c_names.append(tmp1)
+        # I do not know who thought that writing '- 1' as enum value is grrreat
+        # idea - strip whitespaces between unary minus and a digit.
+        if tmp2.startswith('- '):
+          tmp2 = re.sub(r'^-\s+', '', tmp2)
+        tmp3 = tmp2
+        # Python does not understand C-style cast or the u suffix for unsigned.
+        tmp3 = cast_or_unsigned1.sub(r'\1', tmp3)
+        val = eval(tmp3)
+        if cast_or_unsigned2.search(tmp2):
+          perhaps_flags += 10
+        elif tmp2.startswith('0x'):
+          perhaps_flags += 1
+        else:
+          perhaps_enum += 1
+        values.append(tmp2)
+        tokens[tmp1] = val
+        unknown_flag = False
+      else:
+        # if name with other name exists [like MY_FLAG_VALUE = MY_PREV_FLAG_VALUE
+        # or ~(MY_PREV_FLAG_VALUE | MY_EARLIER_VALUE | (1 << 5) - 1 | 0x200)].
+        # [MY_FLAG MY_OTHER_FLAG is also supported - note lack of equal char.]
+        # [SOME_DEFINITION([X, [Y, [...]]]) definition is also supported.]
+        m = name_with_other_name.search(i)
+        if m:
+          tmp1 = m.group(1)
+          tmp2 = m.group(2)
+          c_names.append(tmp1)
+          # split r-values on "logical or" and for each splitted r-value check its
+          # numeric value and replace a name with it if possible.
+          tmps = tmp2.split('|')
+          # dont_eval is True if unknown token is found, so whole value is copied
+          # verbatim, without evaling.
+          dont_eval = False
+          if len(tmps) > 1:
+            perhaps_flags += 1
+          else:
+            perhaps_enum += 1
+
+          for tmpval in tmps:
+            # if r-value is something like MY_FLAG or MY_DEFINE_VALUE3.
+            m = other_name.search(tmpval)
+            if m:
+              tmp3 = m.group(1)
+              if tmp3 not in tokens:
+                dont_eval = True
+                print('WARNING:', tmp3, 'value of', tmp1, "element in '",  c_name,
+                      "' enum is an unknown token.", file=sys.stderr)
+                if not has_warned_unknown_token:
+                  has_warned_unknown_token = True
+                  print("It probably is one of:",
+                        "  - preprocessor value - make sure that header defining this value is included in sources wrapping the enum.",
+                        "  - enum value from other header or module - see 'preprocessor value'.",
+                        "  - typo (happens rarely) - send a patch fixing this to maintainer of this module.",
+                        sep='\n', file=sys.stderr)
+                # unknown value often makes a flag.
+                perhaps_flags += 1
+              else:
+                tmp2 = tmp2.replace(tmp3, str(tokens[tmp3]))
+            else:
+              # else is a numeric value, so we do not do anything.
+              pass
+
+          # check if there are still some non-numerical values.
+          if re.search(r'[_A-Z]+', tmp2):
+            dont_eval = True
+
+          if not dont_eval:
+            val = eval(tmp2)
+            values.append(val)
+            tokens[tmp1] = val
+            unknown_flag = False
+          else:
+            values.append(tmp2)
+            unknown_flag = True
+            # wrapping in safety parens.
+            unknown_base = '(' + tmp2 + ')'
+            unknown_increment = 0
+            tokens[tmp1] = unknown_base
+
+        # if name with char exists (like MY_ENUM_VALUE = 'a').
+        else:
+          m = name_with_char.search(i)
+          if m:
+            c_names.append(m.group(1))
+            values.append("'" + m.group(2) + "'")
+            val = ord(m.group(2))
+            tokens[m.group(1)] = val
+            unknown_flag = False
+            perhaps_enum += 1
+
+          # if it's one of the char values that were replaced by
+          # %%COMMA%% or %%RBRACE%%.
+          else:
+            m = comma_or_rbrace.search(i)
+            if m:
+              c_names.append(m.group(1))
+              if m.group(2) == r'%%COMMA%%':
+                values.append("','")
+                val = ord(',')
+              elif m.group(2) == r'%%RBRACE%%':
+                values.append("'}'")
+                val = ord('}')
+              else:
+                values.append(m.group(2))
+              tokens[m.group(1)] = val
+              unknown_flag = False
+              perhaps_enum += 1
+
+            # it should not get here,
+            # except if the last enumerator is followed by a comma.
+            elif not(not i and iter == len(lines)):
+              print("WARNING: I do not know how to parse it: '", i, "' in '", c_name, "'.",
+                    sep='', file=sys.stderr)
+
+    if unknown_flag:
+      unknown_increment += 1
+      unknown_val = unknown_base + ' + ', + unknown_increment
+    else:
+      val += 1
+
+  entity = 'flags' if c_name.endswith('Flags') or perhaps_flags >= perhaps_enum else 'enum'
+  # get nick names.
+  ref_names = form_names(c_name, c_names)
+  # set format - decimal for enums, hexadecimal for flags.
+  vformat = '{0:d}' if entity == 'enum' else '{0:#x}'
+  # evaluate any unevaluated values and format them properly, if applicable.
+  for j in range(len(values)):
+    # if values[j] is a string that can be interpreted as a decimal integer,
+    # convert it to an integer, so the format (decimal or hexadecimal)
+    # can be selected by vformat.
+    if isinstance(values[j], str):
+      try:
+        values[j] = int(values[j])
+      except ValueError:
+        pass    
+    if isinstance(values[j], int):
+      values[j] = vformat.format(values[j])
+
+  # print the defs.
+  print('(define-', entity, '-extended ', enum_def, sep='')
+  print('  (in-module "', module, '")', sep='')
+  print('  (c-name "', c_name, '")', sep='')
+  print('  (values')
+  for j in range(len(c_names)):
+    value = ''
+    if values[j]:
+      value = ' "' + values[j] + '"'
+    print('    \'("', ref_names[j], '" "', c_names[j], '"', value, ')', sep='')
+  print('  )')
+  print(')\n')
+
+def form_names(c_name, c_names):
+  '''form nick names from C names'''
+
+  names = []
+  # no values in enum means no names.
+  if not c_names:
+    return names
+    
+  # search for length of a prefix.
+  leng = len(c_names[0]) - 1
+  # if there is more than one value in enum, search for a common part.
+  if len(c_names) > 1:
+    for j in range(len(c_names)-1):
+      while c_names[j][leng-1] != '_' or c_names[j][0:leng] != c_names[j+1][0:leng]:
+        leng -= 1
+        if leng <= 0:
+          break
+      if leng <= 0:
+        break
+  # if there is only one value in enum, we have to use name of the enum.
+  else:
+    subvals = c_names[0].split('_')
+    for j in range(len(subvals)):
+      subvals[j] = subvals[j].capitalize()
+    false_c_name = ''.join(subvals)
+    while leng > 0 and c_name[0:leng] != false_c_name[0:leng]:
+      leng -= 1
+    tmpleng = leng
+    for subval in subvals:
+      leng += 1
+      l = len(subval)
+      if tmpleng <= l:
+        break
+      tmpleng -= l
+
+  # get prefix with given length.
+  prefix = c_names[0][0:leng]
+  # generate names.
+  for c_n in c_names:
+    if c_n[0:len(prefix)] == prefix:
+      # remove prefix.
+      c_n = c_n[len(prefix):]
+    c_n = c_n.lower().replace('_', '-')
+    names.append(c_n)
+
+  return names
+
+# ----- Main -----
+if __name__ == '__main__':
+  import argparse
+
+  parser = argparse.ArgumentParser(
+    description='Extract enum definitions from C/C++ header files and write a .defs file.')
+  parser.add_argument('--module', help='module name')
+  parser.add_argument('--omit-deprecated', action='store_true', dest='omit',
+    help='omit deprecated enums and enum values')
+  parser.add_argument('header_files', nargs='+', help='header file(s) to parse')
+  args = parser.parse_args()
+
+  exitcode = 0
+  for filepath in args.header_files:
+    try:
+      parse(filepath, args.module, args.omit)
+    except FileNotFoundError as err:
+      exitcode = 1
+      print(err, file=sys.stderr)
+
+  sys.exit(exitcode)
diff --git a/tools/enum.pl b/tools/enum.pl
index 3091f113..10d12830 100755
--- a/tools/enum.pl
+++ b/tools/enum.pl
@@ -1,5 +1,9 @@
 #! /usr/bin/perl
 
+# glibmm/tools/defs_gen/enumextract.py is a Python script with almost
+# the same functionality as this Perl script.
+# enumextract.py is newer and recommended.
+
 # The lisp definitions for flags does not include order.
 # thus we must extract it ourselves.
 # Usage: ./enum.pl /gnome/head/cvs/gconf/gconf/*.h > gconf_enums.defs
diff --git a/tools/gen_scripts/init_generate.sh b/tools/gen_scripts/init_generate.sh
index 5ce3895e..97a695a0 100755
--- a/tools/gen_scripts/init_generate.sh
+++ b/tools/gen_scripts/init_generate.sh
@@ -32,7 +32,7 @@ fi
 # Scripts in glibmm. These are source files.
 gen_docs="$GMMPROC_GEN_SOURCE_DIR/glibmm/tools/defs_gen/docextract_to_xml.py"
 gen_methods="$GMMPROC_GEN_SOURCE_DIR/glibmm/tools/defs_gen/h2def.py"
-gen_enums="$GMMPROC_GEN_SOURCE_DIR/glibmm/tools/enum.pl"
+gen_enums="$GMMPROC_GEN_SOURCE_DIR/glibmm/tools/defs_gen/enumextract.py"
 
 # Where to find executables that generate extra defs (signals and properties).
 # glibmm is built with autotools.
author	Kjell Ahlstedt <kjellahlstedt@gmail.com>	2021-03-17 16:44:47 +0100
committer	Kjell Ahlstedt <kjellahlstedt@gmail.com>	2021-03-17 16:44:47 +0100
commit	91c81c6e7ad8883f314d8d4c011b256d94468a38 (patch)
tree	04a0cf42a283990fd5f9384405b3901fd30cf42e /tools
parent	3ec8bdad019572c518d2fee17040b46a161d6f40 (diff)
download	glibmm-91c81c6e7ad8883f314d8d4c011b256d94468a38.tar.gz