#!/usr/bin/env python3

# Extracts enum definitions from C header files and writes a .defs file
# that gmmproc can read.

# enumextract.py  [--module modname] [--omit-deprecated] header_files...

import os
import sys
import re

# Globals

# dictionary with enum names and values.
tokens = {}

# A long warning is printed at most once.
has_warned_unknown_token = False

# Part of a regular expression.
optional_cast = r'(?:\([a-z ]+\)\s*)?'

# Compiled regular expressions.
comment_begin = re.compile(r'^(.*)/\*')
comment_end = re.compile(r'\*/(.*)')
deprecate_if_begin = re.compile(r'^\s*#\s*(:?if\s*!\s*defined|ifndef)\s*\(?\s*[A-Z_]+_DISABLE_DEPRECATED\s*\)?')
if_begin = re.compile(r'^\s*#\s*if')
if_end = re.compile(r'^\s*#\s*endif')
pp_directive = re.compile(r'^\s*#')
single_line_comment_c = re.compile(r'/\*.*?\*/')
single_line_comment_cpp = re.compile(r'//.*$')
enum_begin = re.compile(r'^\s*typedef\s+enum')
deprecated_type = re.compile(r'[A-Z]+_DEPRECATED_TYPE')
extract_enum_name = re.compile(r'^.*?(\w+)')
white_spaces = re.compile(r'\s+')
opening_bracket = re.compile(r'\s*{\s*')
extract_module_name = re.compile(r'^([A-Z][a-z]*)')
deprecated_enumerator = re.compile(r'[A-Z]+_DEPRECATED_ENUMERATOR')
dep_or_avail_enumerator = re.compile(r'^\s*(\w+)\s+\w+?_(:?DEPRECATED|AVAILABLE)_ENUMERATOR\w*(:?\s*\(.*?\))?')
parenthesis_value = re.compile(r"^\s*\S+\s*=\s*'[\(\)]'\s*$")
only_name = re.compile(r'^\w+$')
name_and_value1 = re.compile(r'^(\w+)\s*=?\s*(0x[0-9a-fA-F]+[\s0-9a-fx<-]*)$')
name_and_value2 = re.compile(r'^(\w+)\s*=?\s*(-?\s*[0-9]+)$')
name_and_value3 = re.compile(r'^(\w+)\s*=?\s*(' + optional_cast + r'\(?1[uU]?\s*<<\s*[0-9]+\s*\)?[\s0-9a-fx<-]*)$')
cast_or_unsigned1 = re.compile(optional_cast + r'(\(?1)[uU]')
cast_or_unsigned2 = re.compile(optional_cast + r'\(?1[uU]?\s*<<')
name_with_other_name = re.compile(r'^(\w+)\s*=?\s*(-?[ _x0-9a-fA-Z|()<~,]+)$')
other_name = re.compile(r'([A-Z][_A-Z0-9]+)')
name_with_char = re.compile(r"^(\w+)\s*=\s*'(.)'$")
comma_or_rbrace = re.compile(r'^(\w+)\s*=\s*(\%\%[A-Z]+\%\%)$')

def parse(filepath, module, omit):
  '''parse enums in a C file'''

  with open(filepath, mode='r') as file:
    # if we are inside enum.
    in_enum = False
    # 1 or more, if we are inside deprecated lines.
    in_deprecated = 0
    # if we are inside multiline comment.
    in_comment = False
    # line containing whole enum preprocessed definition to be processed.
    line = ''
    # line containing whole enum raw definition.
    raw_line = ''
    # if we already printed comment about basename of header file containing enums.
    printed_from = False
    # if only right bracket was found, not name.
    rbracket_only = False

    for current_rawline in file:
      current_line = current_rawline
      if in_enum:
        raw_line += ';; ' + current_rawline
      if in_comment:
        # end of multiline comment.
        is_comment_end = comment_end.search(current_line)
        if is_comment_end:
          in_comment = False
          if in_enum:
            line += is_comment_end.group(1)
        continue

      # omit deprecated stuff.
      if omit and deprecate_if_begin.search(current_line):
        in_deprecated += 1
        continue
      if in_deprecated:
        if if_begin.search(current_line):
          in_deprecated += 1
        elif if_end.search(current_line):
          in_deprecated -= 1
        continue

      # discard any preprocessor directives inside enums.
      if pp_directive.search(current_line):
        continue

      # filter single-line comments.
      current_line = single_line_comment_c.sub('', current_line)
      current_line = single_line_comment_cpp.sub('', current_line, 1)

      # beginning of multiline comment.
      is_comment_begin = comment_begin.search(current_line)
      if is_comment_begin:
        in_comment = True
        if in_enum:
          line += is_comment_begin.group(1) + '\n'
        continue

      # Replace the enumerator values ',' and '}' by strings that won't confuse
      # process(). They are reset to the original strings when they are written
      # to the output file.
      # typedef enum { V1 = ',', V2 = '}' } E1; // is a legal definition.
      current_line = current_line.replace("','", r'\%\%COMMA\%\%', 1)
      current_line = current_line.replace("'}'", r'\%\%RBRACE\%\%', 1)

      # we have found an enum.
      if enum_begin.search(current_line):
        basename = os.path.basename(filepath)
        if not printed_from:
          print(';; From', basename, end='\n\n')
          printed_from = True
        in_enum = True
        raw_line += ';; ' + current_rawline
        continue

      # we have found the end of an enum.
      if (in_enum and ('}' in current_line)) or rbracket_only:
        # if the same line also contains ';' - that means there is a typedef name
        # between '}' and ';'.
        if ';' in current_line:
          if not (omit and deprecated_type.search(current_line)):
            enum_def = '} ' if rbracket_only else ''
            enum_def += current_line
            print(';; Original typedef:')
            print(raw_line)
            process(line, enum_def, module, omit)
          in_enum = False
          line = ''
          raw_line = ''
          rbracket_only = False
        # we assume there is no such definition formed like this:
        # typedef enum
        # {
        # ...
        # } MyTypedef
        # ;
        # that would be stupid.
        else:
          rbracket_only = True
          # don't append useless lines to $line.
          continue

      if in_enum:
        line += current_line

def process(line, enum_def, module, omit):
  '''convert enums to lisp'''

  global tokens, has_warned_unknown_token
  # The name is the first word after the closing bracket.
  # The name can be followed by *_DEPRECATED_TYPE* or *_AVAILABLE_TYPE*
  # before the semicolon.
  is_enum_name = extract_enum_name.search(enum_def)
  if is_enum_name:
    enum_def = is_enum_name.group(1)
  c_name = enum_def
  # replace all excessive whitespaces with one space.
  line = white_spaces.sub(' ', line)
  # get rid of any comments.
  line = single_line_comment_c.sub('', line)
  # get rid of opening bracket.
  line = opening_bracket.sub('', line, 1)
  # lets employ some heuristics. :)
  perhaps_enum = 0
  perhaps_flags = 0
  # c_name = module + enum_def.
  if not module:
    is_module_name = extract_module_name.search(c_name)
    if is_module_name:
      module = is_module_name.group(1)
    else:
      module = ''
  enum_def = enum_def.replace(module, '')
  # names and their values.
  c_names = []
  values = []
  # val - default value for enum, gets incremented after every value processed.
  val = 0
  # these are just for case when enum value is equal to some sort of unknown
  # value - preprocessor define or other enum.
  unknown_flag = False
  unknown_val = ''
  unknown_base = ''
  unknown_increment = 0

  lines = line.split(',')
  iter = 0
  while iter < len(lines):
    # The enumerator name can be followed by *_DEPRECATED_ENUMERATOR*,
    # *_DEPRECATED_ENUMERATOR*_FOR(*) or *_AVAILABLE_ENUMERATOR*
    # before the equal sign or comma.
    omit_enumerator = omit and deprecated_enumerator.search(lines[iter])
    lines[iter] = dep_or_avail_enumerator.sub(r'\1', lines[iter], 1)

    brackets_count = 0
    begin = iter

    # ignore ',' inside () brackets
    # except '(' and ')' enum values
    if parenthesis_value.search(lines[iter]):
      iter += 1
    else:
      first = True
      while first or (iter < len(lines) and brackets_count != 0):
        first = False
        brackets_count += lines[iter].count('(')
        brackets_count -= lines[iter].count(')')
        iter += 1

    if omit_enumerator:
      continue

    # join with comma and remove leading and trailing spaces.
    # also remove backslashes as some people like to add them before newlines...
    i = ','.join(lines[begin:iter]).strip().replace('\\', '')

    # if only name exists [like MY_ENUM_VALUE].
    if only_name.search(i):
      c_names.append(i)
      if unknown_flag:
        values.append(unknown_val)
        tokens[i] = unknown_val
      else:
        values.append(str(val))
        tokens[i] = val
      perhaps_enum += 1
    # if name with value exists [like MY_FLAG_VALUE = 0x2 or 0x5 << 22
    # or 42 or -13 (in this case entity is still enum, not flags)
    # or 1 << 2 or (1 << 4) or (1 << 5) - 1].
    else:
      m = name_and_value1.search(i) or name_and_value2.search(i) or name_and_value3.search(i)
      if m:
        tmp1 = m.group(1)
        tmp2 = m.group(2)
        c_names.append(tmp1)
        # I do not know who thought that writing '- 1' as enum value is grrreat
        # idea - strip whitespaces between unary minus and a digit.
        if tmp2.startswith('- '):
          tmp2 = re.sub(r'^-\s+', '', tmp2)
        tmp3 = tmp2
        # Python does not understand C-style cast or the u suffix for unsigned.
        tmp3 = cast_or_unsigned1.sub(r'\1', tmp3)
        val = eval(tmp3)
        if cast_or_unsigned2.search(tmp2):
          perhaps_flags += 10
        elif tmp2.startswith('0x'):
          perhaps_flags += 1
        else:
          perhaps_enum += 1
        values.append(tmp2)
        tokens[tmp1] = val
        unknown_flag = False
      else:
        # if name with other name exists [like MY_FLAG_VALUE = MY_PREV_FLAG_VALUE
        # or ~(MY_PREV_FLAG_VALUE | MY_EARLIER_VALUE | (1 << 5) - 1 | 0x200)].
        # [MY_FLAG MY_OTHER_FLAG is also supported - note lack of equal char.]
        # [SOME_DEFINITION([X, [Y, [...]]]) definition is also supported.]
        m = name_with_other_name.search(i)
        if m:
          tmp1 = m.group(1)
          tmp2 = m.group(2)
          c_names.append(tmp1)
          # split r-values on "logical or" and for each splitted r-value check its
          # numeric value and replace a name with it if possible.
          tmps = tmp2.split('|')
          # dont_eval is True if unknown token is found, so whole value is copied
          # verbatim, without evaling.
          dont_eval = False
          if len(tmps) > 1:
            perhaps_flags += 1
          else:
            perhaps_enum += 1

          for tmpval in tmps:
            # if r-value is something like MY_FLAG or MY_DEFINE_VALUE3.
            m = other_name.search(tmpval)
            if m:
              tmp3 = m.group(1)
              if tmp3 not in tokens:
                dont_eval = True
                print('WARNING:', tmp3, 'value of', tmp1, "element in '",  c_name,
                      "' enum is an unknown token.", file=sys.stderr)
                if not has_warned_unknown_token:
                  has_warned_unknown_token = True
                  print("It probably is one of:",
                        "  - preprocessor value - make sure that header defining this value is included in sources wrapping the enum.",
                        "  - enum value from other header or module - see 'preprocessor value'.",
                        "  - typo (happens rarely) - send a patch fixing this to maintainer of this module.",
                        sep='\n', file=sys.stderr)
                # unknown value often makes a flag.
                perhaps_flags += 1
              else:
                tmp2 = tmp2.replace(tmp3, str(tokens[tmp3]))
            else:
              # else is a numeric value, so we do not do anything.
              pass

          # check if there are still some non-numerical values.
          if re.search(r'[_A-Z]+', tmp2):
            dont_eval = True

          if not dont_eval:
            val = eval(tmp2)
            values.append(val)
            tokens[tmp1] = val
            unknown_flag = False
          else:
            values.append(tmp2)
            unknown_flag = True
            # wrapping in safety parens.
            unknown_base = '(' + tmp2 + ')'
            unknown_increment = 0
            tokens[tmp1] = unknown_base

        # if name with char exists (like MY_ENUM_VALUE = 'a').
        else:
          m = name_with_char.search(i)
          if m:
            c_names.append(m.group(1))
            values.append("'" + m.group(2) + "'")
            val = ord(m.group(2))
            tokens[m.group(1)] = val
            unknown_flag = False
            perhaps_enum += 1

          # if it's one of the char values that were replaced by
          # %%COMMA%% or %%RBRACE%%.
          else:
            m = comma_or_rbrace.search(i)
            if m:
              c_names.append(m.group(1))
              if m.group(2) == r'%%COMMA%%':
                values.append("','")
                val = ord(',')
              elif m.group(2) == r'%%RBRACE%%':
                values.append("'}'")
                val = ord('}')
              else:
                values.append(m.group(2))
              tokens[m.group(1)] = val
              unknown_flag = False
              perhaps_enum += 1

            # it should not get here,
            # except if the last enumerator is followed by a comma.
            elif not(not i and iter == len(lines)):
              print("WARNING: I do not know how to parse it: '", i, "' in '", c_name, "'.",
                    sep='', file=sys.stderr)

    if unknown_flag:
      unknown_increment += 1
      unknown_val = unknown_base + ' + ', + unknown_increment
    else:
      val += 1

  entity = 'flags' if c_name.endswith('Flags') or perhaps_flags >= perhaps_enum else 'enum'
  # get nick names.
  ref_names = form_names(c_name, c_names)
  # set format - decimal for enums, hexadecimal for flags.
  vformat = '{0:d}' if entity == 'enum' else '{0:#x}'
  # evaluate any unevaluated values and format them properly, if applicable.
  for j in range(len(values)):
    # if values[j] is a string that can be interpreted as a decimal integer,
    # convert it to an integer, so the format (decimal or hexadecimal)
    # can be selected by vformat.
    if isinstance(values[j], str):
      try:
        values[j] = int(values[j])
      except ValueError:
        pass    
    if isinstance(values[j], int):
      values[j] = vformat.format(values[j])

  # print the defs.
  print('(define-', entity, '-extended ', enum_def, sep='')
  print('  (in-module "', module, '")', sep='')
  print('  (c-name "', c_name, '")', sep='')
  print('  (values')
  for j in range(len(c_names)):
    value = ''
    if values[j]:
      value = ' "' + values[j] + '"'
    print('    \'("', ref_names[j], '" "', c_names[j], '"', value, ')', sep='')
  print('  )')
  print(')\n')

def form_names(c_name, c_names):
  '''form nick names from C names'''

  names = []
  # no values in enum means no names.
  if not c_names:
    return names
    
  # search for length of a prefix.
  leng = len(c_names[0]) - 1
  # if there is more than one value in enum, search for a common part.
  if len(c_names) > 1:
    for j in range(len(c_names)-1):
      while c_names[j][leng-1] != '_' or c_names[j][0:leng] != c_names[j+1][0:leng]:
        leng -= 1
        if leng <= 0:
          break
      if leng <= 0:
        break
  # if there is only one value in enum, we have to use name of the enum.
  else:
    subvals = c_names[0].split('_')
    for j in range(len(subvals)):
      subvals[j] = subvals[j].capitalize()
    false_c_name = ''.join(subvals)
    while leng > 0 and c_name[0:leng] != false_c_name[0:leng]:
      leng -= 1
    tmpleng = leng
    for subval in subvals:
      leng += 1
      l = len(subval)
      if tmpleng <= l:
        break
      tmpleng -= l

  # get prefix with given length.
  prefix = c_names[0][0:leng]
  # generate names.
  for c_n in c_names:
    if c_n[0:len(prefix)] == prefix:
      # remove prefix.
      c_n = c_n[len(prefix):]
    c_n = c_n.lower().replace('_', '-')
    names.append(c_n)

  return names

# ----- Main -----
if __name__ == '__main__':
  import argparse

  parser = argparse.ArgumentParser(
    description='Extract enum definitions from C/C++ header files and write a .defs file.')
  parser.add_argument('--module', help='module name')
  parser.add_argument('--omit-deprecated', action='store_true', dest='omit',
    help='omit deprecated enums and enum values')
  parser.add_argument('header_files', nargs='+', help='header file(s) to parse')
  args = parser.parse_args()

  exitcode = 0
  for filepath in args.header_files:
    try:
      parse(filepath, args.module, args.omit)
    except FileNotFoundError as err:
      exitcode = 1
      print(err, file=sys.stderr)

  sys.exit(exitcode)