tools/db2md.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212

#!/usr/bin/env python3
# -*- python; coding: utf-8 -*-
#
# gtk-doc - GTK DocBook documentation generator.
# Copyright (C) 2017  Stefan Sauer
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#

"""Migrate from inline docbook markup to markdown.

The tool converts markup in comments for the given source file(s). If --dry-run
is given it would only report that docbook tags were found with exit code 1.
To convert interatively one would make a copy of the docs/xml dir, run the
migration tool for some sources, rebuild the docs and compare the new xml.
If it looks the same (or similar enough), submit the changes and repeat for more
files.

Examples:
python3 tools/db2md.py --dry-run tests/*/src/*.{c,h} | sed -e 's/^ *//' | sort | uniq -c | sort -g
"""

import argparse
import logging
import os
import re
import sys
import xml.etree.ElementTree as ET


def print_xml(node, depth=0):
    # if node.text:
    #     print('  ' * depth, node.text)
    for child in node:
        print('  ' * depth, '<%s %s>' % (child.tag, child.attrib))
        print_xml(child, depth + 1)
    # if node.tail:
    #     print('  ' * depth, node.tail)


def convert_block(dry_run, filename, lines, beg, end):
    logging.debug("%s: scan block %d..%d", filename, beg, end)

    # get indentation
    line = lines[beg]
    indent = line.find('* ')
    if indent == -1:
        logging.warning("%s:%d: missing '*' in comment?", filename, beg)
        return 0

    indent += 2

    found_docbook = 0
    end_skip = None
    content = ''
    for ix in range(beg, end):
        # scan for docbook tags
        line = lines[ix]
        content += line[indent:]

        if not re.search(r'^\s*\*', line):
            logging.warning("%s:%d: missing '*' in comment?", filename, ix)
            continue

        line = line[indent:]

        # skip |[ ... ]| and <![CDATA[ ...  ]]> blocks
        if end_skip:
            if re.search(end_skip, line):
                logging.debug("%s:%d: skip code block end", filename, ix)
                end_skip = None
            continue
        else:
            if re.search(r'\|\[', line):
                logging.debug("%s:%d: skip code block start", filename, ix)
                end_skip = r'\]\|'
                continue
            # if re.search(r'<!\[CDATA\[', line):
            #     logging.debug("%s:%d: skip code block start", filename, ix)
            #     end_skip = r'\]\]>'
            #     continue

        # TODO: skip `...` blocks
        # check for historic non markdown compatible chars
        if re.search(r'\s\*\w+[\s.]', line):
            logging.warning("%s:%d: leading '*' needs escaping: '%s'", filename, ix, line)
        # if re.search(r'\s\w+\*[\s.]', line):
        #     logging.warning("%s:%d: trailing '*' needs escaping: '%s'", filename, ix, line)
        if re.search(r'\s_\w+[\s.]', line):
            logging.warning("%s:%d: leading '_' needs escaping: '%s'", filename, ix, line)
        # if re.search(r'\s\w+_[\s.]', line):
        #     logging.warning("%s:%d: trailing '_' needs escaping: '%s'", filename, ix, line)

        # look for docbook
        for m in re.finditer(r'<([^>]*)>', line):
            tag = m.group(1)
            tag_name = tag.split(' ')[0]
            # check if it is a valid xml element name
            if not re.search(r'^/?[a-z_:][a-z0-9_:.-]*/?$', tag_name, re.I):
                continue

            found_docbook = 1
            break
            # if dry_run:
            #     # python3 tools/db2md.py --dry-run tests/*/src/*.{c,h} | \
            #     #   cut -d':' -f3- | sort | uniq -c | sort -g
            #     print('%s:%d:<%s>' % (filename, ix, tag_name.replace('/', '')))

    if found_docbook:
        # add a fake root
        content = '<gtkdoc>' + content + '</gtkdoc>'
        # TODO: protect |[ ... ]| sections, use CDATA?s
        try:
            root = ET.fromstring(content)
        except ET.ParseError:
            return 0

        if not root:
            return 0

        if dry_run:
            print('%s:%d:' % (filename, ix))
            print_xml(root)
        else:
            # TODO: convert_tags()
            pass

    return found_docbook


def convert_file(dry_run, filename):
    """Scan scan a single file.

    Returns: 0 if no doocbook was found
    """

    found_docbook = 0
    lines = None
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    logging.debug("%s: read file with %d lines", filename, len(lines))

    beg = end = -1
    for ix in range(len(lines)):
        line = lines[ix]
        # logging.debug("%s:%d: %d,%d: %s", filename, ix, beg, end, line)
        if beg == -1 and end == -1:
            if re.search(r'^\s*/\*.*\*/', line):
                pass
            elif re.search(r'^\s*/\*\*(\s|$)', line):
                logging.debug("%s:%d: comment start", filename, ix)
                beg = ix
        elif beg > -1 and end == -1:
            if re.search(r'^\s*\*+/', line):
                logging.debug("%s:%d: comment end", filename, ix)
                end = ix

        if beg > -1 and end > -1:
            beg += 1
            end -= 1
            if beg < end:
                found_docbook = found_docbook | convert_block(dry_run, filename, lines, beg, end)
            beg = end = -1

    return found_docbook


def main(dry_run, files):
    """Scan for docbook tags in comments. If not in dry_run mode rewrite them as
    markdown. Report the files that contain(ed) docbook tags.

    Returns: 0 if no doocbook was found
    """

    found_docbook = 0
    for f in files:
        found_docbook = found_docbook | convert_file(dry_run, f)
    return found_docbook


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='db2md - convert docbook in comment to markdown')
    parser.add_argument('--dry-run', default=False, action='store_true',
                        help='Only print files with docbook comments.')
    parser.add_argument('sources', nargs='*')
    options = parser.parse_args()
    if len(options.sources) == 0:
        sys.exit('Too few arguments')

    log_level = os.environ.get('GTKDOC_TRACE')
    if log_level == '':
        log_level = 'INFO'
    if log_level:
        logging.basicConfig(stream=sys.stdout,
                            level=logging.getLevelName(log_level.upper()),
                            format='%(asctime)s:%(filename)s:%(funcName)s:%(lineno)d:%(levelname)s:%(message)s')

    sys.exit(main(options.dry_run, options.sources))