summaryrefslogtreecommitdiff
path: root/genChRanges.py
blob: 8ff470e477a17ee5d8a6c08bdbf65b69e1149287 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
#!/usr/bin/python -u
#
# Portions of this script have been (shamelessly) stolen from the
# prior work of Daniel Veillard (genUnicode.py)
#
# I, however, take full credit for any bugs, errors or difficulties :-)
#
# William Brack
# October 2003
#

import sys
import string
import time

#
# A little routine to assign a 'meaningful' name to a range
#
def rangename( intvl ):
    (start, end) = intvl
    rname = "r" + hex(start)[2:] + "x" + hex(end)[2:]
    return rname

#
# A routine to take a list of yes/no (1, 0) values and turn it
# into a list of ranges.  This will later be used to determine whether
# to generate single-byte lookup tables, or inline comparisons
#
def makeRange(lst):
    ret = []
    pos = 0
    while pos < len(lst):
	try:		# index generates exception if not present
	    s = lst[pos:].index(1)	# look for start of next range
	except:
	    break			# if no more, finished
	pos += s			# pointer to start of possible range
	try:
	    e = lst[pos:].index(0)	# look for end of range
	    e += pos
	except:				# if no end, set to end of list
	    e = len(lst)
	ret.append((pos, e-1))		# append range tuple to list
	pos = e + 1			# ready to check for next range
    return ret

sources = "chvalid.def"			# input filename

# minTableSize gives the minimum number of ranges which must be present
# before a 256-byte lookup table is produced.  If there are less than this
# number, a macro with inline comparisons is generated
minTableSize = 6

# dictionary of ranges, key=range, element contains list of funcs using it
Ranges = {}

# dictionary of functions, key=name, element contains char-map and range-list
Functs = {}

state = 0

try:
    defines = open("chvalid.def", "r")
except:
    print "Missing chvalid.def, aborting ..."
    sys.exit(1)

#
# The lines in the .def file have three types:-
#   name:   Defines a new function block
#   ur:	    Defines individual or ranges of unicode values
#   end:    Indicates the end of the function block
#
# These lines are processed below.
#
for line in defines.readlines():
    # ignore blank lines, or lines beginning with '#'
    if line[0] == '#':
        continue
    line = string.strip(line)
    if line == '':
        continue
    # split line into space-separated fields, then split on type
    try:
        fields = string.split(line, ' ')
	#
	# name line:
	#   validate any previous function block already ended
	#   validate this function not already defined
	#   initialize an entry in the function dicitonary
	#	including a mask table with no values yet defined
	#
	if fields[0] == 'name':
	    name = fields[1]
	    if state != 0:
		print "'name' %s found before previous name" \
		      "completed" % (fields[1])
		continue
	    state = 1
	    if Functs.has_key(name):
		print "name '%s' already present - may give" \
		      " wrong results" % (name)
	    else:
		# dict entry with two list elements (chdata, rangedata)
		Functs[name] = [ [], [] ]
		for v in range(256):
		    Functs[name][0].append(0)
	#
	# end line:
	#   validate there was a preceding function name line
	#   set state to show no current function active
	#
	elif fields[0] == 'end':
	    if state == 0:
		print "'end' found outside of function block"
		continue
	    state = 0

	#
	# ur line:
	#   validate function has been defined
	#   process remaining fields on the line, which may be either
	#	individual unicode values or ranges of values
	#
	elif fields[0] == 'ur':
	    if state != 1:
		raise ValidationError, "'ur' found outside of 'name' block"
	    for el in fields[1:]:
		pos = string.find(el, '..')
		# pos <=0 means not a range, so must be individual value
		if pos <= 0:
		    # cheap handling of hex or decimal values
		    if el[0:2] == '0x':
		        value = int(el[2:],16)
		    elif el[0] == "'":
			value = ord(el[1])
		    else:
			value = int(el)
		    if ((value < 0) | (value > 0x1fffff)):
			raise ValidationError, 'Illegal value (%s) in ch for'\
				' name %s' % (el,name)
		    # for ur we have only ranges (makes things simpler),
		    # so convert val to range
		    currange = (value, value)
		# pos > 0 means this is a range, so isolate/validate
		# the interval
		else:
		    # split the range into it's first-val, last-val
		    (first, last) = string.split(el, "..")
		    # convert values from text into binary
		    if first[0:2] == '0x':	
			start = int(first[2:],16)
		    elif first[0] == "'":
			start = ord(first[1])
		    else:
			start = int(first)
		    if last[0:2] == '0x':
			end = int(last[2:],16)
		    elif last[0] == "'":
			end = ord(last[1])
		    else:
			end = int(last)
		    if (start < 0) | (end > 0x1fffff) | (start > end):
			raise ValidationError, "Invalid range '%s'" % el
		    currange = (start, end)
		# common path - 'currange' has the range, now take care of it
		# We split on single-byte values vs. multibyte
		if currange[1] < 0x100:	# single-byte
		    for ch in range(currange[0],currange[1]+1):
			# validate that value not previously defined
			if Functs[name][0][ch]:
			    msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
			    raise ValidationError, msg
			Functs[name][0][ch] = 1
		else:			# multi-byte
		    if Ranges.has_key(currange):
			Ranges[currange].append(name)
		    else:
			Ranges[currange] = [ name ]
		    if currange in Functs[name][1]:
			raise ValidationError, "range already defined in" \
				" function"
		    else:
			Functs[name][1].append(currange)

    except:
	print "Failed to process line: %s" % (line)
	raise
#
# At this point, the entire definition file has been processed.  Now we
# enter the output phase, where we generate the two files chvalid.c and'
# chvalid.h
#
# To do this, we first output the 'static' data (heading, fixed
# definitions, etc.), then output the 'dynamic' data (the results
# of the above processing), and finally output closing 'static' data
# (e.g. the subroutine to process the ranges)
#

#
# Generate the headings:
#
try:
    header = open("include/libxml/chvalid.h", "w")
except:
    print "Failed to open include/libxml/chvalid.h"
    sys.exit(1)

try:
    output = open("chvalid.c", "w")
except:
    print "Failed to open chvalid.c"
    sys.exit(1)

date = time.asctime(time.localtime(time.time()))

header.write(
"""/*
 * chvalid.h: this header exports interfaces for the character
 *		 range validation APIs
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
 *
 * Generation date: %s
 * Sources: %s
 * William Brack <wbrack@mmm.com.hk>
 */

#ifndef __XML_CHVALID_H__
#define __XML_CHVALID_H__

#ifdef __cplusplus
extern "C" {
#endif

/*
 * Define our typedefs and structures
 *
 */
typedef struct _xmlChSRange xmlChSRange;
typedef xmlChSRange *xmlChSRangePtr;
struct _xmlChSRange {
    unsigned short	low;
    unsigned short	high;
};

typedef struct _xmlChLRange xmlChLRange;
typedef xmlChLRange *xmlChLRangePtr;
struct _xmlChLRange {
    unsigned		low;
    unsigned		high;
};

typedef struct _xmlChRangeGroup xmlChRangeGroup;
typedef xmlChRangeGroup *xmlChRangeGroupPtr;
struct _xmlChRangeGroup {
    int			nbShortRange;
    int			nbLongRange;
    xmlChSRangePtr	shortRange;	/* points to an array of ranges */
    xmlChLRangePtr	longRange;
};

/* Range checking routine */
int xmlCharInRange(unsigned int val, const xmlChRangeGroupPtr group);

""" % (date, sources));
output.write(
"""/*
 * chvalid.c:	this module implements the character range
 *		validation APIs
 *
 * This file is automatically generated from the cvs source
 * definition files using the genChRanges.py Python script
 *
 * Generation date: %s
 * Sources: %s
 * William Brack <wbrack@mmm.com.hk>
 */

#include <libxml/chvalid.h>

/*
 * The initial tables ({func_name}_tab) are used to validate whether a
 * single-byte character is within the specified group.  Each table
 * contains 256 bytes, with each byte representing one of the 256
 * possible characters.  If the table byte is set, the character is
 * allowed.
 *
 */
""" % (date, sources));

#
# Now output the generated data.
# We try to produce the best execution times.  Tests have shown that validation
# with direct table lookup is, when there are a "small" number of valid items,
# still not as fast as a sequence of inline compares.  So, if the single-byte
# portion of a range has a "small" number of ranges, we output a macro for inline
# compares, otherwise we output a 256-byte table and a macro to use it.
#

fkeys = Functs.keys()	# Dictionary of all defined functions
fkeys.sort()		# Put some order to our output

for f in fkeys:

# First we convert the specified single-byte values into a group of ranges.
# If the total number of such ranges is less than minTableSize, we generate
# an inline macro for direct comparisons; if greater, we generate a lookup
# table.
    if max(Functs[f][0]) > 0:	# only check if at least one entry
        rangeTable = makeRange(Functs[f][0])
	numRanges = len(rangeTable)
	if numRanges >= minTableSize:	# table is worthwhile
	    header.write("extern unsigned char %s_tab[256];\n" % f)
	    header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))

	    # write the constant data to the code file
	    output.write("unsigned char %s_tab[256] = {\n" % f)
	    pline = "   "
	    for n in range(255):
		pline += " 0x%02x," % Functs[f][0][n]
		if len(pline) > 72:
		    output.write(pline + "\n")
		    pline = "   "
	    output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])

	else:		# inline check is used
	    # first another little optimisation - if space is present,
	    # put it at the front of the list so it is checked first
	    try:
		ix = rangeTable.remove((0x20, 0x20))
		rangeTable.insert(0, (0x20, 0x20))
	    except:
		pass
	    pline = "#define %s_ch(c)\t( " % f
	    firstFlag = 1
	    for rg in rangeTable:
		if not firstFlag:
		    pline += " || \\\n\t\t\t"
		else:
		    firstFlag = 0
		if rg[0] == rg[1]:		# single value - check equal
		    pline += "((c) == " + hex(rg[0]) + ")"
		else:			# value range
		    pline += "((" + hex(rg[0]) + "<= (c)) &&"
		    pline += " ((c) <= " + hex(rg[1]) + "))"
	    pline += ")\n"
	    header.write(pline)

    header.write("#define %s(c)\t(((c) < 0x100) ? \\\n\t\t\t\t" % f)
    if max(Functs[f][0]) > 0:
	header.write("%s_ch((c)) :" % f)
    else:
	header.write("0 :")

    # if no ranges defined, value invalid if >= 0x100
    if len(Functs[f][1]) == 0:
	header.write(" 0)\n\n")
    else:
	header.write(" \\\n\t\t\t\txmlCharInRange((c), &%sGroup))\n\n"  % f)

    if len(Functs[f][1]) > 0:
	header.write("extern xmlChRangeGroup %sGroup;\n" % f)


#
# Next we do the unicode ranges
#

for f in fkeys:
    if len(Functs[f][1]) > 0:	# only generate if unicode ranges present
	rangeTable = Functs[f][1]
	rangeTable.sort()	# ascending tuple sequence
	numShort = 0
	numLong  = 0
	for rg in rangeTable:
	    if rg[1] < 0x10000:	# if short value
		if numShort == 0:	# first occurence
		    pline = "static xmlChSRange %s_srng[] = { " % f
		else:
		    pline += ", "
		numShort += 1
		if len(pline) > 60:
		    output.write(pline + "\n")
		    pline = "    "
		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
	    else:		# if long value
		if numLong == 0:	# first occurence
		    if numShort > 0:	# if there were shorts, finish them off
			output.write(pline + "};\n")
		    pline = "static xmlChLRange %s_lrng[] = { " % f
		else:
		    pline += ", "
		numLong += 1
		if len(pline) > 60:
		    output.write(pline + "\n")
		    pline = "    "
		pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
	output.write(pline + "};\n")	# finish off last group

	pline = "xmlChRangeGroup %sGroup = {%d, %d, " % (f, numShort, numLong)
	if numShort > 0:
	    pline += "%s_srng" % f
	if numLong > 0:
	    pline += ", %s_lrng" % f
	
	output.write(pline + "};\n\n")
#
# Run complete - write trailers and close the output files
#

header.write("""
#ifdef __cplusplus
}
#endif
#endif /* __XML_CHVALID_H__ */
""");

header.close()

output.write(
"""
int
xmlCharInRange (unsigned int val, xmlChRangeGroupPtr rptr) {
    int low, high, mid;
    xmlChSRangePtr sptr;
    xmlChLRangePtr lptr;
    if (val < 0x10000) {	/* is val in 'short' or 'long'  array? */
	if (rptr->nbShortRange == 0)
	    return 0;
	low = 0;
	high = rptr->nbShortRange;
	sptr = rptr->shortRange;
	while (low <= high) {
	    mid = (low + high) / 2;
	    if ((unsigned short) val < sptr[mid].low)
		high = mid - 1;
	    else if ((unsigned short) val > sptr[mid].high)
		low = mid + 1;
	    else
		return 1;
	}
    } else {
	if (rptr->nbLongRange == 0)
	    return 0;
	low = 0;
	high = rptr->nbLongRange;
	lptr = rptr->longRange;
	while (low <= high) {
	    mid = (low + high) / 2;
	    if (val < lptr[mid].low)
		high = mid - 1;
	    else if (val > lptr[mid].high)
		low = mid + 1;
	    else
		return 1;
	}
    }
    return 0;
}

""");

output.close()