1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
#! /usr/bin/env python
"""
Analysis of the re.UNICODE flag on whitespace recognition.
"""
# Running this program produces this output:
"""
Regular expressions:
1. '\\s'
2. '\\s', re.UNICODE
3. u'(?![\xa0\u202f])\\s', re.UNICODE
=== ========= ======= ========================= =======
Cat Codepoint Decimal Name/Description Regexps
=== ========= ======= ========================= =======
Cc U+0009 9 (HT) TAB \t 1 2 3
Cc U+000a 10 (LF) LINE FEED \n 1 2 3
Cc U+000b 11 (VT) VERTICAL TAB \v 1 2 3
Cc U+000c 12 (FF) FORM FEED \f 1 2 3
Cc U+000d 13 (CR) CARRIAGE RETURN \r 1 2 3
Cc U+001c 28 (FS) FILE SEPARATOR 2 3
Cc U+001d 29 (GS) GROUP SEPARATOR 2 3
Cc U+001e 30 (RS) RECORD SEPARATOR 2 3
Cc U+001f 31 (US) UNIT SEPARATOR 2 3
Zs U+0020 32 SPACE 1 2 3
Cc U+0085 133 (NEL) NEXT LINE 2 3
Zs U+00a0 160 NO-BREAK SPACE 2
Zs U+1680 5760 OGHAM SPACE MARK 2 3
Zs U+2000 8192 EN QUAD 2 3
Zs U+2001 8193 EM QUAD 2 3
Zs U+2002 8194 EN SPACE 2 3
Zs U+2003 8195 EM SPACE 2 3
Zs U+2004 8196 THREE-PER-EM SPACE 2 3
Zs U+2005 8197 FOUR-PER-EM SPACE 2 3
Zs U+2006 8198 SIX-PER-EM SPACE 2 3
Zs U+2007 8199 FIGURE SPACE 2 3
Zs U+2008 8200 PUNCTUATION SPACE 2 3
Zs U+2009 8201 THIN SPACE 2 3
Zs U+200a 8202 HAIR SPACE 2 3
Zs U+200b 8203 ZERO WIDTH SPACE 2 3
Zl U+2028 8232 LINE SEPARATOR 2 3
Zp U+2029 8233 PARAGRAPH SEPARATOR 2 3
Zs U+202f 8239 NARROW NO-BREAK SPACE 2
Zs U+205f 8287 MEDIUM MATHEMATICAL SPACE 2 3
Zs U+3000 12288 IDEOGRAPHIC SPACE 2 3
=== ========= ======= ========================= =======
"""
# For Unicode category information, see
# http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
"""
======== ====================
Category Description
======== ====================
Zs Separator, Space
Zl Separator, Line
Zp Separator, Paragraph
Cc Other, Control
Cf Other, Format
======== ====================
"""
import re
import unicodedata
charnames = {9: '(HT) TAB \\t',
10: '(LF) LINE FEED \\n',
11: '(VT) VERTICAL TAB \\v',
12: '(FF) FORM FEED \\f',
13: '(CR) CARRIAGE RETURN \\r',
28: '(FS) FILE SEPARATOR',
29: '(GS) GROUP SEPARATOR',
30: '(RS) RECORD SEPARATOR',
31: '(US) UNIT SEPARATOR',
133: '(NEL) NEXT LINE'}
pats = [re.compile(r'\s'),
re.compile(r'\s', re.UNICODE),
re.compile(u'(?![\u00a0\u202f])\\s', re.UNICODE),]
border = '=== ========= ======= ========================= ======='
header = 'Cat Codepoint Decimal Name/Description Regexps'
print 'Regular expressions:\n'
for i, pat in enumerate(pats):
if pat.flags & re.UNICODE:
flag = ', re.UNICODE'
else:
flag = ''
print '%s. %r%s' % (i + 1, pat.pattern, flag)
print
print border
print header
print border
chars = []
for u in range(0x10000):
c = unichr(u)
category = unicodedata.category(c)
if category[:0] in 'ZC': # Z: whitespace; C: controls
respace = 0
parts = []
for i, pat in enumerate(pats):
if pat.search(c):
parts.append(str(i + 1))
respace += 1
else:
parts.append(' ')
if category.startswith('Z') or respace:
print ('%s U+%04x %5s %-25s %s'
% (category, u, u,
unicodedata.name(c, charnames.get(u, repr(c))),
' '.join(parts)))
chars.append(c)
print border
|