diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-08-25 11:36:15 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-08-25 11:36:15 +0000 |
commit | 39f01fd0ae3cc442ec0114e31b6a256172e8288c (patch) | |
tree | a1a59625f17f928e67d56ba2c747fab4c4e62cd7 /pcre_tables.c | |
parent | 1e22d8f74de1ebf5bea6c8b07a3b79f457fcf419 (diff) | |
download | pcre-39f01fd0ae3cc442ec0114e31b6a256172e8288c.tar.gz |
Upgrade \X to match an extended grapheme cluster
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1011 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_tables.c')
-rw-r--r-- | pcre_tables.c | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/pcre_tables.c b/pcre_tables.c index 7ac2d89..4145046 100644 --- a/pcre_tables.c +++ b/pcre_tables.c @@ -109,6 +109,61 @@ const int PRIV(ucp_gentype)[] = { ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ }; +/* This byte table encodes the rules for finding the end of an extended +grapheme cluster. It could be done with bits instead of bytes, but the saving +in memory would be small and there would be more computation at runtime. + +Every code point has a grapheme break property which is one of the ucp_gbXX +values defined in ucp.h. The number of such properties is ucp_gbCount. The +2-dimensional table is indexed by the properties of two adjacent code points. +The value is non-zero if a grapheme break is NOT permitted between the relevant +two code points. The breaking rules are as follows: + +1. Break at the start and end of text (pretty obviously). + +2. Do not break between a CR and LF: (0,1) is set; otherwise, break before and + after controls: (x,0), (x,1), (x,2), (0,x), (1,x), and (2,x) are not set, + except for (0,1). + +3. Do not break Hangul syllable sequences: (6,6), (6,7), (6,9), (6,10), + (7,7), (7,8), (8,8), (9,7), (9,8), and (10,8) are set. The rules for Hangul + sequences are: + + L may be followed by L, V, LV or LVT + LV or V may be followed by V or T + LVT or T may be followed by T + +4. Do not break before extending characters: (x,3) is set except for (0,3), + (1,3), and (2,3). + +The next two rules are only for extended grapheme clusters (but that's what we +are implementing). + +5. Do not break before SpacingMarks: (x,5) is set except for (0,5), (1,5), + and (2,5). + +6. Do not break after Prepend characters: (4,x) is set except for (4,0), (4,1), + and (4,2). + +8. Otherwise, break everywhere. +*/ + +const pcre_uint8 PRIV(ucp_gbtable[]) = { +/* 0 1 2 3 4 5 6 7 8 9 10 11 */ + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 CR */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1 LF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2 Control */ + 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, /* 3 Extend */ + 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4 Prepend */ + 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, /* 5 SpacingMark */ + 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, /* 6 L */ + 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, /* 7 V */ + 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, /* 8 T */ + 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, /* 9 LV */ + 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, /* 10 LVT */ + 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0 /* 11 Other */ +}; + #ifdef SUPPORT_JIT /* This table reverses PRIV(ucp_gentype). We can save the cost of a memory load. */ |