diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-08-26 16:07:14 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-08-26 16:07:14 +0000 |
commit | adb57cee9416b491993c440adf20534d4e571a50 (patch) | |
tree | a3332256ba1405a7ac2da7cf84402845afa999c7 /pcre_tables.c | |
parent | ab058a28a025fcc86eb93d0166cb248ccdbfce3a (diff) | |
download | pcre-adb57cee9416b491993c440adf20534d4e571a50.tar.gz |
Improve extended grapheme clusters using a bit table.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1015 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_tables.c')
-rw-r--r-- | pcre_tables.c | 74 |
1 files changed, 39 insertions, 35 deletions
diff --git a/pcre_tables.c b/pcre_tables.c index 4145046..b705613 100644 --- a/pcre_tables.c +++ b/pcre_tables.c @@ -109,61 +109,65 @@ const int PRIV(ucp_gentype)[] = { ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ }; -/* This byte table encodes the rules for finding the end of an extended -grapheme cluster. It could be done with bits instead of bytes, but the saving -in memory would be small and there would be more computation at runtime. +/* This table encodes the rules for finding the end of an extended grapheme +cluster. Every code point has a grapheme break property which is one of the +ucp_gbXX values defined in ucp.h. The 2-dimensional table is indexed by the +properties of two adjacent code points. The left property selects a word from +the table, and the right property selects a bit from that word like this: + + ucp_gbtable[left-property] & (1 << right-property) -Every code point has a grapheme break property which is one of the ucp_gbXX -values defined in ucp.h. The number of such properties is ucp_gbCount. The -2-dimensional table is indexed by the properties of two adjacent code points. The value is non-zero if a grapheme break is NOT permitted between the relevant two code points. The breaking rules are as follows: 1. Break at the start and end of text (pretty obviously). -2. Do not break between a CR and LF: (0,1) is set; otherwise, break before and - after controls: (x,0), (x,1), (x,2), (0,x), (1,x), and (2,x) are not set, - except for (0,1). +2. Do not break between a CR and LF; otherwise, break before and after + controls. -3. Do not break Hangul syllable sequences: (6,6), (6,7), (6,9), (6,10), - (7,7), (7,8), (8,8), (9,7), (9,8), and (10,8) are set. The rules for Hangul - sequences are: +3. Do not break Hangul syllable sequences, the rules for which are: L may be followed by L, V, LV or LVT LV or V may be followed by V or T LVT or T may be followed by T -4. Do not break before extending characters: (x,3) is set except for (0,3), - (1,3), and (2,3). +4. Do not break before extending characters. The next two rules are only for extended grapheme clusters (but that's what we are implementing). -5. Do not break before SpacingMarks: (x,5) is set except for (0,5), (1,5), - and (2,5). +5. Do not break before SpacingMarks. -6. Do not break after Prepend characters: (4,x) is set except for (4,0), (4,1), - and (4,2). +6. Do not break after Prepend characters. -8. Otherwise, break everywhere. +7. Otherwise, break everywhere. */ -const pcre_uint8 PRIV(ucp_gbtable[]) = { -/* 0 1 2 3 4 5 6 7 8 9 10 11 */ - 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 CR */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1 LF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2 Control */ - 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, /* 3 Extend */ - 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4 Prepend */ - 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, /* 5 SpacingMark */ - 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, /* 6 L */ - 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, /* 7 V */ - 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, /* 8 T */ - 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, /* 9 LV */ - 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, /* 10 LVT */ - 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0 /* 11 Other */ -}; - +const pcre_uint32 PRIV(ucp_gbtable[]) = { + (1<<ucp_gbLF), /* 0 CR */ + 0, /* 1 LF */ + 0, /* 2 Control */ + (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark), /* 3 Extend */ + (1<<ucp_gbExtend)|(1<<ucp_gbPrepend)| /* 4 Prepend */ + (1<<ucp_gbSpacingMark)|(1<<ucp_gbL)| + (1<<ucp_gbV)|(1<<ucp_gbT)|(1<<ucp_gbLV)| + (1<<ucp_gbLVT)|(1<<ucp_gbOther), + + (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark), /* 5 SpacingMark */ + (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbL)| /* 6 L */ + (1<<ucp_gbL)|(1<<ucp_gbV)|(1<<ucp_gbLV)|(1<<ucp_gbLVT), + + (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbV)| /* 7 V */ + (1<<ucp_gbT), + + (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbT), /* 8 T */ + (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbV)| /* 9 LV */ + (1<<ucp_gbT), + + (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbT), /* 10 LVT */ + (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark) /* 11 Other */ +}; + #ifdef SUPPORT_JIT /* This table reverses PRIV(ucp_gentype). We can save the cost of a memory load. */ |