summaryrefslogtreecommitdiff
path: root/pcre_tables.c
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-08-25 11:36:15 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-08-25 11:36:15 +0000
commit39f01fd0ae3cc442ec0114e31b6a256172e8288c (patch)
treea1a59625f17f928e67d56ba2c747fab4c4e62cd7 /pcre_tables.c
parent1e22d8f74de1ebf5bea6c8b07a3b79f457fcf419 (diff)
downloadpcre-39f01fd0ae3cc442ec0114e31b6a256172e8288c.tar.gz
Upgrade \X to match an extended grapheme cluster
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1011 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_tables.c')
-rw-r--r--pcre_tables.c55
1 files changed, 55 insertions, 0 deletions
diff --git a/pcre_tables.c b/pcre_tables.c
index 7ac2d89..4145046 100644
--- a/pcre_tables.c
+++ b/pcre_tables.c
@@ -109,6 +109,61 @@ const int PRIV(ucp_gentype)[] = {
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
};
+/* This byte table encodes the rules for finding the end of an extended
+grapheme cluster. It could be done with bits instead of bytes, but the saving
+in memory would be small and there would be more computation at runtime.
+
+Every code point has a grapheme break property which is one of the ucp_gbXX
+values defined in ucp.h. The number of such properties is ucp_gbCount. The
+2-dimensional table is indexed by the properties of two adjacent code points.
+The value is non-zero if a grapheme break is NOT permitted between the relevant
+two code points. The breaking rules are as follows:
+
+1. Break at the start and end of text (pretty obviously).
+
+2. Do not break between a CR and LF: (0,1) is set; otherwise, break before and
+ after controls: (x,0), (x,1), (x,2), (0,x), (1,x), and (2,x) are not set,
+ except for (0,1).
+
+3. Do not break Hangul syllable sequences: (6,6), (6,7), (6,9), (6,10),
+ (7,7), (7,8), (8,8), (9,7), (9,8), and (10,8) are set. The rules for Hangul
+ sequences are:
+
+ L may be followed by L, V, LV or LVT
+ LV or V may be followed by V or T
+ LVT or T may be followed by T
+
+4. Do not break before extending characters: (x,3) is set except for (0,3),
+ (1,3), and (2,3).
+
+The next two rules are only for extended grapheme clusters (but that's what we
+are implementing).
+
+5. Do not break before SpacingMarks: (x,5) is set except for (0,5), (1,5),
+ and (2,5).
+
+6. Do not break after Prepend characters: (4,x) is set except for (4,0), (4,1),
+ and (4,2).
+
+8. Otherwise, break everywhere.
+*/
+
+const pcre_uint8 PRIV(ucp_gbtable[]) = {
+/* 0 1 2 3 4 5 6 7 8 9 10 11 */
+ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 CR */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1 LF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2 Control */
+ 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, /* 3 Extend */
+ 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4 Prepend */
+ 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, /* 5 SpacingMark */
+ 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, /* 6 L */
+ 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, /* 7 V */
+ 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, /* 8 T */
+ 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, /* 9 LV */
+ 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, /* 10 LVT */
+ 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0 /* 11 Other */
+};
+
#ifdef SUPPORT_JIT
/* This table reverses PRIV(ucp_gentype). We can save the cost
of a memory load. */