diff options
author | Rafael Garcia-Suarez <rgs@consttype.org> | 2009-12-12 09:33:51 +0100 |
---|---|---|
committer | Rafael Garcia-Suarez <rgs@consttype.org> | 2009-12-12 09:33:51 +0100 |
commit | 5b79a2243fad631bde9802bdfa8903ea90aca08d (patch) | |
tree | 6b2fd432ba283abc9a59f21564bca1ac966e9f09 | |
parent | 3a42395c067611139a8730e53bb1d25a2864e85a (diff) | |
parent | 37e2e78edfe0a224b8a615820f46db879584f523 (diff) | |
download | perl-5b79a2243fad631bde9802bdfa8903ea90aca08d.tar.gz |
Merge commit 'khwilliamson/x' into blead
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | embed.fnc | 10 | ||||
-rw-r--r-- | embed.h | 20 | ||||
-rw-r--r-- | embedvar.h | 20 | ||||
-rw-r--r-- | intrpvar.h | 10 | ||||
-rw-r--r-- | lib/unicore/README.perl | 61 | ||||
-rw-r--r-- | lib/unicore/auxiliary/GCBTest.txt | 311 | ||||
-rw-r--r-- | lib/unicore/mktables | 428 | ||||
-rw-r--r-- | perlapi.h | 20 | ||||
-rw-r--r-- | pod/perl5113delta.pod | 20 | ||||
-rw-r--r-- | proto.h | 60 | ||||
-rw-r--r-- | regexec.c | 244 | ||||
-rw-r--r-- | sv.c | 10 | ||||
-rw-r--r-- | utf8.c | 123 | ||||
-rw-r--r-- | utf8.h | 26 |
15 files changed, 1221 insertions, 143 deletions
@@ -3593,6 +3593,7 @@ lib/Unicode/README Explanation what happened to lib/unicode. lib/Unicode/UCD.pm Unicode character database lib/Unicode/UCD.t See if Unicode character database works lib/unicore/ArabicShaping.txt Unicode character database +lib/unicore/auxiliary/GCBTest.txt Unicode character database lib/unicore/auxiliary/GraphemeBreakProperty.txt Unicode character database lib/unicore/auxiliary/SentenceBreakProperty.txt Unicode character database lib/unicore/auxiliary/WordBreakProperty.txt Unicode character database @@ -502,6 +502,16 @@ ApR |bool |is_utf8_print |NN const U8 *p ApR |bool |is_utf8_punct |NN const U8 *p ApR |bool |is_utf8_xdigit |NN const U8 *p ApR |bool |is_utf8_mark |NN const U8 *p +pR |bool |is_utf8_X_begin |NN const U8 *p +pR |bool |is_utf8_X_extend |NN const U8 *p +pR |bool |is_utf8_X_prepend |NN const U8 *p +pR |bool |is_utf8_X_non_hangul |NN const U8 *p +pR |bool |is_utf8_X_L |NN const U8 *p +pR |bool |is_utf8_X_LV |NN const U8 *p +pR |bool |is_utf8_X_LVT |NN const U8 *p +pR |bool |is_utf8_X_LV_LVT_V |NN const U8 *p +pR |bool |is_utf8_X_T |NN const U8 *p +pR |bool |is_utf8_X_V |NN const U8 *p : Used in perly.y p |OP* |jmaybe |NN OP *o : Used in pp.c @@ -392,6 +392,16 @@ #define is_utf8_xdigit Perl_is_utf8_xdigit #define is_utf8_mark Perl_is_utf8_mark #ifdef PERL_CORE +#define is_utf8_X_begin Perl_is_utf8_X_begin +#define is_utf8_X_extend Perl_is_utf8_X_extend +#define is_utf8_X_prepend Perl_is_utf8_X_prepend +#define is_utf8_X_non_hangul Perl_is_utf8_X_non_hangul +#define is_utf8_X_L Perl_is_utf8_X_L +#define is_utf8_X_LV Perl_is_utf8_X_LV +#define is_utf8_X_LVT Perl_is_utf8_X_LVT +#define is_utf8_X_LV_LVT_V Perl_is_utf8_X_LV_LVT_V +#define is_utf8_X_T Perl_is_utf8_X_T +#define is_utf8_X_V Perl_is_utf8_X_V #define jmaybe Perl_jmaybe #define keyword Perl_keyword #endif @@ -2785,6 +2795,16 @@ #define is_utf8_xdigit(a) Perl_is_utf8_xdigit(aTHX_ a) #define is_utf8_mark(a) Perl_is_utf8_mark(aTHX_ a) #ifdef PERL_CORE +#define is_utf8_X_begin(a) Perl_is_utf8_X_begin(aTHX_ a) +#define is_utf8_X_extend(a) Perl_is_utf8_X_extend(aTHX_ a) +#define is_utf8_X_prepend(a) Perl_is_utf8_X_prepend(aTHX_ a) +#define is_utf8_X_non_hangul(a) Perl_is_utf8_X_non_hangul(aTHX_ a) +#define is_utf8_X_L(a) Perl_is_utf8_X_L(aTHX_ a) +#define is_utf8_X_LV(a) Perl_is_utf8_X_LV(aTHX_ a) +#define is_utf8_X_LVT(a) Perl_is_utf8_X_LVT(aTHX_ a) +#define is_utf8_X_LV_LVT_V(a) Perl_is_utf8_X_LV_LVT_V(aTHX_ a) +#define is_utf8_X_T(a) Perl_is_utf8_X_T(aTHX_ a) +#define is_utf8_X_V(a) Perl_is_utf8_X_V(aTHX_ a) #define jmaybe(a) Perl_jmaybe(aTHX_ a) #define keyword(a,b,c) Perl_keyword(aTHX_ a,b,c) #endif diff --git a/embedvar.h b/embedvar.h index e805a79822..63ed46ee2a 100644 --- a/embedvar.h +++ b/embedvar.h @@ -324,6 +324,16 @@ #define PL_unitcheckav_save (vTHX->Iunitcheckav_save) #define PL_unlockhook (vTHX->Iunlockhook) #define PL_unsafe (vTHX->Iunsafe) +#define PL_utf8_X_L (vTHX->Iutf8_X_L) +#define PL_utf8_X_LV (vTHX->Iutf8_X_LV) +#define PL_utf8_X_LVT (vTHX->Iutf8_X_LVT) +#define PL_utf8_X_LV_LVT_V (vTHX->Iutf8_X_LV_LVT_V) +#define PL_utf8_X_T (vTHX->Iutf8_X_T) +#define PL_utf8_X_V (vTHX->Iutf8_X_V) +#define PL_utf8_X_begin (vTHX->Iutf8_X_begin) +#define PL_utf8_X_extend (vTHX->Iutf8_X_extend) +#define PL_utf8_X_non_hangul (vTHX->Iutf8_X_non_hangul) +#define PL_utf8_X_prepend (vTHX->Iutf8_X_prepend) #define PL_utf8_alnum (vTHX->Iutf8_alnum) #define PL_utf8_alpha (vTHX->Iutf8_alpha) #define PL_utf8_ascii (vTHX->Iutf8_ascii) @@ -641,6 +651,16 @@ #define PL_Iunitcheckav_save PL_unitcheckav_save #define PL_Iunlockhook PL_unlockhook #define PL_Iunsafe PL_unsafe +#define PL_Iutf8_X_L PL_utf8_X_L +#define PL_Iutf8_X_LV PL_utf8_X_LV +#define PL_Iutf8_X_LVT PL_utf8_X_LVT +#define PL_Iutf8_X_LV_LVT_V PL_utf8_X_LV_LVT_V +#define PL_Iutf8_X_T PL_utf8_X_T +#define PL_Iutf8_X_V PL_utf8_X_V +#define PL_Iutf8_X_begin PL_utf8_X_begin +#define PL_Iutf8_X_extend PL_utf8_X_extend +#define PL_Iutf8_X_non_hangul PL_utf8_X_non_hangul +#define PL_Iutf8_X_prepend PL_utf8_X_prepend #define PL_Iutf8_alnum PL_utf8_alnum #define PL_Iutf8_alpha PL_utf8_alpha #define PL_Iutf8_ascii PL_utf8_ascii diff --git a/intrpvar.h b/intrpvar.h index 650eb62c8e..8fe641c29c 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -531,6 +531,16 @@ PERLVAR(Iutf8_print, SV *) PERLVAR(Iutf8_punct, SV *) PERLVAR(Iutf8_xdigit, SV *) PERLVAR(Iutf8_mark, SV *) +PERLVAR(Iutf8_X_begin, SV *) +PERLVAR(Iutf8_X_extend, SV *) +PERLVAR(Iutf8_X_prepend, SV *) +PERLVAR(Iutf8_X_non_hangul, SV *) +PERLVAR(Iutf8_X_L, SV *) +PERLVAR(Iutf8_X_LV, SV *) +PERLVAR(Iutf8_X_LVT, SV *) +PERLVAR(Iutf8_X_T, SV *) +PERLVAR(Iutf8_X_V, SV *) +PERLVAR(Iutf8_X_LV_LVT_V, SV *) PERLVAR(Iutf8_toupper, SV *) PERLVAR(Iutf8_totitle, SV *) PERLVAR(Iutf8_tolower, SV *) diff --git a/lib/unicore/README.perl b/lib/unicore/README.perl index 7515825c6f..59d66a8669 100644 --- a/lib/unicore/README.perl +++ b/lib/unicore/README.perl @@ -5,16 +5,17 @@ The *.txt files were copied from with subdirectories 'extracted' and 'auxiliary' The Unihan files were not included due to space considerations. Also NOT -included were any *.html files and *Test.txt files. It is possible to add the -Unihan files, and edit mktables (see instructions near its beginning) to look -at them. +included were any *.html files. It is possible to add the Unihan files, and +edit mktables (see instructions near its beginning) to look at them. The file 'version' should exist and be a single line with the Unicode version, like: 5.2.0 To be 8.3 filesystem friendly, the names of some of the input files have been -changed from the values that are in the Unicode DB: +changed from the values that are in the Unicode DB. Not all of the Test files +are currently used, so may not be present, so some of the mv's can fail. The +.html Test files are not touched. mv PropertyValueAliases.txt PropValueAliases.txt mv NamedSequencesProv.txt NamedSqProv.txt @@ -33,6 +34,11 @@ mv extracted/DerivedLineBreak.txt extracted/DLineBreak.txt mv extracted/DerivedNumericType.txt extracted/DNumType.txt mv extracted/DerivedNumericValues.txt extracted/DNumValues.txt +mv auxiliary/GraphemeBreakTest.txt auxiliary/GCBTest.txt +mv auxiliary/LineBreakTest.txt auxiliary/LBTest.txt +mv auxiliary/SentenceBreakTest.txt auxiliary/SBTest.txt +mv auxiliary/WordBreakTest.txt auxiliary/WBTest.txt + If you have the Unihan database (5.2 and above), you should also do the following: @@ -45,9 +51,9 @@ mv Unihan_RadicalStrokeCounts.txt UnihanRadicalStrokeCounts.txt mv Unihan_Readings.txt UnihanReadings.txt mv Unihan_Variants.txt UnihanVariants.txt -If you download everything, the names of files, such as test files, that are -not used by mktables are not changed by the above, and will not work correctly -as-is on 8.3 filesystems. +If you download everything, the names of files that are not used by mktables +are not changed by the above, and will not work correctly as-is on 8.3 +filesystems. mktables is used to generate the tables used by the rest of Perl. It will warn you about any *.txt files in the directory substructure that it doesn't know @@ -58,17 +64,12 @@ its lists to process. You can run to have it try to process these tables generically. -If any files are added, deleted, or their names change, you must run - - mktables -makelist - -to generate a new list of all the files. - FOR PUMPKINS The files are inter-related. If you take the latest UnicodeData.txt, for example, but leave the older versions of other files, there can be subtle -problems. +problems. So get everything available from Unicode, and delete those which +aren't needed. When moving to a new version of Unicode, you need to update 'version' by hand @@ -85,27 +86,19 @@ mktables can continue to be used for earlier Unicode versions. When putting out a new Perl release, think about if any of the Deprecated properties should be moved to Suppressed. -The *.pl files are generated from the *.txt files by the mktables script, -more recently done during the Perl build process, but if you want to try -the old manual way: - - cd lib/unicore - p4 edit *.pl */*.pl */*/*.pl - perl ./mktables -P ../../pod -T ../../t/re/uniprops.t -makelist - p4 revert -a - cd ../.. - perl Porting/manicheck - -If any new (or deleted, unlikely but not impossible) *.pl files are indicated: - - cd lib/unicore - p4 add ... - p4 delete ... - cd ../... - p4 edit MANIFEST - ... +The code in regexec.c for the \X match construct is intimately tied to the +regular expression in UAX #29 (http://www.unicode.org/reports/tr29/). You +should see if it has changed, and if so regexec.c should be modified. The +current one is +( CRLF +| Prepend* ( Hangul-syllable | !Control ) + ( Grapheme_Extend | Spacing_Mark)* +| . ) + +mktables has many checks to warn you if there are unexpected or novel things +that it doesn't know how to handle. -And finally: +Finally: p4 submit diff --git a/lib/unicore/auxiliary/GCBTest.txt b/lib/unicore/auxiliary/GCBTest.txt new file mode 100644 index 0000000000..7932e4d759 --- /dev/null +++ b/lib/unicore/auxiliary/GCBTest.txt @@ -0,0 +1,311 @@ +# GraphemeBreakTest-5.2.0.txt +# Date: 2009-09-19, 00:42:12 GMT [MD] +# +# Unicode Character Database +# Copyright (c) 1991-2009 Unicode, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# For documentation, see http://www.unicode.org/reports/tr44/ +# +# Default Grapheme Break Test +# +# Format: +# <string> (# <comment>)? +# <string> contains hex Unicode code points, with +# ÷ wherever there is a break opportunity, and +# × wherever there is not. +# <comment> the format can change, but currently it shows: +# - the sample character name +# - (x) the Grapheme_Break property* for the sample character +# - [x] the rule that determines whether there is a break or not +# +# These samples may be extended or changed in the future. +# +÷ 0020 ÷ 0020 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0020 × 0308 ÷ 0020 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0020 ÷ 000D ÷ # ÷ [0.2] SPACE (Other) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0020 × 0308 ÷ 000D ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0020 ÷ 000A ÷ # ÷ [0.2] SPACE (Other) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0020 × 0308 ÷ 000A ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0020 ÷ 0001 ÷ # ÷ [0.2] SPACE (Other) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0020 × 0308 ÷ 0001 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0020 × 0300 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0020 × 0308 × 0300 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0020 ÷ 0E40 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0020 × 0308 ÷ 0E40 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0020 × 0903 ÷ # ÷ [0.2] SPACE (Other) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0020 × 0308 × 0903 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0020 ÷ 1100 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0020 × 0308 ÷ 1100 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0020 ÷ 1160 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0020 × 0308 ÷ 1160 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0020 ÷ 11A8 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0020 × 0308 ÷ 11A8 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0020 ÷ AC00 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0020 × 0308 ÷ AC00 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0020 ÷ AC01 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0020 × 0308 ÷ AC01 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 000D ÷ 0020 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] SPACE (Other) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 0020 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 000D ÷ 000D ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 000D × 000A ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) × [3.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 000D ÷ 0001 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 0001 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 000D ÷ 0300 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 000D ÷ 0308 × 0300 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 000D ÷ 0E40 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 0E40 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 000D ÷ 0903 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 000D ÷ 0308 × 0903 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 000D ÷ 1100 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 1100 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 000D ÷ 1160 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 1160 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 000D ÷ 11A8 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 11A8 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 000D ÷ AC00 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 000D ÷ 0308 ÷ AC00 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 000D ÷ AC01 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 000D ÷ 0308 ÷ AC01 ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 000A ÷ 0020 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] SPACE (Other) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 0020 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 000A ÷ 000D ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 000A ÷ 000A ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 000A ÷ 0001 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 0001 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 000A ÷ 0300 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 000A ÷ 0308 × 0300 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 000A ÷ 0E40 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 0E40 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 000A ÷ 0903 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 000A ÷ 0308 × 0903 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 000A ÷ 1100 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 1100 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 000A ÷ 1160 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 1160 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 000A ÷ 11A8 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 11A8 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 000A ÷ AC00 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 000A ÷ 0308 ÷ AC00 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 000A ÷ AC01 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 000A ÷ 0308 ÷ AC01 ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0001 ÷ 0020 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] SPACE (Other) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 0020 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0001 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0001 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0001 ÷ 0001 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 0001 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0001 ÷ 0300 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0001 ÷ 0308 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0001 ÷ 0E40 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 0E40 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0001 ÷ 0903 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0001 ÷ 0308 × 0903 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0001 ÷ 1100 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 1100 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0001 ÷ 1160 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 1160 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0001 ÷ 11A8 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 11A8 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0001 ÷ AC00 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ AC00 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0001 ÷ AC01 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ AC01 ÷ # ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0300 ÷ 0020 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0300 × 0308 ÷ 0020 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0300 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0300 × 0308 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0300 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0300 × 0308 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0300 ÷ 0001 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0300 × 0308 ÷ 0001 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0300 × 0300 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0300 × 0308 × 0300 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0300 ÷ 0E40 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0300 × 0308 ÷ 0E40 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0300 × 0903 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0300 × 0308 × 0903 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0300 ÷ 1100 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0300 × 0308 ÷ 1100 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0300 ÷ 1160 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0300 × 0308 ÷ 1160 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0300 ÷ 11A8 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0300 × 0308 ÷ 11A8 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0300 ÷ AC00 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0300 × 0308 ÷ AC00 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0300 ÷ AC01 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0300 × 0308 ÷ AC01 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0E40 × 0020 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] SPACE (Other) ÷ [0.3] +÷ 0E40 × 0308 ÷ 0020 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0E40 ÷ 000D ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0E40 × 0308 ÷ 000D ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0E40 ÷ 000A ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0E40 × 0308 ÷ 000A ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0E40 ÷ 0001 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0E40 × 0308 ÷ 0001 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0E40 × 0300 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0E40 × 0308 × 0300 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0E40 × 0E40 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0E40 × 0308 ÷ 0E40 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0E40 × 0903 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0E40 × 0308 × 0903 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0E40 × 1100 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0E40 × 0308 ÷ 1100 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0E40 × 1160 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0E40 × 0308 ÷ 1160 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0E40 × 11A8 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0E40 × 0308 ÷ 11A8 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0E40 × AC00 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0E40 × 0308 ÷ AC00 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0E40 × AC01 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0E40 × 0308 ÷ AC01 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0903 ÷ 0020 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0903 × 0308 ÷ 0020 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0903 ÷ 000D ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0903 × 0308 ÷ 000D ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 0903 ÷ 000A ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0903 × 0308 ÷ 000A ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 0903 ÷ 0001 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0903 × 0308 ÷ 0001 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 0903 × 0300 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0903 × 0308 × 0300 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0903 ÷ 0E40 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0903 × 0308 ÷ 0E40 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0903 × 0903 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0903 × 0308 × 0903 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0903 ÷ 1100 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0903 × 0308 ÷ 1100 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0903 ÷ 1160 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0903 × 0308 ÷ 1160 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0903 ÷ 11A8 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0903 × 0308 ÷ 11A8 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0903 ÷ AC00 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0903 × 0308 ÷ AC00 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0903 ÷ AC01 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0903 × 0308 ÷ AC01 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 1100 ÷ 0020 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 1100 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 1100 ÷ 000D ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 1100 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 1100 ÷ 000A ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 1100 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 1100 ÷ 0001 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 1100 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 1100 × 0300 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 1100 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 1100 ÷ 0E40 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 1100 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 1100 × 0903 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 1100 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 1100 × 1100 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 1100 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 1100 × 1160 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 1100 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 1100 ÷ 11A8 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 1100 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 1100 × AC00 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 1100 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 1100 × AC01 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 1100 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 1160 ÷ 0020 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 1160 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 1160 ÷ 000D ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 1160 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 1160 ÷ 000A ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 1160 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 1160 ÷ 0001 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 1160 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 1160 × 0300 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 1160 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 1160 ÷ 0E40 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 1160 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 1160 × 0903 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 1160 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 1160 ÷ 1100 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 1160 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 1160 × 1160 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [7.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 1160 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 1160 × 11A8 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [7.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 1160 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 1160 ÷ AC00 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 1160 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 1160 ÷ AC01 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 1160 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 11A8 ÷ 0020 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 11A8 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 11A8 ÷ 000D ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 11A8 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ 11A8 ÷ 000A ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 11A8 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ 11A8 ÷ 0001 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 11A8 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ 11A8 × 0300 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 11A8 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 11A8 ÷ 0E40 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 11A8 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 11A8 × 0903 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 11A8 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 11A8 ÷ 1100 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 11A8 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 11A8 ÷ 1160 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 11A8 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 11A8 × 11A8 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [8.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 11A8 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 11A8 ÷ AC00 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 11A8 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 11A8 ÷ AC01 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 11A8 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ AC00 ÷ 0020 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ AC00 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ AC00 ÷ 000D ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ AC00 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ AC00 ÷ 000A ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ AC00 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ AC00 ÷ 0001 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ AC00 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ AC00 × 0300 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ AC00 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ AC00 ÷ 0E40 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ AC00 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ AC00 × 0903 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ AC00 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ AC00 ÷ 1100 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ AC00 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ AC00 × 1160 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [7.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ AC00 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ AC00 × 11A8 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [7.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ AC00 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ AC00 ÷ AC00 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ AC00 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ AC00 ÷ AC01 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ AC00 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ AC01 ÷ 0020 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ AC01 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ AC01 ÷ 000D ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ AC01 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3] +÷ AC01 ÷ 000A ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ AC01 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3] +÷ AC01 ÷ 0001 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ AC01 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3] +÷ AC01 × 0300 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ AC01 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ AC01 ÷ 0E40 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ AC01 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ AC01 × 0903 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ AC01 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ AC01 ÷ 1100 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ AC01 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ AC01 ÷ 1160 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ AC01 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ AC01 × 11A8 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [8.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ AC01 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ AC01 ÷ AC00 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ AC01 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ AC01 ÷ AC01 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ AC01 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +# Lines: 288 diff --git a/lib/unicore/mktables b/lib/unicore/mktables index f39466abff..c61a3f4709 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -998,6 +998,7 @@ my $MULTIPLE = 4; # Don't replace, but add a duplicate record if my $NORMAL = ""; my $SUPPRESSED = 'z'; # The character should never actually be seen, since # it is suppressed +my $PLACEHOLDER = 'P'; # Implies no pod entry generated my $DEPRECATED = 'D'; my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; @@ -1019,7 +1020,7 @@ my %status_past_participles = ( $SUPPRESSED => 'should never be generated', $STABILIZED => 'stabilized', $OBSOLETE => 'obsolete', - $DEPRECATED => 'deprecated' + $DEPRECATED => 'deprecated', ); # The format of the values of the map tables: @@ -1074,6 +1075,7 @@ my %Jamo_L; # Leading consonants my %Jamo_V; # Vowels my %Jamo_T; # Trailing consonants +my @backslash_X_tests; # List of tests read in for testing \X my @unhandled_properties; # Will contain a list of properties found in # the input that we didn't process. my @match_properties; # Properties that have match tables, to be @@ -1604,6 +1606,15 @@ sub trace { return main::trace(@_); } # processed when you set the $debug_skip global. main::set_access('non_skip', \%non_skip, 'c'); + my %skip; + # This is used to skip processing of this input file semi-permanently. + # It is used for files that we aren't planning to process anytime soon, + # but want to allow to be in the directory and not raise a message that we + # are not handling. Mostly for test files. This is in contrast to the + # non_skip element, which is supposed to be used very temporarily for + # debugging. Sets 'optional' to 1 + main::set_access('skip', \%skip, 'c'); + my %each_line_handler; # list of subroutines to look at and filter each non-comment line in the # file. defaults to none. The subroutines are called in order, each is @@ -1667,6 +1678,7 @@ sub trace { return main::trace(@_); } # Set defaults $handler{$addr} = \&main::process_generic_property_file; $non_skip{$addr} = 0; + $skip{$addr} = 0; $has_missings_defaults{$addr} = $NO_DEFAULTS; $handle{$addr} = undef; $added_lines{$addr} = [ ]; @@ -1723,6 +1735,8 @@ sub trace { return main::trace(@_); } print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; } + $optional{$addr} = 1 if $skip{$addr}; + return $self; } @@ -1771,7 +1785,7 @@ sub trace { return main::trace(@_); } } # File could be optional - if ($optional{$addr}){ + if ($optional{$addr}) { return unless -e $file; my $result = eval $optional{$addr}; if (! defined $result) { @@ -1804,7 +1818,8 @@ sub trace { return main::trace(@_); } } else { - # Here, the file exists + # Here, the file exists. Some platforms may change the case of + # its name if ($seen_non_extracted_non_age) { if ($file =~ /$EXTRACTED/i) { Carp::my_carp_bug(join_lines(<<END @@ -1837,6 +1852,13 @@ END ! $expecting && ! defined $handle{$addr}; + # Having deleted from expected files, we can quit if not to do + # anything. Don't print progress unless really want verbosity + if ($skip{$addr}) { + print "Skipping $file.\n" if $verbosity >= $VERBOSE; + return; + } + # Open the file, converting the slashes used in this program # into the proper form for the OS my $file_handle; @@ -3846,14 +3868,12 @@ sub trace { return main::trace(@_); } # For non-ASCII, we shun the characters that don't have Perl encoding- # independent symbols for them. 'A' is such a symbol, so is "\n". - # Note, this program hopefully will work on 5.8 Perls, and \v is not - # such a symbol in them. return $try_hard if $non_ASCII && $code <= 0xFF && ($code >= 0x7F || ($code >= 0x0E && $code <= 0x1F) || ($code >= 0x01 && $code <= 0x06) - || $code == 0x0B); # \v introduced after 5.8 + || $code == 0x0B); # shun null. I'm (khw) not sure why this was done, but NULL would be # the character very frequently used. @@ -4075,7 +4095,6 @@ sub trace { return main::trace(@_); } my $complete_name = $complete_name{$addr} = delete $args{'Complete_Name'}; $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0; - $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0; $property{$addr} = delete $args{'_Property'}; $range_list{$addr} = delete $args{'_Range_List'}; $status{$addr} = delete $args{'Status'} || $NORMAL; @@ -4087,6 +4106,7 @@ sub trace { return main::trace(@_); } my $loose_match = delete $args{'Fuzzy'}; my $note = delete $args{'Note'}; my $make_pod_entry = delete $args{'Pod_Entry'}; + my $perl_extension = delete $args{'Perl_Extension'}; # Shouldn't have any left over Carp::carp_extra_args(\%args) if main::DEBUG && %args; @@ -4105,11 +4125,20 @@ sub trace { return main::trace(@_); } push @{$description{$addr}}, $description if $description; push @{$note{$addr}}, $note if $note; - # If hasn't set its status already, see if it is on one of the lists - # of properties or tables that have particular statuses; if not, is - # normal. The lists are prioritized so the most serious ones are - # checked first - if (! $status{$addr}) { + if ($status{$addr} eq $PLACEHOLDER) { + + # A placeholder table doesn't get documented, is a perl extension, + # and quite likely will be empty + $make_pod_entry = 0 if ! defined $make_pod_entry; + $perl_extension = 1 if ! defined $perl_extension; + push @tables_that_may_be_empty, $complete_name{$addr}; + } + elsif (! $status{$addr}) { + + # If hasn't set its status already, see if it is on one of the + # lists of properties or tables that have particular statuses; if + # not, is normal. The lists are prioritized so the most serious + # ones are checked first if (exists $why_suppressed{$complete_name}) { $status{$addr} = $SUPPRESSED; } @@ -4145,6 +4174,8 @@ sub trace { return main::trace(@_); } } } + $perl_extension{$addr} = $perl_extension || 0; + # By convention what typically gets printed only or first is what's # first in the list, so put the full name there for good output # clarity. Other routines rely on the full name being first on the @@ -6204,7 +6235,17 @@ END my $flag = $property->status || $table->status || $table_alias_object->status; - $flags{$flag} = $status_past_participles{$flag} if $flag; + if ($flag) { + if ($flag ne $PLACEHOLDER) { + $flags{$flag} = $status_past_participles{$flag}; + } else { + $flags{$flag} = <<END; +a placeholder because it is not in Version $string_version of Unicode, but is +needed by the Perl core to work gracefully. Because it is not in this version +of Unicode, it will not be listed in $pod_file.pod +END + } + } $loose_count++; @@ -6221,6 +6262,9 @@ END push @note, $table->note; push @conflicting, $table->conflicting; + # And this for output after all the tables. + push @global_comments, $table->comment; + # Compute an alternate compound name using the final property # synonym and the first table synonym with a colon instead of # the equal sign used elsewhere. @@ -6306,8 +6350,10 @@ END if (%flags) { foreach my $flag (sort keys %flags) { $comment .= <<END; -'$flag' below means that this form is $flags{$flag}. Consult $pod_file.pod +'$flag' below means that this form is $flags{$flag}. END + next if $flag eq $PLACEHOLDER; + $comment .= "Consult $pod_file.pod\n"; } $comment .= "\n"; } @@ -6317,7 +6363,7 @@ This file returns the $code_points in Unicode Version $string_version that $match$synonyms: $matches_comment -$pod_file.pod should be consulted for the rules on using $any_of_these, +$pod_file.pod should be consulted for the syntax rules for $any_of_these, including if adding or subtracting white space, underscore, and hyphen characters matters or doesn't matter, and other permissible syntactic variants. Upper/lower case distinctions never matter. @@ -6346,7 +6392,9 @@ END # And append any comment(s) from the actual tables. They are all # gathered here, so may not read all that well. - $comment .= "\n" . join "\n\n", @global_comments if @global_comments; + if (@global_comments) { + $comment .= "\n" . join("\n\n", @global_comments) . "\n"; + } if ($count) { # The format differs if no code points, and needs no # explanation in that case @@ -9503,6 +9551,18 @@ END } } # End closure for UnicodeData +sub process_GCB_test { + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + while ($file->next_line) { + push @backslash_X_tests, $_; + } + + return; +} + sub process_NamedSequences { # NamedSequences.txt entries are just added to an array. Because these # don't look like the other tables, they have their own handler. @@ -10795,21 +10855,78 @@ sub compile_perl() { } # These are used in Unicode's definition of \X + my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1); + my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1); + my $gcb = property_ref('Grapheme_Cluster_Break'); - #my $extend = $perl->add_match_table('_X_Extend'); - my $extend = $perl->add_match_table('_GCB_Extend'); - # XXX until decide what todo my $begin = $perl->add_match_table('_X_Begin'); - if (defined $gcb) { - $extend += $gcb->table('Extend') + $gcb->table('SpacingMark') - #$begin += ~ ($gcb->table('Control') - # + $gcb->table('CR') - # + $gcb->table('LF')); + + # The 'extended' grapheme cluster came in 5.1. The non-extended + # definition differs too much from the traditional Perl one to use. + if (defined $gcb && defined $gcb->table('SpacingMark')) { + + # Note that assumes HST is defined; it came in an earlier release than + # GCB. In the line below, two negatives means: yes hangul + $begin += ~ property_ref('Hangul_Syllable_Type') + ->table('Not_Applicable') + + ~ ($gcb->table('Control') + + $gcb->table('CR') + + $gcb->table('LF')); + $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control'); + + $extend += $gcb->table('Extend') + $gcb->table('SpacingMark'); + $extend->add_comment('For use in \X; matches: Extend | SpacingMark'); } else { # Old definition, used on early releases. $extend += $gc->table('Mark') - + 0x200C # ZWNJ - + 0x200D; # ZWJ - #$begin += ~ $extend; + + 0x200C # ZWNJ + + 0x200D; # ZWJ + $begin += ~ $extend; + + # Here we may have a release that has the regular grapheme cluster + # defined, or a release that doesn't have anything defined. + # We set things up so the Perl core degrades gracefully, possibly with + # placeholders that match nothing. + + if (! defined $gcb) { + $gcb = Property->new('GCB', Status => $PLACEHOLDER); + } + my $hst = property_ref('HST'); + if (!defined $hst) { + $hst = Property->new('HST', Status => $PLACEHOLDER); + $hst->add_match_table('Not_Applicable', + Initialize => $Any, + Matches_All => 1); + } + + # On some releases, here we may not have the needed tables for the + # perl core, in some releases we may. + foreach my $name (qw{ L LV LVT T V prepend }) { + my $table = $gcb->table($name); + if (! defined $table) { + $table = $gcb->add_match_table($name); + push @tables_that_may_be_empty, $table->complete_name; + } + + # The HST property predates the GCB one, and has identical tables + # for some of them, so use it if we can. + if ($table->is_empty + && defined $hst + && defined $hst->table($name)) + { + $table += $hst->table($name); + } + } + } + + # More GCB. If we found some hangul syllables, populate a combined + # table. + my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V'); + my $LV = $gcb->table('LV'); + if ($LV->is_empty) { + push @tables_that_may_be_empty, $lv_lvt_v->complete_name; + } else { + $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V'); + $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V'); } # Create a new property specially located that is a combination of the @@ -13231,6 +13348,11 @@ sub make_property_test_script() { } } } + + foreach my $test (@backslash_X_tests) { + print $OUT "Test_X('$test');\n"; + } + print $OUT "Finished();\n"; close $OUT; return; @@ -13380,6 +13502,9 @@ my @input_file_objects = ( Input_file->new('BidiMirroring.txt', v3.0.1, Property => 'Bidi_Mirroring_Glyph', ), + Input_file->new("NormalizationTest.txt", v3.0.1, + Skip => 1, + ), Input_file->new('CaseFolding.txt', v3.0.1, Pre_Handler => \&setup_case_folding, Each_Line_Handler => @@ -13417,6 +13542,18 @@ my @input_file_objects = ( Property => 'Grapheme_Cluster_Break', Has_Missings_Defaults => $NOT_IGNORED, ), + Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, + Handler => \&process_GCB_test, + ), + Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, + Skip => 1, + ), Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, Property => 'Sentence_Break', Has_Missings_Defaults => $NOT_IGNORED, @@ -13427,6 +13564,9 @@ my @input_file_objects = ( Input_file->new('NameAliases.txt', v5.0.0, Property => 'Name_Alias', ), + Input_file->new("BidiTest.txt", v5.2.0, + Skip => 1, + ), Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, Optional => 1, Each_Line_Handler => \&filter_unihan_line, @@ -13474,18 +13614,16 @@ END # Put into %potential_files a list of all the files in the directory structure # that could be inputs to this program, excluding those that we should ignore. -# Also don't consider test files. Use absolute file names because it makes it -# easier across machine types. +# Use absolute file names because it makes it easier across machine types. my @ignored_files_full_names = map { File::Spec->rel2abs( internal_file_to_platform($_)) } keys %ignored_files; File::Find::find({ wanted=>sub { - return unless /\.txt$/i; - return if /Test\.txt$/i; + return unless /\.txt$/i; # Some platforms change the name's case my $full = lc(File::Spec->rel2abs($_)); $potential_files{$full} = 1 - if ! grep { $full eq lc($_) } @ignored_files_full_names; + if ! grep { $full eq lc($_) } @ignored_files_full_names; return; } }, File::Spec->curdir()); @@ -13584,7 +13722,7 @@ if ($glob_list) { && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) { splice @input_file_objects, $i, 0, - Input_file->new($file, v0); + Input_file->new($file, v0); last; } } @@ -13758,28 +13896,53 @@ __DATA__ use strict; use warnings; -# Test the \p{} regular expression constructs. This file is constructed by -# mktables from the tables it generates, so if mktables is buggy, this won't -# necessarily catch those bugs. Tests are generated for all feasible -# properties; a few aren't currently feasible; see is_code_point_usable() -# in mktables for details. +# Test qr/\X/ and the \p{} regular expression constructs. This file is +# constructed by mktables from the tables it generates, so if mktables is +# buggy, this won't necessarily catch those bugs. Tests are generated for all +# feasible properties; a few aren't currently feasible; see +# is_code_point_usable() in mktables for details. # Standard test packages are not used because this manipulates SIG_WARN. It # exits 0 if every non-skipped test succeeded; -1 if any failed. my $Tests = 0; my $Fails = 0; -my $Skips = 0; my $non_ASCII = (ord('A') != 65); -# The first 127 ASCII characters in ordinal order, with the ones that don't -# have Perl names (as of 5.8) replaced by dots. The 127th is used as the -# string delimiter -my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; -#for my $i (0..126) { -# print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n"; -#} +# The 256 8-bit characters in ASCII ordinal order, with the ones that don't +# have Perl names replaced by -1 +my @ascii_ordered_chars = ( + "\0", + (-1) x 6, + "\a", "\b", "\t", "\n", + -1, # No Vt + "\f", "\r", + (-1) x 18, + " ", "!", "\"", "#", '$', "%", "&", "'", + "(", ")", "*", "+", ",", "-", ".", "/", + "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + ":", ";", "<", "=", ">", "?", "@", + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", + "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", + "[", "\\", "]", "^", "_", "`", + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", + "{", "|", "}", "~", + (-1) x 129 +); + +sub ASCII_ord_to_native ($) { + # Converts input ordinal number to the native one, if can be done easily. + # Returns -1 otherwise. + + my $ord = shift; + + return $ord if $ord > 255 || ! $non_ASCII; + my $result = $ascii_ordered_chars[$ord]; + return $result if $result eq '-1'; + return ord($result); +} sub Expect($$$$) { my $expected = shift; @@ -13789,38 +13952,24 @@ sub Expect($$$$) { # or empty if none my $line = (caller)[2]; + # Convert the non-ASCII code points expressible as characters to their + # ASCII equivalents, and skip the others. + $ord = ASCII_ord_to_native($ord); + if ($ord < 0) { + $Tests++; + print "ok $Tests - " + . sprintf("\"\\x{%04X}\"", $ord) + . " =~ $regex # Skipped: non-ASCII\n"; + return; + } + # Convert the code point to hex form my $string = sprintf "\"\\x{%04X}\"", $ord; - # Convert the non-ASCII code points expressible as characters in Perl 5.8 - # to their ASCII equivalents, and skip the others. - if ($non_ASCII && $ord < 255) { - - # Dots are used as place holders in the conversion string for the - # non-convertible ones, so check for it first. - if ($ord == 0x2E) { - $ord = ord('.'); - } - elsif ($ord < 0x7F - # Any dots returned are non-convertible. - && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.')) - { - #print STDERR "$ord, $char, \n"; - $ord = ord($char); - } - else { - $Tests++; - $Skips++; - print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n"; - return; - } - } - - # The first time through, use all warnings. my @tests = ""; - # If the input should generate a warning, add another time through with - # them turned off + # The first time through, use all warnings. If the input should generate + # a warning, add another time through with them turned off push @tests, "no warnings '$warning_type';" if $warning_type; foreach my $no_warnings (@tests) { @@ -13880,9 +14029,142 @@ sub Error($) { return; } +# GCBTest.txt character that separates grapheme clusters +my $breakable_utf8 = my $breakable = chr(0xF7); +utf8::upgrade($breakable_utf8); + +# GCBTest.txt character that indicates that the adjoining code points are part +# of the same grapheme cluster +my $nobreak_utf8 = my $nobreak = chr(0xD7); +utf8::upgrade($nobreak_utf8); + +sub Test_X($) { + # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt + # Each such line is a sequence of code points given by their hex numbers, + # separated by the two characters defined just before this subroutine that + # indicate that either there can or cannot be a break between the adjacent + # code points. If there isn't a break, that means the sequence forms an + # extended grapheme cluster, which means that \X should match the whole + # thing. If there is a break, \X should stop there. This is all + # converted by this routine into a match: + # $string =~ /(\X)/, + # Each \X should match the next cluster; and that is what is checked. + + my $template = shift; + + my $line = (caller)[2]; + + # The line contains characters above the ASCII range, but in Latin1. It + # may or may not be in utf8, and if it is, it may or may not know it. So, + # convert these characters to 8 bits. If knows is in utf8, simply + # downgrade. + if (utf8::is_utf8($template)) { + utf8::downgrade($template); + } else { + + # Otherwise, if it is in utf8, but doesn't know it, the next lines + # convert the two problematic characters to their 8-bit equivalents. + # If it isn't in utf8, they don't harm anything. + use bytes; + $template =~ s/$nobreak_utf8/$nobreak/g; + $template =~ s/$breakable_utf8/$breakable/g; + } + + # Get rid of the leading and trailing breakables + $template =~ s/^ \s* $breakable \s* //x; + $template =~ s/ \s* $breakable \s* $ //x; + + # And no-breaks become just a space. + $template =~ s/ \s* $nobreak \s* / /xg; + + # Split the input into segments that are breakable between them. + my @segments = split /\s*$breakable\s*/, $template; + + my $string = ""; + my $display_string = ""; + my @should_match; + my @should_display; + + # Convert the code point sequence in each segment into a Perl string of + # characters + foreach my $segment (@segments) { + my @code_points = split /\s+/, $segment; + my $this_string = ""; + my $this_display = ""; + foreach my $code_point (@code_points) { + my $ord = ASCII_ord_to_native(hex $code_point); + if ($ord < 0) { + $Tests++; + print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n"; + return; + } + $this_string .= chr $ord; + $this_display .= "\\x{$code_point}"; + } + + # The next cluster should match the string in this segment. + push @should_match, $this_string; + push @should_display, $this_display; + $string .= $this_string; + $display_string .= $this_display; + } + + # If a string can be represented in both non-ut8 and utf8, test both cases + UPGRADE: + for my $to_upgrade (0 .. 1) { + + if ($to_upgrade) { + + # If already in utf8, would just be a repeat + next UPGRADE if utf8::is_utf8($string); + + utf8::upgrade($string); + } + + # Finally, do the \X match. + my @matches = $string =~ /(\X)/g; + + # Look through each matched cluster to verify that it matches what we + # expect. + my $min = (@matches < @should_match) ? @matches : @should_match; + for my $i (0 .. $min - 1) { + $Tests++; + if ($matches[$i] eq $should_match[$i]) { + print "ok $Tests - "; + if ($i == 0) { + print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; + } else { + print "And \\X #", $i + 1, + } + print " correctly matched $should_display[$i]; line $line\n"; + } else { + $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ } + unpack("U*", $matches[$i])); + print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #", + $i + 1, + " should have matched $should_display[$i]", + " but instead matched $matches[$i]", + ". Abandoning rest of line $line\n"; + next UPGRADE; + } + } + + # And the number of matches should equal the number of expected matches. + $Tests++; + if (@matches == @should_match) { + print "ok $Tests - Nothing was left over; line $line\n"; + } else { + print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n"; + } + } + + return; +} + sub Finished() { print "1..$Tests\n"; exit($Fails ? -1 : 0); } Error('\p{Script=InGreek}'); # Bug #69018 +Test_X("1100 $nobreak 1161"); # Bug #70940 @@ -684,6 +684,26 @@ END_EXTERN_C #define PL_unlockhook (*Perl_Iunlockhook_ptr(aTHX)) #undef PL_unsafe #define PL_unsafe (*Perl_Iunsafe_ptr(aTHX)) +#undef PL_utf8_X_L +#define PL_utf8_X_L (*Perl_Iutf8_X_L_ptr(aTHX)) +#undef PL_utf8_X_LV +#define PL_utf8_X_LV (*Perl_Iutf8_X_LV_ptr(aTHX)) +#undef PL_utf8_X_LVT +#define PL_utf8_X_LVT (*Perl_Iutf8_X_LVT_ptr(aTHX)) +#undef PL_utf8_X_LV_LVT_V +#define PL_utf8_X_LV_LVT_V (*Perl_Iutf8_X_LV_LVT_V_ptr(aTHX)) +#undef PL_utf8_X_T +#define PL_utf8_X_T (*Perl_Iutf8_X_T_ptr(aTHX)) +#undef PL_utf8_X_V +#define PL_utf8_X_V (*Perl_Iutf8_X_V_ptr(aTHX)) +#undef PL_utf8_X_begin +#define PL_utf8_X_begin (*Perl_Iutf8_X_begin_ptr(aTHX)) +#undef PL_utf8_X_extend +#define PL_utf8_X_extend (*Perl_Iutf8_X_extend_ptr(aTHX)) +#undef PL_utf8_X_non_hangul +#define PL_utf8_X_non_hangul (*Perl_Iutf8_X_non_hangul_ptr(aTHX)) +#undef PL_utf8_X_prepend +#define PL_utf8_X_prepend (*Perl_Iutf8_X_prepend_ptr(aTHX)) #undef PL_utf8_alnum #define PL_utf8_alnum (*Perl_Iutf8_alnum_ptr(aTHX)) #undef PL_utf8_alpha diff --git a/pod/perl5113delta.pod b/pod/perl5113delta.pod index 2e1ddf8772..ec2443c13a 100644 --- a/pod/perl5113delta.pod +++ b/pod/perl5113delta.pod @@ -38,6 +38,12 @@ Perl is shipped with the latest Unicode version, 5.2, October 2009. See L<http://www.unicode.org/versions/Unicode5.2.0> for details about this release of Unicode. +But, an installation can now fairly easily change Perl to operate on any +Unicode release. Perl is shipped with the latest official release, but +an installation can download and install any prior release from Unicode, and +cause Perl to work with that (or even multiple releases). Instructions are in +L<perlunicode>. + =head2 Unicode properties Perl can now handle every Unicode character property. A new pod, @@ -58,6 +64,15 @@ underscores between digits of numbers. All the Unicode-defined synonyms for properties and property values are now accepted. +C<qr/\X/>, which matches a Unicode logical character, has been expanded to work +better with various Asian languages. It now is defined as an C<extended +grapheme cluster>. (See L<http://www.unicode.org/reports/tr29/>). One change +due to this is that C<\X> will match the whole sequence C<S<CR LF>>. Another +change is that C<\X> will match an isolated mark. Marks generally come after a +base character, but it is possible in Unicode to have them in isolation, and +C<\X> will now handle that case. Otherwise, this change should be transparent +for the non-affected languages. + C<\p{...}> matches using the Canonical_Combining_Class property were completely broken in previous Perls. This is now fixed. @@ -120,11 +135,6 @@ Other_Default_Ignorable_Code_Point, Other_Grapheme_Extend, Other_ID_Continue, Other_ID_Start, Other_Lowercase, Other_Math, and Other_Uppercase. -An installation can now fairly easily change Perl to operate on any -Unicode release. Perl is shipped with the latest official release, but -an installation can now download any prior release, and Perl will work -with that. Instructions are in L<perlunicode>. - An installation can now fairly easily change which Unicode properties Perl understands. As mentioned above, certain properties are by default turned off. These include all the Unihan properties (which should be @@ -1414,6 +1414,66 @@ PERL_CALLCONV bool Perl_is_utf8_mark(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT_IS_UTF8_MARK \ assert(p) +PERL_CALLCONV bool Perl_is_utf8_X_begin(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_extend(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_prepend(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_L(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_L \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_LV(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_LV \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_LVT(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_LVT \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_T(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_T \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_V(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_V \ + assert(p) + PERL_CALLCONV OP* Perl_jmaybe(pTHX_ OP *o) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_JMAYBE \ @@ -120,11 +120,31 @@ /* these are unrolled below in the CCC_TRY_XXX defined */ #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END + +/* Doesn't do an assert to verify that is correct */ +#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ + if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END + #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a") #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0") #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ") -#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86") +#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ + LOAD_UTF8_CHARCLASS(X_begin, " "), \ + LOAD_UTF8_CHARCLASS(X_non_hangul, "A"), \ + /* These are utf8 constants, and not utf-ebcdic constants, so the \ + * assert should likely and hopefully fail on an EBCDIC machine */ \ + LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"), /* U+0300 */ \ + \ + /* No asserts are done for these, in case called on an early \ + * Unicode version in which they map to nothing */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend),/* U+0E40 "\xe0\xb9\x80" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_L), /* U+1100 "\xe1\x84\x80" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV), /* U+AC00 "\xea\xb0\x80" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT), /* U+AC01 "\xea\xb0\x81" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V),/* U+AC01 "\xea\xb0\x81" */\ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_T), /* U+11A8 "\xe1\x86\xa8" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */ /* We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test @@ -3521,22 +3541,216 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); - case CLUMP: + case CLUMP: /* Match \X: logical Unicode character. This is defined as + a Unicode extended Grapheme Cluster */ + /* From http://www.unicode.org/reports/tr29 (5.2 version). An + extended Grapheme Cluster is: + + CR LF + | Prepend* Begin Extend* + | . + + Begin is (Hangul-syllable | ! Control) + Extend is (Grapheme_Extend | Spacing_Mark) + Control is [ GCB_Control CR LF ] + + The discussion below shows how the code for CLUMP is derived + from this regex. Note that most of these concepts are from + property values of the Grapheme Cluster Boundary (GCB) property. + No code point can have multiple property values for a given + property. Thus a code point in Prepend can't be in Control, but + it must be in !Control. This is why Control above includes + GCB_Control plus CR plus LF. The latter two are used in the GCB + property separately, and so can't be in GCB_Control, even though + they logically are controls. Control is not the same as gc=cc, + but includes format and other characters as well. + + The Unicode definition of Hangul-syllable is: + L+ + | (L* ( ( V | LV ) V* | LVT ) T*) + | T+ + ) + Each of these is a value for the GCB property, and hence must be + disjoint, so the order they are tested is immaterial, so the + above can safely be changed to + T+ + | L+ + | (L* ( LVT | ( V | LV ) V*) T*) + + The last two terms can be combined like this: + L* ( L + | (( LVT | ( V | LV ) V*) T*)) + + And refactored into this: + L* (L | LVT T* | V V* T* | LV V* T*) + + That means that if we have seen any L's at all we can quit + there, but if the next character is a LVT, a V or and LV we + should keep going. + + There is a subtlety with Prepend* which showed up in testing. + Note that the Begin, and only the Begin is required in: + | Prepend* Begin Extend* + Also, Begin contains '! Control'. A Prepend must be a '! + Control', which means it must be a Begin. What it comes down to + is that if we match Prepend* and then find no suitable Begin + afterwards, that if we backtrack the last Prepend, that one will + be a suitable Begin. + */ + if (locinput >= PL_regeol) sayNO; - if (do_utf8) { - LOAD_UTF8_CHARCLASS_MARK(); - if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) - sayNO; - locinput += PL_utf8skip[nextchr]; - while (locinput < PL_regeol && - swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) - locinput += UTF8SKIP(locinput); - if (locinput > PL_regeol) - sayNO; - } - else - locinput++; + if (! do_utf8) { + + /* Match either CR LF or '.', as all the other possibilities + * require utf8 */ + locinput++; /* Match the . or CR */ + if (nextchr == '\r' + && locinput < PL_regeol + && UCHARAT(locinput) == '\n') locinput++; + } + else { + + /* Utf8: See if is ( CR LF ); already know that locinput < + * PL_regeol, so locinput+1 is in bounds */ + if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') { + locinput += 2; + } + else { + /* In case have to backtrack to beginning, then match '.' */ + char *starting = locinput; + + /* In case have to backtrack the last prepend */ + char *previous_prepend = 0; + + LOAD_UTF8_CHARCLASS_GCB(); + + /* Match (prepend)* */ + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_prepend, + (U8*)locinput, do_utf8)) + { + previous_prepend = locinput; + locinput += UTF8SKIP(locinput); + } + + /* As noted above, if we matched a prepend character, but + * the next thing won't match, back off the last prepend we + * matched, as it is guaranteed to match the begin */ + if (previous_prepend + && (locinput >= PL_regeol + || ! swash_fetch(PL_utf8_X_begin, + (U8*)locinput, do_utf8))) + { + locinput = previous_prepend; + } + + /* Note that here we know PL_regeol > locinput, as we + * tested that upon input to this switch case, and if we + * moved locinput forward, we tested the result just above + * and it either passed, or we backed off so that it will + * now pass */ + if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) { + + /* Here did not match the required 'Begin' in the + * second term. So just match the very first + * character, the '.' of the final term of the regex */ + locinput = starting + UTF8SKIP(starting); + } else { + + /* Here is the beginning of a character that can have + * an extender. It is either a hangul syllable, or a + * non-control */ + if (swash_fetch(PL_utf8_X_non_hangul, + (U8*)locinput, do_utf8)) + { + + /* Here not a Hangul syllable, must be a + * ('! * Control') */ + locinput += UTF8SKIP(locinput); + } else { + + /* Here is a Hangul syllable. It can be composed + * of several individual characters. One + * possibility is T+ */ + if (swash_fetch(PL_utf8_X_T, + (U8*)locinput, do_utf8)) + { + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_T, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + } else { + + /* Here, not T+, but is a Hangul. That means + * it is one of the others: L, LV, LVT or V, + * and matches: + * L* (L | LVT T* | V V* T* | LV V* T*) */ + + /* Match L* */ + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_L, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + + /* Here, have exhausted L*. If the next + * character is not an LV, LVT nor V, it means + * we had to have at least one L, so matches L+ + * in the original equation, we have a complete + * hangul syllable. Are done. */ + + if (locinput < PL_regeol + && swash_fetch(PL_utf8_X_LV_LVT_V, + (U8*)locinput, do_utf8)) + { + + /* Otherwise keep going. Must be LV, LVT + * or V. See if LVT */ + if (swash_fetch(PL_utf8_X_LVT, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } else { + + /* Must be V or LV. Take it, then + * match V* */ + locinput += UTF8SKIP(locinput); + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_V, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + } + + /* And any of LV, LVT, or V can be followed + * by T* */ + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_T, + (U8*)locinput, + do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + } + } + } + + /* Match any extender */ + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_extend, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + } + } + if (locinput > PL_regeol) sayNO; + } nextchr = UCHARAT(locinput); break; @@ -12214,6 +12214,16 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8_punct = sv_dup_inc(proto_perl->Iutf8_punct, param); PL_utf8_xdigit = sv_dup_inc(proto_perl->Iutf8_xdigit, param); PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param); + PL_utf8_X_begin = sv_dup_inc(proto_perl->Iutf8_X_begin, param); + PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param); + PL_utf8_X_prepend = sv_dup_inc(proto_perl->Iutf8_X_prepend, param); + PL_utf8_X_non_hangul = sv_dup_inc(proto_perl->Iutf8_X_non_hangul, param); + PL_utf8_X_L = sv_dup_inc(proto_perl->Iutf8_X_L, param); + PL_utf8_X_LV = sv_dup_inc(proto_perl->Iutf8_X_LV, param); + PL_utf8_X_LVT = sv_dup_inc(proto_perl->Iutf8_X_LVT, param); + PL_utf8_X_T = sv_dup_inc(proto_perl->Iutf8_X_T, param); + PL_utf8_X_V = sv_dup_inc(proto_perl->Iutf8_X_V, param); + PL_utf8_X_LV_LVT_V = sv_dup_inc(proto_perl->Iutf8_X_LV_LVT_V, param); PL_utf8_toupper = sv_dup_inc(proto_perl->Iutf8_toupper, param); PL_utf8_totitle = sv_dup_inc(proto_perl->Iutf8_totitle, param); PL_utf8_tolower = sv_dup_inc(proto_perl->Iutf8_tolower, param); @@ -1488,6 +1488,106 @@ Perl_is_utf8_mark(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_mark, "IsM"); } +bool +Perl_is_utf8_X_begin(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN; + + return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin"); +} + +bool +Perl_is_utf8_X_extend(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND; + + return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend"); +} + +bool +Perl_is_utf8_X_prepend(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND; + + return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend"); +} + +bool +Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL; + + return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable"); +} + +bool +Perl_is_utf8_X_L(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_L; + + return is_utf8_common(p, &PL_utf8_X_L, "GCB=L"); +} + +bool +Perl_is_utf8_X_LV(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LV; + + return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV"); +} + +bool +Perl_is_utf8_X_LVT(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LVT; + + return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT"); +} + +bool +Perl_is_utf8_X_T(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_T; + + return is_utf8_common(p, &PL_utf8_X_T, "GCB=T"); +} + +bool +Perl_is_utf8_X_V(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_V; + + return is_utf8_common(p, &PL_utf8_X_V, "GCB=V"); +} + +bool +Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V; + + return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V"); +} + /* =for apidoc to_utf8_case @@ -1532,6 +1632,22 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, if (!*swashp) /* load on-demand */ *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); + /* This is the beginnings of a skeleton of code to read the info section + * that is in all the swashes in case we ever want to do that, so one can + * read things whose maps aren't code points, and whose default if missing + * is not to the code point itself. This was just to see if it actually + * worked. Details on what the possibilities are are in perluniprops.pod + HV * const hv = get_hv("utf8::SwashInfo", 0); + if (hv) { + SV **svp; + svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE); + const char *s; + + HV * const this_hash = SvRV(*svp); + svp = hv_fetch(this_hash, "type", strlen("type"), FALSE); + s = SvPV_const(*svp, len); + } + }*/ /* The 0xDF is the only special casing Unicode code point below 0x100. */ if (special && (uv1 == 0xDF || uv1 > 0xFF)) { @@ -1594,7 +1710,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, } } - if (!len) /* Neither: just copy. */ + if (!len) /* Neither: just copy. In other words, there was no mapping + defined, which means that the code point maps to itself */ len = uvchr_to_utf8(ustrp, uv0) - ustrp; if (lenp) @@ -1809,7 +1926,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) ptr = tmputf8; } /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ - * then the "swatch" is a vec() for al the chars which start + * then the "swatch" is a vec() for all the chars which start * with 0xAA..0xYY * So the key in the hash (klen) is length of encoded char -1 */ @@ -1817,7 +1934,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) off = ptr[klen]; if (klen == 0) { - /* If char in invariant then swatch is for all the invariant chars + /* If char is invariant then swatch is for all the invariant chars * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK */ needents = UTF_CONTINUATION_MARK; @@ -73,21 +73,20 @@ END_EXTERN_C U+0000..U+007F 00..7F U+0080..U+07FF C2..DF 80..BF - U+0800..U+0FFF E0 A0..BF 80..BF + U+0800..U+0FFF E0 * A0..BF 80..BF U+1000..U+CFFF E1..EC 80..BF 80..BF - U+D000..U+D7FF ED 80..9F 80..BF - U+D800..U+DFFF ******* ill-formed ******* + U+D000..U+D7FF ED * 80..9F 80..BF + U+D800..U+DFFF +++++++ utf16 surrogates, not legal utf8 +++++++ U+E000..U+FFFF EE..EF 80..BF 80..BF - U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF U+100000..U+10FFFF F4 80..8F 80..BF 80..BF -Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF, -the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF. -The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings: -it is technically possible to UTF-8-encode a single code point in different -ways, but that is explicitly forbidden, and the shortest possible encoding -should always be used (and that is what Perl does). +Note the gaps before the 2nd Byte entries above marked by '*'. These are +caused by legal UTF-8 avoiding non-shortest encodings: it is technically +possible to UTF-8-encode a single code point in different ways, but that is +explicitly forbidden, and the shortest possible encoding should always be used +(and that is what Perl does). */ @@ -213,11 +212,12 @@ encoded character. #define UTF8_ALLOW_EMPTY 0x0001 #define UTF8_ALLOW_CONTINUATION 0x0002 #define UTF8_ALLOW_NON_CONTINUATION 0x0004 -#define UTF8_ALLOW_FE_FF 0x0008 /* Allow above 0x7fffFFFF */ -#define UTF8_ALLOW_SHORT 0x0010 +#define UTF8_ALLOW_FE_FF 0x0008 /* Allow FE or FF start bytes, \ + yields above 0x7fffFFFF */ +#define UTF8_ALLOW_SHORT 0x0010 /* expecting more bytes */ #define UTF8_ALLOW_SURROGATE 0x0020 #define UTF8_ALLOW_FFFF 0x0040 /* Allow UNICODE_ILLEGAL */ -#define UTF8_ALLOW_LONG 0x0080 +#define UTF8_ALLOW_LONG 0x0080 /* expecting fewer bytes */ #define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\ UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) #define UTF8_ALLOW_ANY 0x00FF |