diff options
author | Karl Williamson <khw@cpan.org> | 2016-01-18 14:25:02 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-01-19 15:08:59 -0700 |
commit | 6b659339f976d014a1a53731d86cedd01f5921ec (patch) | |
tree | 852b02830e8b19bf700af95791214485d1f4e2e8 /charclass_invlists.h | |
parent | ca8226cfa2cc0ddcc50f60505c42078df8e3b766 (diff) | |
download | perl-6b659339f976d014a1a53731d86cedd01f5921ec.tar.gz |
Add qr/\b{lb}/
This adds the final Unicode boundary type previously missing from core
Perl: the LineBreak one. This feature is already available in the
Unicode::LineBreak module, but I've been told that there are portability
and some other issues with that module. What's added here is a
light-weight version that is lacking the customizable features of the
module.
This implements the default Line Breaking algorithm, but with the
customizations that Unicode is expecting everybody to add, as their
test file tests for them. In other words, this passes Unicode's fairly
extensive furnished tests, but wouldn't if it didn't include certain
customizations specified by Unicode beyond the basic algorithm.
The implementation uses a look-up table of the characters surrounding a
boundary to see if it is a suitable place to break a line. In a few
cases, context needs to be taken into account, so there is code in
addition to the lookup table to handle those.
This should meet the needs for line breaking of many applications,
without having to load the module.
The algorithm is somewhat independent of the Unicode version, just like
the other boundary types. Only if new rules are added, or existing ones
modified is there need to go in and change this code. Otherwise,
running regen/mk_invlists.pl should be sufficient when a new Unicode
release is done to keep it up-to-date, again like the other Unicode
boundary types.
Diffstat (limited to 'charclass_invlists.h')
-rw-r--r-- | charclass_invlists.h | 60 |
1 files changed, 58 insertions, 2 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index 478d98ccb3..e4d43cde01 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -87733,6 +87733,62 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC 037 */ #endif /* EBCDIC 037 */ +#if defined(PERL_IN_REGEXEC_C) + +#define LB_NOBREAK 0 +#define LB_BREAKABLE 1 +#define LB_NOBREAK_EVEN_WITH_SP_BETWEEN 2 +#define LB_CM_foo 3 +#define LB_SP_foo 6 +#define LB_PR_or_PO_then_OP_or_HY 9 +#define LB_SY_or_IS_then_various 11 +#define LB_HY_or_BA_then_foo 13 +#define LB_various_then_PO_or_PR 16 + +static const U8 LB_table[36][36] = { + +/* 'ed' stands for 'edge' */ +/* AL BA BB B2 SY CR CP CL CM CB EX GL H2 H3 HL HY ID IS IN JL JT JV LF BK NL NS NU OP PO PR QU RI SP WJ ZW ed */ +/* AL */ 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, +/* BA */ 14, 0, 14, 14, 2, 0, 2, 2, 0, 1, 2, 14, 14, 14, 14, 0, 14, 2, 14, 14, 14, 14, 0, 0, 0, 0, 14, 14, 14, 14, 0, 14, 0, 0, 0, 1, +/* BB */ 0, 0, 0, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, +/* B2 */ 1, 0, 1, 2, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, +/* SY */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 12, 1, 17, 17, 0, 1, 0, 0, 0, 1, +/* CR */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* CP */ 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 2, 0, 1, 17, 17, 0, 1, 0, 0, 0, 1, +/* CL */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 2, 1, 1, 17, 17, 0, 1, 0, 0, 0, 1, +/* CM */ 3, 3, 3, 3, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 1, +/* CB */ 1, 1, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, +/* EX */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, +/* GL */ 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, +/* H2 */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, +/* H3 */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, +/* HL */ 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, +/* HY */ 14, 0, 14, 14, 2, 0, 2, 2, 0, 1, 2, 14, 14, 14, 14, 0, 14, 2, 14, 14, 14, 14, 0, 0, 0, 0, 13, 14, 14, 14, 0, 14, 0, 0, 0, 1, +/* ID */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, +/* IS */ 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 12, 1, 17, 17, 0, 1, 0, 0, 0, 1, +/* IN */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, +/* JL */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, +/* JT */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, +/* JV */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, +/* LF */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* BK */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* NL */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* NS */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, +/* NU */ 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 2, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, +/* OP */ 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 1, +/* PO */ 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 10, 1, 1, 0, 1, 0, 0, 0, 1, +/* PR */ 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 10, 1, 1, 0, 1, 0, 0, 0, 1, +/* QU */ 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, +/* RI */ 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, +/* SP */ 7, 7, 7, 7, 8, 0, 8, 8, 7, 7, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 0, 8, 0, 1, +/* WJ */ 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, +/* ZW */ 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, +/* ed */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +#endif /* defined(PERL_IN_REGEXEC_C) */ + /* Generated from: * cb3170dd603ad12ba0299440e99e8f50a8afde60ade2ffcbf1ff4a8a53854b90 lib/Unicode/UCD.pm * ae98bec7e4f0564758eed81eca5015481ba32581f8a735a825b71b3bba714450 lib/unicore/ArabicShaping.txt @@ -87776,8 +87832,8 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC 037 */ * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * 3589fcc6852823b96c60209bc615ad8cb96455d5047194ec3545e30ecc9d464b lib/unicore/mktables + * 648ee6791150b7163b4a7b61db070d5d19e9d4461a3ba20b7de536d820335e3e lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl - * 9209c2d6979d09bd75a4636e7a278cea1857b9b6143c75f6bf02ec1f91399f2a regen/mk_invlists.pl + * e9526360724d821ce2a46cdd11b9ddd9aac93e4edb653bf6af8f493bf42da1e6 regen/mk_invlists.pl * ex: set ro: */ |