diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2005-01-22 02:20:12 +0200 |
---|---|---|
committer | Dave Mitchell <davem@fdisolutions.com> | 2005-01-22 00:17:40 +0000 |
commit | 89ebb4a3f2a55825eeed13aaf58db5c73d2140ef (patch) | |
tree | a66444144493fa61d6befce0c9bf1358973f9872 /utf8.h | |
parent | 80a13697042a4d823de61ba24b77aa9d893765d6 (diff) | |
download | perl-89ebb4a3f2a55825eeed13aaf58db5c73d2140ef.tar.gz |
Re: uc($long_utf8_string) exhausts memory
Message-Id: <41F1801C.3080201@iki.fi>
Make buffer size estimates for utf8 case conversion less maximally
pessimistic
p4raw-id: //depot/perl@23857
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 28 |
1 files changed, 20 insertions, 8 deletions
@@ -162,14 +162,26 @@ encoded character. #define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1) #define isALNUM_lazy(p) isALNUM_lazy_if(p,1) -/* how wide can a single UTF-8 encoded character become */ -#define UTF8_MAXLEN 13 -/* how wide a character can become when upper/lowercased */ -#define UTF8_MAXLEN_UCLC_MULT 3 -#define UTF8_MAXLEN_UCLC (UTF8_MAXLEN*UTF8_MAXLEN_UCLC_MULT) -/* how wide a character can become when casefolded */ -#define UTF8_MAXLEN_FOLD_MULT 3 -#define UTF8_MAXLEN_FOLD (UTF8_MAXLEN*UTF8_MAXLEN_FOLD_MULT) +#define UTF8_MAXBYTES 13 +/* How wide can a single UTF-8 encoded character become in bytes. + * NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 + * since UTF-8 is an encoding of Unicode and given Unicode's current + * upper limit only four bytes is possible. Perl thinks of UTF-8 + * as a way to encode non-negative integers in a binary format. */ +#define UTF8_MAXLEN UTF8_MAXBYTES + +#define UTF8_MAXLEN_UCLC 3 /* Obsolete, do not use. */ +#define UTF8_MAXLEN_UCLC_MULT 39 /* Obsolete, do not use. */ +#define UTF8_MAXLEN_FOLD 3 /* Obsolete, do not use. */ +#define UTF8_MAXLEN_FOLD_MULT 39 /* Obsolete, do not use. */ + +/* The maximum number of UTF-8 bytes a single Unicode character can + * uppercase/lowercase/fold into; this number depends on the Unicode + * version. An example of maximal expansion is the U+03B0 which + * uppercases to U+03C5 U+0308 U+0301. The Unicode databases that + * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and + * SpecialCasing.txt. */ +#define UTF8_MAXBYTES_CASE 6 #define IN_BYTES (PL_curcop->op_private & HINT_BYTES) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES) |