diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-11-08 22:16:39 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-11-08 22:38:39 -0700 |
commit | a1dde8dee0a7efc73bdb7c985e74f1461e153f12 (patch) | |
tree | 37d8d55f6094d3437a7b264f9614f3952fce73fc | |
parent | f26f1b9ce33a811c1a4ea00612a18101543a46fd (diff) | |
download | perl-a1dde8dee0a7efc73bdb7c985e74f1461e153f12.tar.gz |
utf8.c: Faster latin1 folding
This adds a function similar to the ones for the other three case
changing operations that works on latin1 characters only, and avoids
having to go out to swashes. It changes to_uni_fold() and
to_utf8_fold() to call it on the appropriate input
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | utf8.c | 48 |
4 files changed, 55 insertions, 1 deletions
@@ -603,6 +603,7 @@ Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp #ifdef PERL_IN_UTF8_C sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s +p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const U8 flags #endif Ap |UV |to_uni_lower |UV c|NN U8 *p|NN STRLEN *lenp Amp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp @@ -1571,6 +1571,7 @@ #define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) +#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d) #define _to_upper_title_latin1(a,b,c,d) Perl__to_upper_title_latin1(aTHX_ a,b,c,d) #define is_utf8_char_slow S_is_utf8_char_slow #define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c) @@ -6983,6 +6983,12 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U #endif #if defined(PERL_IN_UTF8_C) +PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const U8 flags) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \ + assert(p); assert(lenp) + PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); @@ -1459,12 +1459,50 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) } UV +Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const U8 flags) +{ + UV converted; + + PERL_ARGS_ASSERT__TO_FOLD_LATIN1; + + if (c == MICRO_SIGN) { + converted = GREEK_SMALL_LETTER_MU; + } + else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) { + *(p)++ = 's'; + *p = 's'; + *lenp = 2; + return 's'; + } + else { /* In this range the fold of all other characters is their lower + case */ + converted = toLOWER_LATIN1(c); + } + + if (UNI_IS_INVARIANT(converted)) { + *p = (U8) converted; + *lenp = 1; + } + else { + *(p)++ = UTF8_TWO_BYTE_HI(converted); + *p = UTF8_TWO_BYTE_LO(converted); + *lenp = 2; + } + + return converted; +} + +UV Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) { PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; + if (c < 256) { + return _to_fold_latin1((U8) c, p, lenp, flags); + } + uvchr_to_utf8(p, c); - return _to_utf8_fold_flags(p, p, lenp, flags); + return CALL_FOLD_CASE(p, p, lenp, flags); } /* for now these all assume no locale info available for Unicode > 255 */ @@ -2180,6 +2218,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; + if (UTF8_IS_INVARIANT(*p)) { + return _to_fold_latin1(*p, ustrp, lenp, flags); + } + else if UTF8_IS_DOWNGRADEABLE_START(*p) { + return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + ustrp, lenp, flags); + } + return CALL_FOLD_CASE(p, ustrp, lenp, flags); } |