From a1dde8dee0a7efc73bdb7c985e74f1461e153f12 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 8 Nov 2011 22:16:39 -0700 Subject: utf8.c: Faster latin1 folding This adds a function similar to the ones for the other three case changing operations that works on latin1 characters only, and avoids having to go out to swashes. It changes to_uni_fold() and to_utf8_fold() to call it on the appropriate input --- embed.fnc | 1 + embed.h | 1 + proto.h | 6 ++++++ utf8.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/embed.fnc b/embed.fnc index e2911dd8a0..446faf5de9 100644 --- a/embed.fnc +++ b/embed.fnc @@ -603,6 +603,7 @@ Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp #ifdef PERL_IN_UTF8_C sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s +p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const U8 flags #endif Ap |UV |to_uni_lower |UV c|NN U8 *p|NN STRLEN *lenp Amp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp diff --git a/embed.h b/embed.h index 3d985b5f82..8540031730 100644 --- a/embed.h +++ b/embed.h @@ -1571,6 +1571,7 @@ #define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) +#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d) #define _to_upper_title_latin1(a,b,c,d) Perl__to_upper_title_latin1(aTHX_ a,b,c,d) #define is_utf8_char_slow S_is_utf8_char_slow #define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c) diff --git a/proto.h b/proto.h index 7f9621a47e..534fab8511 100644 --- a/proto.h +++ b/proto.h @@ -6983,6 +6983,12 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U #endif #if defined(PERL_IN_UTF8_C) +PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const U8 flags) + __attribute__nonnull__(pTHX_2) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \ + assert(p); assert(lenp) + PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); diff --git a/utf8.c b/utf8.c index 38f5c6c00c..9c55d10c25 100644 --- a/utf8.c +++ b/utf8.c @@ -1458,13 +1458,51 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) return CALL_LOWER_CASE(p, p, lenp); } +UV +Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const U8 flags) +{ + UV converted; + + PERL_ARGS_ASSERT__TO_FOLD_LATIN1; + + if (c == MICRO_SIGN) { + converted = GREEK_SMALL_LETTER_MU; + } + else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) { + *(p)++ = 's'; + *p = 's'; + *lenp = 2; + return 's'; + } + else { /* In this range the fold of all other characters is their lower + case */ + converted = toLOWER_LATIN1(c); + } + + if (UNI_IS_INVARIANT(converted)) { + *p = (U8) converted; + *lenp = 1; + } + else { + *(p)++ = UTF8_TWO_BYTE_HI(converted); + *p = UTF8_TWO_BYTE_LO(converted); + *lenp = 2; + } + + return converted; +} + UV Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) { PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; + if (c < 256) { + return _to_fold_latin1((U8) c, p, lenp, flags); + } + uvchr_to_utf8(p, c); - return _to_utf8_fold_flags(p, p, lenp, flags); + return CALL_FOLD_CASE(p, p, lenp, flags); } /* for now these all assume no locale info available for Unicode > 255 */ @@ -2180,6 +2218,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; + if (UTF8_IS_INVARIANT(*p)) { + return _to_fold_latin1(*p, ustrp, lenp, flags); + } + else if UTF8_IS_DOWNGRADEABLE_START(*p) { + return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)), + ustrp, lenp, flags); + } + return CALL_FOLD_CASE(p, ustrp, lenp, flags); } -- cgit v1.2.1