summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-11-08 22:16:39 -0700
committerKarl Williamson <public@khwilliamson.com>2011-11-08 22:38:39 -0700
commita1dde8dee0a7efc73bdb7c985e74f1461e153f12 (patch)
tree37d8d55f6094d3437a7b264f9614f3952fce73fc
parentf26f1b9ce33a811c1a4ea00612a18101543a46fd (diff)
downloadperl-a1dde8dee0a7efc73bdb7c985e74f1461e153f12.tar.gz
utf8.c: Faster latin1 folding
This adds a function similar to the ones for the other three case changing operations that works on latin1 characters only, and avoids having to go out to swashes. It changes to_uni_fold() and to_utf8_fold() to call it on the appropriate input
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--proto.h6
-rw-r--r--utf8.c48
4 files changed, 55 insertions, 1 deletions
diff --git a/embed.fnc b/embed.fnc
index e2911dd8a0..446faf5de9 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -603,6 +603,7 @@ Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp
#ifdef PERL_IN_UTF8_C
sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
+p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const U8 flags
#endif
Ap |UV |to_uni_lower |UV c|NN U8 *p|NN STRLEN *lenp
Amp |UV |to_uni_fold |UV c|NN U8 *p|NN STRLEN *lenp
diff --git a/embed.h b/embed.h
index 3d985b5f82..8540031730 100644
--- a/embed.h
+++ b/embed.h
@@ -1571,6 +1571,7 @@
#define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d)
# endif
# if defined(PERL_IN_UTF8_C)
+#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
#define _to_upper_title_latin1(a,b,c,d) Perl__to_upper_title_latin1(aTHX_ a,b,c,d)
#define is_utf8_char_slow S_is_utf8_char_slow
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index 7f9621a47e..534fab8511 100644
--- a/proto.h
+++ b/proto.h
@@ -6983,6 +6983,12 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
#endif
#if defined(PERL_IN_UTF8_C)
+PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const U8 flags)
+ __attribute__nonnull__(pTHX_2)
+ __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \
+ assert(p); assert(lenp)
+
PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s)
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);
diff --git a/utf8.c b/utf8.c
index 38f5c6c00c..9c55d10c25 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1459,12 +1459,50 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
}
UV
+Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const U8 flags)
+{
+ UV converted;
+
+ PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
+
+ if (c == MICRO_SIGN) {
+ converted = GREEK_SMALL_LETTER_MU;
+ }
+ else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) {
+ *(p)++ = 's';
+ *p = 's';
+ *lenp = 2;
+ return 's';
+ }
+ else { /* In this range the fold of all other characters is their lower
+ case */
+ converted = toLOWER_LATIN1(c);
+ }
+
+ if (UNI_IS_INVARIANT(converted)) {
+ *p = (U8) converted;
+ *lenp = 1;
+ }
+ else {
+ *(p)++ = UTF8_TWO_BYTE_HI(converted);
+ *p = UTF8_TWO_BYTE_LO(converted);
+ *lenp = 2;
+ }
+
+ return converted;
+}
+
+UV
Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
{
PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
+ if (c < 256) {
+ return _to_fold_latin1((U8) c, p, lenp, flags);
+ }
+
uvchr_to_utf8(p, c);
- return _to_utf8_fold_flags(p, p, lenp, flags);
+ return CALL_FOLD_CASE(p, p, lenp, flags);
}
/* for now these all assume no locale info available for Unicode > 255 */
@@ -2180,6 +2218,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
+ if (UTF8_IS_INVARIANT(*p)) {
+ return _to_fold_latin1(*p, ustrp, lenp, flags);
+ }
+ else if UTF8_IS_DOWNGRADEABLE_START(*p) {
+ return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
+ ustrp, lenp, flags);
+ }
+
return CALL_FOLD_CASE(p, ustrp, lenp, flags);
}