summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-05-06 09:28:01 -0600
committerKarl Williamson <public@khwilliamson.com>2012-05-22 08:24:21 -0600
commita027039367d8b0d7e425d682b287cef406d072e2 (patch)
treee6348723a5178078d85c6eba4c325a1c3f97d467 /utf8.c
parent50ba90ffe5821effdba066df4bc3986dee904e0c (diff)
downloadperl-a027039367d8b0d7e425d682b287cef406d072e2.tar.gz
utf8.c: Add nomix-ASCII option to to_fold functions
Under /iaa regex matching, folds that cross the ASCII/non-ASCII boundary are prohibited. This changes _to_uni_fold_flags() and _to_utf8_fold_flags() functions to take a new flag which, when set, tells them to not accept such folds. This allows us to later move the intelligence for handling this situation to these centralized functions.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c83
1 files changed, 72 insertions, 11 deletions
diff --git a/utf8.c b/utf8.c
index 29b30cf927..3d57154284 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1761,20 +1761,44 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const bool flags)
}
UV
-Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const bool flags)
+Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
{
- /* Not currently externally documented, and subject to change, <flags> is
- * TRUE iff full folding is to be used */
+ /* Not currently externally documented, and subject to change
+ * <flags> bits meanings:
+ * FOLD_FLAGS_FULL iff full folding is to be used;
+ * FOLD_FLAGS_LOCALE iff in locale
+ * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
+ */
PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
if (c < 256) {
- return _to_fold_latin1((U8) c, p, lenp, flags);
+ UV result = _to_fold_latin1((U8) c, p, lenp,
+ cBOOL(((flags & FOLD_FLAGS_FULL)
+ /* If ASCII-safe, don't allow full folding,
+ * as that could include SHARP S => ss;
+ * otherwise there is no crossing of
+ * ascii/non-ascii in the latin1 range */
+ && ! (flags & FOLD_FLAGS_NOMIX_ASCII))));
+ /* It is illegal for the fold to cross the 255/256 boundary under
+ * locale; in this case return the original */
+ return (result > 256 && flags & FOLD_FLAGS_LOCALE)
+ ? c
+ : result;
+ }
+
+ /* If no special needs, just use the macro */
+ if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
+ uvchr_to_utf8(p, c);
+ return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
+ }
+ else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
+ the special flags. */
+ U8 utf8_c[UTF8_MAXBYTES + 1];
+ uvchr_to_utf8(utf8_c, c);
+ return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
}
-
- uvchr_to_utf8(p, c);
- return CALL_FOLD_CASE(p, p, lenp, flags);
}
/* for now these all assume no locale info available for Unicode > 255; and
@@ -2695,6 +2719,8 @@ The character at C<p> is assumed by this routine to be well-formed.
* POSIX, lowercase is used instead
* bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
* otherwise simple folds
+ * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
+ * prohibited
* <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
* were used in the calculation; otherwise unchanged. */
@@ -2707,6 +2733,9 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
+ /* These are mutually exclusive */
+ assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
+
assert(p != ustrp); /* Otherwise overwrites */
if (UTF8_IS_INVARIANT(*p)) {
@@ -2724,17 +2753,49 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
}
else {
return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
- ustrp, lenp, cBOOL(flags & FOLD_FLAGS_FULL));
+ ustrp, lenp,
+ cBOOL((flags & FOLD_FLAGS_FULL
+ /* If ASCII safe, don't allow full
+ * folding, as that could include SHARP
+ * S => ss; otherwise there is no
+ * crossing of ascii/non-ascii in the
+ * latin1 range */
+ && ! (flags & FOLD_FLAGS_NOMIX_ASCII))));
}
}
else { /* utf8, ord above 255 */
- result = CALL_FOLD_CASE(p, ustrp, lenp, flags);
+ result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
if ((flags & FOLD_FLAGS_LOCALE)) {
- result = check_locale_boundary_crossing(p, result, ustrp, lenp);
+ return check_locale_boundary_crossing(p, result, ustrp, lenp);
+ }
+ else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
+ return result;
}
+ else {
+ /* This is called when changing the case of a utf8-encoded
+ * character above the Latin1 range, and the result should not
+ * contain an ASCII character. */
+
+ UV original; /* To store the first code point of <p> */
+
+ /* Look at every character in the result; if any cross the
+ * boundary, the whole thing is disallowed */
+ U8* s = ustrp;
+ U8* e = ustrp + *lenp;
+ while (s < e) {
+ if (isASCII(*s)) {
+ /* Crossed, have to return the original */
+ original = valid_utf8_to_uvchr(p, lenp);
+ Copy(p, ustrp, *lenp, char);
+ return original;
+ }
+ s += UTF8SKIP(s);
+ }
- return result;
+ /* Here, no characters crossed, result is ok as-is */
+ return result;
+ }
}
/* Here, used locale rules. Convert back to utf8 */