summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANIFEST1
-rw-r--r--charclass_invlists.h2
-rw-r--r--doop.c2
-rw-r--r--embed.fnc6
-rw-r--r--embed.h4
-rw-r--r--embedvar.h5
-rw-r--r--intrpvar.h8
-rw-r--r--lib/unicore/mktables2
-rw-r--r--lib/unicore/uni_keywords.pl2
-rw-r--r--lib/utf8_heavy.pl5
-rw-r--r--pod/perldiag.pod5
-rw-r--r--proto.h18
-rw-r--r--regcharclass.h2
-rw-r--r--sv.c10
-rw-r--r--t/uni/cache.t35
-rw-r--r--toke.c4
-rw-r--r--uni_keywords.h2
-rw-r--r--utf8.c690
18 files changed, 10 insertions, 793 deletions
diff --git a/MANIFEST b/MANIFEST
index 76e8c1d0b0..f9c93032fc 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -6103,7 +6103,6 @@ t/test_pl/tempfile.t Tests for the simple testing library
t/thread_it.pl Run regression tests in a new thread
t/uni/attrs.t See if Unicode attributes work
t/uni/bless.t See if Unicode bless works
-t/uni/cache.t See if Unicode swash caching works
t/uni/caller.t See if Unicode doesn't get mangled in caller()
t/uni/case.pl See if Unicode casing works
t/uni/chomp.t See if Unicode chomp works
diff --git a/charclass_invlists.h b/charclass_invlists.h
index bb365d1b23..2d782770b5 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -395307,7 +395307,7 @@ static const U8 WB_table[23][23] = {
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
- * 0e4964d17f9da0415977193f7f05f6db2994f59ea4a5be5281f1add183bcf8d0 lib/unicore/mktables
+ * ad9f9ebb4ad5378ccb7437b0d15b213b3f9d1b7e5c095284a44bee5d84206be6 lib/unicore/mktables
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl
diff --git a/doop.c b/doop.c
index 3cb1354540..5ac79420b0 100644
--- a/doop.c
+++ b/doop.c
@@ -391,7 +391,7 @@ S_do_trans_invmap(pTHX_ SV * const sv, AV * const invmap)
bool inplace = ! cBOOL(PL_op->op_private & OPpTRANS_GROWS);
const UV* from_array = invlist_array(from_invlist);
UV final_map;
- bool out_is_utf8 = SvUTF8(sv);
+ bool out_is_utf8 = cBOOL(SvUTF8(sv));
STRLEN s_len;
PERL_ARGS_ASSERT_DO_TRANS_INVMAP;
diff --git a/embed.fnc b/embed.fnc
index 76ec0c0909..64986dfaa3 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1868,8 +1868,6 @@ Apd |void |sv_vsetpvfn |NN SV *const sv|NN const char *const pat|const STRLEN pa
|NULLOK va_list *const args|NULLOK SV **const svargs \
|const Size_t sv_count|NULLOK bool *const maybe_tainted
ApR |NV |str_to_version |NN SV *sv
-EXpR |SV* |swash_init |NN const char* pkg|NN const char* name|NN SV* listsv|I32 minbits|I32 none
-EXp |UV |swash_fetch |NN SV *swash|NN const U8 *ptr|bool do_utf8
#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
EiR |SV* |add_cp_to_invlist |NULLOK SV* invlist|const UV cp
Ei |void |invlist_extend |NN SV* const invlist|const UV len
@@ -3093,10 +3091,6 @@ SR |UV |check_locale_boundary_crossing \
iR |bool |is_utf8_common |NN const U8 *const p \
|NN const U8 *const e \
|NULLOK SV* const invlist
-SR |SV* |swatch_get |NN SV* swash|UV start|UV span
-SR |U8* |swash_scan_list_line|NN U8* l|NN U8* const lend|NN UV* min \
- |NN UV* max|NN UV* val|const bool wants_value \
- |NN const U8* const typestr
#endif
EXiTp |void |append_utf8_from_native_byte|const U8 byte|NN U8** dest
diff --git a/embed.h b/embed.h
index 5a1c6fe190..6351956bfb 100644
--- a/embed.h
+++ b/embed.h
@@ -941,8 +941,6 @@
#define skipspace_flags(a,b) Perl_skipspace_flags(aTHX_ a,b)
#define sv_magicext_mglob(a) Perl_sv_magicext_mglob(aTHX_ a)
#define sv_only_taint_gmagic Perl_sv_only_taint_gmagic
-#define swash_fetch(a,b,c) Perl_swash_fetch(aTHX_ a,b,c)
-#define swash_init(a,b,c,d,e) Perl_swash_init(aTHX_ a,b,c,d,e)
#define utf16_to_utf8(a,b,c,d) Perl_utf16_to_utf8(aTHX_ a,b,c,d)
#define utf16_to_utf8_reversed(a,b,c,d) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d)
#define validate_proto(a,b,c,d) Perl_validate_proto(aTHX_ a,b,c,d)
@@ -1924,8 +1922,6 @@
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
#define is_utf8_overlong_given_start_byte_ok S_is_utf8_overlong_given_start_byte_ok
#define new_msg_hv(a,b,c) S_new_msg_hv(aTHX_ a,b,c)
-#define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
-#define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c)
#define to_lower_latin1 S_to_lower_latin1
#define turkic_fc(a,b,c,d) S_turkic_fc(aTHX_ a,b,c,d)
#define turkic_lc(a,b,c,d) S_turkic_lc(aTHX_ a,b,c,d)
diff --git a/embedvar.h b/embedvar.h
index 120e5f72da..8c8b1748ba 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -168,11 +168,6 @@
#define PL_langinfo_buf (vTHX->Ilanginfo_buf)
#define PL_langinfo_bufsize (vTHX->Ilanginfo_bufsize)
#define PL_last_in_gv (vTHX->Ilast_in_gv)
-#define PL_last_swash_hv (vTHX->Ilast_swash_hv)
-#define PL_last_swash_key (vTHX->Ilast_swash_key)
-#define PL_last_swash_klen (vTHX->Ilast_swash_klen)
-#define PL_last_swash_slen (vTHX->Ilast_swash_slen)
-#define PL_last_swash_tmps (vTHX->Ilast_swash_tmps)
#define PL_lastfd (vTHX->Ilastfd)
#define PL_lastgotoprobe (vTHX->Ilastgotoprobe)
#define PL_laststatval (vTHX->Ilaststatval)
diff --git a/intrpvar.h b/intrpvar.h
index 94ac64717a..906a67a739 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -723,14 +723,6 @@ PERLVARI(I, underlying_numeric_obj, locale_t, NULL)
# endif
#endif /* !USE_LOCALE_NUMERIC */
-/* utf8 character class swashes */
-
-PERLVAR(I, last_swash_hv, HV *)
-PERLVAR(I, last_swash_tmps, U8 *)
-PERLVAR(I, last_swash_slen, STRLEN)
-PERLVARA(I, last_swash_key,UTF8_MAXBYTES-1, U8)
-PERLVAR(I, last_swash_klen, U8) /* Only needs to store 0-12 */
-
#ifdef FCRYPT
PERLVARI(I, cryptseen, bool, FALSE) /* has fast crypt() been initialized? */
#else
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index c8452cc486..47bf2d3f80 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -7614,7 +7614,7 @@ END
$return .= <<END;
-# The name this swash is to be known by, with the format of the mappings in
+# The name this table is to be known by, with the format of the mappings in
# the main body of the table, and what all code points missing from this file
# map to.
\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index 311f7e9b80..92263939e2 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1261,7 +1261,7 @@
# 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
# 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
# 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
-# 0e4964d17f9da0415977193f7f05f6db2994f59ea4a5be5281f1add183bcf8d0 lib/unicore/mktables
+# ad9f9ebb4ad5378ccb7437b0d15b213b3f9d1b7e5c095284a44bee5d84206be6 lib/unicore/mktables
# a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
# 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
# e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index ec6dbb69dd..1c54849a30 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -81,7 +81,6 @@ my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x;
## Called from swash_init (see utf8.c) or SWASHNEW itself.
##
## Callers of swash_init:
- ## op.c:pmtrans -- for tr/// and y///
## Unicode::UCD::prop_invlist
## Unicode::UCD::prop_invmap
##
@@ -102,7 +101,7 @@ my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x;
## $none is undocumented, so I'm (khw) trying to do some documentation
## of it now. It appears to be if there is a mapping in an input file
## that maps to 'XXXX', then that is replaced by $none+1, expressed in
- ## hexadecimal. It is used somehow in tr///.
+ ## hexadecimal. It is no longer used.
##
## To make the parsing of $type clear, this code takes the a rather
## unorthodox approach of last'ing out of the block once we have the
@@ -488,7 +487,7 @@ my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x;
my $taint = substr($list,0,0); # maintain taint
# Separate the extras from the code point list, and make sure
- # user-defined properties and tr/// are well-behaved for
+ # user-defined properties are well-behaved for
# downstream code.
if ($user_defined || $none) {
my @tmp = split(/^/m, $list);
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 1e07818529..465317bf92 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -5973,11 +5973,6 @@ assignment or as a subroutine argument for example).
(P) Perl tried to force the upgrade of an SV to a type which was actually
inferior to its current type.
-=item SWASHNEW didn't return an HV ref
-
-(P) Something went wrong internally when Perl was trying to look up
-Unicode characters.
-
=item Switch (?(condition)... contains too many branches in regex; marked by
S<<-- HERE> in m/%s/
diff --git a/proto.h b/proto.h
index 51c316e070..84a0aeadf2 100644
--- a/proto.h
+++ b/proto.h
@@ -3770,14 +3770,6 @@ PERL_CALLCONV void Perl_sv_vsetpvf_mg(pTHX_ SV *const sv, const char *const pat,
PERL_CALLCONV void Perl_sv_vsetpvfn(pTHX_ SV *const sv, const char *const pat, const STRLEN patlen, va_list *const args, SV **const svargs, const Size_t sv_count, bool *const maybe_tainted);
#define PERL_ARGS_ASSERT_SV_VSETPVFN \
assert(sv); assert(pat)
-PERL_CALLCONV UV Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8);
-#define PERL_ARGS_ASSERT_SWASH_FETCH \
- assert(swash); assert(ptr)
-PERL_CALLCONV SV* Perl_swash_init(pTHX_ const char* pkg, const char* name, SV* listsv, I32 minbits, I32 none)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_SWASH_INIT \
- assert(pkg); assert(name); assert(listsv)
-
PERL_CALLCONV void Perl_switch_to_global_locale(void);
#define PERL_ARGS_ASSERT_SWITCH_TO_GLOBAL_LOCALE
PERL_CALLCONV bool Perl_sync_locale(void);
@@ -6386,16 +6378,6 @@ STATIC HV * S_new_msg_hv(pTHX_ const char * const message, U32 categories, U32 f
#define PERL_ARGS_ASSERT_NEW_MSG_HV \
assert(message)
-STATIC U8* S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, const bool wants_value, const U8* const typestr)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE \
- assert(l); assert(lend); assert(min); assert(max); assert(val); assert(typestr)
-
-STATIC SV* S_swatch_get(pTHX_ SV* swash, UV start, UV span)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_SWATCH_GET \
- assert(swash)
-
STATIC U8 S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp, const char dummy)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_TO_LOWER_LATIN1
diff --git a/regcharclass.h b/regcharclass.h
index 220027a992..cf2a344d8c 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -1901,7 +1901,7 @@
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
- * 0e4964d17f9da0415977193f7f05f6db2994f59ea4a5be5281f1add183bcf8d0 lib/unicore/mktables
+ * ad9f9ebb4ad5378ccb7437b0d15b213b3f9d1b7e5c095284a44bee5d84206be6 lib/unicore/mktables
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* 8cffbf838b6e8ea5310e4ad2e0498ad9c1d87d4babead678081859473591317c regen/regcharclass.pl
diff --git a/sv.c b/sv.c
index ed07e6875e..8a081975d0 100644
--- a/sv.c
+++ b/sv.c
@@ -6654,9 +6654,6 @@ Perl_sv_clear(pTHX_ SV *const orig_sv)
sv_del_backref(MUTABLE_SV(stash), sv);
goto freescalar;
case SVt_PVHV:
- if (PL_last_swash_hv == (const HV *)sv) {
- PL_last_swash_hv = NULL;
- }
if (HvTOTALKEYS((HV*)sv) > 0) {
const HEK *hek;
/* this statement should match the one at the beginning of
@@ -15387,13 +15384,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
PL_globhook = proto_perl->Iglobhook;
- /* swatch cache */
- PL_last_swash_hv = NULL; /* reinits on demand */
- PL_last_swash_klen = 0;
- PL_last_swash_key[0]= '\0';
- PL_last_swash_tmps = (U8*)NULL;
- PL_last_swash_slen = 0;
-
PL_srand_called = proto_perl->Isrand_called;
Copy(&(proto_perl->Irandom_state), &PL_random_state, 1, PL_RANDOM_STATE_TYPE);
diff --git a/t/uni/cache.t b/t/uni/cache.t
deleted file mode 100644
index e72a1b1aa8..0000000000
--- a/t/uni/cache.t
+++ /dev/null
@@ -1,35 +0,0 @@
-BEGIN {
- chdir 't' if -d 't';
- require './test.pl';
- set_up_inc('../lib');
- skip_all("utf8_heavy no longer used much");
- skip_all_without_unicode_tables();
-}
-
-plan tests => 1;
-
-# Looks to see if a "do 'unicore/lib/Scx/Hira.pl'" is called more than once, by
-# putting a compile sub first on the library path;
-# XXX Kludge: requires exact path, which might change, and has deep knowledge
-# of how utf8_heavy.pl works, which might also change.
-
-BEGIN { # Make sure catches compile time references
- $::count = 0;
- unshift @INC, sub {
- $::count++ if $_[1] eq 'unicore/lib/Scx/Hira.pl';
- };
-}
-
-my $s = 'foo';
-
-# The second value is to prevent an optimization that exists at the time this
-# is written to re-use a property without trying to look it up if it is the
-# only thing in a character class. They differ in order to make sure that any
-# future optimizations that don't re-use identical character classes don't come
-# into play
-$s =~ m/[\p{Hiragana}\x{101}]/;
-$s =~ m/[\p{Hiragana}\x{102}]/;
-$s =~ m/[\p{Hiragana}\x{103}]/;
-$s =~ m/[\p{Hiragana}\x{104}]/;
-
-is($::count, 1, "Swatch hash caching kept us from reloading swatch hash.");
diff --git a/toke.c b/toke.c
index 2c448eb6de..862dbb453e 100644
--- a/toke.c
+++ b/toke.c
@@ -2691,8 +2691,8 @@ Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
}
else {
/* Similarly for utf8. For invariants can check directly; for other
- * Latin1, can calculate their code point and check; otherwise use a
- * swash */
+ * Latin1, can calculate their code point and check; otherwise use an
+ * inversion list */
if (UTF8_IS_INVARIANT(*s)) {
if (! isALPHAU(*s)) {
goto bad_charname;
diff --git a/uni_keywords.h b/uni_keywords.h
index 9d780f54d1..392e0169aa 100644
--- a/uni_keywords.h
+++ b/uni_keywords.h
@@ -7284,7 +7284,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
- * 0e4964d17f9da0415977193f7f05f6db2994f59ea4a5be5281f1add183bcf8d0 lib/unicore/mktables
+ * ad9f9ebb4ad5378ccb7437b0d15b213b3f9d1b7e5c095284a44bee5d84206be6 lib/unicore/mktables
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl
diff --git a/utf8.c b/utf8.c
index 6b98473fb3..86623b16ef 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3317,8 +3317,6 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
> HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
{
- /* As of Unicode 10.0, this means we avoid swash creation
- * for anything beyond high Plane 1 (below emojis) */
goto cases_to_self;
}
#endif
@@ -3966,694 +3964,6 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
}
-/* Note:
- * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
- * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
- * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
- */
-
-SV*
-Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
- I32 minbits, I32 none)
-{
- /* Returns a copy of a swash initiated by the called function. This is the
- * public interface, and returning a copy prevents others from doing
- * mischief on the original. The only remaining use of this is in tr/// */
-
- /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
- * use the following define */
-
-#define SWASH_INIT_RETURN(x) \
- PL_curpm= old_PL_curpm; \
- return newSVsv(x)
-
- /* Initialize and return a swash, creating it if necessary. It does this
- * by calling utf8_heavy.pl in the general case.
- *
- * pkg is the name of the package that <name> should be in.
- * name is the name of the swash to find.
- * listsv is a string to initialize the swash with. It must be of the form
- * documented as the subroutine return value in
- * L<perlunicode/User-Defined Character Properties>
- * minbits is the number of bits required to represent each data element.
- * none I (khw) do not understand this one, but it is used only in tr///.
- *
- * Thus there are two possible inputs to find the swash: <name> and
- * <listsv>. At least one must be specified. The result
- * will be the union of the specified ones, although <listsv>'s various
- * actions can intersect, etc. what <name> gives. To avoid going out to
- * disk at all, <invlist> should specify completely what the swash should
- * have, and <listsv> should be &PL_sv_undef and <name> should be "".
- */
-
- PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
-
- SV* retval = &PL_sv_undef;
-
- PERL_ARGS_ASSERT_SWASH_INIT;
-
- assert(listsv != &PL_sv_undef || strNE(name, ""));
-
- PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
- regex that triggered the swash init and the swash init
- perl logic itself. See perl #122747 */
-
- /* If data was passed in to go out to utf8_heavy to find the swash of, do
- * so */
- if (listsv != &PL_sv_undef || strNE(name, "")) {
- dSP;
- const size_t pkg_len = strlen(pkg);
- const size_t name_len = strlen(name);
- HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
- SV* errsv_save;
- GV *method;
-
-
- PUSHSTACKi(PERLSI_MAGIC);
- ENTER;
- SAVEHINTS();
- save_re_context();
- /* We might get here via a subroutine signature which uses a utf8
- * parameter name, at which point PL_subname will have been set
- * but not yet used. */
- save_item(PL_subname);
- if (PL_parser && PL_parser->error_count)
- SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
- method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
- if (!method) { /* demand load UTF-8 */
- ENTER;
- if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
- GvSV(PL_errgv) = NULL;
-#ifndef NO_TAINT_SUPPORT
- /* It is assumed that callers of this routine are not passing in
- * any user derived data. */
- /* Need to do this after save_re_context() as it will set
- * PL_tainted to 1 while saving $1 etc (see the code after getrx:
- * in Perl_magic_get). Even line to create errsv_save can turn on
- * PL_tainted. */
- SAVEBOOL(TAINT_get);
- TAINT_NOT;
-#endif
- require_pv("utf8_heavy.pl");
- {
- /* Not ERRSV, as there is no need to vivify a scalar we are
- about to discard. */
- SV * const errsv = GvSV(PL_errgv);
- if (!SvTRUE(errsv)) {
- GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
- SvREFCNT_dec(errsv);
- }
- }
- LEAVE;
- }
- SPAGAIN;
- PUSHMARK(SP);
- EXTEND(SP,5);
- mPUSHp(pkg, pkg_len);
- mPUSHp(name, name_len);
- PUSHs(listsv);
- mPUSHi(minbits);
- mPUSHi(none);
- PUTBACK;
- if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
- GvSV(PL_errgv) = NULL;
- /* If we already have a pointer to the method, no need to use
- * call_method() to repeat the lookup. */
- if (method
- ? call_sv(MUTABLE_SV(method), G_SCALAR)
- : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
- {
- retval = *PL_stack_sp--;
- SvREFCNT_inc(retval);
- }
- {
- /* Not ERRSV. See above. */
- SV * const errsv = GvSV(PL_errgv);
- if (!SvTRUE(errsv)) {
- GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
- SvREFCNT_dec(errsv);
- }
- }
- LEAVE;
- POPSTACK;
- if (IN_PERL_COMPILETIME) {
- CopHINTS_set(PL_curcop, PL_hints);
- }
- } /* End of calling the module to find the swash */
-
- SWASH_INIT_RETURN(retval);
-#undef SWASH_INIT_RETURN
-}
-
-
-/* This API is wrong for special case conversions since we may need to
- * return several Unicode characters for a single Unicode character
- * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
- * the lower-level routine, and it is similarly broken for returning
- * multiple values. --jhi
- * For those, you should use S__to_utf8_case() instead */
-/* Now SWASHGET is recasted into S_swatch_get in this file. */
-
-/* Note:
- * Returns the value of property/mapping C<swash> for the first character
- * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
- * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
- * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
- *
- * A "swash" is a hash which contains initially the keys/values set up by
- * SWASHNEW. The purpose is to be able to completely represent a Unicode
- * property for all possible code points. Things are stored in a compact form
- * (see utf8_heavy.pl) so that calculation is required to find the actual
- * property value for a given code point. As code points are looked up, new
- * key/value pairs are added to the hash, so that the calculation doesn't have
- * to ever be re-done. Further, each calculation is done, not just for the
- * desired one, but for a whole block of code points adjacent to that one.
- * For binary properties on ASCII machines, the block is usually for 64 code
- * points, starting with a code point evenly divisible by 64. Thus if the
- * property value for code point 257 is requested, the code goes out and
- * calculates the property values for all 64 code points between 256 and 319,
- * and stores these as a single 64-bit long bit vector, called a "swatch",
- * under the key for code point 256. The key is the UTF-8 encoding for code
- * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
- * for a code point is 13 bytes, the key will be 12 bytes long. If the value
- * for code point 258 is then requested, this code realizes that it would be
- * stored under the key for 256, and would find that value and extract the
- * relevant bit, offset from 256.
- *
- * Non-binary properties are stored in as many bits as necessary to represent
- * their values (32 currently, though the code is more general than that), not
- * as single bits, but the principle is the same: the value for each key is a
- * vector that encompasses the property values for all code points whose UTF-8
- * representations are represented by the key. That is, for all code points
- * whose UTF-8 representations are length N bytes, and the key is the first N-1
- * bytes of that.
- */
-UV
-Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
-{
- HV *const hv = MUTABLE_HV(SvRV(swash));
- U32 klen;
- U32 off;
- STRLEN slen = 0;
- STRLEN needents;
- const U8 *tmps = NULL;
- SV *swatch;
- const U8 c = *ptr;
-
- PERL_ARGS_ASSERT_SWASH_FETCH;
-
- /* If it really isn't a hash, it isn't really swash; must be an inversion
- * list */
- if (SvTYPE(hv) != SVt_PVHV) {
- return _invlist_contains_cp((SV*)hv,
- (do_utf8)
- ? valid_utf8_to_uvchr(ptr, NULL)
- : c);
- }
-
- /* We store the values in a "swatch" which is a vec() value in a swash
- * hash. Code points 0-255 are a single vec() stored with key length
- * (klen) 0. All other code points have a UTF-8 representation
- * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
- * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
- * length for them is the length of the encoded char - 1. ptr[klen] is the
- * final byte in the sequence representing the character */
- if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
- klen = 0;
- needents = 256;
- off = c;
- }
- else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
- klen = 0;
- needents = 256;
- off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
- }
- else {
- klen = UTF8SKIP(ptr) - 1;
-
- /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
- * the vec is the final byte in the sequence. (In EBCDIC this is
- * converted to I8 to get consecutive values.) To help you visualize
- * all this:
- * Straight 1047 After final byte
- * UTF-8 UTF-EBCDIC I8 transform
- * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
- * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
- * ...
- * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
- * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
- * ...
- * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
- * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
- * ...
- * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
- * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
- * ...
- * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
- * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
- *
- * (There are no discontinuities in the elided (...) entries.)
- * The UTF-8 key for these 33 code points is '\xD0' (which also is the
- * key for the next 31, up through U+043F, whose UTF-8 final byte is
- * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
- * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
- * index into the vec() swatch (after subtracting 0x80, which we
- * actually do with an '&').
- * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
- * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
- * dicontinuities which go away by transforming it into I8, and we
- * effectively subtract 0xA0 to get the index. */
- needents = (1 << UTF_ACCUMULATION_SHIFT);
- off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
- }
-
- /*
- * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
- * suite. (That is, only 7-8% overall over just a hash cache. Still,
- * it's nothing to sniff at.) Pity we usually come through at least
- * two function calls to get here...
- *
- * NB: this code assumes that swatches are never modified, once generated!
- */
-
- if (hv == PL_last_swash_hv &&
- klen == PL_last_swash_klen &&
- (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
- {
- tmps = PL_last_swash_tmps;
- slen = PL_last_swash_slen;
- }
- else {
- /* Try our second-level swatch cache, kept in a hash. */
- SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
-
- /* If not cached, generate it via swatch_get */
- if (!svp || !SvPOK(*svp)
- || !(tmps = (const U8*)SvPV_const(*svp, slen)))
- {
- if (klen) {
- const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
- swatch = swatch_get(swash,
- code_point & ~((UV)needents - 1),
- needents);
- }
- else { /* For the first 256 code points, the swatch has a key of
- length 0 */
- swatch = swatch_get(swash, 0, needents);
- }
-
- if (IN_PERL_COMPILETIME)
- CopHINTS_set(PL_curcop, PL_hints);
-
- svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
-
- if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
- || (slen << 3) < needents)
- Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
- "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
- svp, tmps, (UV)slen, (UV)needents);
- }
-
- PL_last_swash_hv = hv;
- assert(klen <= sizeof(PL_last_swash_key));
- PL_last_swash_klen = (U8)klen;
- /* FIXME change interpvar.h? */
- PL_last_swash_tmps = (U8 *) tmps;
- PL_last_swash_slen = slen;
- if (klen)
- Copy(ptr, PL_last_swash_key, klen, U8);
- }
-
- switch ((int)((slen << 3) / needents)) {
- case 1:
- return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
- case 8:
- return ((UV) tmps[off]);
- case 16:
- off <<= 1;
- return
- ((UV) tmps[off ] << 8) +
- ((UV) tmps[off + 1]);
- case 32:
- off <<= 2;
- return
- ((UV) tmps[off ] << 24) +
- ((UV) tmps[off + 1] << 16) +
- ((UV) tmps[off + 2] << 8) +
- ((UV) tmps[off + 3]);
- }
- Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
- "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
- NORETURN_FUNCTION_END;
-}
-
-/* Read a single line of the main body of the swash input text. These are of
- * the form:
- * 0053 0056 0073
- * where each number is hex. The first two numbers form the minimum and
- * maximum of a range, and the third is the value associated with the range.
- * Not all swashes should have a third number
- *
- * On input: l points to the beginning of the line to be examined; it points
- * to somewhere in the string of the whole input text, and is
- * terminated by a \n or the null string terminator.
- * lend points to the null terminator of that string
- * wants_value is non-zero if the swash expects a third number
- * typestr is the name of the swash's mapping, like 'ToLower'
- * On output: *min, *max, and *val are set to the values read from the line.
- * returns a pointer just beyond the line examined. If there was no
- * valid min number on the line, returns lend+1
- */
-
-STATIC U8*
-S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
- const bool wants_value, const U8* const typestr)
-{
- const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
- STRLEN numlen; /* Length of the number */
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX
- | PERL_SCAN_SILENT_NON_PORTABLE;
-
- /* nl points to the next \n in the scan */
- U8* const nl = (U8*)memchr(l, '\n', lend - l);
-
- PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
-
- /* Get the first number on the line: the range minimum */
- numlen = lend - l;
- *min = grok_hex((char *)l, &numlen, &flags, NULL);
- *max = *min; /* So can never return without setting max */
- if (numlen) /* If found a hex number, position past it */
- l += numlen;
- else if (nl) { /* Else, go handle next line, if any */
- return nl + 1; /* 1 is length of "\n" */
- }
- else { /* Else, no next line */
- return lend + 1; /* to LIST's end at which \n is not found */
- }
-
- /* The max range value follows, separated by a BLANK */
- if (isBLANK(*l)) {
- ++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX
- | PERL_SCAN_SILENT_NON_PORTABLE;
- numlen = lend - l;
- *max = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else /* If no value here, it is a single element range */
- *max = *min;
-
- /* Non-binary tables have a third entry: what the first element of the
- * range maps to. The map for those currently read here is in hex */
- if (wants_value) {
- if (isBLANK(*l)) {
- ++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX
- | PERL_SCAN_SILENT_NON_PORTABLE;
- numlen = lend - l;
- *val = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else
- *val = 0;
- }
- else {
- *val = 0;
- if (typeto) {
- /* diag_listed_as: To%s: illegal mapping '%s' */
- Perl_croak(aTHX_ "%s: illegal mapping '%s'",
- typestr, l);
- }
- }
- }
- else
- *val = 0; /* bits == 1, then any val should be ignored */
- }
- else { /* Nothing following range min, should be single element with no
- mapping expected */
- if (wants_value) {
- *val = 0;
- if (typeto) {
- /* diag_listed_as: To%s: illegal mapping '%s' */
- Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
- }
- }
- else
- *val = 0; /* bits == 1, then val should be ignored */
- }
-
- /* Position to next line if any, or EOF */
- if (nl)
- l = nl + 1;
- else
- l = lend;
-
- return l;
-}
-
-/* Note:
- * Returns a swatch (a bit vector string) for a code point sequence
- * that starts from the value C<start> and comprises the number C<span>.
- * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
- * Should be used via swash_fetch, which will cache the swatch in C<swash>.
- */
-STATIC SV*
-S_swatch_get(pTHX_ SV* swash, UV start, UV span)
-{
- SV *swatch;
- U8 *l, *lend, *x, *xend, *s;
- STRLEN lcur, xcur, scur;
- HV *const hv = MUTABLE_HV(SvRV(swash));
-
- SV** listsvp = NULL; /* The string containing the main body of the table */
- SV** extssvp = NULL;
- U8* typestr = NULL;
- STRLEN bits = 0;
- STRLEN octets; /* if bits == 1, then octets == 0 */
- UV none;
- UV end = start + span;
-
- SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
- SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
- SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
- extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
- listsvp = hv_fetchs(hv, "LIST", FALSE);
-
- bits = SvUV(*bitssvp);
- none = SvUV(*nonesvp);
- typestr = (U8*)SvPV_nolen(*typesvp);
- octets = bits >> 3; /* if bits == 1, then octets == 0 */
-
- PERL_ARGS_ASSERT_SWATCH_GET;
-
- if (bits != 8 && bits != 16 && bits != 32) {
- Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
- (UV)bits);
- }
-
- /* If overflowed, use the max possible */
- if (end < start) {
- end = UV_MAX;
- span = end - start;
- }
-
- /* create and initialize $swatch */
- scur = octets ? (span * octets) : (span + 7) / 8;
- swatch = newSV(scur);
- SvPOK_on(swatch);
- s = (U8*)SvPVX(swatch);
- if (octets && none) {
- const U8* const e = s + scur;
- while (s < e) {
- if (bits == 8)
- *s++ = (U8)(none & 0xff);
- else if (bits == 16) {
- *s++ = (U8)((none >> 8) & 0xff);
- *s++ = (U8)( none & 0xff);
- }
- else if (bits == 32) {
- *s++ = (U8)((none >> 24) & 0xff);
- *s++ = (U8)((none >> 16) & 0xff);
- *s++ = (U8)((none >> 8) & 0xff);
- *s++ = (U8)( none & 0xff);
- }
- }
- *s = '\0';
- }
- else {
- (void)memzero((U8*)s, scur + 1);
- }
- SvCUR_set(swatch, scur);
- s = (U8*)SvPVX(swatch);
-
- /* read $swash->{LIST} */
- l = (U8*)SvPV(*listsvp, lcur);
- lend = l + lcur;
- while (l < lend) {
- UV min = 0, max = 0, val = 0, upper;
- l = swash_scan_list_line(l, lend, &min, &max, &val,
- cBOOL(octets), typestr);
- if (l > lend) {
- break;
- }
-
- /* If looking for something beyond this range, go try the next one */
- if (max < start)
- continue;
-
- /* <end> is generally 1 beyond where we want to set things, but at the
- * platform's infinity, where we can't go any higher, we want to
- * include the code point at <end> */
- upper = (max < end)
- ? max
- : (max != UV_MAX || end != UV_MAX)
- ? end - 1
- : end;
-
- if (octets) {
- UV key;
- if (min < start) {
- if (!none || val < none) {
- val += start - min;
- }
- min = start;
- }
- for (key = min; key <= upper; key++) {
- STRLEN offset;
- /* offset must be non-negative (start <= min <= key < end) */
- offset = octets * (key - start);
- if (bits == 8)
- s[offset] = (U8)(val & 0xff);
- else if (bits == 16) {
- s[offset ] = (U8)((val >> 8) & 0xff);
- s[offset + 1] = (U8)( val & 0xff);
- }
- else if (bits == 32) {
- s[offset ] = (U8)((val >> 24) & 0xff);
- s[offset + 1] = (U8)((val >> 16) & 0xff);
- s[offset + 2] = (U8)((val >> 8) & 0xff);
- s[offset + 3] = (U8)( val & 0xff);
- }
-
- if (!none || val < none)
- ++val;
- }
- }
- } /* while */
-
- /* read $swash->{EXTRAS} */
- x = (U8*)SvPV(*extssvp, xcur);
- xend = x + xcur;
- while (x < xend) {
- STRLEN namelen;
- U8 *namestr;
- SV** othersvp;
- HV* otherhv;
- STRLEN otherbits;
- SV **otherbitssvp, *other;
- U8 *s, *o, *nl;
- STRLEN slen, olen;
-
- const U8 opc = *x++;
- if (opc == '\n')
- continue;
-
- nl = (U8*)memchr(x, '\n', xend - x);
-
- if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
- if (nl) {
- x = nl + 1; /* 1 is length of "\n" */
- continue;
- }
- else {
- x = xend; /* to EXTRAS' end at which \n is not found */
- break;
- }
- }
-
- namestr = x;
- if (nl) {
- namelen = nl - namestr;
- x = nl + 1;
- }
- else {
- namelen = xend - namestr;
- x = xend;
- }
-
- othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
- otherhv = MUTABLE_HV(SvRV(*othersvp));
- otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
- otherbits = (STRLEN)SvUV(*otherbitssvp);
- if (bits < otherbits)
- Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
- "bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits);
-
- /* The "other" swatch must be destroyed after. */
- other = swatch_get(*othersvp, start, span);
- o = (U8*)SvPV(other, olen);
-
- if (!olen)
- Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
-
- s = (U8*)SvPV(swatch, slen);
- {
- STRLEN otheroctets = otherbits >> 3;
- STRLEN offset = 0;
- U8* const send = s + slen;
-
- while (s < send) {
- UV otherval = 0;
-
- if (otherbits == 1) {
- otherval = (o[offset >> 3] >> (offset & 7)) & 1;
- ++offset;
- }
- else {
- STRLEN vlen = otheroctets;
- otherval = *o++;
- while (--vlen) {
- otherval <<= 8;
- otherval |= *o++;
- }
- }
-
- if (opc == '+' && otherval)
- NOOP; /* replace with otherval */
- else if (opc == '!' && !otherval)
- otherval = 1;
- else if (opc == '-' && otherval)
- otherval = 0;
- else if (opc == '&' && !otherval)
- otherval = 0;
- else {
- s += octets; /* no replacement */
- continue;
- }
-
- if (bits == 8)
- *s++ = (U8)( otherval & 0xff);
- else if (bits == 16) {
- *s++ = (U8)((otherval >> 8) & 0xff);
- *s++ = (U8)( otherval & 0xff);
- }
- else if (bits == 32) {
- *s++ = (U8)((otherval >> 24) & 0xff);
- *s++ = (U8)((otherval >> 16) & 0xff);
- *s++ = (U8)((otherval >> 8) & 0xff);
- *s++ = (U8)( otherval & 0xff);
- }
- }
- }
- sv_free(other); /* through with it! */
- } /* while */
- return swatch;
-}
-
bool
Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
{