diff options
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | charclass_invlists.h | 2 | ||||
-rw-r--r-- | doop.c | 2 | ||||
-rw-r--r-- | embed.fnc | 6 | ||||
-rw-r--r-- | embed.h | 4 | ||||
-rw-r--r-- | embedvar.h | 5 | ||||
-rw-r--r-- | intrpvar.h | 8 | ||||
-rw-r--r-- | lib/unicore/mktables | 2 | ||||
-rw-r--r-- | lib/unicore/uni_keywords.pl | 2 | ||||
-rw-r--r-- | lib/utf8_heavy.pl | 5 | ||||
-rw-r--r-- | pod/perldiag.pod | 5 | ||||
-rw-r--r-- | proto.h | 18 | ||||
-rw-r--r-- | regcharclass.h | 2 | ||||
-rw-r--r-- | sv.c | 10 | ||||
-rw-r--r-- | t/uni/cache.t | 35 | ||||
-rw-r--r-- | toke.c | 4 | ||||
-rw-r--r-- | uni_keywords.h | 2 | ||||
-rw-r--r-- | utf8.c | 690 |
18 files changed, 10 insertions, 793 deletions
@@ -6103,7 +6103,6 @@ t/test_pl/tempfile.t Tests for the simple testing library t/thread_it.pl Run regression tests in a new thread t/uni/attrs.t See if Unicode attributes work t/uni/bless.t See if Unicode bless works -t/uni/cache.t See if Unicode swash caching works t/uni/caller.t See if Unicode doesn't get mangled in caller() t/uni/case.pl See if Unicode casing works t/uni/chomp.t See if Unicode chomp works diff --git a/charclass_invlists.h b/charclass_invlists.h index bb365d1b23..2d782770b5 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -395307,7 +395307,7 @@ static const U8 WB_table[23][23] = { * 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt * 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt * 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt - * 0e4964d17f9da0415977193f7f05f6db2994f59ea4a5be5281f1add183bcf8d0 lib/unicore/mktables + * ad9f9ebb4ad5378ccb7437b0d15b213b3f9d1b7e5c095284a44bee5d84206be6 lib/unicore/mktables * a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl @@ -391,7 +391,7 @@ S_do_trans_invmap(pTHX_ SV * const sv, AV * const invmap) bool inplace = ! cBOOL(PL_op->op_private & OPpTRANS_GROWS); const UV* from_array = invlist_array(from_invlist); UV final_map; - bool out_is_utf8 = SvUTF8(sv); + bool out_is_utf8 = cBOOL(SvUTF8(sv)); STRLEN s_len; PERL_ARGS_ASSERT_DO_TRANS_INVMAP; @@ -1868,8 +1868,6 @@ Apd |void |sv_vsetpvfn |NN SV *const sv|NN const char *const pat|const STRLEN pa |NULLOK va_list *const args|NULLOK SV **const svargs \ |const Size_t sv_count|NULLOK bool *const maybe_tainted ApR |NV |str_to_version |NN SV *sv -EXpR |SV* |swash_init |NN const char* pkg|NN const char* name|NN SV* listsv|I32 minbits|I32 none -EXp |UV |swash_fetch |NN SV *swash|NN const U8 *ptr|bool do_utf8 #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) EiR |SV* |add_cp_to_invlist |NULLOK SV* invlist|const UV cp Ei |void |invlist_extend |NN SV* const invlist|const UV len @@ -3093,10 +3091,6 @@ SR |UV |check_locale_boundary_crossing \ iR |bool |is_utf8_common |NN const U8 *const p \ |NN const U8 *const e \ |NULLOK SV* const invlist -SR |SV* |swatch_get |NN SV* swash|UV start|UV span -SR |U8* |swash_scan_list_line|NN U8* l|NN U8* const lend|NN UV* min \ - |NN UV* max|NN UV* val|const bool wants_value \ - |NN const U8* const typestr #endif EXiTp |void |append_utf8_from_native_byte|const U8 byte|NN U8** dest @@ -941,8 +941,6 @@ #define skipspace_flags(a,b) Perl_skipspace_flags(aTHX_ a,b) #define sv_magicext_mglob(a) Perl_sv_magicext_mglob(aTHX_ a) #define sv_only_taint_gmagic Perl_sv_only_taint_gmagic -#define swash_fetch(a,b,c) Perl_swash_fetch(aTHX_ a,b,c) -#define swash_init(a,b,c,d,e) Perl_swash_init(aTHX_ a,b,c,d,e) #define utf16_to_utf8(a,b,c,d) Perl_utf16_to_utf8(aTHX_ a,b,c,d) #define utf16_to_utf8_reversed(a,b,c,d) Perl_utf16_to_utf8_reversed(aTHX_ a,b,c,d) #define validate_proto(a,b,c,d) Perl_validate_proto(aTHX_ a,b,c,d) @@ -1924,8 +1922,6 @@ #define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c) #define is_utf8_overlong_given_start_byte_ok S_is_utf8_overlong_given_start_byte_ok #define new_msg_hv(a,b,c) S_new_msg_hv(aTHX_ a,b,c) -#define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g) -#define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c) #define to_lower_latin1 S_to_lower_latin1 #define turkic_fc(a,b,c,d) S_turkic_fc(aTHX_ a,b,c,d) #define turkic_lc(a,b,c,d) S_turkic_lc(aTHX_ a,b,c,d) diff --git a/embedvar.h b/embedvar.h index 120e5f72da..8c8b1748ba 100644 --- a/embedvar.h +++ b/embedvar.h @@ -168,11 +168,6 @@ #define PL_langinfo_buf (vTHX->Ilanginfo_buf) #define PL_langinfo_bufsize (vTHX->Ilanginfo_bufsize) #define PL_last_in_gv (vTHX->Ilast_in_gv) -#define PL_last_swash_hv (vTHX->Ilast_swash_hv) -#define PL_last_swash_key (vTHX->Ilast_swash_key) -#define PL_last_swash_klen (vTHX->Ilast_swash_klen) -#define PL_last_swash_slen (vTHX->Ilast_swash_slen) -#define PL_last_swash_tmps (vTHX->Ilast_swash_tmps) #define PL_lastfd (vTHX->Ilastfd) #define PL_lastgotoprobe (vTHX->Ilastgotoprobe) #define PL_laststatval (vTHX->Ilaststatval) diff --git a/intrpvar.h b/intrpvar.h index 94ac64717a..906a67a739 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -723,14 +723,6 @@ PERLVARI(I, underlying_numeric_obj, locale_t, NULL) # endif #endif /* !USE_LOCALE_NUMERIC */ -/* utf8 character class swashes */ - -PERLVAR(I, last_swash_hv, HV *) -PERLVAR(I, last_swash_tmps, U8 *) -PERLVAR(I, last_swash_slen, STRLEN) -PERLVARA(I, last_swash_key,UTF8_MAXBYTES-1, U8) -PERLVAR(I, last_swash_klen, U8) /* Only needs to store 0-12 */ - #ifdef FCRYPT PERLVARI(I, cryptseen, bool, FALSE) /* has fast crypt() been initialized? */ #else diff --git a/lib/unicore/mktables b/lib/unicore/mktables index c8452cc486..47bf2d3f80 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -7614,7 +7614,7 @@ END $return .= <<END; -# The name this swash is to be known by, with the format of the mappings in +# The name this table is to be known by, with the format of the mappings in # the main body of the table, and what all code points missing from this file # map to. \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format} diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index 311f7e9b80..92263939e2 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1261,7 +1261,7 @@ # 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt # 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt # 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt -# 0e4964d17f9da0415977193f7f05f6db2994f59ea4a5be5281f1add183bcf8d0 lib/unicore/mktables +# ad9f9ebb4ad5378ccb7437b0d15b213b3f9d1b7e5c095284a44bee5d84206be6 lib/unicore/mktables # a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version # 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl # e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index ec6dbb69dd..1c54849a30 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -81,7 +81,6 @@ my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x; ## Called from swash_init (see utf8.c) or SWASHNEW itself. ## ## Callers of swash_init: - ## op.c:pmtrans -- for tr/// and y/// ## Unicode::UCD::prop_invlist ## Unicode::UCD::prop_invmap ## @@ -102,7 +101,7 @@ my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x; ## $none is undocumented, so I'm (khw) trying to do some documentation ## of it now. It appears to be if there is a mapping in an input file ## that maps to 'XXXX', then that is replaced by $none+1, expressed in - ## hexadecimal. It is used somehow in tr///. + ## hexadecimal. It is no longer used. ## ## To make the parsing of $type clear, this code takes the a rather ## unorthodox approach of last'ing out of the block once we have the @@ -488,7 +487,7 @@ my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x; my $taint = substr($list,0,0); # maintain taint # Separate the extras from the code point list, and make sure - # user-defined properties and tr/// are well-behaved for + # user-defined properties are well-behaved for # downstream code. if ($user_defined || $none) { my @tmp = split(/^/m, $list); diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 1e07818529..465317bf92 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -5973,11 +5973,6 @@ assignment or as a subroutine argument for example). (P) Perl tried to force the upgrade of an SV to a type which was actually inferior to its current type. -=item SWASHNEW didn't return an HV ref - -(P) Something went wrong internally when Perl was trying to look up -Unicode characters. - =item Switch (?(condition)... contains too many branches in regex; marked by S<<-- HERE> in m/%s/ @@ -3770,14 +3770,6 @@ PERL_CALLCONV void Perl_sv_vsetpvf_mg(pTHX_ SV *const sv, const char *const pat, PERL_CALLCONV void Perl_sv_vsetpvfn(pTHX_ SV *const sv, const char *const pat, const STRLEN patlen, va_list *const args, SV **const svargs, const Size_t sv_count, bool *const maybe_tainted); #define PERL_ARGS_ASSERT_SV_VSETPVFN \ assert(sv); assert(pat) -PERL_CALLCONV UV Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8); -#define PERL_ARGS_ASSERT_SWASH_FETCH \ - assert(swash); assert(ptr) -PERL_CALLCONV SV* Perl_swash_init(pTHX_ const char* pkg, const char* name, SV* listsv, I32 minbits, I32 none) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_SWASH_INIT \ - assert(pkg); assert(name); assert(listsv) - PERL_CALLCONV void Perl_switch_to_global_locale(void); #define PERL_ARGS_ASSERT_SWITCH_TO_GLOBAL_LOCALE PERL_CALLCONV bool Perl_sync_locale(void); @@ -6386,16 +6378,6 @@ STATIC HV * S_new_msg_hv(pTHX_ const char * const message, U32 categories, U32 f #define PERL_ARGS_ASSERT_NEW_MSG_HV \ assert(message) -STATIC U8* S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, const bool wants_value, const U8* const typestr) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE \ - assert(l); assert(lend); assert(min); assert(max); assert(val); assert(typestr) - -STATIC SV* S_swatch_get(pTHX_ SV* swash, UV start, UV span) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_SWATCH_GET \ - assert(swash) - STATIC U8 S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp, const char dummy) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_TO_LOWER_LATIN1 diff --git a/regcharclass.h b/regcharclass.h index 220027a992..cf2a344d8c 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1901,7 +1901,7 @@ * 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt * 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt * 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt - * 0e4964d17f9da0415977193f7f05f6db2994f59ea4a5be5281f1add183bcf8d0 lib/unicore/mktables + * ad9f9ebb4ad5378ccb7437b0d15b213b3f9d1b7e5c095284a44bee5d84206be6 lib/unicore/mktables * a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * 8cffbf838b6e8ea5310e4ad2e0498ad9c1d87d4babead678081859473591317c regen/regcharclass.pl @@ -6654,9 +6654,6 @@ Perl_sv_clear(pTHX_ SV *const orig_sv) sv_del_backref(MUTABLE_SV(stash), sv); goto freescalar; case SVt_PVHV: - if (PL_last_swash_hv == (const HV *)sv) { - PL_last_swash_hv = NULL; - } if (HvTOTALKEYS((HV*)sv) > 0) { const HEK *hek; /* this statement should match the one at the beginning of @@ -15387,13 +15384,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_globhook = proto_perl->Iglobhook; - /* swatch cache */ - PL_last_swash_hv = NULL; /* reinits on demand */ - PL_last_swash_klen = 0; - PL_last_swash_key[0]= '\0'; - PL_last_swash_tmps = (U8*)NULL; - PL_last_swash_slen = 0; - PL_srand_called = proto_perl->Isrand_called; Copy(&(proto_perl->Irandom_state), &PL_random_state, 1, PL_RANDOM_STATE_TYPE); diff --git a/t/uni/cache.t b/t/uni/cache.t deleted file mode 100644 index e72a1b1aa8..0000000000 --- a/t/uni/cache.t +++ /dev/null @@ -1,35 +0,0 @@ -BEGIN { - chdir 't' if -d 't'; - require './test.pl'; - set_up_inc('../lib'); - skip_all("utf8_heavy no longer used much"); - skip_all_without_unicode_tables(); -} - -plan tests => 1; - -# Looks to see if a "do 'unicore/lib/Scx/Hira.pl'" is called more than once, by -# putting a compile sub first on the library path; -# XXX Kludge: requires exact path, which might change, and has deep knowledge -# of how utf8_heavy.pl works, which might also change. - -BEGIN { # Make sure catches compile time references - $::count = 0; - unshift @INC, sub { - $::count++ if $_[1] eq 'unicore/lib/Scx/Hira.pl'; - }; -} - -my $s = 'foo'; - -# The second value is to prevent an optimization that exists at the time this -# is written to re-use a property without trying to look it up if it is the -# only thing in a character class. They differ in order to make sure that any -# future optimizations that don't re-use identical character classes don't come -# into play -$s =~ m/[\p{Hiragana}\x{101}]/; -$s =~ m/[\p{Hiragana}\x{102}]/; -$s =~ m/[\p{Hiragana}\x{103}]/; -$s =~ m/[\p{Hiragana}\x{104}]/; - -is($::count, 1, "Swatch hash caching kept us from reloading swatch hash."); @@ -2691,8 +2691,8 @@ Perl_get_and_check_backslash_N_name(pTHX_ const char* s, } else { /* Similarly for utf8. For invariants can check directly; for other - * Latin1, can calculate their code point and check; otherwise use a - * swash */ + * Latin1, can calculate their code point and check; otherwise use an + * inversion list */ if (UTF8_IS_INVARIANT(*s)) { if (! isALPHAU(*s)) { goto bad_charname; diff --git a/uni_keywords.h b/uni_keywords.h index 9d780f54d1..392e0169aa 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7284,7 +7284,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { * 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt * 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt * 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt - * 0e4964d17f9da0415977193f7f05f6db2994f59ea4a5be5281f1add183bcf8d0 lib/unicore/mktables + * ad9f9ebb4ad5378ccb7437b0d15b213b3f9d1b7e5c095284a44bee5d84206be6 lib/unicore/mktables * a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl @@ -3317,8 +3317,6 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C)) { - /* As of Unicode 10.0, this means we avoid swash creation - * for anything beyond high Plane 1 (below emojis) */ goto cases_to_self; } #endif @@ -3966,694 +3964,6 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, } -/* Note: - * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch(). - * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8". - * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl. - */ - -SV* -Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, - I32 minbits, I32 none) -{ - /* Returns a copy of a swash initiated by the called function. This is the - * public interface, and returning a copy prevents others from doing - * mischief on the original. The only remaining use of this is in tr/// */ - - /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST - * use the following define */ - -#define SWASH_INIT_RETURN(x) \ - PL_curpm= old_PL_curpm; \ - return newSVsv(x) - - /* Initialize and return a swash, creating it if necessary. It does this - * by calling utf8_heavy.pl in the general case. - * - * pkg is the name of the package that <name> should be in. - * name is the name of the swash to find. - * listsv is a string to initialize the swash with. It must be of the form - * documented as the subroutine return value in - * L<perlunicode/User-Defined Character Properties> - * minbits is the number of bits required to represent each data element. - * none I (khw) do not understand this one, but it is used only in tr///. - * - * Thus there are two possible inputs to find the swash: <name> and - * <listsv>. At least one must be specified. The result - * will be the union of the specified ones, although <listsv>'s various - * actions can intersect, etc. what <name> gives. To avoid going out to - * disk at all, <invlist> should specify completely what the swash should - * have, and <listsv> should be &PL_sv_undef and <name> should be "". - */ - - PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */ - - SV* retval = &PL_sv_undef; - - PERL_ARGS_ASSERT_SWASH_INIT; - - assert(listsv != &PL_sv_undef || strNE(name, "")); - - PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the - regex that triggered the swash init and the swash init - perl logic itself. See perl #122747 */ - - /* If data was passed in to go out to utf8_heavy to find the swash of, do - * so */ - if (listsv != &PL_sv_undef || strNE(name, "")) { - dSP; - const size_t pkg_len = strlen(pkg); - const size_t name_len = strlen(name); - HV * const stash = gv_stashpvn(pkg, pkg_len, 0); - SV* errsv_save; - GV *method; - - - PUSHSTACKi(PERLSI_MAGIC); - ENTER; - SAVEHINTS(); - save_re_context(); - /* We might get here via a subroutine signature which uses a utf8 - * parameter name, at which point PL_subname will have been set - * but not yet used. */ - save_item(PL_subname); - if (PL_parser && PL_parser->error_count) - SAVEI8(PL_parser->error_count), PL_parser->error_count = 0; - method = gv_fetchmeth(stash, "SWASHNEW", 8, -1); - if (!method) { /* demand load UTF-8 */ - ENTER; - if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save); - GvSV(PL_errgv) = NULL; -#ifndef NO_TAINT_SUPPORT - /* It is assumed that callers of this routine are not passing in - * any user derived data. */ - /* Need to do this after save_re_context() as it will set - * PL_tainted to 1 while saving $1 etc (see the code after getrx: - * in Perl_magic_get). Even line to create errsv_save can turn on - * PL_tainted. */ - SAVEBOOL(TAINT_get); - TAINT_NOT; -#endif - require_pv("utf8_heavy.pl"); - { - /* Not ERRSV, as there is no need to vivify a scalar we are - about to discard. */ - SV * const errsv = GvSV(PL_errgv); - if (!SvTRUE(errsv)) { - GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save); - SvREFCNT_dec(errsv); - } - } - LEAVE; - } - SPAGAIN; - PUSHMARK(SP); - EXTEND(SP,5); - mPUSHp(pkg, pkg_len); - mPUSHp(name, name_len); - PUSHs(listsv); - mPUSHi(minbits); - mPUSHi(none); - PUTBACK; - if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save); - GvSV(PL_errgv) = NULL; - /* If we already have a pointer to the method, no need to use - * call_method() to repeat the lookup. */ - if (method - ? call_sv(MUTABLE_SV(method), G_SCALAR) - : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD)) - { - retval = *PL_stack_sp--; - SvREFCNT_inc(retval); - } - { - /* Not ERRSV. See above. */ - SV * const errsv = GvSV(PL_errgv); - if (!SvTRUE(errsv)) { - GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save); - SvREFCNT_dec(errsv); - } - } - LEAVE; - POPSTACK; - if (IN_PERL_COMPILETIME) { - CopHINTS_set(PL_curcop, PL_hints); - } - } /* End of calling the module to find the swash */ - - SWASH_INIT_RETURN(retval); -#undef SWASH_INIT_RETURN -} - - -/* This API is wrong for special case conversions since we may need to - * return several Unicode characters for a single Unicode character - * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is - * the lower-level routine, and it is similarly broken for returning - * multiple values. --jhi - * For those, you should use S__to_utf8_case() instead */ -/* Now SWASHGET is recasted into S_swatch_get in this file. */ - -/* Note: - * Returns the value of property/mapping C<swash> for the first character - * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is - * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr> - * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>. - * - * A "swash" is a hash which contains initially the keys/values set up by - * SWASHNEW. The purpose is to be able to completely represent a Unicode - * property for all possible code points. Things are stored in a compact form - * (see utf8_heavy.pl) so that calculation is required to find the actual - * property value for a given code point. As code points are looked up, new - * key/value pairs are added to the hash, so that the calculation doesn't have - * to ever be re-done. Further, each calculation is done, not just for the - * desired one, but for a whole block of code points adjacent to that one. - * For binary properties on ASCII machines, the block is usually for 64 code - * points, starting with a code point evenly divisible by 64. Thus if the - * property value for code point 257 is requested, the code goes out and - * calculates the property values for all 64 code points between 256 and 319, - * and stores these as a single 64-bit long bit vector, called a "swatch", - * under the key for code point 256. The key is the UTF-8 encoding for code - * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding - * for a code point is 13 bytes, the key will be 12 bytes long. If the value - * for code point 258 is then requested, this code realizes that it would be - * stored under the key for 256, and would find that value and extract the - * relevant bit, offset from 256. - * - * Non-binary properties are stored in as many bits as necessary to represent - * their values (32 currently, though the code is more general than that), not - * as single bits, but the principle is the same: the value for each key is a - * vector that encompasses the property values for all code points whose UTF-8 - * representations are represented by the key. That is, for all code points - * whose UTF-8 representations are length N bytes, and the key is the first N-1 - * bytes of that. - */ -UV -Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) -{ - HV *const hv = MUTABLE_HV(SvRV(swash)); - U32 klen; - U32 off; - STRLEN slen = 0; - STRLEN needents; - const U8 *tmps = NULL; - SV *swatch; - const U8 c = *ptr; - - PERL_ARGS_ASSERT_SWASH_FETCH; - - /* If it really isn't a hash, it isn't really swash; must be an inversion - * list */ - if (SvTYPE(hv) != SVt_PVHV) { - return _invlist_contains_cp((SV*)hv, - (do_utf8) - ? valid_utf8_to_uvchr(ptr, NULL) - : c); - } - - /* We store the values in a "swatch" which is a vec() value in a swash - * hash. Code points 0-255 are a single vec() stored with key length - * (klen) 0. All other code points have a UTF-8 representation - * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which - * share 0xAA..0xYY, which is the key in the hash to that vec. So the key - * length for them is the length of the encoded char - 1. ptr[klen] is the - * final byte in the sequence representing the character */ - if (!do_utf8 || UTF8_IS_INVARIANT(c)) { - klen = 0; - needents = 256; - off = c; - } - else if (UTF8_IS_DOWNGRADEABLE_START(c)) { - klen = 0; - needents = 256; - off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1)); - } - else { - klen = UTF8SKIP(ptr) - 1; - - /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into - * the vec is the final byte in the sequence. (In EBCDIC this is - * converted to I8 to get consecutive values.) To help you visualize - * all this: - * Straight 1047 After final byte - * UTF-8 UTF-EBCDIC I8 transform - * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0 - * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1 - * ... - * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9 - * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA - * ... - * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2 - * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3 - * ... - * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB - * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC - * ... - * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF - * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41 - * - * (There are no discontinuities in the elided (...) entries.) - * The UTF-8 key for these 33 code points is '\xD0' (which also is the - * key for the next 31, up through U+043F, whose UTF-8 final byte is - * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points. - * The final UTF-8 byte, which ranges between \x80 and \xBF, is an - * index into the vec() swatch (after subtracting 0x80, which we - * actually do with an '&'). - * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32 - * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has - * dicontinuities which go away by transforming it into I8, and we - * effectively subtract 0xA0 to get the index. */ - needents = (1 << UTF_ACCUMULATION_SHIFT); - off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK; - } - - /* - * This single-entry cache saves about 1/3 of the UTF-8 overhead in test - * suite. (That is, only 7-8% overall over just a hash cache. Still, - * it's nothing to sniff at.) Pity we usually come through at least - * two function calls to get here... - * - * NB: this code assumes that swatches are never modified, once generated! - */ - - if (hv == PL_last_swash_hv && - klen == PL_last_swash_klen && - (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) ) - { - tmps = PL_last_swash_tmps; - slen = PL_last_swash_slen; - } - else { - /* Try our second-level swatch cache, kept in a hash. */ - SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE); - - /* If not cached, generate it via swatch_get */ - if (!svp || !SvPOK(*svp) - || !(tmps = (const U8*)SvPV_const(*svp, slen))) - { - if (klen) { - const UV code_point = valid_utf8_to_uvchr(ptr, NULL); - swatch = swatch_get(swash, - code_point & ~((UV)needents - 1), - needents); - } - else { /* For the first 256 code points, the swatch has a key of - length 0 */ - swatch = swatch_get(swash, 0, needents); - } - - if (IN_PERL_COMPILETIME) - CopHINTS_set(PL_curcop, PL_hints); - - svp = hv_store(hv, (const char *)ptr, klen, swatch, 0); - - if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) - || (slen << 3) < needents) - Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, " - "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf, - svp, tmps, (UV)slen, (UV)needents); - } - - PL_last_swash_hv = hv; - assert(klen <= sizeof(PL_last_swash_key)); - PL_last_swash_klen = (U8)klen; - /* FIXME change interpvar.h? */ - PL_last_swash_tmps = (U8 *) tmps; - PL_last_swash_slen = slen; - if (klen) - Copy(ptr, PL_last_swash_key, klen, U8); - } - - switch ((int)((slen << 3) / needents)) { - case 1: - return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0; - case 8: - return ((UV) tmps[off]); - case 16: - off <<= 1; - return - ((UV) tmps[off ] << 8) + - ((UV) tmps[off + 1]); - case 32: - off <<= 2; - return - ((UV) tmps[off ] << 24) + - ((UV) tmps[off + 1] << 16) + - ((UV) tmps[off + 2] << 8) + - ((UV) tmps[off + 3]); - } - Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, " - "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents); - NORETURN_FUNCTION_END; -} - -/* Read a single line of the main body of the swash input text. These are of - * the form: - * 0053 0056 0073 - * where each number is hex. The first two numbers form the minimum and - * maximum of a range, and the third is the value associated with the range. - * Not all swashes should have a third number - * - * On input: l points to the beginning of the line to be examined; it points - * to somewhere in the string of the whole input text, and is - * terminated by a \n or the null string terminator. - * lend points to the null terminator of that string - * wants_value is non-zero if the swash expects a third number - * typestr is the name of the swash's mapping, like 'ToLower' - * On output: *min, *max, and *val are set to the values read from the line. - * returns a pointer just beyond the line examined. If there was no - * valid min number on the line, returns lend+1 - */ - -STATIC U8* -S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val, - const bool wants_value, const U8* const typestr) -{ - const int typeto = typestr[0] == 'T' && typestr[1] == 'o'; - STRLEN numlen; /* Length of the number */ - I32 flags = PERL_SCAN_SILENT_ILLDIGIT - | PERL_SCAN_DISALLOW_PREFIX - | PERL_SCAN_SILENT_NON_PORTABLE; - - /* nl points to the next \n in the scan */ - U8* const nl = (U8*)memchr(l, '\n', lend - l); - - PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE; - - /* Get the first number on the line: the range minimum */ - numlen = lend - l; - *min = grok_hex((char *)l, &numlen, &flags, NULL); - *max = *min; /* So can never return without setting max */ - if (numlen) /* If found a hex number, position past it */ - l += numlen; - else if (nl) { /* Else, go handle next line, if any */ - return nl + 1; /* 1 is length of "\n" */ - } - else { /* Else, no next line */ - return lend + 1; /* to LIST's end at which \n is not found */ - } - - /* The max range value follows, separated by a BLANK */ - if (isBLANK(*l)) { - ++l; - flags = PERL_SCAN_SILENT_ILLDIGIT - | PERL_SCAN_DISALLOW_PREFIX - | PERL_SCAN_SILENT_NON_PORTABLE; - numlen = lend - l; - *max = grok_hex((char *)l, &numlen, &flags, NULL); - if (numlen) - l += numlen; - else /* If no value here, it is a single element range */ - *max = *min; - - /* Non-binary tables have a third entry: what the first element of the - * range maps to. The map for those currently read here is in hex */ - if (wants_value) { - if (isBLANK(*l)) { - ++l; - flags = PERL_SCAN_SILENT_ILLDIGIT - | PERL_SCAN_DISALLOW_PREFIX - | PERL_SCAN_SILENT_NON_PORTABLE; - numlen = lend - l; - *val = grok_hex((char *)l, &numlen, &flags, NULL); - if (numlen) - l += numlen; - else - *val = 0; - } - else { - *val = 0; - if (typeto) { - /* diag_listed_as: To%s: illegal mapping '%s' */ - Perl_croak(aTHX_ "%s: illegal mapping '%s'", - typestr, l); - } - } - } - else - *val = 0; /* bits == 1, then any val should be ignored */ - } - else { /* Nothing following range min, should be single element with no - mapping expected */ - if (wants_value) { - *val = 0; - if (typeto) { - /* diag_listed_as: To%s: illegal mapping '%s' */ - Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l); - } - } - else - *val = 0; /* bits == 1, then val should be ignored */ - } - - /* Position to next line if any, or EOF */ - if (nl) - l = nl + 1; - else - l = lend; - - return l; -} - -/* Note: - * Returns a swatch (a bit vector string) for a code point sequence - * that starts from the value C<start> and comprises the number C<span>. - * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl). - * Should be used via swash_fetch, which will cache the swatch in C<swash>. - */ -STATIC SV* -S_swatch_get(pTHX_ SV* swash, UV start, UV span) -{ - SV *swatch; - U8 *l, *lend, *x, *xend, *s; - STRLEN lcur, xcur, scur; - HV *const hv = MUTABLE_HV(SvRV(swash)); - - SV** listsvp = NULL; /* The string containing the main body of the table */ - SV** extssvp = NULL; - U8* typestr = NULL; - STRLEN bits = 0; - STRLEN octets; /* if bits == 1, then octets == 0 */ - UV none; - UV end = start + span; - - SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); - SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); - SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); - extssvp = hv_fetchs(hv, "EXTRAS", FALSE); - listsvp = hv_fetchs(hv, "LIST", FALSE); - - bits = SvUV(*bitssvp); - none = SvUV(*nonesvp); - typestr = (U8*)SvPV_nolen(*typesvp); - octets = bits >> 3; /* if bits == 1, then octets == 0 */ - - PERL_ARGS_ASSERT_SWATCH_GET; - - if (bits != 8 && bits != 16 && bits != 32) { - Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf, - (UV)bits); - } - - /* If overflowed, use the max possible */ - if (end < start) { - end = UV_MAX; - span = end - start; - } - - /* create and initialize $swatch */ - scur = octets ? (span * octets) : (span + 7) / 8; - swatch = newSV(scur); - SvPOK_on(swatch); - s = (U8*)SvPVX(swatch); - if (octets && none) { - const U8* const e = s + scur; - while (s < e) { - if (bits == 8) - *s++ = (U8)(none & 0xff); - else if (bits == 16) { - *s++ = (U8)((none >> 8) & 0xff); - *s++ = (U8)( none & 0xff); - } - else if (bits == 32) { - *s++ = (U8)((none >> 24) & 0xff); - *s++ = (U8)((none >> 16) & 0xff); - *s++ = (U8)((none >> 8) & 0xff); - *s++ = (U8)( none & 0xff); - } - } - *s = '\0'; - } - else { - (void)memzero((U8*)s, scur + 1); - } - SvCUR_set(swatch, scur); - s = (U8*)SvPVX(swatch); - - /* read $swash->{LIST} */ - l = (U8*)SvPV(*listsvp, lcur); - lend = l + lcur; - while (l < lend) { - UV min = 0, max = 0, val = 0, upper; - l = swash_scan_list_line(l, lend, &min, &max, &val, - cBOOL(octets), typestr); - if (l > lend) { - break; - } - - /* If looking for something beyond this range, go try the next one */ - if (max < start) - continue; - - /* <end> is generally 1 beyond where we want to set things, but at the - * platform's infinity, where we can't go any higher, we want to - * include the code point at <end> */ - upper = (max < end) - ? max - : (max != UV_MAX || end != UV_MAX) - ? end - 1 - : end; - - if (octets) { - UV key; - if (min < start) { - if (!none || val < none) { - val += start - min; - } - min = start; - } - for (key = min; key <= upper; key++) { - STRLEN offset; - /* offset must be non-negative (start <= min <= key < end) */ - offset = octets * (key - start); - if (bits == 8) - s[offset] = (U8)(val & 0xff); - else if (bits == 16) { - s[offset ] = (U8)((val >> 8) & 0xff); - s[offset + 1] = (U8)( val & 0xff); - } - else if (bits == 32) { - s[offset ] = (U8)((val >> 24) & 0xff); - s[offset + 1] = (U8)((val >> 16) & 0xff); - s[offset + 2] = (U8)((val >> 8) & 0xff); - s[offset + 3] = (U8)( val & 0xff); - } - - if (!none || val < none) - ++val; - } - } - } /* while */ - - /* read $swash->{EXTRAS} */ - x = (U8*)SvPV(*extssvp, xcur); - xend = x + xcur; - while (x < xend) { - STRLEN namelen; - U8 *namestr; - SV** othersvp; - HV* otherhv; - STRLEN otherbits; - SV **otherbitssvp, *other; - U8 *s, *o, *nl; - STRLEN slen, olen; - - const U8 opc = *x++; - if (opc == '\n') - continue; - - nl = (U8*)memchr(x, '\n', xend - x); - - if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { - if (nl) { - x = nl + 1; /* 1 is length of "\n" */ - continue; - } - else { - x = xend; /* to EXTRAS' end at which \n is not found */ - break; - } - } - - namestr = x; - if (nl) { - namelen = nl - namestr; - x = nl + 1; - } - else { - namelen = xend - namestr; - x = xend; - } - - othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); - otherhv = MUTABLE_HV(SvRV(*othersvp)); - otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); - otherbits = (STRLEN)SvUV(*otherbitssvp); - if (bits < otherbits) - Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, " - "bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits); - - /* The "other" swatch must be destroyed after. */ - other = swatch_get(*othersvp, start, span); - o = (U8*)SvPV(other, olen); - - if (!olen) - Perl_croak(aTHX_ "panic: swatch_get got improper swatch"); - - s = (U8*)SvPV(swatch, slen); - { - STRLEN otheroctets = otherbits >> 3; - STRLEN offset = 0; - U8* const send = s + slen; - - while (s < send) { - UV otherval = 0; - - if (otherbits == 1) { - otherval = (o[offset >> 3] >> (offset & 7)) & 1; - ++offset; - } - else { - STRLEN vlen = otheroctets; - otherval = *o++; - while (--vlen) { - otherval <<= 8; - otherval |= *o++; - } - } - - if (opc == '+' && otherval) - NOOP; /* replace with otherval */ - else if (opc == '!' && !otherval) - otherval = 1; - else if (opc == '-' && otherval) - otherval = 0; - else if (opc == '&' && !otherval) - otherval = 0; - else { - s += octets; /* no replacement */ - continue; - } - - if (bits == 8) - *s++ = (U8)( otherval & 0xff); - else if (bits == 16) { - *s++ = (U8)((otherval >> 8) & 0xff); - *s++ = (U8)( otherval & 0xff); - } - else if (bits == 32) { - *s++ = (U8)((otherval >> 24) & 0xff); - *s++ = (U8)((otherval >> 16) & 0xff); - *s++ = (U8)((otherval >> 8) & 0xff); - *s++ = (U8)( otherval & 0xff); - } - } - } - sv_free(other); /* through with it! */ - } /* while */ - return swatch; -} - bool Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) { |