utf8.c: New function to retrieve non-copy of swash

Currently, swash_init returns a copy of the swash it finds. The core portions of the swash are read-only, and the non-read-only portions are derived from them. When the value for a code point is looked up, the results for it and adjacent code points are stored in a new element, so that the lookup never has to be performed again. But since a copy is returned, those results are stored only in the copy, and any other uses of the same logical stash don't have access to them, so the lookups have to be performed for each logical use. Here's an example. If you have 2 occurrences of /\p{Upper}/ in your program, there are 2 different swashes created, both initialized identically. As you start matching against code points, say "A" =~ /\p{Upper}/, the swashes diverge, as the results for each match are saved in the one applicable to that match. If you match "A" in each swash, it has to be looked up in each swash, and an (identical) element will be saved for it in each swash. This is wasteful of both time and memory. This patch renames the function and returns the original and not a copy, thus eliminating the overhead for stashes accessed through the new interface. The old function name is serviced by a new function which merely wraps the new name result with a copy, thus preserving the interface for existing calls. Thus, in the example above, there is only one swash, and matching "A" against it results in only one new element, and so the second use will find that, and not have to go out looking again. In a program with lots of regular expressions, the savings in time and memory can be quite large. The new name is restricted to use only in regcomp.c and utf8.c (unless XS code cheats the preprocessor), where we will code so as to not destroy the original's data. Otherwise, a change to that would change the definition of a Unicode property everywhere in the program. Note that there are no current callers of the new interface; these will be added in future commits.
author: Karl Williamson <public@khwilliamson.com> 2011-11-22 12:06:41 -0700
committer: Karl Williamson <public@khwilliamson.com> 2012-01-13 09:58:34 -0700
commit: c4a5db0c445f65e05b7efe68951a24a5f1826c18 (patch)
tree: ababcfae06e346506975815726d89977c636def1 /utf8.c
parent: 4a7e937e5e38b45d73f74bc6acde04dfb15ad1aa (diff)
download: perl-c4a5db0c445f65e05b7efe68951a24a5f1826c18.tar.gz
1 files changed, 38 insertions, 5 deletions
diff --git a/utf8.c b/utf8.c
index 13aa2dc19f..63741257e9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2454,11 +2454,43 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
  */
+
 SV*
 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
 {
+    PERL_ARGS_ASSERT_SWASH_INIT;
+
+    /* Returns a copy of a swash initiated by the called function.  This is the
+     * public interface, and returning a copy prevents others from doing
+     * mischief on the original */
+
+    return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none));
+}
+
+SV*
+Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
+{
+    /* Initialize and return a swash, creating it if necessary.  It does this
+     * by calling utf8_heavy.pl.
+     *
+     * This interface should only be used by functions that won't destroy or
+     * adversely change the swash, as doing so affects all other uses of the
+     * swash in the program; the general public should use 'Perl_swash_init'
+     * instead.
+     *
+     * pkg  is the name of the package that <name> should be in.
+     * name is the name of the swash to find.  Typically it is a Unicode
+     *	    property name, including user-defined ones
+     * listsv is a string to initialize the swash with.  It must be of the form
+     *	    documented as the subroutine return value in
+     *	    L<perlunicode/User-Defined Character Properties>
+     * minbits is the number of bits required to represent each data element.
+     *	    It is '1' for binary properties.
+     * none I (khw) do not understand this one, but it is used only in tr///.
+     */
+
     dVAR;
-    SV* retval;
+    SV* retval = &PL_sv_undef;
     dSP;
     const size_t pkg_len = strlen(pkg);
     const size_t name_len = strlen(name);
@@ -2466,7 +2498,7 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
     SV* errsv_save;
     GV *method;
 
-    PERL_ARGS_ASSERT_SWASH_INIT;
+    PERL_ARGS_ASSERT__CORE_SWASH_INIT;
 
     PUSHSTACKi(PERLSI_MAGIC);
     ENTER;
@@ -2506,9 +2538,10 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
        to repeat the lookup.  */
     if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
 	: call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
-	retval = newSVsv(*PL_stack_sp--);
-    else
-	retval = &PL_sv_undef;
+    {
+        retval = *PL_stack_sp--;
+        SvREFCNT_inc(retval);
+    }
     if (!SvTRUE(ERRSV))
 	sv_setsv(ERRSV, errsv_save);
     SvREFCNT_dec(errsv_save);
author	Karl Williamson <public@khwilliamson.com>	2011-11-22 12:06:41 -0700
committer	Karl Williamson <public@khwilliamson.com>	2012-01-13 09:58:34 -0700
commit	c4a5db0c445f65e05b7efe68951a24a5f1826c18 (patch)
tree	ababcfae06e346506975815726d89977c636def1 /utf8.c
parent	4a7e937e5e38b45d73f74bc6acde04dfb15ad1aa (diff)
download	perl-c4a5db0c445f65e05b7efe68951a24a5f1826c18.tar.gz