summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-08-25 14:49:47 -0600
committerKarl Williamson <public@khwilliamson.com>2012-08-25 23:21:30 -0600
commit87367d5f9dc9bbf7db1a6cf87820cea76571bf1a (patch)
tree0065ddd270e63dc5f72f6596aa1a74d0218b5f1d /utf8.c
parent5ab9d2ef14d72dec0d833310911032a980efcf09 (diff)
downloadperl-87367d5f9dc9bbf7db1a6cf87820cea76571bf1a.tar.gz
utf8.c: Prefer binsearch over swash hash for small swashes
A binary swash is a hash of bitmaps used to cache the results of looking up if a code point matches a Unicode property or regex bracketed character class. An inversion list is a data structure that also holds information about what code points match a Unicode property or character class. It is implemented as an SV* to a sorted C array, and hence can be searched using a binary search. This patch converts to using a binary search of an inversion list instead of a hash look-up for inversion lists that are no more than 512 elements (9 iterations of the search loop). That number can be easily adjusted, if necessary. Theoretically, a hash is faster than a binary lookup over a very long period. So this may negatively impact long-running servers. But in the short run, where most programs reside, the binary search is significantly faster. A swash is filled as necessary as time goes on, caching each new distinct code point it is called with. If it is called with many, many such code points, its performance can degrade as collisions increase. A binary search does not have that drawback. However, most real-world scenarios do not have a program being called with huge numbers of distinct code points. Mostly, the program will be called with code points from just one or a few of the world's scripts, so will remain sparse. The bitmaps in a swash are each 64 bits long (except for ASCII, where it is 128). That means that when the swash is populated, a lookup of a single code point that hasn't been checked before will have to lookup the 63 adjoining code points as well, increasing its startup overhead. Of course, if one of those 63 code points is later accessed, no extra populate happens. This is a typical case where a languages code points are all near each other. The bottom line, though, is in the short term, this patch speeds up the processing of \X regex matching about 35-40%, with modern Korean (which has uniquely complicated \X processing) closer to 40%, and other scripts closer to 35%. The 512 boundary means that over 90% of the official Unicode properties are handled using binary search. I settled on that number by experimenting with several properties besides \X and with various powers-of-2 limits. Until I got that high, performance kept improving when the property went from being a swash to a binary search. \X improved even up to 2048, which encompasses 100% of the official Unicode properties. The implementation changes so that an inversion list instead of a swash is returned by swash_init() when the input flags allows it to do so, for all inversion lists shorter than the compiled in constant of 512 (actually <= 512). The other functions that access swashes have added intelligence to deal with an object of either type. Should someone in CPAN be using the public swash_init() interface, they will not see any difference, as the option to get an inversion list is not available to them.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c53
1 files changed, 46 insertions, 7 deletions
diff --git a/utf8.c b/utf8.c
index f3e7f59fb0..edae6ba5c4 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1957,8 +1957,10 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
* validating routine */
if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
return FALSE;
- if (!*swash)
- *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
+ if (!*swash) {
+ U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+ *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
+ }
return swash_fetch(*swash, p, TRUE) != 0;
}
@@ -2922,7 +2924,11 @@ SV*
Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
{
/* Initialize and return a swash, creating it if necessary. It does this
- * by calling utf8_heavy.pl in the general case.
+ * by calling utf8_heavy.pl in the general case. The returned value may be
+ * the swash's inversion list instead if the input parameters allow it.
+ * Which is returned should be immaterial to callers, as the only
+ * operations permitted on a swash, swash_fetch() and
+ * _get_swash_invlist(), handle both these transparently.
*
* This interface should only be used by functions that won't destroy or
* adversely change the swash, as doing so affects all other uses of the
@@ -2947,6 +2953,10 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
* came from a user-defined property. (I O)
* _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
* when the swash cannot be located, to simply return NULL. (I)
+ * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
+ * return of an inversion list instead of a swash hash if this routine
+ * thinks that would result in faster execution of swash_fetch() later
+ * on. (I)
*
* Thus there are three possible inputs to find the swash: <name>,
* <listsv>, and <invlist>. At least one must be specified. The result
@@ -2958,6 +2968,11 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
dVAR;
SV* retval = &PL_sv_undef;
HV* swash_hv = NULL;
+ const int invlist_swash_boundary =
+ (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
+ ? 512 /* Based on some benchmarking, but not extensive, see commit
+ message */
+ : -1; /* Never return just an inversion list */
assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
assert(! invlist || minbits == 1);
@@ -3093,9 +3108,12 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
}
else {
- /* Here, there is no swash already. Set up a minimal one */
+ /* Here, there is no swash already. Set up a minimal one, if
+ * we are going to return a swash */
+ if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
swash_hv = newHV();
retval = newRV_inc(MUTABLE_SV(swash_hv));
+ }
swash_invlist = invlist;
}
}
@@ -3103,12 +3121,19 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
/* Here, we have computed the union of all the passed-in data. It may
* be that there was an inversion list in the swash which didn't get
* touched; otherwise save the one computed one */
- if (! invlist_in_swash_is_valid) {
+ if (! invlist_in_swash_is_valid
+ && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
+ {
if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
{
Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
}
}
+
+ if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
+ SvREFCNT_dec(retval);
+ retval = newRV_inc(swash_invlist);
+ }
}
return retval;
@@ -3174,6 +3199,15 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
PERL_ARGS_ASSERT_SWASH_FETCH;
+ /* If it really isn't a hash, it isn't really swash; must be an inversion
+ * list */
+ if (SvTYPE(hv) != SVt_PVHV) {
+ return _invlist_contains_cp((SV*)hv,
+ (do_utf8)
+ ? valid_utf8_to_uvchr(ptr, NULL)
+ : c);
+ }
+
/* Convert to utf8 if not already */
if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
@@ -4147,12 +4181,17 @@ Perl__get_swash_invlist(pTHX_ SV* const swash)
PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
- if (! SvROK(swash) || SvTYPE(SvRV(swash)) != SVt_PVHV) {
+ if (! SvROK(swash)) {
return NULL;
}
- ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
+ /* If it really isn't a hash, it isn't really swash; must be an inversion
+ * list */
+ if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
+ return SvRV(swash);
+ }
+ ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
if (! ptr) {
return NULL;
}