summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSADAHIRO Tomoyuki <BQW10602@nifty.com>2005-11-28 02:02:02 +0900
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2005-11-30 14:29:23 +0000
commit979f29225180f8c09f4adec52f850ae45694fd81 (patch)
tree2a19e5d269deb62b035de4c272f457862d7574f8
parent8f7f721921e56db1ab4fa5e3365e8f86077b2518 (diff)
downloadperl-979f29225180f8c09f4adec52f850ae45694fd81.tar.gz
Re: XS-assisted SWASHGET (esp. for t/uni/class.t speedup)
Message-Id: <20051127170016.A786.BQW10602@nifty.com> p4raw-id: //depot/perl@26229
-rw-r--r--embed.fnc3
-rw-r--r--embed.h2
-rw-r--r--lib/utf8_heavy.pl8
-rw-r--r--pod/perlapi.pod2
-rw-r--r--pod/perlintern.pod71
-rw-r--r--proto.h4
-rw-r--r--universal.c413
-rw-r--r--utf8.c382
-rw-r--r--utf8.h2
-rw-r--r--utfebcdic.h4
10 files changed, 390 insertions, 501 deletions
diff --git a/embed.fnc b/embed.fnc
index f3385fc502..ba2994dba4 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1385,7 +1385,8 @@ sn |NV|mulexp10 |NV value|I32 exponent
#if defined(PERL_IN_UTF8_C) || defined(PERL_DECL_PROT)
s |STRLEN |is_utf8_char_slow|NN const U8 *s|const STRLEN len
-spR |bool |is_utf8_common |NN const U8 *const p|NN SV **swash|NN const char * const swashname
+sR |bool |is_utf8_common |NN const U8 *const p|NN SV **swash|NN const char * const swashname
+sR |SV* |swash_get |NN SV* swash|UV start|UV span
#endif
START_EXTERN_C
diff --git a/embed.h b/embed.h
index d534f17553..3812e2da3e 100644
--- a/embed.h
+++ b/embed.h
@@ -1425,6 +1425,7 @@
#ifdef PERL_CORE
#define is_utf8_char_slow S_is_utf8_char_slow
#define is_utf8_common S_is_utf8_common
+#define swash_get S_swash_get
#endif
#endif
#define sv_setsv_flags Perl_sv_setsv_flags
@@ -3430,6 +3431,7 @@
#ifdef PERL_CORE
#define is_utf8_char_slow(a,b) S_is_utf8_char_slow(aTHX_ a,b)
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
+#define swash_get(a,b,c) S_swash_get(aTHX_ a,b,c)
#endif
#endif
#define sv_setsv_flags(a,b,c) Perl_sv_setsv_flags(aTHX_ a,b,c)
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index 229ed97536..e5fd6e3634 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -266,12 +266,6 @@ sub SWASHNEW {
return $SWASH;
}
-# NOTE: utf8.c:swash_init() assumes entries are never modified once generated.
-sub SWASHGET {
- # See utf8.c:Perl_swash_fetch for problems with this interface.
- # See universal.c for XS utf8::SWASHGET_heavy.
- # USAGE: $swatch = utf8::SWASHGET_heavy($self, $start, $len, DEBUG);
- return utf8::SWASHGET_heavy($_[0], $_[1], $_[2], DEBUG);
-}
+# Now SWASHGET is recasted into a C function S_swash_get (see utf8.c).
1;
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 497ad9f94a..be9249b025 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -6008,7 +6008,7 @@ of the result.
The "swashp" is a pointer to the swash to use.
Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
-and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
+and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
but not always, a multicharacter mapping), is tried first.
The "special" is a string like "utf8::ToSpecLower", which means the
diff --git a/pod/perlintern.pod b/pod/perlintern.pod
index ee9de89269..4e6119d3e2 100644
--- a/pod/perlintern.pod
+++ b/pod/perlintern.pod
@@ -897,38 +897,6 @@ Found in file pp.h
=over 8
-=item find_uninit_var
-X<find_uninit_var>
-
-Find the name of the undefined variable (if any) that caused the operator o
-to issue a "Use of uninitialized value" warning.
-If match is true, only return a name if it's value matches uninit_sv.
-So roughly speaking, if a unary operator (such as OP_COS) generates a
-warning, then following the direct child of the op may yield an
-OP_PADSV or OP_GV that gives the name of the undefined variable. On the
-other hand, with OP_ADD there are two branches to follow, so we only print
-the variable name if we get an exact match.
-
-The name is returned as a mortal SV.
-
-Assumes that PL_op is the op that originally triggered the error, and that
-PL_comppad/PL_curpad points to the currently executing pad.
-
- SV* find_uninit_var(OP* obase, SV* uninit_sv, bool top)
-
-=for hackers
-Found in file sv.c
-
-=item report_uninit
-X<report_uninit>
-
-Print appropriate "Use of uninitialized variable" warning
-
- void report_uninit(SV* uninit_sv)
-
-=for hackers
-Found in file sv.c
-
=item sv_add_arena
X<sv_add_arena>
@@ -976,6 +944,45 @@ Found in file sv.c
=back
+=head1 Unicode Support
+
+=over 8
+
+=item find_uninit_var
+X<find_uninit_var>
+
+Find the name of the undefined variable (if any) that caused the operator o
+to issue a "Use of uninitialized value" warning.
+If match is true, only return a name if it's value matches uninit_sv.
+So roughly speaking, if a unary operator (such as OP_COS) generates a
+warning, then following the direct child of the op may yield an
+OP_PADSV or OP_GV that gives the name of the undefined variable. On the
+other hand, with OP_ADD there are two branches to follow, so we only print
+the variable name if we get an exact match.
+
+The name is returned as a mortal SV.
+
+Assumes that PL_op is the op that originally triggered the error, and that
+PL_comppad/PL_curpad points to the currently executing pad.
+
+ SV* find_uninit_var(OP* obase, SV* uninit_sv, bool top)
+
+=for hackers
+Found in file sv.c
+
+=item report_uninit
+X<report_uninit>
+
+Print appropriate "Use of uninitialized variable" warning
+
+ void report_uninit(SV* uninit_sv)
+
+=for hackers
+Found in file sv.c
+
+
+=back
+
=head1 AUTHORS
The autodocumentation system was originally added to the Perl core by
diff --git a/proto.h b/proto.h
index 4ad3b66b05..71be05a7e4 100644
--- a/proto.h
+++ b/proto.h
@@ -3806,6 +3806,10 @@ STATIC bool S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, const char * c
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3);
+STATIC SV* S_swash_get(pTHX_ SV* swash, UV start, UV span)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(pTHX_1);
+
#endif
START_EXTERN_C
diff --git a/universal.c b/universal.c
index b3a742b384..10dddb5efe 100644
--- a/universal.c
+++ b/universal.c
@@ -199,7 +199,6 @@ XS(XS_Regexp_DESTROY);
XS(XS_Internals_hash_seed);
XS(XS_Internals_rehash_seed);
XS(XS_Internals_HvREHASH);
-XS(XS_utf8_SWASHGET_heavy);
void
Perl_boot_core_UNIVERSAL(pTHX)
@@ -248,7 +247,6 @@ Perl_boot_core_UNIVERSAL(pTHX)
newXSproto("Internals::hash_seed",XS_Internals_hash_seed, file, "");
newXSproto("Internals::rehash_seed",XS_Internals_rehash_seed, file, "");
newXSproto("Internals::HvREHASH", XS_Internals_HvREHASH, file, "\\%");
- newXS("utf8::SWASHGET_heavy", XS_utf8_SWASHGET_heavy, file);
}
@@ -951,417 +949,6 @@ XS(XS_Internals_HvREHASH) /* Subject to change */
Perl_croak(aTHX_ "Internals::HvREHASH $hashref");
}
-XS(XS_utf8_SWASHGET_heavy)
-{
- dXSARGS;
- if (items != 4) {
- Perl_croak(aTHX_
- "Usage: utf8::SWASHGET_heavy($self, $start, $len, DEBUG)");
- }
- {
- SV* self = ST(0);
- const I32 i_start = (I32)SvIV(ST(1));
- const I32 i_len = (I32)SvIV(ST(2));
- const I32 debug = (I32)SvIV(ST(3));
- U32 start = (U32)i_start;
- U32 len = (U32)i_len;
-
- HV *hv;
- SV **listsvp, **typesvp, **bitssvp, **nonesvp, **extssvp, *swatch;
- U8 *l, *lend, *x, *xend, *s, *nextline;
- STRLEN lcur, xcur, scur;
- U8* typestr;
- int typeto;
- U32 bits, none, end, octets;
-
- if (SvROK(self) && SvTYPE(SvRV(self))==SVt_PVHV)
- hv = (HV*)SvRV(self);
- else
- Perl_croak(aTHX_ "hv is not a hash reference");
-
- if (i_start < 0)
- Perl_croak(aTHX_ "SWASHGET negative start");
- if (i_len < 0)
- Perl_croak(aTHX_ "SWASHGET negative len");
-
- listsvp = hv_fetch(hv, "LIST", 4, FALSE);
- typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
- bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
- nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
- extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
- typestr = (U8*)SvPV_nolen(*typesvp);
- typeto = typestr[0] == 'T' && typestr[1] == 'o';
- bits = (U32)SvUV(*bitssvp);
- none = (U32)SvUV(*nonesvp);
- end = start + len;
- octets = bits >> 3; /* if bits == 1, then octets == 0 */
-
- if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
- Perl_croak(aTHX_ "SWASHGET unknown bits %"UVuf, (UV)bits);
- }
- if (debug) {
- char* selfstr = SvPV_nolen(self);
- PerlIO_printf(Perl_error_log, "SWASHGET ");
- PerlIO_printf(Perl_error_log, "%s %"UVuf" %"UVuf" ",
- selfstr, (UV)start, (UV)len);
- PerlIO_printf(Perl_error_log, "[%s/%"UVuf"/%"UVuf"]\n",
- typestr, (UV)bits, (UV)none);
- }
-
- /* initialize $swatch */
- swatch = newSVpvn("",0);
- scur = octets ? (len * octets) : (len + 7) / 8;
- SvGROW(swatch, scur + 1);
- s = (U8*)SvPVX(swatch);
- if (octets && none) {
- const U8* e = s + scur;
- while (s < e) {
- if (bits == 8)
- *s++ = (U8)(none & 0xff);
- else if (bits == 16) {
- *s++ = (U8)((none >> 8) & 0xff);
- *s++ = (U8)( none & 0xff);
- }
- else if (bits == 32) {
- *s++ = (U8)((none >> 24) & 0xff);
- *s++ = (U8)((none >> 16) & 0xff);
- *s++ = (U8)((none >> 8) & 0xff);
- *s++ = (U8)( none & 0xff);
- }
- }
- *s = '\0';
- }
- else {
- (void)memzero((U8*)s, scur + 1);
- }
- SvCUR_set(swatch, scur);
- s = (U8*)SvPVX(swatch);
-
- /* read $self->{LIST} */
- l = (U8*)SvPV(*listsvp, lcur);
- lend = l + lcur;
- while (l < lend) {
- U32 min, max, val, key;
- STRLEN numlen;
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-
- nextline = (U8*)memchr(l, '\n', lend - l);
-
- numlen = lend - l;
- min = (U32)grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else if (nextline) {
- l = nextline + 1; /* 1 is length of "\n" */
- continue;
- }
- else {
- l = lend; /* to the end of LIST, at which no \n */
- break;
- }
-
- if (isBLANK(*l)) {
- ++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
- numlen = lend - l;
- max = (U32)grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else
- max = min;
-
- if (octets) {
- if (isBLANK(*l)) {
- ++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT |
- PERL_SCAN_DISALLOW_PREFIX;
- numlen = lend - l;
- val = (U32)grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else
- val = 0;
- }
- else {
- val = 0;
- if (typeto) {
- Perl_croak(aTHX_ "%s: illegal mapping '%s'",
- typestr, l);
- }
- }
- }
- }
- else {
- max = min;
- if (octets) {
- val = 0;
- if (typeto) {
- Perl_croak(aTHX_ "%s: illegal mapping '%s'",
- typestr, l);
- }
- }
- }
-
- if (nextline)
- l = nextline + 1;
- else
- l = lend;
-
- if (max < start)
- continue;
-
- if (octets) {
- if (debug) {
- PerlIO_printf(Perl_error_log,
- "%"UVuf" %"UVuf" %"UVuf"\n",
- (UV)min, (UV)max, (UV)val);
- }
- if (min < start) {
- if (!none || val < none) {
- val += start - min;
- }
- min = start;
- }
- for (key = min; key <= max; key++) {
- U32 offset;
- if (key >= end)
- goto go_out_list;
- if (debug) {
- PerlIO_printf(Perl_error_log,
- "%"UVuf" => %"UVuf"\n",
- (UV)key, (UV)val);
- }
-
- /* offset must be non-negative (start <= min <= key < end) */
- offset = (key - start) * octets;
- if (bits == 8)
- s[offset] = (U8)(val & 0xff);
- else if (bits == 16) {
- s[offset ] = (U8)((val >> 8) & 0xff);
- s[offset + 1] = (U8)( val & 0xff);
- }
- else if (bits == 32) {
- s[offset ] = (U8)((val >> 24) & 0xff);
- s[offset + 1] = (U8)((val >> 16) & 0xff);
- s[offset + 2] = (U8)((val >> 8) & 0xff);
- s[offset + 3] = (U8)( val & 0xff);
- }
-
- if (!none || val < none)
- ++val;
- }
- }
- else {
- if (min < start)
- min = start;
- for (key = min; key <= max; key++) {
- U32 offset = key - start;
- if (key >= end)
- goto go_out_list;
- if (debug) {
- PerlIO_printf(Perl_error_log,
- "%"UVuf" => 1\n", (UV)key);
- }
- s[offset >> 3] |= 1 << (offset & 7);
- }
- }
- }
- go_out_list:
-
- /* read $self->{EXTRAS} */
- x = (U8*)SvPV(*extssvp, xcur);
- xend = x + xcur;
- while (x < xend) {
- STRLEN namelen;
- U8 *namestr;
- SV** othersvp;
- U32 otherbits;
-
- U8 opc = *x++;
- if (opc == '\n')
- continue;
-
- nextline = (U8*)memchr(x, '\n', xend - x);
-
- if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
- if (nextline) {
- x = nextline + 1;
- continue;
- }
- else {
- x = xend;
- break;
- }
- }
-
- namestr = x;
-
- if (nextline) {
- namelen = nextline - namestr;
- x = nextline + 1;
- }
- else {
- namelen = xend - namestr;
- x = xend;
- }
-
- if (debug) {
- U8* tmpstr;
- Newx(tmpstr, namelen + 1, U8);
- Move(namestr, tmpstr, namelen, U8);
- tmpstr[namelen] = '\0';
- PerlIO_printf(Perl_error_log,
- "INDIRECT %c %s\n", opc, tmpstr);
- Safefree(tmpstr);
- }
-
- {
- HV* otherhv;
- SV **otherbitssvp;
-
- othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
- if (*othersvp && SvROK(*othersvp) &&
- SvTYPE(SvRV(*othersvp))==SVt_PVHV)
- otherhv = (HV*)SvRV(*othersvp);
- else
- Perl_croak(aTHX_ "otherhv is not a hash reference");
-
- otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
- otherbits = (U32)SvUV(*otherbitssvp);
- if (bits < otherbits)
- Perl_croak(aTHX_ "SWASHGET size mismatch");
- }
-
- {
- dSP;
- ENTER;
- SAVETMPS;
- PUSHMARK(SP);
- EXTEND(SP,3);
- PUSHs(*othersvp);
- PUSHs(sv_2mortal(newSViv(start)));
- PUSHs(sv_2mortal(newSViv(len)));
- PUTBACK;
- if (call_method("SWASHGET", G_SCALAR)) {
- U8 *s, *o;
- STRLEN slen, olen;
- SV* tmpsv = *PL_stack_sp--;
- o = (U8*)SvPV(tmpsv, olen);
-
- if (!olen)
- Perl_croak(aTHX_ "SWASHGET didn't return valid swatch");
- s = (U8*)SvPV(swatch, slen);
- if (bits == 1 && otherbits == 1) {
- if (slen != olen)
- Perl_croak(aTHX_ "SWASHGET length mismatch");
-
- switch (opc) {
- case '+':
- while (slen--)
- *s++ |= *o++;
- break;
- case '!':
- while (slen--)
- *s++ |= ~*o++;
- break;
- case '-':
- while (slen--)
- *s++ &= ~*o++;
- break;
- case '&':
- while (slen--)
- *s++ &= *o++;
- break;
- default:
- break;
- }
- }
- else {
- U32 otheroctets = otherbits / 8;
- U32 offset = 0;
- U8* send = s + slen;
-
- while (s < send) {
- U32 val = 0;
-
- if (otherbits == 1) {
- val = (o[offset >> 3] >> (offset & 7)) & 1;
- ++offset;
- }
- else {
- U32 vlen = otheroctets;
- val = *o++;
- while (--vlen) {
- val <<= 8;
- val |= *o++;
- }
- }
-
- if (opc == '+' && val)
- val = 1;
- else if (opc == '!' && !val)
- val = 1;
- else if (opc == '-' && val)
- val = 0;
- else if (opc == '&' && !val)
- val = 0;
- else {
- s += octets;
- continue;
- }
-
- if (bits == 8)
- *s++ = (U8)( val & 0xff);
- else if (bits == 16) {
- *s++ = (U8)((val >> 8) & 0xff);
- *s++ = (U8)( val & 0xff);
- }
- else if (bits == 32) {
- *s++ = (U8)((val >> 24) & 0xff);
- *s++ = (U8)((val >> 16) & 0xff);
- *s++ = (U8)((val >> 8) & 0xff);
- *s++ = (U8)( val & 0xff);
- }
- }
- }
- }
- FREETMPS;
- LEAVE;
- }
- }
-
- if (debug) {
- U8* s = (U8*)SvPVX(swatch);
- PerlIO_printf(Perl_error_log, "CELLS ");
- if (bits == 1) {
- U32 key;
- for (key = 0; key < len; key++) {
- int val = (s[key >> 3] >> (key & 7)) & 1;
- PerlIO_printf(Perl_error_log, val ? "1 " : "0 ");
- }
- }
- else {
- U8* send = s + len * octets;
- while (s < send) {
- U32 vlen = octets;
- U32 val = *s++;
- while (--vlen) {
- val <<= 8;
- val |= *s++;
- }
- PerlIO_printf(Perl_error_log, "%"UVuf" ", (UV)val);
- }
- }
- PerlIO_printf(Perl_error_log, "\n");
- }
-
- ST(0) = swatch;
- sv_2mortal(ST(0));
- }
- XSRETURN(1);
-}
-
-
/*
* Local variables:
* c-indentation-style: bsd
diff --git a/utf8.c b/utf8.c
index 690e4c2e0f..813a64fcdd 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1621,6 +1621,7 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
* the lower-level routine, and it is similarly broken for returning
* multiple values. --jhi */
+/* Now SWASHGET is recasted into S_swash_get in this file. */
UV
Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
{
@@ -1632,14 +1633,14 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
STRLEN needents;
const U8 *tmps = NULL;
U32 bit;
- SV *retval;
+ SV *swatch;
U8 tmputf8[2];
UV c = NATIVE_TO_ASCII(*ptr);
if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
- tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
- tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
- ptr = tmputf8;
+ tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
+ tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
+ ptr = tmputf8;
}
/* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
* then the "swatch" is a vec() for al the chars which start
@@ -1649,20 +1650,18 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
klen = UTF8SKIP(ptr) - 1;
off = ptr[klen];
- if (klen == 0)
- {
+ if (klen == 0) {
/* If char in invariant then swatch is for all the invariant chars
* In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
*/
- needents = UTF_CONTINUATION_MARK;
- off = NATIVE_TO_UTF(ptr[klen]);
- }
- else
- {
+ needents = UTF_CONTINUATION_MARK;
+ off = NATIVE_TO_UTF(ptr[klen]);
+ }
+ else {
/* If char is encoded then swatch is for the prefix */
- needents = (1 << UTF_ACCUMULATION_SHIFT);
- off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
- }
+ needents = (1 << UTF_ACCUMULATION_SHIFT);
+ off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+ }
/*
* This single-entry cache saves about 1/3 of the utf8 overhead in test
@@ -1684,46 +1683,28 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
/* Try our second-level swatch cache, kept in a hash. */
SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
- /* If not cached, generate it via utf8::SWASHGET */
- if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
- dSP;
+ /* If not cached, generate it via swash_get */
+ if (!svp || !SvPOK(*svp)
+ || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
/* We use utf8n_to_uvuni() as we want an index into
Unicode tables, not a native character number.
*/
const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
- SV *errsv_save;
- ENTER;
- SAVETMPS;
- /* save_re_context(); */ /* Now SWASHGET doesn't use regex */
- PUSHSTACKi(PERLSI_MAGIC);
- PUSHMARK(SP);
- EXTEND(SP,3);
- PUSHs((SV*)sv);
- /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
- PUSHs(sv_2mortal(newSViv((klen) ?
- (code_point & ~(needents - 1)) : 0)));
- PUSHs(sv_2mortal(newSViv(needents)));
- PUTBACK;
- errsv_save = newSVsv(ERRSV);
- if (call_method("SWASHGET", G_SCALAR))
- retval = newSVsv(*PL_stack_sp--);
- else
- retval = &PL_sv_undef;
- if (!SvTRUE(ERRSV))
- sv_setsv(ERRSV, errsv_save);
- SvREFCNT_dec(errsv_save);
- POPSTACK;
- FREETMPS;
- LEAVE;
+ swatch = swash_get(sv,
+ /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
+ (klen) ? (code_point & ~(needents - 1)) : 0,
+ needents);
+
if (IN_PERL_COMPILETIME)
PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
- svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
+ svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
- if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
- Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
+ if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
+ || (slen << 3) < needents)
+ Perl_croak(aTHX_ "The swatch does not have proper length");
}
PL_last_swash_hv = hv;
@@ -1753,6 +1734,319 @@ Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
return 0;
}
+/* Note:
+ * Returns a swatch (a bit vector string) for a code point sequence
+ * that starts from the value C<start> and comprises the number C<span>.
+ * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
+ * Should be used via swash_fetch, which will cache the swatch in C<swash>.
+ */
+STATIC SV*
+S_swash_get(pTHX_ SV* swash, UV start, UV span)
+{
+ SV *swatch;
+ U8 *l, *lend, *x, *xend, *s, *nl;
+ STRLEN lcur, xcur, scur;
+
+ HV* const hv = (HV*)SvRV(swash);
+ SV** listsvp = hv_fetch(hv, "LIST", 4, FALSE);
+ SV** typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
+ SV** bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
+ SV** nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
+ SV** extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
+ U8* typestr = (U8*)SvPV_nolen(*typesvp);
+ int typeto = typestr[0] == 'T' && typestr[1] == 'o';
+ STRLEN bits = SvUV(*bitssvp);
+ STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+ UV none = SvUV(*nonesvp);
+ UV end = start + span;
+
+ if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
+ Perl_croak(aTHX_ "swash_get: unknown bits %"UVuf, (UV) bits);
+ }
+
+ /* create and initialize $swatch */
+ swatch = newSVpvn("",0);
+ scur = octets ? (span * octets) : (span + 7) / 8;
+ SvGROW(swatch, scur + 1);
+ s = (U8*)SvPVX(swatch);
+ if (octets && none) {
+ const U8* e = s + scur;
+ while (s < e) {
+ if (bits == 8)
+ *s++ = (U8)(none & 0xff);
+ else if (bits == 16) {
+ *s++ = (U8)((none >> 8) & 0xff);
+ *s++ = (U8)( none & 0xff);
+ }
+ else if (bits == 32) {
+ *s++ = (U8)((none >> 24) & 0xff);
+ *s++ = (U8)((none >> 16) & 0xff);
+ *s++ = (U8)((none >> 8) & 0xff);
+ *s++ = (U8)( none & 0xff);
+ }
+ }
+ *s = '\0';
+ }
+ else {
+ (void)memzero((U8*)s, scur + 1);
+ }
+ SvCUR_set(swatch, scur);
+ s = (U8*)SvPVX(swatch);
+
+ /* read $swash->{LIST} */
+ l = (U8*)SvPV(*listsvp, lcur);
+ lend = l + lcur;
+ while (l < lend) {
+ UV min, max, val, key;
+ STRLEN numlen;
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+
+ nl = (U8*)memchr(l, '\n', lend - l);
+
+ numlen = lend - l;
+ min = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else if (nl) {
+ l = nl + 1; /* 1 is length of "\n" */
+ continue;
+ }
+ else {
+ l = lend; /* to LIST's end at which \n is not found */
+ break;
+ }
+
+ if (isBLANK(*l)) {
+ ++l;
+ flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+ numlen = lend - l;
+ max = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else
+ max = min;
+
+ if (octets) {
+ if (isBLANK(*l)) {
+ ++l;
+ flags = PERL_SCAN_SILENT_ILLDIGIT |
+ PERL_SCAN_DISALLOW_PREFIX;
+ numlen = lend - l;
+ val = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else
+ val = 0;
+ }
+ else {
+ val = 0;
+ if (typeto) {
+ Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+ typestr, l);
+ }
+ }
+ }
+ }
+ else {
+ max = min;
+ if (octets) {
+ val = 0;
+ if (typeto) {
+ Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
+ }
+ }
+ }
+
+ if (nl)
+ l = nl + 1;
+ else
+ l = lend;
+
+ if (max < start)
+ continue;
+
+ if (octets) {
+ if (min < start) {
+ if (!none || val < none) {
+ val += start - min;
+ }
+ min = start;
+ }
+ for (key = min; key <= max; key++) {
+ STRLEN offset;
+ if (key >= end)
+ goto go_out_list;
+ /* offset must be non-negative (start <= min <= key < end) */
+ offset = octets * (key - start);
+ if (bits == 8)
+ s[offset] = (U8)(val & 0xff);
+ else if (bits == 16) {
+ s[offset ] = (U8)((val >> 8) & 0xff);
+ s[offset + 1] = (U8)( val & 0xff);
+ }
+ else if (bits == 32) {
+ s[offset ] = (U8)((val >> 24) & 0xff);
+ s[offset + 1] = (U8)((val >> 16) & 0xff);
+ s[offset + 2] = (U8)((val >> 8) & 0xff);
+ s[offset + 3] = (U8)( val & 0xff);
+ }
+
+ if (!none || val < none)
+ ++val;
+ }
+ }
+ else {
+ if (min < start)
+ min = start;
+ for (key = min; key <= max; key++) {
+ STRLEN offset = (STRLEN)(key - start);
+ if (key >= end)
+ goto go_out_list;
+ s[offset >> 3] |= 1 << (offset & 7);
+ }
+ }
+ } /* while */
+ go_out_list:
+
+ /* read $swash->{EXTRAS} */
+ x = (U8*)SvPV(*extssvp, xcur);
+ xend = x + xcur;
+ while (x < xend) {
+ STRLEN namelen;
+ U8 *namestr;
+ SV** othersvp;
+ HV* otherhv;
+ STRLEN otherbits;
+ SV **otherbitssvp, *other;
+ U8 *s, *o;
+ STRLEN slen, olen;
+
+ U8 opc = *x++;
+ if (opc == '\n')
+ continue;
+
+ nl = (U8*)memchr(x, '\n', xend - x);
+
+ if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
+ if (nl) {
+ x = nl + 1; /* 1 is length of "\n" */
+ continue;
+ }
+ else {
+ x = xend; /* to EXTRAS' end at which \n is not found */
+ break;
+ }
+ }
+
+ namestr = x;
+ if (nl) {
+ namelen = nl - namestr;
+ x = nl + 1;
+ }
+ else {
+ namelen = xend - namestr;
+ x = xend;
+ }
+
+ othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
+ if (*othersvp && SvROK(*othersvp) &&
+ SvTYPE(SvRV(*othersvp))==SVt_PVHV)
+ otherhv = (HV*)SvRV(*othersvp);
+ else
+ Perl_croak(aTHX_ "otherhv is not a hash reference");
+
+ otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
+ otherbits = (STRLEN)SvUV(*otherbitssvp);
+ if (bits < otherbits)
+ Perl_croak(aTHX_ "swash_get: swatch size mismatch");
+
+ /* The "other" swatch must be destroyed after. */
+ other = swash_get(*othersvp, start, span);
+ o = (U8*)SvPV(other, olen);
+
+ if (!olen)
+ Perl_croak(aTHX_ "swash_get didn't return valid swatch for other");
+
+ s = (U8*)SvPV(swatch, slen);
+ if (bits == 1 && otherbits == 1) {
+ if (slen != olen)
+ Perl_croak(aTHX_ "swash_get: swatch length mismatch");
+
+ switch (opc) {
+ case '+':
+ while (slen--)
+ *s++ |= *o++;
+ break;
+ case '!':
+ while (slen--)
+ *s++ |= ~*o++;
+ break;
+ case '-':
+ while (slen--)
+ *s++ &= ~*o++;
+ break;
+ case '&':
+ while (slen--)
+ *s++ &= *o++;
+ break;
+ default:
+ break;
+ }
+ }
+ else { /* bits >= 8 */
+ /* XXX: but weirdly otherval is treated as boolean */
+ STRLEN otheroctets = otherbits >> 3;
+ STRLEN offset = 0;
+ U8* send = s + slen;
+
+ while (s < send) {
+ UV otherval = 0;
+
+ if (otherbits == 1) {
+ otherval = (o[offset >> 3] >> (offset & 7)) & 1;
+ ++offset;
+ }
+ else {
+ STRLEN vlen = otheroctets;
+ otherval = *o++;
+ while (--vlen) {
+ otherval <<= 8;
+ otherval |= *o++;
+ }
+ }
+
+ if (opc == '+' && otherval)
+ otherval = 1;
+ else if (opc == '!' && !otherval)
+ otherval = 1;
+ else if (opc == '-' && otherval)
+ otherval = 0;
+ else if (opc == '&' && !otherval)
+ otherval = 0;
+ else {
+ s += octets; /* not modify orig swatch */
+ continue;
+ }
+
+ if (bits == 8)
+ *s++ = (U8)( otherval & 0xff);
+ else if (bits == 16) {
+ *s++ = (U8)((otherval >> 8) & 0xff);
+ *s++ = (U8)( otherval & 0xff);
+ }
+ else if (bits == 32) {
+ *s++ = (U8)((otherval >> 24) & 0xff);
+ *s++ = (U8)((otherval >> 16) & 0xff);
+ *s++ = (U8)((otherval >> 8) & 0xff);
+ *s++ = (U8)( otherval & 0xff);
+ }
+ }
+ }
+ sv_free(other); /* through with it! */
+ } /* while */
+ return swatch;
+}
+
/*
=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
diff --git a/utf8.h b/utf8.h
index 837f7fb826..d1989ecc88 100644
--- a/utf8.h
+++ b/utf8.h
@@ -145,7 +145,7 @@ encoded character.
* Note: we try to be careful never to call the isXXX_utf8() functions
* unless we're pretty sure we've seen the beginning of a UTF-8 character
* (that is, the two high bits are set). Otherwise we risk loading in the
- * heavy-duty SWASHINIT and SWASHGET routines unnecessarily.
+ * heavy-duty swash_init and swash_fetch routines unnecessarily.
*/
#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
? isIDFIRST(*(p)) \
diff --git a/utfebcdic.h b/utfebcdic.h
index 7d03608b41..bdc1359ed8 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -357,8 +357,8 @@ END_EXTERN_C
/*
* Note: we should try and be careful never to call the isXXX_utf8() functions
* unless we're pretty sure we've seen the beginning of a UTF-EBCDIC character
- * Otherwise we risk loading in the heavy-duty SWASHINIT and SWASHGET routines
- * unnecessarily.
+ * Otherwise we risk loading in the heavy-duty swash_init and swash_fetch
+ * routines unnecessarily.
*/
#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || UTF8_IS_INVARIANT(*p))) \