summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/Devel-Peek/t/Peek.t57
-rw-r--r--pod/perldelta.pod6
-rw-r--r--pp.c61
-rw-r--r--regexec.c12
-rw-r--r--util.c85
5 files changed, 188 insertions, 33 deletions
diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t
index 642d34c09b..d582a8f03c 100644
--- a/ext/Devel-Peek/t/Peek.t
+++ b/ext/Devel-Peek/t/Peek.t
@@ -874,9 +874,10 @@ unless ($Config{useithreads}) {
LEN = \d+
MAGIC = $ADDR
MG_VIRTUAL = &PL_vtbl_regexp
+ MG_PRIVATE = 1
MG_TYPE = PERL_MAGIC_study\\(G\\)
- MG_LEN = 1044
- MG_PTR = $ADDR "\\\\377\\\\377\\\\377\\\\377.*"
+ MG_LEN = 261
+ MG_PTR = $ADDR "\\\\377.*"
';
is(study beer, 1, "Our studies were successful");
@@ -903,10 +904,58 @@ unless ($Config{useithreads}) {
LEN = \d+
MAGIC = $ADDR
MG_VIRTUAL = &PL_vtbl_regexp
+ MG_PRIVATE = 1
MG_TYPE = PERL_MAGIC_study\\(G\\)
- MG_LEN = 1040
- MG_PTR = $ADDR "\\\\377\\\\377\\\\377\\\\377.*"
+ MG_LEN = 260
+ MG_PTR = $ADDR "\\\\377.*"
');
}
+{
+ my %z;
+ foreach (1, 254, 255, 65534, 65535) {
+ $z{$_} = "\0" x $_;
+ study $z{$_};
+ }
+ do_test('short studied representation', $z{1},
+'SV = PVMG\\($ADDR\\) at $ADDR
+ REFCNT = 1
+ FLAGS = \\(SMG,POK,pPOK,SCREAM\\)
+ IV = 0
+ NV = 0
+ PV = $ADDR "\\\\0"\\\0
+ CUR = 1
+ LEN = \d+
+ MAGIC = $ADDR
+ MG_VIRTUAL = &PL_vtbl_regexp
+ MG_PRIVATE = 1
+ MG_TYPE = PERL_MAGIC_study\\(G\\)
+ MG_LEN = 257
+ MG_PTR = $ADDR "\\\\0(?:\\\\377){256}"
+');
+
+ foreach ([254, 1], [255, 2], [65534, 2], [65535, 4]
+ ) {
+ my ($length, $bytes) = @$_;
+ my $quant = $length <= 32766 ? "{$length}" : '*';
+ do_test("studied representation for length $length", $z{$length},
+ sprintf
+'SV = PVMG\\($ADDR\\) at $ADDR
+ REFCNT = 1
+ FLAGS = \\(SMG,POK,pPOK,SCREAM\\)
+ IV = 0
+ NV = 0
+ PV = $ADDR "(?:\\\\0)%s"\\\0
+ CUR = %d
+ LEN = \d+
+ MAGIC = $ADDR
+ MG_VIRTUAL = &PL_vtbl_regexp
+ MG_PRIVATE = %d
+ MG_TYPE = PERL_MAGIC_study\\(G\\)
+ MG_LEN = %d
+ MG_PTR = $ADDR "\\\\0.*\\\\377"
+', $quant, $length, $bytes, (256 + $length) * $bytes);
+ }
+}
+
done_testing();
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index c65ee3e145..b06fc7a408 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -91,6 +91,12 @@ The implementation of C<s///r> makes one fewer copy of the scalar's value.
If a studied scalar is C<split> with a regex, the engine will now take
advantage of the C<study> data.
+=item *
+
+C<study> now uses considerably less memory for shorter strings. Strings shorter
+than 65535 characters use roughly half the memory than previously, strings
+shorter than 255 characters use roughly one quarter of the memory.
+
=back
=head1 Modules and Pragmata
diff --git a/pp.c b/pp.c
index f177165a0f..98d64827e0 100644
--- a/pp.c
+++ b/pp.c
@@ -707,10 +707,11 @@ PP(pp_study)
{
dVAR; dSP; dPOPss;
register unsigned char *s;
- U32 *sfirst;
- U32 *snext;
+ char *sfirst_raw;
STRLEN len;
MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_study) : NULL;
+ U8 quanta;
+ STRLEN size;
if (mg && SvSCREAM(sv))
RETPUSHYES;
@@ -724,28 +725,64 @@ PP(pp_study)
stringification. Also refuse to study an FBM scalar, as this gives
more flexibility in SV flag usage. No real-world code would ever
end up studying an FBM scalar, so this isn't a real pessimisation.
+ Endemic use of I32 in Perl_screaminstr makes it hard to safely push
+ the study length limit from I32_MAX to U32_MAX - 1.
*/
RETPUSHNO;
}
- Newx(sfirst, 256 + len, U32);
+ if (len < 0xFF) {
+ quanta = 1;
+ } else if (len < 0xFFFF) {
+ quanta = 2;
+ } else
+ quanta = 4;
- if (!sfirst)
+ size = (256 + len) * quanta;
+ sfirst_raw = (char *)safemalloc(size);
+
+ if (!sfirst_raw)
DIE(aTHX_ "do_study: out of memory");
SvSCREAM_on(sv);
if (!mg)
mg = sv_magicext(sv, NULL, PERL_MAGIC_study, &PL_vtbl_regexp, NULL, 0);
- mg->mg_ptr = (char *) sfirst;
- mg->mg_len = (256 + len) * sizeof(U32);
+ mg->mg_ptr = sfirst_raw;
+ mg->mg_len = size;
+ mg->mg_private = quanta;
- snext = sfirst + 256;
- memset(sfirst, ~0, 256 * sizeof(U32));
+ memset(sfirst_raw, ~0, 256 * quanta);
- while (len-- > 0) {
- const U8 ch = s[len];
- snext[len] = sfirst[ch];
- sfirst[ch] = len;
+ /* The assumption here is that most studied strings are fairly short, hence
+ the pain of the extra code is worth it, given the memory savings.
+ 80 character string, 336 bytes as U8, down from 1344 as U32
+ 800 character string, 2112 bytes as U16, down from 4224 as U32
+ */
+
+ if (quanta == 1) {
+ U8 *const sfirst = (U8 *)sfirst_raw;
+ U8 *const snext = sfirst + 256;
+ while (len-- > 0) {
+ const U8 ch = s[len];
+ snext[len] = sfirst[ch];
+ sfirst[ch] = len;
+ }
+ } else if (quanta == 2) {
+ U16 *const sfirst = (U16 *)sfirst_raw;
+ U16 *const snext = sfirst + 256;
+ while (len-- > 0) {
+ const U8 ch = s[len];
+ snext[len] = sfirst[ch];
+ sfirst[ch] = len;
+ }
+ } else {
+ U32 *const sfirst = (U32 *)sfirst_raw;
+ U32 *const snext = sfirst + 256;
+ while (len-- > 0) {
+ const U8 ch = s[len];
+ snext[len] = sfirst[ch];
+ sfirst[ch] = len;
+ }
}
RETPUSHYES;
diff --git a/regexec.c b/regexec.c
index 516bf9556c..99ac5b381c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -696,12 +696,22 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
I32 p = -1; /* Internal iterator of scream. */
I32 * const pp = data ? data->scream_pos : &p;
const MAGIC *mg;
+ bool found = FALSE;
assert(SvMAGICAL(sv));
mg = mg_find(sv, PERL_MAGIC_study);
assert(mg);
- if (((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0
+ if (mg->mg_private == 1) {
+ found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
+ } else if (mg->mg_private == 2) {
+ found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
+ } else {
+ assert (mg->mg_private == 4);
+ found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
+ }
+
+ if (found
|| ( BmRARE(check) == '\n'
&& (BmPREVIOUS(check) == SvCUR(check) - 1)
&& SvTAIL(check) ))
diff --git a/util.c b/util.c
index 4dbd15e689..fcfeda9814 100644
--- a/util.c
+++ b/util.c
@@ -854,7 +854,7 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift
{
dVAR;
register const unsigned char *big;
- U32 pos;
+ U32 pos = 0; /* hush a gcc warning */
register I32 previous;
register I32 first;
register const unsigned char *little;
@@ -862,9 +862,8 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift
register const unsigned char *littleend;
bool found = FALSE;
const MAGIC * mg;
- U32 *screamfirst;
- U32 *screamnext;
- U32 const nope = ~0;
+ const void *screamnext_raw = NULL; /* hush a gcc warning */
+ bool cant_find = FALSE; /* hush a gcc warning */
PERL_ARGS_ASSERT_SCREAMINSTR;
@@ -874,12 +873,37 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift
assert(SvTYPE(littlestr) == SVt_PVMG);
assert(SvVALID(littlestr));
- screamfirst = (U32 *)mg->mg_ptr;
- screamnext = screamfirst + 256;
+ if (mg->mg_private == 1) {
+ const U8 *const screamfirst = (U8 *)mg->mg_ptr;
+ const U8 *const screamnext = screamfirst + 256;
- pos = *old_posp == -1
- ? screamfirst[BmRARE(littlestr)] : screamnext[*old_posp];
- if (pos == nope) {
+ screamnext_raw = (const void *)screamnext;
+
+ pos = *old_posp == -1
+ ? screamfirst[BmRARE(littlestr)] : screamnext[*old_posp];
+ cant_find = pos == (U8)~0;
+ } else if (mg->mg_private == 2) {
+ const U16 *const screamfirst = (U16 *)mg->mg_ptr;
+ const U16 *const screamnext = screamfirst + 256;
+
+ screamnext_raw = (const void *)screamnext;
+
+ pos = *old_posp == -1
+ ? screamfirst[BmRARE(littlestr)] : screamnext[*old_posp];
+ cant_find = pos == (U16)~0;
+ } else if (mg->mg_private == 4) {
+ const U32 *const screamfirst = (U32 *)mg->mg_ptr;
+ const U32 *const screamnext = screamfirst + 256;
+
+ screamnext_raw = (const void *)screamnext;
+
+ pos = *old_posp == -1
+ ? screamfirst[BmRARE(littlestr)] : screamnext[*old_posp];
+ cant_find = pos == (U32)~0;
+ } else
+ Perl_croak(aTHX_ "panic: unknown study size %u", mg->mg_private);
+
+ if (cant_find) {
cant_find:
if ( BmRARE(littlestr) == '\n'
&& BmPREVIOUS(littlestr) == SvCUR(littlestr) - 1) {
@@ -910,13 +934,30 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift
#endif
return NULL;
}
- while ((I32)pos < previous + start_shift) {
- pos = screamnext[pos];
- if (pos == nope)
- goto cant_find;
+ if (mg->mg_private == 1) {
+ const U8 *const screamnext = (const U8 *const) screamnext_raw;
+ while ((I32)pos < previous + start_shift) {
+ pos = screamnext[pos];
+ if (pos == (U8)~0)
+ goto cant_find;
+ }
+ } else if (mg->mg_private == 2) {
+ const U16 *const screamnext = (const U16 *const) screamnext_raw;
+ while ((I32)pos < previous + start_shift) {
+ pos = screamnext[pos];
+ if (pos == (U16)~0)
+ goto cant_find;
+ }
+ } else if (mg->mg_private == 4) {
+ const U32 *const screamnext = (const U32 *const) screamnext_raw;
+ while ((I32)pos < previous + start_shift) {
+ pos = screamnext[pos];
+ if (pos == (U32)~0)
+ goto cant_find;
+ }
}
big -= previous;
- do {
+ while (1) {
if ((I32)pos >= stop_pos) break;
if (big[pos] == first) {
const unsigned char *s = little;
@@ -932,8 +973,20 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift
found = TRUE;
}
}
- pos = screamnext[pos];
- } while (pos != nope);
+ if (mg->mg_private == 1) {
+ pos = ((const U8 *const)screamnext_raw)[pos];
+ if (pos == (U8)~0)
+ break;
+ } else if (mg->mg_private == 2) {
+ pos = ((const U16 *const)screamnext_raw)[pos];
+ if (pos == (U16)~0)
+ break;
+ } else if (mg->mg_private == 4) {
+ pos = ((const U32 *const)screamnext_raw)[pos];
+ if (pos == (U32)~0)
+ break;
+ }
+ };
if (last && found)
return (char *)(big+(*old_posp));
check_tail: