diff options
-rw-r--r-- | ext/Devel-Peek/t/Peek.t | 57 | ||||
-rw-r--r-- | pod/perldelta.pod | 6 | ||||
-rw-r--r-- | pp.c | 61 | ||||
-rw-r--r-- | regexec.c | 12 | ||||
-rw-r--r-- | util.c | 85 |
5 files changed, 188 insertions, 33 deletions
diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t index 642d34c09b..d582a8f03c 100644 --- a/ext/Devel-Peek/t/Peek.t +++ b/ext/Devel-Peek/t/Peek.t @@ -874,9 +874,10 @@ unless ($Config{useithreads}) { LEN = \d+ MAGIC = $ADDR MG_VIRTUAL = &PL_vtbl_regexp + MG_PRIVATE = 1 MG_TYPE = PERL_MAGIC_study\\(G\\) - MG_LEN = 1044 - MG_PTR = $ADDR "\\\\377\\\\377\\\\377\\\\377.*" + MG_LEN = 261 + MG_PTR = $ADDR "\\\\377.*" '; is(study beer, 1, "Our studies were successful"); @@ -903,10 +904,58 @@ unless ($Config{useithreads}) { LEN = \d+ MAGIC = $ADDR MG_VIRTUAL = &PL_vtbl_regexp + MG_PRIVATE = 1 MG_TYPE = PERL_MAGIC_study\\(G\\) - MG_LEN = 1040 - MG_PTR = $ADDR "\\\\377\\\\377\\\\377\\\\377.*" + MG_LEN = 260 + MG_PTR = $ADDR "\\\\377.*" '); } +{ + my %z; + foreach (1, 254, 255, 65534, 65535) { + $z{$_} = "\0" x $_; + study $z{$_}; + } + do_test('short studied representation', $z{1}, +'SV = PVMG\\($ADDR\\) at $ADDR + REFCNT = 1 + FLAGS = \\(SMG,POK,pPOK,SCREAM\\) + IV = 0 + NV = 0 + PV = $ADDR "\\\\0"\\\0 + CUR = 1 + LEN = \d+ + MAGIC = $ADDR + MG_VIRTUAL = &PL_vtbl_regexp + MG_PRIVATE = 1 + MG_TYPE = PERL_MAGIC_study\\(G\\) + MG_LEN = 257 + MG_PTR = $ADDR "\\\\0(?:\\\\377){256}" +'); + + foreach ([254, 1], [255, 2], [65534, 2], [65535, 4] + ) { + my ($length, $bytes) = @$_; + my $quant = $length <= 32766 ? "{$length}" : '*'; + do_test("studied representation for length $length", $z{$length}, + sprintf +'SV = PVMG\\($ADDR\\) at $ADDR + REFCNT = 1 + FLAGS = \\(SMG,POK,pPOK,SCREAM\\) + IV = 0 + NV = 0 + PV = $ADDR "(?:\\\\0)%s"\\\0 + CUR = %d + LEN = \d+ + MAGIC = $ADDR + MG_VIRTUAL = &PL_vtbl_regexp + MG_PRIVATE = %d + MG_TYPE = PERL_MAGIC_study\\(G\\) + MG_LEN = %d + MG_PTR = $ADDR "\\\\0.*\\\\377" +', $quant, $length, $bytes, (256 + $length) * $bytes); + } +} + done_testing(); diff --git a/pod/perldelta.pod b/pod/perldelta.pod index c65ee3e145..b06fc7a408 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -91,6 +91,12 @@ The implementation of C<s///r> makes one fewer copy of the scalar's value. If a studied scalar is C<split> with a regex, the engine will now take advantage of the C<study> data. +=item * + +C<study> now uses considerably less memory for shorter strings. Strings shorter +than 65535 characters use roughly half the memory than previously, strings +shorter than 255 characters use roughly one quarter of the memory. + =back =head1 Modules and Pragmata @@ -707,10 +707,11 @@ PP(pp_study) { dVAR; dSP; dPOPss; register unsigned char *s; - U32 *sfirst; - U32 *snext; + char *sfirst_raw; STRLEN len; MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_study) : NULL; + U8 quanta; + STRLEN size; if (mg && SvSCREAM(sv)) RETPUSHYES; @@ -724,28 +725,64 @@ PP(pp_study) stringification. Also refuse to study an FBM scalar, as this gives more flexibility in SV flag usage. No real-world code would ever end up studying an FBM scalar, so this isn't a real pessimisation. + Endemic use of I32 in Perl_screaminstr makes it hard to safely push + the study length limit from I32_MAX to U32_MAX - 1. */ RETPUSHNO; } - Newx(sfirst, 256 + len, U32); + if (len < 0xFF) { + quanta = 1; + } else if (len < 0xFFFF) { + quanta = 2; + } else + quanta = 4; - if (!sfirst) + size = (256 + len) * quanta; + sfirst_raw = (char *)safemalloc(size); + + if (!sfirst_raw) DIE(aTHX_ "do_study: out of memory"); SvSCREAM_on(sv); if (!mg) mg = sv_magicext(sv, NULL, PERL_MAGIC_study, &PL_vtbl_regexp, NULL, 0); - mg->mg_ptr = (char *) sfirst; - mg->mg_len = (256 + len) * sizeof(U32); + mg->mg_ptr = sfirst_raw; + mg->mg_len = size; + mg->mg_private = quanta; - snext = sfirst + 256; - memset(sfirst, ~0, 256 * sizeof(U32)); + memset(sfirst_raw, ~0, 256 * quanta); - while (len-- > 0) { - const U8 ch = s[len]; - snext[len] = sfirst[ch]; - sfirst[ch] = len; + /* The assumption here is that most studied strings are fairly short, hence + the pain of the extra code is worth it, given the memory savings. + 80 character string, 336 bytes as U8, down from 1344 as U32 + 800 character string, 2112 bytes as U16, down from 4224 as U32 + */ + + if (quanta == 1) { + U8 *const sfirst = (U8 *)sfirst_raw; + U8 *const snext = sfirst + 256; + while (len-- > 0) { + const U8 ch = s[len]; + snext[len] = sfirst[ch]; + sfirst[ch] = len; + } + } else if (quanta == 2) { + U16 *const sfirst = (U16 *)sfirst_raw; + U16 *const snext = sfirst + 256; + while (len-- > 0) { + const U8 ch = s[len]; + snext[len] = sfirst[ch]; + sfirst[ch] = len; + } + } else { + U32 *const sfirst = (U32 *)sfirst_raw; + U32 *const snext = sfirst + 256; + while (len-- > 0) { + const U8 ch = s[len]; + snext[len] = sfirst[ch]; + sfirst[ch] = len; + } } RETPUSHYES; @@ -696,12 +696,22 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, I32 p = -1; /* Internal iterator of scream. */ I32 * const pp = data ? data->scream_pos : &p; const MAGIC *mg; + bool found = FALSE; assert(SvMAGICAL(sv)); mg = mg_find(sv, PERL_MAGIC_study); assert(mg); - if (((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0 + if (mg->mg_private == 1) { + found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0; + } else if (mg->mg_private == 2) { + found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0; + } else { + assert (mg->mg_private == 4); + found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0; + } + + if (found || ( BmRARE(check) == '\n' && (BmPREVIOUS(check) == SvCUR(check) - 1) && SvTAIL(check) )) @@ -854,7 +854,7 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift { dVAR; register const unsigned char *big; - U32 pos; + U32 pos = 0; /* hush a gcc warning */ register I32 previous; register I32 first; register const unsigned char *little; @@ -862,9 +862,8 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift register const unsigned char *littleend; bool found = FALSE; const MAGIC * mg; - U32 *screamfirst; - U32 *screamnext; - U32 const nope = ~0; + const void *screamnext_raw = NULL; /* hush a gcc warning */ + bool cant_find = FALSE; /* hush a gcc warning */ PERL_ARGS_ASSERT_SCREAMINSTR; @@ -874,12 +873,37 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift assert(SvTYPE(littlestr) == SVt_PVMG); assert(SvVALID(littlestr)); - screamfirst = (U32 *)mg->mg_ptr; - screamnext = screamfirst + 256; + if (mg->mg_private == 1) { + const U8 *const screamfirst = (U8 *)mg->mg_ptr; + const U8 *const screamnext = screamfirst + 256; - pos = *old_posp == -1 - ? screamfirst[BmRARE(littlestr)] : screamnext[*old_posp]; - if (pos == nope) { + screamnext_raw = (const void *)screamnext; + + pos = *old_posp == -1 + ? screamfirst[BmRARE(littlestr)] : screamnext[*old_posp]; + cant_find = pos == (U8)~0; + } else if (mg->mg_private == 2) { + const U16 *const screamfirst = (U16 *)mg->mg_ptr; + const U16 *const screamnext = screamfirst + 256; + + screamnext_raw = (const void *)screamnext; + + pos = *old_posp == -1 + ? screamfirst[BmRARE(littlestr)] : screamnext[*old_posp]; + cant_find = pos == (U16)~0; + } else if (mg->mg_private == 4) { + const U32 *const screamfirst = (U32 *)mg->mg_ptr; + const U32 *const screamnext = screamfirst + 256; + + screamnext_raw = (const void *)screamnext; + + pos = *old_posp == -1 + ? screamfirst[BmRARE(littlestr)] : screamnext[*old_posp]; + cant_find = pos == (U32)~0; + } else + Perl_croak(aTHX_ "panic: unknown study size %u", mg->mg_private); + + if (cant_find) { cant_find: if ( BmRARE(littlestr) == '\n' && BmPREVIOUS(littlestr) == SvCUR(littlestr) - 1) { @@ -910,13 +934,30 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift #endif return NULL; } - while ((I32)pos < previous + start_shift) { - pos = screamnext[pos]; - if (pos == nope) - goto cant_find; + if (mg->mg_private == 1) { + const U8 *const screamnext = (const U8 *const) screamnext_raw; + while ((I32)pos < previous + start_shift) { + pos = screamnext[pos]; + if (pos == (U8)~0) + goto cant_find; + } + } else if (mg->mg_private == 2) { + const U16 *const screamnext = (const U16 *const) screamnext_raw; + while ((I32)pos < previous + start_shift) { + pos = screamnext[pos]; + if (pos == (U16)~0) + goto cant_find; + } + } else if (mg->mg_private == 4) { + const U32 *const screamnext = (const U32 *const) screamnext_raw; + while ((I32)pos < previous + start_shift) { + pos = screamnext[pos]; + if (pos == (U32)~0) + goto cant_find; + } } big -= previous; - do { + while (1) { if ((I32)pos >= stop_pos) break; if (big[pos] == first) { const unsigned char *s = little; @@ -932,8 +973,20 @@ Perl_screaminstr(pTHX_ SV *bigstr, SV *littlestr, I32 start_shift, I32 end_shift found = TRUE; } } - pos = screamnext[pos]; - } while (pos != nope); + if (mg->mg_private == 1) { + pos = ((const U8 *const)screamnext_raw)[pos]; + if (pos == (U8)~0) + break; + } else if (mg->mg_private == 2) { + pos = ((const U16 *const)screamnext_raw)[pos]; + if (pos == (U16)~0) + break; + } else if (mg->mg_private == 4) { + pos = ((const U32 *const)screamnext_raw)[pos]; + if (pos == (U32)~0) + break; + } + }; if (last && found) return (char *)(big+(*old_posp)); check_tail: |