diff options
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | pod.lst | 1 | ||||
-rw-r--r-- | pod/perl.pod | 1 | ||||
-rw-r--r-- | pod/perlreapi.pod | 498 | ||||
-rw-r--r-- | pod/perlreguts.pod | 328 | ||||
-rw-r--r-- | vms/descrip_mms.template | 6 | ||||
-rw-r--r-- | win32/pod.mak | 4 |
7 files changed, 549 insertions, 290 deletions
@@ -3204,6 +3204,7 @@ pod/perlpod.pod Perl plain old documentation pod/perlpodspec.pod Perl plain old documentation format specification pod/perlport.pod Perl portability guide pod/perlpragma.pod Perl modules: writing a user pragma +pod/perlreapi.pod Perl regexp plugin interface pod/perlref.pod Perl references, the rest of the story pod/perlreftut.pod Perl references short introduction pod/perlreguts.pod Perl regular expression engine internals @@ -108,6 +108,7 @@ h Internals and C Language Interface perlclib Internal replacements for standard C library functions perlguts Perl internal functions for those doing extensions perlcall Perl calling conventions from C + perlreapi Perl regular expression plugin interface perlreguts Perl regular expression engine internals perlapi Perl API listing (autogenerated) diff --git a/pod/perl.pod b/pod/perl.pod index 75b537a7fc..d6bbd61135 100644 --- a/pod/perl.pod +++ b/pod/perl.pod @@ -125,6 +125,7 @@ For ease of access, the Perl manual has been split up into several sections. perlclib Internal replacements for standard C library functions perlguts Perl internal functions for those doing extensions perlcall Perl calling conventions from C + perlreapi Perl regular expression plugin interface perlreguts Perl regular expression engine internals perlapi Perl API listing (autogenerated) diff --git a/pod/perlreapi.pod b/pod/perlreapi.pod new file mode 100644 index 0000000000..02e1ccb265 --- /dev/null +++ b/pod/perlreapi.pod @@ -0,0 +1,498 @@ +=head1 NAME + +perlreapi - perl regular expression plugin interface + +=head1 DESCRIPTION + +As of Perl 5.9.5 there is a new interface for using other regexp engines than +the default one. Each engine is supposed to provide access to a constant +structure of the following format: + + typedef struct regexp_engine { + regexp* (*comp) (pTHX_ char* exp, char* xend, U32 pm_flags); + I32 (*exec) (pTHX_ regexp* prog, char* stringarg, char* strend, + char* strbeg, I32 minend, SV* screamer, + void* data, U32 flags); + char* (*intuit) (pTHX_ regexp *prog, SV *sv, char *strpos, + char *strend, U32 flags, + struct re_scream_pos_data_s *data); + SV* (*checkstr) (pTHX_ regexp *prog); + void (*free) (pTHX_ struct regexp* r); + SV* (*numbered_buff_get) (pTHX_ const REGEXP * const rx, I32 paren, SV* usesv); + SV* (*named_buff_get)(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags); + SV* (*qr_pkg)(pTHX_ const REGEXP * const rx); + #ifdef USE_ITHREADS + void* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param); + #endif + } regexp_engine; + +When a regexp is compiled, its C<engine> field is then set to point at +the appropriate structure so that when it needs to be used Perl can find +the right routines to do so. + +In order to install a new regexp handler, C<$^H{regcomp}> is set +to an integer which (when casted appropriately) resolves to one of these +structures. When compiling, the C<comp> method is executed, and the +resulting regexp structure's engine field is expected to point back at +the same structure. + +The pTHX_ symbol in the definition is a macro used by perl under threading +to provide an extra argument to the routine holding a pointer back to +the interpreter that is executing the regexp. So under threading all +routines get an extra argument. + +The routines are as follows: + +=head2 comp + + regexp* comp(char *exp, char *xend, U32 flags); + +Compile the pattern between exp and xend using the given flags and return a +pointer to a prepared regexp structure that can perform the match. See L</The +REGEXP structure> below for an explanation of the individual fields in the +REGEXP struct. + +The C<flags> paramater is a bitfield which indicates which of the +C<msixk> flags the regex was compiled with. In addition it contains +info about whether C<use locale> is in effect and optimization info +for C<split>. A regex engine might want to use the same split +optimizations with a different syntax, for instance a Perl6 engine +would treat C<split /^^/> equivalently to perl's C<split /^/>, see +L<split documentation|perlfunc> and the relevant code in C<pp_split> +in F<pp.c> to find out whether your engine should be setting these. + +The C<eogc> flags are stripped out before being passed to the comp +routine. The regex engine does not need to know whether any of these +are set. + +=over 4 + +=item RXf_SKIPWHITE + +C<split ' '> or C<split> with no arguments (which really means +C<split(' ', $_> see L<split|perlfunc>). + +=item RXf_START_ONLY + +Set if the pattern is C</^/> (C<<r->prelen == 1 && r->precomp[0] == +'^'>>). Will be used by the C<split> operator to split the given +string on C<\n> (even under C</^/s>, see L<split|perlfunc>). + +=item RXf_WHITE + +Set if the pattern is exactly C</\s+/> and used by C<split>, the +definition of whitespace varies depending on whether RXf_UTF8 or +RXf_PMf_LOCALE is set. + +=item RXf_PMf_LOCALE + +Makes C<split> use the locale dependant definition of whitespace under C<use +locale> when RXf_SKIPWHITE or RXf_WHITE is in effect. Under ASCII whitespace is +defined as per L<isSPACE|perlapi/ISSPACE>, and by the internal macros +C<is_utf8_space> under UTF-8 and C<isSPACE_LC> under C<use locale>. + +=item RXf_PMf_MULTILINE + +The C</m> flag, this ends up being passed to C<Perl_fbm_instr> by +C<pp_split> regardless of the engine. + +=item RXf_PMf_SINGLELINE + +The C</s> flag. Guaranteed not to be used outside the regex engine. + +=item RXf_PMf_FOLD + +The C</i> flag. Guaranteed not to be used outside the regex engine. + +=item RXf_PMf_EXTENDED + +The C</x> flag. Guaranteed not to be used outside the regex +engine. However if present on a regex C<#> comments will be stripped +by the tokenizer regardless of the engine currently in use. + +=item RXf_PMf_KEEPCOPY + +The C</k> flag. + +=item RXf_UTF8 + +Set if the pattern is L<SvUTF8()|perlapi/SvUTF8>, set by Perl_pmruntime. + +=back + +In general these flags should be preserved in regex->extflags after +compilation, although it is possible the regex includes constructs +that changes them. The perl engine for instance may upgrade non-utf8 +strings to utf8 if the pattern includes constructs such as C<\x{...}> +that can only match unicode values. RXf_SKIPWHITE should always be +preserved verbatim in regex->extflags. + +=head2 exec + + I32 exec(regexp* prog, + char *stringarg, char* strend, char* strbeg, + I32 minend, SV* screamer, + void* data, U32 flags); + +Execute a regexp. + +=head2 intuit + + char* intuit( regexp *prog, + SV *sv, char *strpos, char *strend, + U32 flags, struct re_scream_pos_data_s *data); + +Find the start position where a regex match should be attempted, +or possibly whether the regex engine should not be run because the +pattern can't match. This is called as appropriate by the core +depending on the values of the extflags member of the regexp +structure. + +=head2 checkstr + + SV* checkstr(regexp *prog); + +Return a SV containing a string that must appear in the pattern. Used +by C<split> for optimising matches. + +=head2 free + + void free(regexp *prog); + +Called by perl when it is freeing a regexp pattern so that the engine +can release any resources pointed to by the C<pprivate> member of the +regexp structure. This is only responsible for freeing private data; +perl will handle releasing anything else contained in the regexp structure. + +=head2 numbered_buff_get + + SV* numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv); + +TODO: document + +=head2 named_buff_get + + SV* named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags); + +TODO: document + +=head2 qr_pkg + + SV* qr_pkg(pTHX_ const REGEXP * const rx); + +The package the qr// magic object is blessed into (as seen by C<ref +qr//>). It is recommended that engines change this to its package +name, for instance: + + SV* + Example_reg_qr_pkg(pTHX_ const REGEXP * const rx) + { + PERL_UNUSED_ARG(rx); + return newSVpvs("re::engine::Example"); + } + +Any method calls on an object created with C<qr//> will be dispatched to the +package as a normal object. + + use re::engine::Example; + my $re = qr//; + $re->meth; # dispatched to re::engine::Example::meth() + +To retrieve the C<REGEXP> object from the scalar in an XS function use the +following snippet: + + void meth(SV * rv) + PPCODE: + MAGIC * mg; + REGEXP * re; + + if (SvMAGICAL(sv)) + mg_get(sv); + if (SvROK(sv) && + (sv = (SV*)SvRV(sv)) && /* assignment deliberate */ + SvTYPE(sv) == SVt_PVMG && + (mg = mg_find(sv, PERL_MAGIC_qr))) /* assignment deliberate */ + { + re = (REGEXP *)mg->mg_obj; + } + +Or use the (CURRENTLY UNDOCUMENETED!) C<Perl_get_re_arg> function: + + void meth(SV * rv) + PPCODE: + const REGEXP * const re = (REGEXP *)Perl_get_re_arg( aTHX_ rv, 0, NULL ); + +=head2 dupe + + void* dupe(const regexp *r, CLONE_PARAMS *param); + +On threaded builds a regexp may need to be duplicated so that the pattern +can be used by mutiple threads. This routine is expected to handle the +duplication of any private data pointed to by the C<pprivate> member of +the regexp structure. It will be called with the preconstructed new +regexp structure as an argument, the C<pprivate> member will point at +the B<old> private structue, and it is this routine's responsibility to +construct a copy and return a pointer to it (which perl will then use to +overwrite the field as passed to this routine.) + +This allows the engine to dupe its private data but also if necessary +modify the final structure if it really must. + +On unthreaded builds this field doesn't exist. + +=head1 The REGEXP structure + +The REGEXP struct is defined in F<regexp.h>. All regex engines must be able to +correctly build such a structure in their L</comp> routine. + +The REGEXP structure contains all the data that perl needs to be aware of +to properly work with the regular expression. It includes data about +optimisations that perl can use to determine if the regex engine should +really be used, and various other control info that is needed to properly +execute patterns in various contexts such as is the pattern anchored in +some way, or what flags were used during the compile, or whether the +program contains special constructs that perl needs to be aware of. + +In addition it contains two fields that are intended for the private use +of the regex engine that compiled the pattern. These are the C<intflags> +and pprivate members. The C<pprivate> is a void pointer to an arbitrary +structure whose use and management is the responsibility of the compiling +engine. perl will never modify either of these values. + + typedef struct regexp { + /* what engine created this regexp? */ + const struct regexp_engine* engine; + + /* what re is this a lightweight copy of? */ + struct regexp* mother_re; + + /* Information about the match that the perl core uses to manage things */ + U32 extflags; /* Flags used both externally and internally */ + I32 minlen; /* mininum possible length of string to match */ + I32 minlenret; /* mininum possible length of $& */ + U32 gofs; /* chars left of pos that we search from */ + + /* substring data about strings that must appear + in the final match, used for optimisations */ + struct reg_substr_data *substrs; + + U32 nparens; /* number of capture buffers */ + + /* private engine specific data */ + U32 intflags; /* Engine Specific Internal flags */ + void *pprivate; /* Data private to the regex engine which + created this object. */ + + /* Data about the last/current match. These are modified during matching*/ + U32 lastparen; /* last open paren matched */ + U32 lastcloseparen; /* last close paren matched */ + regexp_paren_pair *swap; /* Swap copy of *offs */ + regexp_paren_pair *offs; /* Array of offsets for (@-) and (@+) */ + + char *subbeg; /* saved or original string so \digit works forever. */ + SV_SAVED_COPY /* If non-NULL, SV which is COW from original */ + I32 sublen; /* Length of string pointed by subbeg */ + + /* Information about the match that isn't often used */ + I32 prelen; /* length of precomp */ + const char *precomp; /* pre-compilation regular expression */ + + /* wrapped can't be const char*, as it is returned by sv_2pv_flags */ + char *wrapped; /* wrapped version of the pattern */ + I32 wraplen; /* length of wrapped */ + + I32 seen_evals; /* number of eval groups in the pattern - for security checks */ + HV *paren_names; /* Optional hash of paren names */ + + /* Refcount of this regexp */ + I32 refcnt; /* Refcount of this regexp */ + } regexp; + +The fields are discussed in more detail below: + +=over 4 + +=item C<engine> + +This field points at a regexp_engine structure which contains pointers +to the subroutines that are to be used for performing a match. It +is the compiling routine's responsibility to populate this field before +returning the regexp object. + +Internally this is set to C<NULL> unless a custom engine is specified in +C<$^H{regcomp}>, perl's own set of callbacks can be accessed in the struct +pointed to by C<RE_ENGINE_PTR>. + +=item C<mother_re> + +TODO, see L<http://www.mail-archive.com/perl5-changes@perl.org/msg17328.html> + +=item C<extflags> + +This will be used by perl to see what flags the regexp was compiled with, this +will normally be set to the value of the flags parameter on L</comp>. + +=item C<minlen> C<minlenret> + +The minimum string length required for the pattern to match. This is used to +prune the search space by not bothering to match any closer to the end of a +string than would allow a match. For instance there is no point in even +starting the regex engine if the minlen is 10 but the string is only 5 +characters long. There is no way that the pattern can match. + +C<minlenret> is the minimum length of the string that would be found +in $& after a match. + +The difference between C<minlen> and C<minlenret> can be seen in the +following pattern: + + /ns(?=\d)/ + +where the C<minlen> would be 3 but C<minlenret> would only be 2 as the \d is +required to match but is not actually included in the matched content. This +distinction is particularly important as the substitution logic uses the +C<minlenret> to tell whether it can do in-place substition which can result in +considerable speedup. + +=item C<gofs> + +Left offset from pos() to start match at. + +=item C<substrs> + +TODO: document + +=item C<nparens>, C<lasparen>, and C<lastcloseparen> + +These fields are used to keep track of how many paren groups could be matched +in the pattern, which was the last open paren to be entered, and which was +the last close paren to be entered. + +=item C<intflags> + +The engine's private copy of the flags the pattern was compiled with. Usually +this is the same as C<extflags> unless the engine chose to modify one of them + +=item C<pprivate> + +A void* pointing to an engine-defined data structure. The perl engine uses the +C<regexp_internal> structure (see L<perlreguts/Base Structures>) but a custom +engine should use something else. + +=item C<swap> + +TODO: document + +=item C<offs> + +A C<regexp_paren_pair> structure which defines offsets into the string being +matched which correspond to the C<$&> and C<$1>, C<$2> etc. captures, the +C<regexp_paren_pair> struct is defined as follows: + + typedef struct regexp_paren_pair { + I32 start; + I32 end; + } regexp_paren_pair; + +If C<< ->offs[num].start >> or C<< ->offs[num].end >> is C<-1> then that +capture buffer did not match. C<< ->offs[0].start/end >> represents C<$&> (or +C<${^MATCH> under C<//p>) and C<< ->offs[paren].end >> matches C<$$paren> where +C<$paren >= 1>. + +=item C<precomp> C<prelen> + +Used for debugging purposes. C<precomp> holds a copy of the pattern +that was compiled and C<prelen> its length. + +=item C<paren_names> + +This is a hash used internally to track named capture buffers and their +offsets. The keys are the names of the buffers the values are dualvars, +with the IV slot holding the number of buffers with the given name and the +pv being an embedded array of I32. The values may also be contained +independently in the data array in cases where named backreferences are +used. + +=item C<reg_substr_data> + +Holds information on the longest string that must occur at a fixed +offset from the start of the pattern, and the longest string that must +occur at a floating offset from the start of the pattern. Used to do +Fast-Boyer-Moore searches on the string to find out if its worth using +the regex engine at all, and if so where in the string to search. + +=item C<startp>, C<endp> + +These fields store arrays that are used to hold the offsets of the begining +and end of each capture group that has matched. -1 is used to indicate no match. + +These are the source for @- and @+. + +=item C<subbeg> C<sublen> C<saved_copy> + + #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) + if (RX_MATCH_COPIED(ret)) + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); + else + ret->subbeg = NULL; + +C<PL_sawampersand || rx->extflags & RXf_PMf_KEEPCOPY> + +These are used during execution phase for managing search and replace +patterns. + +=item C<wrapped> C<wraplen> + +Stores the string C<qr//> stringifies to, for example C<(?-xism:eek)> +in the case of C<qr/eek/>. + +When using a custom engine that doesn't support the C<(?:)> construct for +inline modifiers it's best to have C<qr//> stringify to the supplied pattern, +note that this will create invalid patterns in cases such as: + + my $x = qr/a|b/; # "a|b" + my $y = qr/c/; # "c" + my $z = qr/$x$y/; # "a|bc" + +There's no solution for such problems other than making the custom engine +understand some for of inline modifiers. + +The C<Perl_reg_stringify> in F<regcomp.c> does the stringification work. + +=item C<seen_evals> + +This stores the number of eval groups in the pattern. This is used for security +purposes when embedding compiled regexes into larger patterns with C<qr//>. + +=item C<refcnt> + +The number of times the structure is referenced. When this falls to 0 the +regexp is automatically freed by a call to pregfree. This should be set to 1 in +each engine's L</comp> routine. + +=back + +=head2 De-allocation and Cloning + +Any patch that adds data items to the REGEXP struct will need to include +changes to F<sv.c> (C<Perl_re_dup()>) and F<regcomp.c> (C<pregfree()>). This +involves freeing or cloning items in the regexp's data array based on the data +item's type. + +=head1 HISTORY + +Originally part of L<perlreguts>. + +=head1 AUTHORS + +Originally written by Yves Orton, expanded by E<AElig>var ArnfjE<ouml>rE<eth> +Bjarmason. + +=head1 LICENSE + +Copyright 2006 Yves Orton and 2007 E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason. + +This program is free software; you can redistribute it and/or modify it under +the same terms as Perl itself. + +=cut diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index 577f672bf4..125a9f9f41 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -12,14 +12,15 @@ author's experience, comments in the source code, other papers on the regex engine, feedback on the perl5-porters mail list, and no doubt other places as well. -B<WARNING!> It should be clearly understood that this document represents -the state of the regex engine as the author understands it at the time of -writing. Unless stated otherwise it is B<NOT> an API definition; it is -purely an internals guide for those who want to hack the regex engine, or -understand how the regex engine works. Readers of this document are -expected to understand perl's regex syntax and its usage in detail. If you -want to learn about the basics of Perl's regular expressions, see -L<perlre>. +B<NOTICE!> It should be clearly understood that the behavior and +structures discussed in this represents the state of the engine as the +author understood it at the time of writing. It is B<NOT> an API +definition, it is purely an internals guide for those who want to hack +the regex engine, or understand how the regex engine works. Readers of +this document are expected to understand perl's regex syntax and its +usage in detail. If you want to learn about the basics of Perl's +regular expressions, see L<perlre>. And if you want to replace the +regex engine with your own see see L<perlreapi>. =head1 OVERVIEW @@ -384,9 +385,9 @@ A grammar form might be something like this: =head3 Debug Output -In the 5.9.x development version of perl you can C<< use re Debug => 'PARSE'; >> to see some trace -information about the parse process. We will start with some simple -patterns and build up to more complex patterns. +In the 5.9.x development version of perl you can C<<use re Debug => 'PARSE'>> +to see some trace information about the parse process. We will start with some +simple patterns and build up to more complex patterns. So when we parse C</foo/> we see something like the following table. The left shows what is being parsed, and the number indicates where the next regop @@ -743,11 +744,28 @@ tricky this can be: =head2 Base Structures +The C<regexp> structure described in L<perlreapi> is common to all +regex engines. Two of its fields that are intended for the private use +of the regex engine that compiled the pattern. These are the +C<intflags> and pprivate members. The C<pprivate> is a void pointer to +an arbitrary structure whose use and management is the responsibility +of the compiling engine. perl will never modify either of these +values. In the case of the stock engine the structure pointed to by +C<pprivate> is called C<regexp_internal>. + +Its C<pprivate> and C<intflags> fields contain data +specific to each engine. + There are two structures used to store a compiled regular expression. -One, the regexp structure, is considered to be perl's property, and the -other is considered to be the property of the regex engine which -compiled the regular expression; in the case of the stock engine this -structure is called regexp_internal. +One, the C<regexp> structure described in L<perlreapi> is populated by +the engine currently being. used and some of its fields read by perl to +implement things such as the stringification of C<qr//>. + + +The other structure is pointed to be the C<regexp> struct's +C<pprivate> and is in addition to C<intflags> in the same struct +considered to be the property of the regex engine which compiled the +regular expression; The regexp structure contains all the data that perl needs to be aware of to properly work with the regular expression. It includes data about @@ -768,151 +786,11 @@ will be a pointer to a regexp_internal structure which holds the compiled program and any additional data that is private to the regex engine implementation. -=head3 Perl Inspectable Data About Pattern - -F<regexp.h> contains the "public" structure definition. All regex engines -must be able to correctly build a regexp structure. - - typedef struct regexp { - /* what engine created this regexp? */ - const struct regexp_engine* engine; - - /* Information about the match that the perl core uses to manage things */ - U32 extflags; /* Flags used both externally and internally */ - I32 minlen; /* mininum possible length of string to match */ - I32 minlenret; /* mininum possible length of $& */ - U32 gofs; /* chars left of pos that we search from */ - struct reg_substr_data *substrs; /* substring data about strings that must appear - in the final match, used for optimisations */ - U32 nparens; /* number of capture buffers */ - - /* private engine specific data */ - U32 intflags; /* Engine Specific Internal flags */ - void *pprivate; /* Data private to the regex engine which - created this object. */ - - /* Data about the last/current match. These are modified during matching*/ - U32 lastparen; /* last open paren matched */ - U32 lastcloseparen; /* last close paren matched */ - I32 *startp; /* Array of offsets from start of string (@-) */ - I32 *endp; /* Array of offsets from start of string (@+) */ - char *subbeg; /* saved or original string - so \digit works forever. */ - I32 sublen; /* Length of string pointed by subbeg */ - SV_SAVED_COPY /* If non-NULL, SV which is COW from original */ - - - /* Information about the match that isn't often used */ - char *precomp; /* pre-compilation regular expression */ - I32 prelen; /* length of precomp */ - I32 seen_evals; /* number of eval groups in the pattern - for security checks */ - HV *paren_names; /* Optional hash of paren names */ - - /* Refcount of this regexp */ - I32 refcnt; /* Refcount of this regexp */ - } regexp; - -The fields are discussed in more detail below: - -=over 5 - - -=item C<refcnt> - -The number of times the structure is referenced. When this falls to 0 -the regexp is automatically freed by a call to pregfree. - -=item C<engine> - -This field points at a regexp_engine structure which contains pointers -to the subroutines that are to be used for performing a match. It -is the compiling routine's responsibility to populate this field before -returning the regexp object. - -=item C<precomp> C<prelen> - -Used for debugging purposes. C<precomp> holds a copy of the pattern -that was compiled. - -=item C<extflags> - -This is used to store various flags about the pattern, such as whether it -contains a \G or a ^ or $ symbol. - -=item C<minlen> C<minlenret> - -C<minlen> is the minimum string length required for the pattern to match. -This is used to prune the search space by not bothering to match any -closer to the end of a string than would allow a match. For instance -there is no point in even starting the regex engine if the minlen is -10 but the string is only 5 characters long. There is no way that the -pattern can match. - -C<minlenret> is the minimum length of the string that would be found -in $& after a match. - -The difference between C<minlen> and C<minlenret> can be seen in the -following pattern: - - /ns(?=\d)/ - -where the C<minlen> would be 3 but the minlen ret would only be 2 as -the \d is required to match but is not actually included in the matched -content. This distinction is particularly important as the substitution -logic uses the C<minlenret> to tell whether it can do in-place substition -which can result in considerable speedup. - -=item C<gofs> - -Left offset from pos() to start match at. - -=item C<nparens>, C<lasparen>, and C<lastcloseparen> - -These fields are used to keep track of how many paren groups could be matched -in the pattern, which was the last open paren to be entered, and which was -the last close paren to be entered. - -=item C<paren_names> - -This is a hash used internally to track named capture buffers and their -offsets. The keys are the names of the buffers the values are dualvars, -with the IV slot holding the number of buffers with the given name and the -pv being an embedded array of I32. The values may also be contained -independently in the data array in cases where named backreferences are -used. - -=item C<reg_substr_data> - -Holds information on the longest string that must occur at a fixed -offset from the start of the pattern, and the longest string that must -occur at a floating offset from the start of the pattern. Used to do -Fast-Boyer-Moore searches on the string to find out if its worth using -the regex engine at all, and if so where in the string to search. - -=item C<startp>, C<endp> - -These fields store arrays that are used to hold the offsets of the begining -and end of each capture group that has matched. -1 is used to indicate no match. - -These are the source for @- and @+. - -=item C<subbeg> C<sublen> C<saved_copy> - -These are used during execution phase for managing search and replace -patterns. - -=item C<seen_evals> +=head3 Perl's C<pprivate> structure -This stores the number of eval groups in the pattern. This is used -for security purposes when embedding compiled regexes into larger -patterns. - -=back - -=head3 Engine Private Data About Pattern - -Additionally, regexp.h contains the following "private" definition which is -perl-specific and is only of curiosity value to other engine implementations. +The following structure is used as the C<pprivate> struct by perl's +regex engine. Since it is specific to perl it is only of curiosity +value to other engine implementations. typedef struct regexp_internal { regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */ @@ -980,138 +858,10 @@ treated as a single blob. =back -=head2 Pluggable Interface - -As of Perl 5.9.5 there is a new interface for using other regexp engines -than the default one. Each engine is supposed to provide access to -a constant structure of the following format: - - typedef struct regexp_engine { - regexp* (*comp) (pTHX_ char* exp, char* xend, U32 pm_flags); - I32 (*exec) (pTHX_ regexp* prog, char* stringarg, char* strend, - char* strbeg, I32 minend, SV* screamer, - void* data, U32 flags); - char* (*intuit) (pTHX_ regexp *prog, SV *sv, char *strpos, - char *strend, U32 flags, - struct re_scream_pos_data_s *data); - SV* (*checkstr) (pTHX_ regexp *prog); - void (*free) (pTHX_ struct regexp* r); - #ifdef USE_ITHREADS - void* (*dupe) (pTHX_ const regexp *r, CLONE_PARAMS *param); - #endif - } regexp_engine; - -When a regexp is compiled, its C<engine> field is then set to point at -the appropriate structure so that when it needs to be used Perl can find -the right routines to do so. - -In order to install a new regexp handler, C<$^H{regcomp}> is set -to an integer which (when casted appropriately) resolves to one of these -structures. When compiling, the C<comp> method is executed, and the -resulting regexp structure's engine field is expected to point back at -the same structure. - -The pTHX_ symbol in the definition is a macro used by perl under threading -to provide an extra argument to the routine holding a pointer back to -the interpreter that is executing the regexp. So under threading all -routines get an extra argument. - -The routines are as follows: - -=over 4 - -=item comp - - regexp* comp(char *exp, char *xend, U32 pm_flags); - -Compile the pattern between exp and xend using the flags contained in -pm and return a pointer to a prepared regexp structure that can perform -the match. pm flags will have the following flag bits set as determined -by the context that comp() has been called from: - - RXf_UTF8 pattern is encoded in UTF8 - RXf_PMf_LOCALE use locale - RXf_PMf_MULTILINE /m - RXf_PMf_SINGLELINE /s - RXf_PMf_FOLD /i - RXf_PMf_EXTENDED /x - RXf_PMf_KEEPCOPY /k - RXf_SKIPWHITE split ' ' or split with no args - -In general these flags should be preserved in regex->extflags after -compilation, although it is possible the regex includes constructs that -changes them. The perl engine for instance may upgrade non-utf8 strings -to utf8 if the pattern includes constructs such as C<\x{...}> that can only -match unicode values. RXf_SKIPWHITE should always be preserved verbatim -in regex->extflags. - -=item exec - - I32 exec(regexp* prog, - char *stringarg, char* strend, char* strbeg, - I32 minend, SV* screamer, - void* data, U32 flags); - -Execute a regexp. - -=item intuit - - char* intuit( regexp *prog, - SV *sv, char *strpos, char *strend, - U32 flags, struct re_scream_pos_data_s *data); - -Find the start position where a regex match should be attempted, -or possibly whether the regex engine should not be run because the -pattern can't match. This is called as appropriate by the core -depending on the values of the extflags member of the regexp -structure. - -=item checkstr - - SV* checkstr(regexp *prog); - -Return a SV containing a string that must appear in the pattern. Used -for optimising matches. - -=item free - - void free(regexp *prog); - -Called by perl when it is freeing a regexp pattern so that the engine -can release any resources pointed to by the C<pprivate> member of the -regexp structure. This is only responsible for freeing private data; -perl will handle releasing anything else contained in the regexp structure. - -=item dupe - - void* dupe(const regexp *r, CLONE_PARAMS *param); - -On threaded builds a regexp may need to be duplicated so that the pattern -can be used by mutiple threads. This routine is expected to handle the -duplication of any private data pointed to by the C<pprivate> member of -the regexp structure. It will be called with the preconstructed new -regexp structure as an argument, the C<pprivate> member will point at -the B<old> private structue, and it is this routine's responsibility to -construct a copy and return a pointer to it (which perl will then use to -overwrite the field as passed to this routine.) - -This allows the engine to dupe its private data but also if necessary -modify the final structure if it really must. - -On unthreaded builds this field doesn't exist. - -=back - - -=head2 De-allocation and Cloning - -Any patch that adds data items to the regexp will need to include -changes to F<sv.c> (C<Perl_re_dup()>) and F<regcomp.c> (C<pregfree()>). This -involves freeing or cloning items in the regexp's data array based -on the data item's type. - =head1 SEE ALSO +L<perlreapi> + L<perlre> L<perlunitut> diff --git a/vms/descrip_mms.template b/vms/descrip_mms.template index d5741a3b85..30d219a7de 100644 --- a/vms/descrip_mms.template +++ b/vms/descrip_mms.template @@ -408,7 +408,7 @@ pod17 = [.lib.pods]perlmodinstall.pod [.lib.pods]perlmodlib.pod [.lib.pods]perlm pod18 = [.lib.pods]perlnewmod.pod [.lib.pods]perlnumber.pod [.lib.pods]perlobj.pod [.lib.pods]perlop.pod [.lib.pods]perlopenbsd.pod pod19 = [.lib.pods]perlopentut.pod [.lib.pods]perlos2.pod [.lib.pods]perlos390.pod [.lib.pods]perlos400.pod [.lib.pods]perlothrtut.pod pod20 = [.lib.pods]perlpacktut.pod [.lib.pods]perlplan9.pod [.lib.pods]perlpod.pod [.lib.pods]perlpodspec.pod [.lib.pods]perlport.pod -pod21 = [.lib.pods]perlpragma.pod [.lib.pods]perlqnx.pod [.lib.pods]perlre.pod [.lib.pods]perlref.pod [.lib.pods]perlreftut.pod [.lib.pods]perlreguts.pod +pod21 = [.lib.pods]perlpragma.pod [.lib.pods]perlqnx.pod [.lib.pods]perlre.pod [.lib.pods]perlref.pod [.lib.pods]perlreftut.pod [.lib.pods]perlreapi.pod [.lib.pods]perlreguts.pod pod22 = [.lib.pods]perlrequick.pod [.lib.pods]perlreref.pod [.lib.pods]perlretut.pod [.lib.pods]perlriscos.pod [.lib.pods]perlrun.pod [.lib.pods]perlsec.pod pod23 = [.lib.pods]perlsolaris.pod [.lib.pods]perlstyle.pod [.lib.pods]perlsub.pod [.lib.pods]perlsymbian.pod [.lib.pods]perlsyn.pod pod24 = [.lib.pods]perlthrtut.pod [.lib.pods]perltie.pod [.lib.pods]perltoc.pod [.lib.pods]perltodo.pod [.lib.pods]perltooc.pod [.lib.pods]perltoot.pod @@ -1180,6 +1180,10 @@ makeppport : $(MINIPERL_EXE) $(ARCHDIR)Config.pm @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] +[.lib.pods]perlreapi.pod : [.pod]perlreapi.pod + @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] + Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] + [.lib.pods]perlreguts.pod : [.pod]perlreguts.pod @ If F$Search("[.lib]pods.dir").eqs."" Then Create/Directory [.lib.pods] Copy/NoConfirm/Log $(MMS$SOURCE) [.lib.pods] diff --git a/win32/pod.mak b/win32/pod.mak index d2db1de974..ad21b147c2 100644 --- a/win32/pod.mak +++ b/win32/pod.mak @@ -104,6 +104,7 @@ POD = \ perlre.pod \ perlref.pod \ perlreftut.pod \ + perlreapi.pod \ perlreguts.pod \ perlrequick.pod \ perlreref.pod \ @@ -219,6 +220,7 @@ MAN = \ perlre.man \ perlref.man \ perlreftut.man \ + perlreapi.man \ perlreguts.man \ perlrequick.man \ perlreref.man \ @@ -334,6 +336,7 @@ HTML = \ perlre.html \ perlref.html \ perlreftut.html \ + perlreapi.html \ perlreguts.html \ perlrequick.html \ perlreref.html \ @@ -449,6 +452,7 @@ TEX = \ perlre.tex \ perlref.tex \ perlreftut.tex \ + perlreapi.tex \ perlreguts.tex \ perlrequick.tex \ perlreref.tex \ |