diff options
-rw-r--r-- | regcomp.h | 31 | ||||
-rw-r--r-- | regcomp.pl | 41 | ||||
-rw-r--r-- | regcomp.sym | 85 | ||||
-rw-r--r-- | regnodes.h | 24 |
4 files changed, 104 insertions, 77 deletions
@@ -447,37 +447,6 @@ START_EXTERN_C #include "regnodes.h" #endif -/* The following have no fixed length. U8 so we can do strchr() on it. */ -#ifndef DOINIT -EXTCONST U8 PL_varies[]; -#else -EXTCONST U8 PL_varies[] = { - BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, - WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, - NREF, NREFF, NREFFL, - 0 -}; -#endif - -/* The following always have a length of 1. U8 we can do strchr() on it. */ -/* (Note that length 1 means "one character" under UTF8, not "one octet".) */ -#ifndef DOINIT -EXTCONST U8 PL_simple[]; -#else -EXTCONST U8 PL_simple[] = { - REG_ANY, SANY, CANY, - ANYOF, - ALNUM, ALNUML, - NALNUM, NALNUML, - SPACE, SPACEL, - NSPACE, NSPACEL, - DIGIT, NDIGIT, - VERTWS, NVERTWS, - HORIZWS, NHORIZWS, - 0 -}; -#endif - #ifndef PLUGGABLE_RE_EXTENSION #ifndef DOINIT EXTCONST regexp_engine PL_core_reg_engine; diff --git a/regcomp.pl b/regcomp.pl index 2fbe6c68f9..0c63b94380 100644 --- a/regcomp.pl +++ b/regcomp.pl @@ -24,7 +24,7 @@ use warnings; open DESC, 'regcomp.sym'; my $ind = 0; -my (@name,@rest,@type,@code,@args,@longj); +my (@name,@rest,@type,@code,@args,@flags,@longj); my ($desc,$lastregop); while (<DESC>) { s/#.*$//; @@ -37,8 +37,8 @@ while (<DESC>) { unless ($lastregop) { $ind++; ($name[$ind], $desc, $rest[$ind]) = /^(\S+)\s+([^\t]+)\s*;\s*(.*)/; - ($type[$ind], $code[$ind], $args[$ind], $longj[$ind]) - = split /[,\s]\s*/, $desc, 4; + ($type[$ind], $code[$ind], $args[$ind], $flags[$ind], $longj[$ind]) + = split /[,\s]\s*/, $desc; } else { my ($type,@lists)=split /\s+/, $_; die "No list? $type" if !@lists; @@ -79,6 +79,29 @@ close DESC; die "Too many regexp/state opcodes! Maximum is 256, but there are $lastregop in file!" if $lastregop>256; +sub process_flags { + my ($flag, $varname, $comment) = @_; + $comment = '' unless defined $comment; + + $ind = 0; + my @selected; + while (++$ind <= $lastregop) { + push @selected, $name[$ind] if $flags[$ind] && $flags[$ind] eq $flag; + } + my $out_string = join ', ', @selected, 0; + $out_string =~ s/(.{1,70},) /$1\n /g; + return $comment . <<"EOP"; +#ifndef DOINIT +EXTCONST U8 PL_${varname}[]; +#else +EXTCONST U8 PL_${varname}[] = { + $out_string +}; +#endif /* DOINIT */ + +EOP +} + my $tmp_h = 'regnodes.h-new'; unlink $tmp_h if -f $tmp_h; @@ -236,6 +259,18 @@ print $out <<EOP; }; #endif /* DOINIT */ +EOP + +print $out process_flags('V', 'varies', <<'EOC'); +/* The following have no fixed length. U8 so we can do strchr() on it. */ +EOC + +print $out process_flags('S', 'simple', <<'EOC'); +/* The following always have a length of 1. U8 we can do strchr() on it. */ +/* (Note that length 1 means "one character" under UTF8, not "one octet".) */ +EOC + +print $out <<EOP; /* ex: set ro: */ EOP safer_close($out); diff --git a/regcomp.sym b/regcomp.sym index 32935bf9d3..ac1c2e01a8 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -9,7 +9,7 @@ # Note that the order in this file is important. # # Format for first section: -# NAME \s+ TYPE, arg-description [num-args] [longjump-len] ; DESCRIPTION +# NAME \s+ TYPE, arg-description [flags] [num-args] [longjump-len] ; DESCRIPTION # # # run perl regen.pl after editing this file @@ -38,23 +38,23 @@ GPOS GPOS, no ; Matches where last m//g left off. #* [Special] alternatives: (14..30) -REG_ANY REG_ANY, no ; Match any one character (except newline). -SANY REG_ANY, no ; Match any one character. -CANY REG_ANY, no ; Match any one byte. -ANYOF ANYOF, sv ; Match character in (or not in) this class. -ALNUM ALNUM, no ; Match any alphanumeric character -ALNUML ALNUM, no ; Match any alphanumeric char in locale -NALNUM NALNUM, no ; Match any non-alphanumeric character -NALNUML NALNUM, no ; Match any non-alphanumeric char in locale -SPACE SPACE, no ; Match any whitespace character -SPACEL SPACE, no ; Match any whitespace char in locale -NSPACE NSPACE, no ; Match any non-whitespace character -NSPACEL NSPACE, no ; Match any non-whitespace char in locale -DIGIT DIGIT, no ; Match any numeric character +REG_ANY REG_ANY, no 0 S ; Match any one character (except newline). +SANY REG_ANY, no 0 S ; Match any one character. +CANY REG_ANY, no 0 S ; Match any one byte. +ANYOF ANYOF, sv 0 S ; Match character in (or not in) this class. +ALNUM ALNUM, no 0 S ; Match any alphanumeric character +ALNUML ALNUM, no 0 S ; Match any alphanumeric char in locale +NALNUM NALNUM, no 0 S ; Match any non-alphanumeric character +NALNUML NALNUM, no 0 S ; Match any non-alphanumeric char in locale +SPACE SPACE, no 0 S ; Match any whitespace character +SPACEL SPACE, no 0 S ; Match any whitespace char in locale +NSPACE NSPACE, no 0 S ; Match any non-whitespace character +NSPACEL NSPACE, no 0 S ; Match any non-whitespace char in locale +DIGIT DIGIT, no 0 S ; Match any numeric character DIGITL DIGIT, no ; Match any numeric character in locale -NDIGIT NDIGIT, no ; Match any non-numeric character +NDIGIT NDIGIT, no 0 S ; Match any non-numeric character NDIGITL NDIGIT, no ; Match any non-numeric character in locale -CLUMP CLUMP, no ; Match any combining character sequence +CLUMP CLUMP, no 0 V ; Match any combining character sequence #* Alternation (31) @@ -66,14 +66,14 @@ CLUMP CLUMP, no ; Match any combining character sequence # final "next" pointer of each individual branch points; each # branch starts with the operand node of a BRANCH node. # -BRANCH BRANCH, node ; Match this alternative, or the next... +BRANCH BRANCH, node 0 V ; Match this alternative, or the next... #*Back pointer (32) # BACK Normal "next" pointers all implicitly point forward; BACK # exists to make loop structures possible. # not used -BACK BACK, no ; Match "", "next" ptr points backward. +BACK BACK, no 0 V ; Match "", "next" ptr points backward. #*Literals (33..35) @@ -94,16 +94,16 @@ TAIL NOTHING, no ; Match empty string. Can jump here from outsi # per match) are implemented with STAR and PLUS for speed # and to minimize recursive plunges. # -STAR STAR, node ; Match this (simple) thing 0 or more times. -PLUS PLUS, node ; Match this (simple) thing 1 or more times. +STAR STAR, node 0 V ; Match this (simple) thing 0 or more times. +PLUS PLUS, node 0 V ; Match this (simple) thing 1 or more times. -CURLY CURLY, sv 2 ; Match this simple thing {n,m} times. -CURLYN CURLY, no 2 ; Capture next-after-this simple thing -CURLYM CURLY, no 2 ; Capture this medium-complex thing {n,m} times. -CURLYX CURLY, sv 2 ; Match this complex thing {n,m} times. +CURLY CURLY, sv 2 V ; Match this simple thing {n,m} times. +CURLYN CURLY, no 2 V ; Capture next-after-this simple thing +CURLYM CURLY, no 2 V ; Capture this medium-complex thing {n,m} times. +CURLYX CURLY, sv 2 V ; Match this complex thing {n,m} times. # This terminator creates a loop structure for CURLYX -WHILEM WHILEM, no ; Do curly processing and see if rest matches. +WHILEM WHILEM, no 0 V ; Do curly processing and see if rest matches. #*Buffer related (45..49) @@ -111,22 +111,21 @@ WHILEM WHILEM, no ; Do curly processing and see if rest matches. OPEN OPEN, num 1 ; Mark this point in input as start of #n. CLOSE CLOSE, num 1 ; Analogous to OPEN. -REF REF, num 1 ; Match some already matched string -REFF REF, num 1 ; Match already matched string, folded -REFFL REF, num 1 ; Match already matched string, folded in loc. +REF REF, num 1 V ; Match some already matched string +REFF REF, num 1 V ; Match already matched string, folded +REFFL REF, num 1 V ; Match already matched string, folded in loc. -#*Grouping assertions (50..54) -IFMATCH BRANCHJ, off 1 2 ; Succeeds if the following matches. -UNLESSM BRANCHJ, off 1 2 ; Fails if the following matches. -SUSPEND BRANCHJ, off 1 1 ; "Independent" sub-RE. -IFTHEN BRANCHJ, off 1 1 ; Switch, should be preceeded by switcher . +IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches. +UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches. +SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE. +IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceeded by switcher . GROUPP GROUPP, num 1 ; Whether the group matched. #*Support for long RE (55..56) -LONGJMP LONGJMP, off 1 1 ; Jump far away. -BRANCHJ BRANCHJ, off 1 1 ; BRANCH with long offset. +LONGJMP LONGJMP, off 1 . 1 ; Jump far away. +BRANCHJ BRANCHJ, off 1 V 1 ; BRANCH with long offset. #*The heavy worker (57) @@ -138,7 +137,7 @@ MINMOD MINMOD, no ; Next operator is not greedy. LOGICAL LOGICAL, no ; Next opcode should set the flag only. # This is not used yet (60) -RENUM BRANCHJ, off 1 1 ; Group with independently numbered parens. +RENUM BRANCHJ, off 1 . 1 ; Group with independently numbered parens. #*Trie Related (61..62) @@ -158,9 +157,9 @@ GOSUB GOSUB, num/ofs 2L ; recurse to paren arg1 at (signed) ofs ar GOSTART GOSTART, no ; recurse to start of pattern #*Named references (67..69) -NREF REF, no-sv 1 ; Match some already matched string -NREFF REF, no-sv 1 ; Match already matched string, folded -NREFFL REF, no-sv 1 ; Match already matched string, folded in loc. +NREF REF, no-sv 1 V ; Match some already matched string +NREFF REF, no-sv 1 V ; Match already matched string, folded +NREFFL REF, no-sv 1 V ; Match already matched string, folded in loc. #*Special conditionals (70..72) @@ -187,10 +186,10 @@ KEEPS KEEPS, no ; $& begins here. #*New charclass like patterns LNBREAK LNBREAK, none ; generic newline pattern -VERTWS VERTWS, none ; vertical whitespace (Perl 6) -NVERTWS NVERTWS, none ; not vertical whitespace (Perl 6) -HORIZWS HORIZWS, none ; horizontal whitespace (Perl 6) -NHORIZWS NHORIZWS, none ; not horizontal whitespace (Perl 6) +VERTWS VERTWS, none 0 S ; vertical whitespace (Perl 6) +NVERTWS NVERTWS, none 0 S ; not vertical whitespace (Perl 6) +HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6) +NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6) FOLDCHAR FOLDCHAR, codepoint 1 ; codepoint with tricky case folding properties. diff --git a/regnodes.h b/regnodes.h index d87acfd4b5..a501416f9a 100644 --- a/regnodes.h +++ b/regnodes.h @@ -661,4 +661,28 @@ EXTCONST char * const PL_reg_extflags_name[] = { }; #endif /* DOINIT */ +/* The following have no fixed length. U8 so we can do strchr() on it. */ +#ifndef DOINIT +EXTCONST U8 PL_varies[]; +#else +EXTCONST U8 PL_varies[] = { + CLUMP, BRANCH, BACK, STAR, PLUS, CURLY, CURLYN, CURLYM, CURLYX, WHILEM, + REF, REFF, REFFL, SUSPEND, IFTHEN, BRANCHJ, NREF, NREFF, NREFFL, + 0 +}; +#endif /* DOINIT */ + +/* The following always have a length of 1. U8 we can do strchr() on it. */ +/* (Note that length 1 means "one character" under UTF8, not "one octet".) */ +#ifndef DOINIT +EXTCONST U8 PL_simple[]; +#else +EXTCONST U8 PL_simple[] = { + REG_ANY, SANY, CANY, ANYOF, ALNUM, ALNUML, NALNUM, NALNUML, SPACE, + SPACEL, NSPACE, NSPACEL, DIGIT, NDIGIT, VERTWS, NVERTWS, HORIZWS, + NHORIZWS, + 0 +}; +#endif /* DOINIT */ + /* ex: set ro: */ |