diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-02-01 10:43:49 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-02-02 16:31:22 -0700 |
commit | d764b54e5a93ff224d371bad25b14285e5a543cd (patch) | |
tree | ba3d3af8037bece8de968852c49ca8aa2516b34c /utf8.c | |
parent | 2f833f5208e26b208886e51e09e2c072b5eabb46 (diff) | |
download | perl-d764b54e5a93ff224d371bad25b14285e5a543cd.tar.gz |
Add initial inversion list object
Going forward the intent is to convert from swashes to the better-suited
inversion list data structure. This adds rudimentary inversion lists that have
only the functionality needed for 5.14. As a result, they are as much as
possible static to one file.
What's necessary for 5.14 is enough to allow folding of ANYOF nodes to be moved
from regexec to regcomp. Why they are needed for that is to generate as
compact as possible class definitions; otherwise, very long linear lists might
be generated. (They still may be, but that's inherent in the problem domain;
this generates as compact as possible, combining overlapping ranges, etc.)
The only two non-trivial methods in this object are from published algorithms.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 65 |
1 files changed, 65 insertions, 0 deletions
@@ -2713,6 +2713,71 @@ Perl__swash_inversion_hash(pTHX_ SV* swash) return ret; } +HV* +Perl__swash_to_invlist(pTHX_ SV* const swash) +{ + + /* Subject to change or removal. For use only in one place in regcomp.c */ + + U8 *l, *lend; + char *loc; + STRLEN lcur; + HV *const hv = MUTABLE_HV(SvRV(swash)); + UV elements = 0; /* Number of elements in the inversion list */ + + /* The string containing the main body of the table */ + SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); + SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); + SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); + + const U8* const typestr = (U8*)SvPV_nolen(*typesvp); + const STRLEN bits = SvUV(*bitssvp); + const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */ + + HV* invlist; + + PERL_ARGS_ASSERT__SWASH_TO_INVLIST; + + /* read $swash->{LIST} */ + l = (U8*)SvPV(*listsvp, lcur); + loc = (char *) l; + lend = l + lcur; + + /* Scan the input to count the number of lines to preallocate array size + * based on worst possible case, which is each line in the input creates 2 + * elements in the inversion list: 1) the beginning of a range in the list; + * 2) the beginning of a range not in the list. */ + while ((loc = (strchr(loc, '\n'))) != NULL) { + elements += 2; + loc++; + } + + /* If the ending is somehow corrupt and isn't a new line, add another + * element for the final range that isn't in the inversion list */ + if (! (*lend == '\n' || (*lend == '\0' && *(lend - 1) == '\n'))) { + elements++; + } + + invlist = _new_invlist(elements); + + /* Now go through the input again, adding each range to the list */ + while (l < lend) { + UV start, end; + UV val; /* Not used by this function */ + + l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val, + cBOOL(octets), typestr); + + if (l > lend) { + break; + } + + _append_range_to_invlist(invlist, start, end); + } + + return invlist; +} + /* =for apidoc uvchr_to_utf8 |