summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-02-01 10:43:49 -0700
committerKarl Williamson <public@khwilliamson.com>2011-02-02 16:31:22 -0700
commitd764b54e5a93ff224d371bad25b14285e5a543cd (patch)
treeba3d3af8037bece8de968852c49ca8aa2516b34c /utf8.c
parent2f833f5208e26b208886e51e09e2c072b5eabb46 (diff)
downloadperl-d764b54e5a93ff224d371bad25b14285e5a543cd.tar.gz
Add initial inversion list object
Going forward the intent is to convert from swashes to the better-suited inversion list data structure. This adds rudimentary inversion lists that have only the functionality needed for 5.14. As a result, they are as much as possible static to one file. What's necessary for 5.14 is enough to allow folding of ANYOF nodes to be moved from regexec to regcomp. Why they are needed for that is to generate as compact as possible class definitions; otherwise, very long linear lists might be generated. (They still may be, but that's inherent in the problem domain; this generates as compact as possible, combining overlapping ranges, etc.) The only two non-trivial methods in this object are from published algorithms.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c65
1 files changed, 65 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
index 6276308b10..16e0814e09 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2713,6 +2713,71 @@ Perl__swash_inversion_hash(pTHX_ SV* swash)
return ret;
}
+HV*
+Perl__swash_to_invlist(pTHX_ SV* const swash)
+{
+
+ /* Subject to change or removal. For use only in one place in regcomp.c */
+
+ U8 *l, *lend;
+ char *loc;
+ STRLEN lcur;
+ HV *const hv = MUTABLE_HV(SvRV(swash));
+ UV elements = 0; /* Number of elements in the inversion list */
+
+ /* The string containing the main body of the table */
+ SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+ SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+
+ const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+ const STRLEN bits = SvUV(*bitssvp);
+ const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+
+ HV* invlist;
+
+ PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
+
+ /* read $swash->{LIST} */
+ l = (U8*)SvPV(*listsvp, lcur);
+ loc = (char *) l;
+ lend = l + lcur;
+
+ /* Scan the input to count the number of lines to preallocate array size
+ * based on worst possible case, which is each line in the input creates 2
+ * elements in the inversion list: 1) the beginning of a range in the list;
+ * 2) the beginning of a range not in the list. */
+ while ((loc = (strchr(loc, '\n'))) != NULL) {
+ elements += 2;
+ loc++;
+ }
+
+ /* If the ending is somehow corrupt and isn't a new line, add another
+ * element for the final range that isn't in the inversion list */
+ if (! (*lend == '\n' || (*lend == '\0' && *(lend - 1) == '\n'))) {
+ elements++;
+ }
+
+ invlist = _new_invlist(elements);
+
+ /* Now go through the input again, adding each range to the list */
+ while (l < lend) {
+ UV start, end;
+ UV val; /* Not used by this function */
+
+ l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
+ cBOOL(octets), typestr);
+
+ if (l > lend) {
+ break;
+ }
+
+ _append_range_to_invlist(invlist, start, end);
+ }
+
+ return invlist;
+}
+
/*
=for apidoc uvchr_to_utf8