summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c132
1 files changed, 132 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
index 432e4ad88d..818af02904 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2377,6 +2377,138 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
return swatch;
}
+HV*
+Perl__swash_inversion_hash(pTHX_ SV* swash)
+{
+
+ /* Subject to change or removal. For use only in one place in regexec.c
+ *
+ * Returns a hash which is the inversion and closure of a swash mapping.
+ * For example, consider the input lines:
+ * 004B 006B
+ * 004C 006C
+ * 212A 006B
+ *
+ * The returned hash would have two keys, the utf8 for 006B and the utf8 for
+ * 006C. The value for each key is an array. For 006C, the array would
+ * have a two elements, the utf8 for itself, and for 004C. For 006B, there
+ * would be three elements in its array, the utf8 for 006B, 004B and 212A.
+ *
+ * Essentially, for any code point, it gives all the code points that map to
+ * it, or the list of 'froms' for that point.
+ *
+ * Currently it only looks at the main body of the swash, and ignores any
+ * additions or deletions from other swashes */
+
+ U8 *l, *lend;
+ STRLEN lcur;
+ HV *const hv = MUTABLE_HV(SvRV(swash));
+
+ /* The string containing the main body of the table */
+ SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
+ SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+ SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
+ /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
+ const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+ const STRLEN bits = SvUV(*bitssvp);
+ const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+ const UV none = SvUV(*nonesvp);
+
+ HV* ret = newHV();
+
+ PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
+
+ /* Must have at least 8 bits to get the mappings */
+ if (bits != 8 && bits != 16 && bits != 32) {
+ Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
+ (UV)bits);
+ }
+
+ /* read $swash->{LIST} */
+ l = (U8*)SvPV(*listsvp, lcur);
+ lend = l + lcur;
+
+ /* Go through each input line */
+ while (l < lend) {
+ UV min, max, val;
+ UV inverse;
+ l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+ cBOOL(octets), typestr);
+ if (l > lend) {
+ break;
+ }
+
+ /* Each element in the range is to be inverted */
+ for (inverse = min; inverse <= max; inverse++) {
+ AV* list;
+ SV* element;
+ SV** listp;
+ IV i;
+ bool found_key = FALSE;
+
+ /* The key is the inverse mapping */
+ char key[UTF8_MAXBYTES+1];
+ char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
+ STRLEN key_len = key_end - key;
+
+ /* And the value is what the forward mapping is from. */
+ char utf8_inverse[UTF8_MAXBYTES+1];
+ char *utf8_inverse_end = (char *) uvuni_to_utf8((U8*) utf8_inverse, inverse);
+
+ /* Get the list for the map */
+ if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
+ list = (AV*) *listp;
+ }
+ else { /* No entry yet for it: create one */
+ list = newAV();
+ if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
+ Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+ }
+ }
+
+ for (i = 0; i < av_len(list); i++) {
+ SV** entryp = av_fetch(list, i, FALSE);
+ SV* entry;
+ if (entryp == NULL) {
+ Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
+ }
+ entry = *entryp;
+ if (SvCUR(entry) != key_len) {
+ continue;
+ }
+ if (memEQ(key, SvPVX(entry), key_len)) {
+ found_key = TRUE;
+ break;
+ }
+ }
+ if (! found_key) {
+ element = newSVpvn_flags(key, key_len, SVf_UTF8);
+ av_push(list, element);
+ }
+
+
+ /* Simply add the value to the list */
+ element = newSVpvn_flags(utf8_inverse, utf8_inverse_end - utf8_inverse, SVf_UTF8);
+ av_push(list, element);
+
+ /* swash_get() increments the value of val for each element in the
+ * range. That makes more compact tables possible. You can
+ * express the capitalization, for example, of all consecutive
+ * letters with a single line: 0061\t007A\t0041 This maps 0061 to
+ * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
+ * and it's not documented, and perhaps not even currently used,
+ * but I copied the semantics from swash_get(), just in case */
+ if (!none || val < none) {
+ ++val;
+ }
+ }
+ }
+
+ return ret;
+}
+
/*
=for apidoc uvchr_to_utf8