Add ANYOFR regnode

This matches a single range of code points. It is both faster and smaller than other ANYOF-type nodes, requiring, after set-up, a single subtraction and conditional branch. The vast majority of Unicode properties match a single range (though most of the properties likely to be used in real world applications have more than a single range). But things like [ij] are a single range, and those are quite commonly encountered. This new regnode matches them more efficiently than a bitmap would, and doesn't require the space for one either. The flags field is used to store the minimum matchable start byte for UTF-8 strings, and is ignored for non-UTF-8 targets. This, like ANYOFH nodes which have a similar mechanism, allows for quick weeding out of many possible matches without having to convert the UTF-8 to its corresponding code point. This regnode packs the 32 bit argument with 20 bits for the minimum code point the node matches, and 12 bits for the maximum range. If the input is a value outside these, it simply won't compile to this regnode, instead going to one of the ANYOFH flavors. ANYOFR is sufficient to match all of Unicode except for the final (private use) 65K plane.
author: Karl Williamson <khw@cpan.org> 2019-09-19 16:03:04 -0600
committer: Karl Williamson <khw@cpan.org> 2019-11-17 21:20:07 -0700
commit: 13fcf6522466471a1b1c5fc2d760dd5367fd8940 (patch)
tree: f6272af8bf0e7308ab5792219978ab12d75d78d3 /regexec.c
parent: d913538e4f136a14760fb7c73de064901acfc25b (diff)
download: perl-13fcf6522466471a1b1c5fc2d760dd5367fd8940.tar.gz
1 files changed, 65 insertions, 0 deletions
diff --git a/regexec.c b/regexec.c
index 9b8f74165c..8265af4dd4 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2205,6 +2205,21 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         }
         break;
 
+    case ANYOFR:
+        if (utf8_target) {
+            REXEC_FBC_CLASS_SCAN(TRUE,
+                  (   NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
+                   && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
+                                                    (U8 *) strend,
+                                                    NULL),
+                                  ANYOFRbase(c), ANYOFRdelta(c))));
+        }
+        else {
+            REXEC_FBC_CLASS_SCAN(0, withinCOUNT((U8) *s,
+                                               ANYOFRbase(c), ANYOFRdelta(c)));
+        }
+        break;
+
     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
         assert(! is_utf8_pat);
 	/* FALLTHROUGH */
@@ -6874,6 +6889,31 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             goto increment_locinput;
             break;
 
+        case ANYOFR:
+            if (NEXTCHR_IS_EOS) {
+                sayNO;
+            }
+
+            if (utf8_target) {
+                if (    ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
+                   || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
+                                                (U8 *) reginfo->strend,
+                                                NULL),
+                                    ANYOFRbase(scan), ANYOFRdelta(scan)))
+                {
+                    sayNO;
+                }
+            }
+            else {
+                if (! withinCOUNT((U8) *locinput,
+                                  ANYOFRbase(scan), ANYOFRdelta(scan)))
+                {
+                    sayNO;
+                }
+            }
+            goto increment_locinput;
+            break;
+
         /* The argument (FLAGS) to all the POSIX node types is the class number
          * */
 
@@ -9703,6 +9743,31 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         }
         break;
 
+    case ANYOFR:
+        if (utf8_target) {
+            while (   hardcount < max
+                   && scan < this_eol
+                   && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
+                   && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
+                                                (U8 *) this_eol,
+                                                NULL),
+                                  ANYOFRbase(p), ANYOFRdelta(p)))
+            {
+                scan += UTF8SKIP(scan);
+                hardcount++;
+            }
+        }
+        else {
+            while (   hardcount < max
+                   && scan < this_eol
+                   && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
+            {
+                scan++;
+                hardcount++;
+            }
+        }
+        break;
+
     /* The argument (FLAGS) to all the POSIX node types is the class number */
 
     case NPOSIXL:
author	Karl Williamson <khw@cpan.org>	2019-09-19 16:03:04 -0600
committer	Karl Williamson <khw@cpan.org>	2019-11-17 21:20:07 -0700
commit	13fcf6522466471a1b1c5fc2d760dd5367fd8940 (patch)
tree	f6272af8bf0e7308ab5792219978ab12d75d78d3 /regexec.c
parent	d913538e4f136a14760fb7c73de064901acfc25b (diff)
download	perl-13fcf6522466471a1b1c5fc2d760dd5367fd8940.tar.gz