Refactor rninstr()

This function finds a needle in a haystack. If memrchr() is available, use it to find the final byte in the needle; If the needle is length 1, that's all we need, and this is special cased. Otherwise it uses memEQ of the other bytes of the needle, from where the needle would have to start, and if no match, repeats. If no memrchr(), it emulates that by just looking backward from the end until it finds the needle's final byte; then proceeds as above.
author: Karl Williamson <khw@cpan.org> 2020-04-26 17:47:47 -0600
committer: Karl Williamson <khw@cpan.org> 2020-08-22 06:38:40 -0600
commit: 2e8a5b7670884e12a43041522bfe7cc4d36a033c (patch)
tree: daa07b4455f4b2708351d642803f477b4ae96d31 /util.c
parent: bbd8607595f9856d6e75ed63130034cf645feb4a (diff)
download: perl-2e8a5b7670884e12a43041522bfe7cc4d36a033c.tar.gz
1 files changed, 83 insertions, 21 deletions
diff --git a/util.c b/util.c
index 9adbe7ab98..98a861b917 100644
--- a/util.c
+++ b/util.c
@@ -720,32 +720,94 @@ such occurrence.
 char *
 Perl_rninstr(const char *big, const char *bigend, const char *little, const char *lend)
 {
-    const char *bigbeg;
-    const I32 first = *little;
-    const char * const littleend = lend;
+    const Ptrdiff_t little_len = lend - little;
+    const Ptrdiff_t big_len = bigend - big;
 
     PERL_ARGS_ASSERT_RNINSTR;
 
-    if (little >= littleend)
+    /* A non-existent needle trivially matches the rightmost possible position
+     * in the haystack */
+    if (UNLIKELY(little_len <= 0)) {
 	return (char*)bigend;
-    bigbeg = big;
-    big = bigend - (littleend - little++);
-    while (big >= bigbeg) {
-	const char *s, *x;
-	if (*big-- != first)
-	    continue;
-	for (x=big+2,s=little; s < littleend; /**/ ) {
-	    if (*s != *x)
-		break;
-	    else {
-		x++;
-		s++;
-	    }
-	}
-	if (s >= littleend)
-	    return (char*)(big+1);
     }
-    return NULL;
+
+    /* If the needle is larger than the haystack, it can't possibly be in it */
+    if (UNLIKELY(little_len > big_len)) {
+        return NULL;
+    }
+
+    /* Special case length 1 needles.  It's trivial if we have memrchr();
+     * and otherwise we just do a per-byte search backwards.
+     *
+     * XXX In any case, when we don't have memrchr, we could use something like
+     * S_find_next_masked( or S_find_span_end() to do per-word searches */
+    if (little_len == 1) {
+        const char final = *little;
+
+#ifdef HAS_MEMRCHR
+
+        return (char *) memrchr(big, final, big_len);
+#else
+        const char * cur = bigend - 1;
+
+        do {
+            if (*cur == final) {
+                return (char *) cur;
+            }
+        } while (--cur >= big);
+
+        return NULL;
+#endif
+
+    }
+    else {  /* Below, the needle is longer than a single byte */
+
+        /* We search backwards in the haystack for the final character of the
+         * needle.  Each time one is found, we see if the characters just
+         * before it in the haystack match the rest of the needle. */
+        const char final = *(lend - 1);
+
+        /* What matches consists of 'little_len'-1 characters, then the final
+         * one */
+        const Size_t prefix_len = little_len - 1;
+
+        /* If the final character in the needle is any closer than this to the
+         * left edge, there wouldn't be enough room for all of it to fit in the
+         * haystack */
+        const char * const left_fence = big + prefix_len;
+
+        /* Start at the right edge */
+        char * cur = (char *) bigend;
+
+        /* memrchr() makes the search easy (and fast); otherwise, look
+         * backwards byte-by-byte. */
+        do {
+
+#ifdef HAS_MEMRCHR
+
+            cur = (char *) memrchr(left_fence, final, cur - left_fence);
+            if (cur == NULL) {
+                return NULL;
+            }
+#else
+            do {
+                cur--;
+                if (cur < left_fence) {
+                    return NULL;
+                }
+            }
+            while (*cur != final);
+#endif
+
+            /* Here, we know that *cur is 'final'; see if the preceding bytes
+             * of the needle also match the corresponding haystack bytes */
+            if memEQ(cur - prefix_len, little, prefix_len) {
+                return cur - prefix_len;
+            }
+        } while (cur > left_fence);
+
+        return NULL;
+    }
 }
 
 /* As a space optimization, we do not compile tables for strings of length
author	Karl Williamson <khw@cpan.org>	2020-04-26 17:47:47 -0600
committer	Karl Williamson <khw@cpan.org>	2020-08-22 06:38:40 -0600
commit	2e8a5b7670884e12a43041522bfe7cc4d36a033c (patch)
tree	daa07b4455f4b2708351d642803f477b4ae96d31 /util.c
parent	bbd8607595f9856d6e75ed63130034cf645feb4a (diff)
download	perl-2e8a5b7670884e12a43041522bfe7cc4d36a033c.tar.gz