apr_brigades: Add apr_brigade_split_boundary(), allowing us to split

brigades on boundaries of arbitrary length. git-svn-id: https://svn.apache.org/repos/asf/apr/apr/trunk@1894380 13f79535-47bb-0310-9956-ffa450edef68
author: Graham Leggett <minfrin@apache.org> 2021-10-19 14:30:37 +0000
committer: Graham Leggett <minfrin@apache.org> 2021-10-19 14:30:37 +0000
commit: f65813689e5a0061e65eca5d626ebe11d088b87a (patch)
tree: f9d88d1e5f251da447c3c9404d21aabb8ac28ed7
parent: 424e4ecebab9e3df6fe25c1507cc4093bc00f715 (diff)
download: apr-f65813689e5a0061e65eca5d626ebe11d088b87a.tar.gz
3 files changed, 357 insertions, 0 deletions
diff --git a/buckets/apr_brigade.c b/buckets/apr_brigade.c
index c81d29a6c..27bcffd96 100644
--- a/buckets/apr_brigade.c
+++ b/buckets/apr_brigade.c
@@ -387,6 +387,269 @@ APR_DECLARE(apr_status_t) apr_brigade_split_line(apr_bucket_brigade *bbOut,
     return APR_SUCCESS;
 }
 
+APR_DECLARE(apr_status_t) apr_brigade_split_boundary(apr_bucket_brigade *bbOut,
+                                                     apr_bucket_brigade *bbIn,
+                                                     apr_read_type_e block,
+                                                     const char *boundary,
+                                                     apr_size_t boundary_len,
+                                                     apr_off_t maxbytes)
+{
+    apr_off_t outbytes = 0;
+
+    if (!boundary || !boundary[0]) {
+        return APR_EINVAL;
+    }
+
+    if (APR_BUCKETS_STRING == boundary_len) {
+        boundary_len = strlen(boundary);
+    }
+
+    /*
+     * While the call describes itself as searching for a boundary string,
+     * what we actually do is search for anything that is definitely not
+     * a boundary string, and allow that not-boundary data to pass through.
+     *
+     * If we find data that might be a boundary, we try read more data in
+     * until we know for sure.
+     */
+    while (!APR_BRIGADE_EMPTY(bbIn)) {
+
+        const char *pos;
+        const char *str;
+        apr_bucket *e, *next, *prev;
+        apr_off_t inbytes = 0;
+        apr_size_t len;
+        apr_status_t rv;
+
+        /* We didn't find a boundary within the maximum line length. */
+        if (outbytes >= maxbytes) {
+            return APR_INCOMPLETE;
+        }
+
+        e = APR_BRIGADE_FIRST(bbIn);
+
+        /* We hit a metadata bucket, stop and let the caller handle it */
+        if (APR_BUCKET_IS_METADATA(e)) {
+            return APR_INCOMPLETE;
+        }
+
+        rv = apr_bucket_read(e, &str, &len, block);
+
+        if (rv != APR_SUCCESS) {
+            return rv;
+        }
+
+        inbytes += len;
+
+        /*
+         * Fast path.
+         *
+         * If we have at least one boundary worth of data, do an optimised
+         * substring search for the boundary, and split quickly if found.
+         */
+        if (len >= boundary_len) {
+
+            apr_size_t off;
+            apr_size_t leftover;
+
+            pos = strnstr(str, boundary, len);
+
+            /* definitely found it, we leave */
+            if (pos != NULL) {
+
+                off = pos - str;
+
+                /* everything up to the boundary */
+                if (off) {
+
+                    apr_bucket_split(e, off);
+                    APR_BUCKET_REMOVE(e);
+                    APR_BRIGADE_INSERT_TAIL(bbOut, e);
+
+                    e = APR_BRIGADE_FIRST(bbIn);
+                }
+
+                /* cut out the boundary */
+                apr_bucket_split(e, boundary_len);
+                apr_bucket_delete(e);
+
+                return APR_SUCCESS;
+            }
+
+            /* any partial matches at the end? */
+            leftover = boundary_len - 1;
+            off = (len - leftover);
+
+            while (leftover) {
+                if (!strncmp(str + off, boundary, leftover)) {
+
+                    if (off) {
+
+                        apr_bucket_split(e, off);
+                        APR_BUCKET_REMOVE(e);
+                        APR_BRIGADE_INSERT_TAIL(bbOut, e);
+
+                        e = APR_BRIGADE_FIRST(bbIn);
+                    }
+
+                    outbytes += off;
+                    inbytes -= off;
+
+                    goto skip;
+                }
+                off++;
+                leftover--;
+            }
+
+            APR_BUCKET_REMOVE(e);
+            APR_BRIGADE_INSERT_TAIL(bbOut, e);
+
+            outbytes += len;
+
+            continue;
+
+        }
+
+        /*
+         * Slow path.
+         *
+         * We need to read ahead at least one boundary worth of data so
+         * we can search across the bucket edges.
+         */
+        else {
+
+            apr_size_t off = 0;
+
+            /* find all definite non matches */
+            while (len) {
+                if (!strncmp(str + off, boundary, len)) {
+
+                    if (off) {
+
+                        apr_bucket_split(e, off);
+                        APR_BUCKET_REMOVE(e);
+                        APR_BRIGADE_INSERT_TAIL(bbOut, e);
+
+                        e = APR_BRIGADE_FIRST(bbIn);
+                    }
+
+                    inbytes -= off;
+
+                    goto skip;
+                }
+                off++;
+                len--;
+            }
+
+            APR_BUCKET_REMOVE(e);
+            APR_BRIGADE_INSERT_TAIL(bbOut, e);
+            continue;
+
+        }
+
+        /*
+         * If we reach skip, it means the bucket in e is:
+         *
+         * - shorter than the boundary
+         * - matches the boundary up to the bucket length
+         * - might match more buckets
+         *
+         * Read further buckets and check whether the boundary matches all
+         * the way to the end. If so, we have a match. If no match, shave off
+         * one byte and continue round to try again.
+         */
+skip:
+
+        for (next = APR_BUCKET_NEXT(e);
+                inbytes < boundary_len && next != APR_BRIGADE_SENTINEL(bbIn);
+                next = APR_BUCKET_NEXT(next)) {
+
+            const char *str;
+            apr_size_t off;
+            apr_size_t len;
+
+            rv = apr_bucket_read(next, &str, &len, block);
+
+            if (rv != APR_SUCCESS) {
+                return rv;
+            }
+
+            off = boundary_len - inbytes;
+
+            if (len > off) {
+
+                /* not a match, bail out */
+                if (strncmp(str, boundary + inbytes, off)) {
+                    break;
+                }
+
+                /* a match! remove the boundary and return */
+                apr_bucket_split(next, off);
+
+                e = APR_BUCKET_NEXT(next);
+
+                for (prev = APR_BRIGADE_FIRST(bbIn);
+                        prev != e;
+                        prev = APR_BRIGADE_FIRST(bbIn)) {
+
+                    apr_bucket_delete(prev);
+
+                }
+
+                return APR_SUCCESS;
+
+            }
+            if (len == off) {
+
+                /* not a match, bail out */
+                if (strncmp(str, boundary + inbytes, off)) {
+                    break;
+                }
+
+                /* a match! remove the boundary and return */
+                e = APR_BUCKET_NEXT(next);
+
+                for (prev = APR_BRIGADE_FIRST(bbIn);
+                        prev != e;
+                        prev = APR_BRIGADE_FIRST(bbIn)) {
+
+                    apr_bucket_delete(prev);
+
+                }
+
+                return APR_SUCCESS;
+
+            }
+            else if (len) {
+
+                /* not a match, bail out */
+                if (strncmp(str, boundary + inbytes, len)) {
+                    break;
+                }
+
+                /* still hope for a match */
+                inbytes += len;
+            }
+
+        }
+
+        /*
+         * If we reach this point, the bucket e did not match the boundary
+         * in the subsequent buckets.
+         *
+         * Bump one byte off, and loop round to search again.
+         */
+        apr_bucket_split(e, 1);
+        APR_BUCKET_REMOVE(e);
+        APR_BRIGADE_INSERT_TAIL(bbOut, e);
+
+        outbytes++;
+
+    }
+
+    return APR_INCOMPLETE;
+}
+
 
 APR_DECLARE(apr_status_t) apr_brigade_to_iovec(apr_bucket_brigade *b, 
                                                struct iovec *vec, int *nvec)
diff --git a/include/apr_buckets.h b/include/apr_buckets.h
index 0725c6cd9..065058316 100644
--- a/include/apr_buckets.h
+++ b/include/apr_buckets.h
@@ -53,6 +53,11 @@ extern "C" {
 /** default bucket buffer size - 8KB minus room for memory allocator headers */
 #define APR_BUCKET_BUFF_SIZE 8000
 
+/** if passed to apr_brigade_split_boundary(), the string length will
+ * be calculated
+ */
+#define APR_BUCKETS_STRING -1
+
 /** Determines how a bucket or brigade should be read */
 typedef enum {
     APR_BLOCK_READ,   /**< block until data becomes available */
@@ -791,6 +796,38 @@ APR_DECLARE(apr_status_t) apr_brigade_split_line(apr_bucket_brigade *bbOut,
                           __attribute__((nonnull(1,2)));
 
 /**
+ * Split a brigade based on the provided boundary, or metadata buckets,
+ * whichever are encountered first.
+ *
+ * If the boundary is found, all buckets prior to the boundary are passed
+ * into bbOut, and APR_SUCCESS is returned.
+ *
+ * If a metadata bucket is found, or if the boundary is not found within
+ * the limit specified by maxbytes, all prior buckets are passed into bbOut,
+ * and APR_INCOMPLETE is returned.
+ *
+ * Any partial matches at the end of a bucket will be held back
+ * If the boundary is NULL or the empty string, APR_EINVAL is returned.
+ *
+ * If an error is encountered, the APR error code will be returned.
+ *
+ * @param bbOut The bucket brigade that will have the LF line appended to.
+ * @param bbIn The input bucket brigade to search for a LF-line.
+ * @param block The blocking mode to be used to split the line.
+ * @param boundary The boundary string.
+ * @param boundary_len The length of the boundary string. If set to
+ *        APR_BUCKETS_STRING, the length will be calculated.
+ * @param maxbytes The maximum bytes to read.
+ */
+APR_DECLARE(apr_status_t) apr_brigade_split_boundary(apr_bucket_brigade *bbOut,
+                                                     apr_bucket_brigade *bbIn,
+                                                     apr_read_type_e block,
+                                                     const char *boundary,
+                                                     apr_size_t boundary_len,
+                                                     apr_off_t maxbytes)
+                          __attribute__((nonnull(1,2)));
+
+/**
  * Create an iovec of the elements in a bucket_brigade... return number 
  * of elements used.  This is useful for writing to a file or to the
  * network efficiently.
diff --git a/test/testbuckets.c b/test/testbuckets.c
index 31bed0c1b..2b789f1a0 100644
--- a/test/testbuckets.c
+++ b/test/testbuckets.c
@@ -209,6 +209,62 @@ static void test_splitline(abts_case *tc, void *data)
     apr_bucket_alloc_destroy(ba);
 }
 
+static void test_splitboundary(abts_case *tc, void *data)
+{
+    apr_bucket_alloc_t *ba = apr_bucket_alloc_create(p);
+    apr_bucket_brigade *bin, *bout;
+
+    /* fast path */
+    bin = make_simple_brigade(ba, "quick brown fox",
+                              " jumped over the lazy dog");
+    bout = apr_brigade_create(p, ba);
+
+    APR_ASSERT_SUCCESS(tc, "split boundary",
+                       apr_brigade_split_boundary(bout, bin,
+                                              APR_BLOCK_READ, "brown",
+                                              APR_BUCKETS_STRING, 100));
+
+    flatten_match(tc, "split boundary", bout, "quick ");
+    flatten_match(tc, "remainder", bin, " fox jumped over the lazy dog");
+
+    apr_brigade_destroy(bout);
+    apr_brigade_destroy(bin);
+
+    /* slow path */
+    bin = make_simple_brigade(ba, "quick brown fox jum",
+                              "ped over the lazy dog");
+    bout = apr_brigade_create(p, ba);
+
+    APR_ASSERT_SUCCESS(tc, "split boundary",
+                       apr_brigade_split_boundary(bout, bin,
+                                              APR_BLOCK_READ, "jumped",
+                                              APR_BUCKETS_STRING, 100));
+
+    flatten_match(tc, "split boundary", bout, "quick brown fox ");
+    flatten_match(tc, "remainder", bin, " over the lazy dog");
+
+    apr_brigade_destroy(bout);
+    apr_brigade_destroy(bin);
+
+    /* not found */
+    bin = make_simple_brigade(ba, "quick brown fox jum",
+                              "ped over the lazy dog");
+    bout = apr_brigade_create(p, ba);
+
+    ABTS_ASSERT(tc, "split boundary",
+                apr_brigade_split_boundary(bout, bin,
+                    APR_BLOCK_READ, "jumping",
+                    APR_BUCKETS_STRING, 100) == APR_INCOMPLETE);
+
+    flatten_match(tc, "split boundary", bout, "quick brown fox jumped over the lazy dog");
+    flatten_match(tc, "remainder", bin, "");
+
+    apr_brigade_destroy(bout);
+    apr_brigade_destroy(bin);
+
+    apr_bucket_alloc_destroy(ba);
+}
+
 /* Test that bucket E has content EDATA of length ELEN. */
 static void test_bucket_content(abts_case *tc,
                                 apr_bucket *e,
@@ -521,6 +577,7 @@ abts_suite *testbuckets(abts_suite *suite)
     abts_run_test(suite, test_split, NULL);
     abts_run_test(suite, test_bwrite, NULL);
     abts_run_test(suite, test_splitline, NULL);
+    abts_run_test(suite, test_splitboundary, NULL);
     abts_run_test(suite, test_splits, NULL);
     abts_run_test(suite, test_insertfile, NULL);
     abts_run_test(suite, test_manyfile, NULL);
author	Graham Leggett <minfrin@apache.org>	2021-10-19 14:30:37 +0000
committer	Graham Leggett <minfrin@apache.org>	2021-10-19 14:30:37 +0000
commit	f65813689e5a0061e65eca5d626ebe11d088b87a (patch)
tree	f9d88d1e5f251da447c3c9404d21aabb8ac28ed7
parent	424e4ecebab9e3df6fe25c1507cc4093bc00f715 (diff)
download	apr-f65813689e5a0061e65eca5d626ebe11d088b87a.tar.gz