Merge pull request #9167 from keszybz/ellipsization

Ellipsization fixes based on unit-testing and fuzzing
author: Lennart Poettering <lennart@poettering.net> 2018-06-04 13:45:03 +0200
committer: GitHub <noreply@github.com> 2018-06-04 13:45:03 +0200
commit: b5b74e4b12866019a2aafbd9e8f5185d491351b8 (patch)
tree: 4b2cb080cc5509c3d8f6a4d7a463f503959f08d9
parent: 0be9b12be2eadfd9c296f12874dffb5d17c68484 (diff)
parent: 9924aef690ad20f35c5391aad7fd7c5a9576cc21 (diff)
download: systemd-b5b74e4b12866019a2aafbd9e8f5185d491351b8.tar.gz
5 files changed, 133 insertions, 40 deletions
diff --git a/src/basic/string-util.c b/src/basic/string-util.c
index 4c7ab3c4d3..a9362bf8bd 100644
--- a/src/basic/string-util.c
+++ b/src/basic/string-util.c
@@ -469,8 +469,8 @@ static int write_ellipsis(char *buf, bool unicode) {
 }
 
 static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
-        size_t x, need_space;
-        char *r;
+        size_t x, need_space, suffix_len;
+        char *t;
 
         assert(s);
         assert(percent <= 100);
@@ -506,8 +506,8 @@ static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_le
          * either for the UTF-8 encoded character or for three ASCII characters. */
         need_space = is_locale_utf8() ? 1 : 3;
 
-        r = new(char, new_length+3);
-        if (!r)
+        t = new(char, new_length+3);
+        if (!t)
                 return NULL;
 
         assert(new_length >= need_space);
@@ -515,13 +515,13 @@ static char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_le
         x = ((new_length - need_space) * percent + 50) / 100;
         assert(x <= new_length - need_space);
 
-        memcpy(r, s, x);
-        write_ellipsis(r + x, false);
-        memcpy(r + x + 3,
-               s + old_length - (new_length - x - need_space),
-               new_length - x - need_space + 1);
+        memcpy(t, s, x);
+        write_ellipsis(t + x, false);
+        suffix_len = new_length - x - need_space;
+        memcpy(t + x + 3, s + old_length - suffix_len, suffix_len);
+        *(t + x + 3 + suffix_len) = '\0';
 
-        return r;
+        return t;
 }
 
 char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) {
@@ -559,35 +559,49 @@ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigne
         assert(x <= new_length - 1);
 
         k = 0;
-        for (i = s; k < x && i < s + old_length; i = utf8_next_char(i)) {
+        for (i = s; i < s + old_length; i = utf8_next_char(i)) {
                 char32_t c;
+                int w;
 
                 r = utf8_encoded_to_unichar(i, &c);
                 if (r < 0)
                         return NULL;
-                k += unichar_iswide(c) ? 2 : 1;
-        }
 
-        if (k > x) /* last character was wide and went over quota */
-                x++;
+                w = unichar_iswide(c) ? 2 : 1;
+                if (k + w <= x)
+                        k += w;
+                else
+                        break;
+        }
 
-        for (j = s + old_length; k < new_length && j > i; ) {
+        for (j = s + old_length; j > i; ) {
                 char32_t c;
+                int w;
+                const char *jj;
 
-                j = utf8_prev_char(j);
-                r = utf8_encoded_to_unichar(j, &c);
+                jj = utf8_prev_char(j);
+                r = utf8_encoded_to_unichar(jj, &c);
                 if (r < 0)
                         return NULL;
-                k += unichar_iswide(c) ? 2 : 1;
+
+                w = unichar_iswide(c) ? 2 : 1;
+                if (k + w <= new_length) {
+                        k += w;
+                        j = jj;
+                } else
+                        break;
         }
         assert(i <= j);
 
         /* we don't actually need to ellipsize */
         if (i == j)
-                return memdup(s, old_length + 1);
+                return memdup_suffix0(s, old_length);
 
-        /* make space for ellipsis */
-        j = utf8_next_char(j);
+        /* make space for ellipsis, if possible */
+        if (j < s + old_length)
+                j = utf8_next_char(j);
+        else if (i > s)
+                i = utf8_prev_char(i);
 
         len = i - s;
         len2 = s + old_length - j;
@@ -602,18 +616,12 @@ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigne
 
         memcpy(e, s, len);
         write_ellipsis(e + len, true);
-        memcpy(e + len + 3, j, len2 + 1);
+        memcpy(e + len + 3, j, len2);
+        *(e + len + 3 + len2) = '\0';
 
         return e;
 }
 
-char *ellipsize(const char *s, size_t length, unsigned percent) {
-        if (length == (size_t) -1)
-                return strdup(s);
-
-        return ellipsize_mem(s, strlen(s), length, percent);
-}
-
 char *cellescape(char *buf, size_t len, const char *s) {
         /* Escape and ellipsize s into buffer buf of size len. Only non-control ASCII
          * characters are copied as they are, everything else is escaped. The result
diff --git a/src/basic/string-util.h b/src/basic/string-util.h
index 25980e7cc8..7e6880b26b 100644
--- a/src/basic/string-util.h
+++ b/src/basic/string-util.h
@@ -156,7 +156,10 @@ static inline bool _pure_ in_charset(const char *s, const char* charset) {
 bool string_has_cc(const char *p, const char *ok) _pure_;
 
 char *ellipsize_mem(const char *s, size_t old_length_bytes, size_t new_length_columns, unsigned percent);
-char *ellipsize(const char *s, size_t length, unsigned percent);
+static inline char *ellipsize(const char *s, size_t length, unsigned percent) {
+        return ellipsize_mem(s, strlen(s), length, percent);
+}
+
 char *cellescape(char *buf, size_t len, const char *s);
 
 /* This limit is arbitrary, enough to give some idea what the string contains */
diff --git a/src/test/test-ellipsize.c b/src/test/test-ellipsize.c
index 902bc3342f..7d6b2b5449 100644
--- a/src/test/test-ellipsize.c
+++ b/src/test/test-ellipsize.c
@@ -10,10 +10,80 @@
 #include "alloc-util.h"
 #include "def.h"
 #include "string-util.h"
+#include "strv.h"
 #include "terminal-util.h"
 #include "util.h"
+#include "utf8.h"
 
-static void test_one(const char *p) {
+static void test_ellipsize_mem_one(const char *s, size_t old_length, size_t new_length) {
+        _cleanup_free_ char *n = NULL;
+        _cleanup_free_ char *t1 = NULL, *t2 = NULL, *t3 = NULL;
+        char buf[LINE_MAX];
+        bool has_wide_chars;
+        size_t max_width;
+
+        n = memdup_suffix0(s, old_length);
+
+        if (!utf8_is_valid(n))
+                /* We don't support invalid sequences… */
+                return;
+
+        /* Report out inputs. We duplicate the data so that cellescape
+         * can properly report truncated multibyte sequences. */
+        log_info("%s \"%s\" old_length=%zu/%zu new_length=%zu", __func__,
+                 cellescape(buf, sizeof buf, n),
+                 old_length, utf8_console_width(n),
+                 new_length);
+
+        /* To keep this test simple, any case with wide chars starts with this glyph */
+        has_wide_chars = startswith(s, "你");
+        max_width = MIN(utf8_console_width(n), new_length);
+
+        t1 = ellipsize_mem(n, old_length, new_length, 30);
+        log_info("30%% → %s utf8_console_width=%zu", t1, utf8_console_width(t1));
+        if (!has_wide_chars)
+                assert_se(utf8_console_width(t1) == max_width);
+        else
+                assert_se(utf8_console_width(t1) <= max_width);
+
+        t2 = ellipsize_mem(n, old_length, new_length, 90);
+        log_info("90%% → %s utf8_console_width=%zu", t2, utf8_console_width(t2));
+        if (!has_wide_chars)
+                assert_se(utf8_console_width(t2) == max_width);
+        else
+                assert_se(utf8_console_width(t2) <= max_width);
+
+        t3 = ellipsize_mem(n, old_length, new_length, 100);
+        log_info("100%% → %s utf8_console_width=%zu", t3, utf8_console_width(t3));
+        if (!has_wide_chars)
+                assert_se(utf8_console_width(t3) == max_width);
+        else
+                assert_se(utf8_console_width(t3) <= max_width);
+
+        if (new_length >= old_length) {
+                assert_se(streq(t1, n));
+                assert_se(streq(t2, n));
+                assert_se(streq(t3, n));
+        }
+}
+
+static void test_ellipsize_mem(void) {
+        const char *s;
+        ssize_t l, k;
+
+        FOREACH_STRING(s,
+                       "_XXXXXXXXXXX_", /* ASCII */
+                       "_aąęółśćńżźć_", /* two-byte utf-8 */
+                       "გამარჯობა",     /* multi-byte utf-8 */
+                       "你好世界",       /* wide characters */
+                       "你გą世óoó界")    /* a mix */
+
+                for (l = strlen(s); l >= 0; l--)
+                        for (k = strlen(s) + 1; k >= 0; k--)
+                                test_ellipsize_mem_one(s, l, k);
+}
+
+static void test_ellipsize_one(const char *p) {
         _cleanup_free_ char *t;
         t = ellipsize(p, columns(), 70);
         puts(t);
@@ -43,15 +113,20 @@ static void test_one(const char *p) {
         puts(t);
 }
 
+static void test_ellipsize(void) {
+        test_ellipsize_one(DIGITS LETTERS DIGITS LETTERS);
+        test_ellipsize_one("한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어");
+        test_ellipsize_one("-日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国");
+        test_ellipsize_one("中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国-中国中国中国中国中国中国中国中国中国中国中国中国中国");
+        test_ellipsize_one("sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd");
+        test_ellipsize_one("🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮");
+        test_ellipsize_one("Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");
+        test_ellipsize_one("shórt");
+}
+
 int main(int argc, char *argv[]) {
-        test_one(DIGITS LETTERS DIGITS LETTERS);
-        test_one("한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어한국어");
-        test_one("-日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国日本国");
-        test_one("中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国中国-中国中国中国中国中国中国中国中国中国中国中国中国中国");
-        test_one("sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd sÿstëmd");
-        test_one("🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮🐮");
-        test_one("Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.");
-        test_one("shórt");
+        test_ellipsize_mem();
+        test_ellipsize();
 
         return 0;
 }
diff --git a/src/test/test-string-util.c b/src/test/test-string-util.c
index 8b176781de..d6eca393ee 100644
--- a/src/test/test-string-util.c
+++ b/src/test/test-string-util.c
@@ -10,6 +10,7 @@
 #include "macro.h"
 #include "string-util.h"
 #include "strv.h"
+#include "utf8.h"
 
 static void test_string_erase(void) {
         char *x;
diff --git a/test/fuzz-regressions/fuzz-journal-remote/oss-fuzz-8686 b/test/fuzz-regressions/fuzz-journal-remote/oss-fuzz-8686
new file mode 100644
index 0000000000..7c73c8cd9b
--- /dev/null
+++ b/test/fuzz-regressions/fuzz-journal-remote/oss-fuzz-8686
@@ -0,0 +1,6 @@
+__REALTIME_TIMESTAMP=  6
+SYSLOG_IDENTIFIER=             
+MESSAGE=                        ᅟ                                                                                                                                                                                                                                                                                
+SYSLOG_PID=            
+
+  
+\ No newline at end of file
author	Lennart Poettering <lennart@poettering.net>	2018-06-04 13:45:03 +0200
committer	GitHub <noreply@github.com>	2018-06-04 13:45:03 +0200
commit	b5b74e4b12866019a2aafbd9e8f5185d491351b8 (patch)
tree	4b2cb080cc5509c3d8f6a4d7a463f503959f08d9
parent	0be9b12be2eadfd9c296f12874dffb5d17c68484 (diff)
parent	9924aef690ad20f35c5391aad7fd7c5a9576cc21 (diff)
download	systemd-b5b74e4b12866019a2aafbd9e8f5185d491351b8.tar.gz