diff options
Diffstat (limited to 'subversion/libsvn_subr/utf_validate.c')
-rw-r--r-- | subversion/libsvn_subr/utf_validate.c | 109 |
1 files changed, 107 insertions, 2 deletions
diff --git a/subversion/libsvn_subr/utf_validate.c b/subversion/libsvn_subr/utf_validate.c index 6f00f73..8311fd7 100644 --- a/subversion/libsvn_subr/utf_validate.c +++ b/subversion/libsvn_subr/utf_validate.c @@ -57,6 +57,8 @@ */ #include "private/svn_utf_private.h" +#include "private/svn_eol_private.h" +#include "private/svn_dep_compat.h" /* Lookup table to categorise each octet in the string. */ static const char octet_category[256] = { @@ -249,12 +251,100 @@ static const char machine [9][14] = { FSM_ERROR}, /* 0xf5-0xff */ }; +/* Scan MAX_LEN bytes in *DATA for chars that are not in the octet + * category 0 (FSM_START). Return the position of the first such char + * or DATA + MAX_LEN if all were cat 0. + */ +static const char * +first_non_fsm_start_char(const char *data, apr_size_t max_len) +{ +#if !SVN_UNALIGNED_ACCESS_IS_OK + + /* On some systems, we need to make sure that buf is properly aligned + * for chunky data access. + */ + if ((apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1)) + { + apr_size_t len = (~(apr_uintptr_t)data) & (sizeof(apr_uintptr_t)-1); + if (len > max_len) + len = max_len; + max_len -= len; + + for (; len > 0; ++data, --len) + if (*data < 0 || *data >= 0x80) + return data; + } + +#endif + + /* Scan the input one machine word at a time. */ + for (; max_len > sizeof(apr_uintptr_t) + ; data += sizeof(apr_uintptr_t), max_len -= sizeof(apr_uintptr_t)) + if (*(const apr_uintptr_t *)data & SVN__BIT_7_SET) + break; + + /* The remaining odd bytes will be examined the naive way: */ + for (; max_len > 0; ++data, --max_len) + if (*data < 0 || *data >= 0x80) + break; + + return data; +} + +/* Scan the C string in *DATA for chars that are not in the octet + * category 0 (FSM_START). Return the position of either the such + * char or of the terminating NUL. + */ +static const char * +first_non_fsm_start_char_cstring(const char *data) +{ + /* We need to make sure that BUF is properly aligned for chunky data + * access because we don't know the string's length. Unaligned chunk + * read access beyond the NUL terminator could therefore result in a + * segfault. + */ + for (; (apr_uintptr_t)data & (sizeof(apr_uintptr_t)-1); ++data) + if (*data <= 0 || *data >= 0x80) + return data; + + /* Scan the input one machine word at a time. */ +#ifndef SVN_UTF_NO_UNINITIALISED_ACCESS + /* This may read allocated but initialised bytes beyond the + terminating null. Any such bytes are always readable and this + code operates correctly whatever the uninitialised values happen + to be. However memory checking tools such as valgrind and GCC + 4.8's address santitizer will object so this bit of code can be + disabled at compile time. */ + for (; ; data += sizeof(apr_uintptr_t)) + { + /* Check for non-ASCII chars: */ + apr_uintptr_t chunk = *(const apr_uintptr_t *)data; + if (chunk & SVN__BIT_7_SET) + break; + + /* This is the well-known strlen test: */ + chunk |= (chunk & SVN__LOWER_7BITS_SET) + SVN__LOWER_7BITS_SET; + if ((chunk & SVN__BIT_7_SET) != SVN__BIT_7_SET) + break; + } +#endif + + /* The remaining odd bytes will be examined the naive way: */ + for (; ; ++data) + if (*data <= 0 || *data >= 0x80) + break; + + return data; +} const char * svn_utf__last_valid(const char *data, apr_size_t len) { - const char *start = data, *end = data + len; + const char *start = first_non_fsm_start_char(data, len); + const char *end = data + len; int state = FSM_START; + + data = start; while (data < end) { unsigned char octet = *data++; @@ -270,6 +360,12 @@ svn_boolean_t svn_utf__cstring_is_valid(const char *data) { int state = FSM_START; + + if (!data) + return FALSE; + + data = first_non_fsm_start_char_cstring(data); + while (*data) { unsigned char octet = *data++; @@ -284,6 +380,12 @@ svn_utf__is_valid(const char *data, apr_size_t len) { const char *end = data + len; int state = FSM_START; + + if (!data) + return FALSE; + + data = first_non_fsm_start_char(data, len); + while (data < end) { unsigned char octet = *data++; @@ -296,8 +398,11 @@ svn_utf__is_valid(const char *data, apr_size_t len) const char * svn_utf__last_valid2(const char *data, apr_size_t len) { - const char *start = data, *end = data + len; + const char *start = first_non_fsm_start_char(data, len); + const char *end = data + len; int state = FSM_START; + + data = start; while (data < end) { unsigned char octet = *data++; |