diff options
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 258 |
1 files changed, 258 insertions, 0 deletions
@@ -0,0 +1,258 @@ +/* $OpenBSD: utf8.c,v 1.1 2016/05/25 23:48:45 schwarze Exp $ */ +/* + * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Utility functions for multibyte-character handling, + * in particular to sanitize untrusted strings for terminal output. + */ + +#include <sys/types.h> +#include <langinfo.h> +#include <limits.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <vis.h> +#include <wchar.h> + +#include "utf8.h" + +static int dangerous_locale(void); +static int vasnmprintf(char **, size_t, int *, const char *, va_list); + + +/* + * For US-ASCII and UTF-8 encodings, we can safely recover from + * encoding errors and from non-printable characters. For any + * other encodings, err to the side of caution and abort parsing: + * For state-dependent encodings, recovery is impossible. + * For arbitrary encodings, replacement of non-printable + * characters would be non-trivial and too fragile. + */ + +static int +dangerous_locale(void) { + char *loc; + + loc = nl_langinfo(CODESET); + return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8"); +} + +/* + * The following two functions limit the number of bytes written, + * including the terminating '\0', to sz. Unless wp is NULL, + * they limit the number of display columns occupied to *wp. + * Whichever is reached first terminates the output string. + * To stay close to the standard interfaces, they return the number of + * non-NUL bytes that would have been written if both were unlimited. + * If wp is NULL, newline, carriage return, and tab are allowed; + * otherwise, the actual number of columns occupied by what was + * written is returned in *wp. + */ + +static int +vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap) +{ + char *src; /* Source string returned from vasprintf. */ + char *sp; /* Pointer into src. */ + char *dst; /* Destination string to be returned. */ + char *dp; /* Pointer into dst. */ + char *tp; /* Temporary pointer for dst. */ + size_t sz; /* Number of bytes allocated for dst. */ + size_t tsz; /* Temporary size while extending dst. */ + wchar_t wc; /* Wide character at sp. */ + int len; /* Number of bytes in the character at sp. */ + int ret; /* Number of bytes needed to format src. */ + int width; /* Display width of the character wc. */ + int total_width, max_width, print; + + src = dst = NULL; + if (vasprintf(&src, fmt, ap) <= 0) + goto fail; + + sz = strlen(src); + if ((dst = malloc(sz)) == NULL) + goto fail; + + if (maxsz > INT_MAX) + maxsz = INT_MAX; + + sp = src; + dp = dst; + ret = 0; + print = 1; + total_width = 0; + max_width = wp == NULL ? INT_MAX : *wp; + while (*sp != '\0') { + if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) { + (void)mbtowc(NULL, NULL, MB_CUR_MAX); + if (dangerous_locale()) { + ret = -1; + break; + } + len = 1; + width = -1; + } else if (wp == NULL && + (wc == L'\n' || wc == L'\r' || wc == L'\t')) { + /* + * Don't use width uninitialized; the actual + * value doesn't matter because total_width + * is only returned for wp != NULL. + */ + width = 0; + } else if ((width = wcwidth(wc)) == -1 && + dangerous_locale()) { + ret = -1; + break; + } + + /* Valid, printable character. */ + + if (width >= 0) { + if (print && (dp - dst >= (int)maxsz - len || + total_width > max_width - width)) + print = 0; + if (print) { + total_width += width; + memcpy(dp, sp, len); + dp += len; + } + sp += len; + if (ret >= 0) + ret += len; + continue; + } + + /* Escaping required. */ + + while (len > 0) { + if (print && (dp - dst >= (int)maxsz - 4 || + total_width > max_width - 4)) + print = 0; + if (print) { + if (dp + 4 >= dst + sz) { + tsz = sz + 128; + if (tsz > maxsz) + tsz = maxsz; + tp = realloc(dst, tsz); + if (tp == NULL) { + ret = -1; + break; + } + dp = tp + (dp - dst); + dst = tp; + sz = tsz; + } + tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0); + width = tp - dp; + total_width += width; + dp = tp; + } else + width = 4; + len--; + sp++; + if (ret >= 0) + ret += width; + } + if (len > 0) + break; + } + free(src); + *dp = '\0'; + *str = dst; + if (wp != NULL) + *wp = total_width; + + /* + * If the string was truncated by the width limit but + * would have fit into the size limit, the only sane way + * to report the problem is using the return value, such + * that the usual idiom "if (ret < 0 || ret >= sz) error" + * works as expected. + */ + + if (ret < (int)maxsz && !print) + ret = -1; + return ret; + +fail: + free(src); + free(dst); + *str = NULL; + if (wp != NULL) + *wp = 0; + return -1; +} + +int +snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...) +{ + va_list ap; + char *cp; + int ret; + + va_start(ap, fmt); + ret = vasnmprintf(&cp, sz, wp, fmt, ap); + va_end(ap); + (void)strlcpy(str, cp, sz); + free(cp); + return ret; +} + +/* + * To stay close to the standard interfaces, the following functions + * return the number of non-NUL bytes written. + */ + +int +vfmprintf(FILE *stream, const char *fmt, va_list ap) +{ + char *str; + int ret; + + if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) + return -1; + if (fputs(str, stream) == EOF) + ret = -1; + free(str); + return ret; +} + +int +fmprintf(FILE *stream, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = vfmprintf(stream, fmt, ap); + va_end(ap); + return ret; +} + +int +mprintf(const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = vfmprintf(stdout, fmt, ap); + va_end(ap); + return ret; +} |