diff options
author | Christos Zoulas <christos@zoulas.com> | 2019-02-19 20:30:35 +0000 |
---|---|---|
committer | Christos Zoulas <christos@zoulas.com> | 2019-02-19 20:30:35 +0000 |
commit | f0a26da7b371127e4460cc6d2da1b410c3d85ad9 (patch) | |
tree | 31cd4865aa13baaac61fdfe5acff2435e83a898c /src/encoding.c | |
parent | 642f269ef99930b44daa2236908da7d05a68eb08 (diff) | |
download | file-git-f0a26da7b371127e4460cc6d2da1b410c3d85ad9.tar.gz |
PR/61: tmc: Add UCS-32 built-in detection.
Diffstat (limited to 'src/encoding.c')
-rw-r--r-- | src/encoding.c | 66 |
1 files changed, 57 insertions, 9 deletions
diff --git a/src/encoding.c b/src/encoding.c index ef8493d5..b9727fd8 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -35,7 +35,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: encoding.c,v 1.15 2018/10/15 16:29:16 christos Exp $") +FILE_RCSID("@(#)$File: encoding.c,v 1.16 2019/02/19 20:30:35 christos Exp $") #endif /* lint */ #include "magic.h" @@ -49,6 +49,7 @@ private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, size_t *); private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *); private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); +private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *); private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); private void from_ebcdic(const unsigned char *, size_t, unsigned char *); @@ -116,6 +117,15 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf, DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); *code = "UTF-8 Unicode"; *code_mime = "utf-8"; + } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) { + if (ucs_type == 1) { + *code = "Little-endian UTF-32 Unicode"; + *code_mime = "utf-32le"; + } else { + *code = "Big-endian UTF-32 Unicode"; + *code_mime = "utf-32be"; + } + DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen)); } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { if (ucs_type == 1) { *code = "Little-endian UTF-16 Unicode"; @@ -410,7 +420,7 @@ looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) } private int -looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, +looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf, size_t *ulen) { int bigend; @@ -419,9 +429,9 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, if (nbytes < 2) return 0; - if (buf[0] == 0xff && buf[1] == 0xfe) + if (bf[0] == 0xff && bf[1] == 0xfe) bigend = 0; - else if (buf[0] == 0xfe && buf[1] == 0xff) + else if (bf[0] == 0xfe && bf[1] == 0xff) bigend = 1; else return 0; @@ -432,20 +442,58 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, /* XXX fix to properly handle chars > 65536 */ if (bigend) - ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i]; + ubf[(*ulen)++] = bf[i + 1] + 256 * bf[i]; else - ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1]; + ubf[(*ulen)++] = bf[i] + 256 * bf[i + 1]; - if (ubuf[*ulen - 1] == 0xfffe) + if (ubf[*ulen - 1] == 0xfffe) return 0; - if (ubuf[*ulen - 1] < 128 && - text_chars[(size_t)ubuf[*ulen - 1]] != T) + if (ubf[*ulen - 1] < 128 && + text_chars[(size_t)ubf[*ulen - 1]] != T) return 0; } return 1 + bigend; } +private int +looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf, + size_t *ulen) +{ + int bigend; + size_t i; + + if (nbytes < 4) + return 0; + + if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0) + bigend = 0; + else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff) + bigend = 1; + else + return 0; + + *ulen = 0; + + for (i = 4; i + 1 < nbytes; i += 4) { + /* XXX fix to properly handle chars > 65536 */ + + if (bigend) + ubf[(*ulen)++] = bf[i + 3] | (bf[i + 2] << 8) + | (bf[i + 1] << 16) | bf[i] << 24; + else + ubf[(*ulen)++] = bf[i] | (bf[i + 1] << 8) + | (bf[i + 2] << 16) | (bf[i + 3] << 24); + + if (ubf[*ulen - 1] == 0xfffe) + return 0; + if (ubf[*ulen - 1] < 128 && + text_chars[(size_t)ubf[*ulen - 1]] != T) + return 0; + } + + return 1 + bigend; +} #undef F #undef T #undef I |