summaryrefslogtreecommitdiff
path: root/src/encoding.c
diff options
context:
space:
mode:
authorChristos Zoulas <christos@zoulas.com>2019-02-19 20:30:35 +0000
committerChristos Zoulas <christos@zoulas.com>2019-02-19 20:30:35 +0000
commitf0a26da7b371127e4460cc6d2da1b410c3d85ad9 (patch)
tree31cd4865aa13baaac61fdfe5acff2435e83a898c /src/encoding.c
parent642f269ef99930b44daa2236908da7d05a68eb08 (diff)
downloadfile-git-f0a26da7b371127e4460cc6d2da1b410c3d85ad9.tar.gz
PR/61: tmc: Add UCS-32 built-in detection.
Diffstat (limited to 'src/encoding.c')
-rw-r--r--src/encoding.c66
1 files changed, 57 insertions, 9 deletions
diff --git a/src/encoding.c b/src/encoding.c
index ef8493d5..b9727fd8 100644
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -35,7 +35,7 @@
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.15 2018/10/15 16:29:16 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.16 2019/02/19 20:30:35 christos Exp $")
#endif /* lint */
#include "magic.h"
@@ -49,6 +49,7 @@ private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
size_t *);
private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_ucs32(const unsigned char *, size_t, unichar *, size_t *);
private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
@@ -116,6 +117,15 @@ file_encoding(struct magic_set *ms, const struct buffer *b, unichar **ubuf,
DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
*code = "UTF-8 Unicode";
*code_mime = "utf-8";
+ } else if ((ucs_type = looks_ucs32(buf, nbytes, *ubuf, ulen)) != 0) {
+ if (ucs_type == 1) {
+ *code = "Little-endian UTF-32 Unicode";
+ *code_mime = "utf-32le";
+ } else {
+ *code = "Big-endian UTF-32 Unicode";
+ *code_mime = "utf-32be";
+ }
+ DPRINTF(("ucs32 %" SIZE_T_FORMAT "u\n", *ulen));
} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
if (ucs_type == 1) {
*code = "Little-endian UTF-16 Unicode";
@@ -410,7 +420,7 @@ looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
}
private int
-looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+looks_ucs16(const unsigned char *bf, size_t nbytes, unichar *ubf,
size_t *ulen)
{
int bigend;
@@ -419,9 +429,9 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
if (nbytes < 2)
return 0;
- if (buf[0] == 0xff && buf[1] == 0xfe)
+ if (bf[0] == 0xff && bf[1] == 0xfe)
bigend = 0;
- else if (buf[0] == 0xfe && buf[1] == 0xff)
+ else if (bf[0] == 0xfe && bf[1] == 0xff)
bigend = 1;
else
return 0;
@@ -432,20 +442,58 @@ looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
/* XXX fix to properly handle chars > 65536 */
if (bigend)
- ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
+ ubf[(*ulen)++] = bf[i + 1] + 256 * bf[i];
else
- ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
+ ubf[(*ulen)++] = bf[i] + 256 * bf[i + 1];
- if (ubuf[*ulen - 1] == 0xfffe)
+ if (ubf[*ulen - 1] == 0xfffe)
return 0;
- if (ubuf[*ulen - 1] < 128 &&
- text_chars[(size_t)ubuf[*ulen - 1]] != T)
+ if (ubf[*ulen - 1] < 128 &&
+ text_chars[(size_t)ubf[*ulen - 1]] != T)
return 0;
}
return 1 + bigend;
}
+private int
+looks_ucs32(const unsigned char *bf, size_t nbytes, unichar *ubf,
+ size_t *ulen)
+{
+ int bigend;
+ size_t i;
+
+ if (nbytes < 4)
+ return 0;
+
+ if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
+ bigend = 0;
+ else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
+ bigend = 1;
+ else
+ return 0;
+
+ *ulen = 0;
+
+ for (i = 4; i + 1 < nbytes; i += 4) {
+ /* XXX fix to properly handle chars > 65536 */
+
+ if (bigend)
+ ubf[(*ulen)++] = bf[i + 3] | (bf[i + 2] << 8)
+ | (bf[i + 1] << 16) | bf[i] << 24;
+ else
+ ubf[(*ulen)++] = bf[i] | (bf[i + 1] << 8)
+ | (bf[i + 2] << 16) | (bf[i + 3] << 24);
+
+ if (ubf[*ulen - 1] == 0xfffe)
+ return 0;
+ if (ubf[*ulen - 1] < 128 &&
+ text_chars[(size_t)ubf[*ulen - 1]] != T)
+ return 0;
+ }
+
+ return 1 + bigend;
+}
#undef F
#undef T
#undef I