commit/commit-tree: correct latin1 to utf-8

When a line in the message is not a valid utf-8, "git mailinfo" attempts to convert it to utf-8 assuming the input is latin1 (and punt if it does not convert cleanly). Using the same heuristics in "git commit" and "git commit-tree" lets the editor output be in latin1 to make the overall system more consistent. Signed-off-by: Junio C Hamano <gitster@pobox.com>
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-06-28 11:24:14 -0700
committer: Junio C Hamano <gitster@pobox.com> 2012-08-21 16:10:53 -0700
commit: 08a94a145c3231c0fa36469682591a3c45222271 (patch)
tree: f25e5463156188f41e00da2f284fd8eb1b756bf2 /commit.c
parent: 4c8a9db6f7dd9e10b5ce9bfbcd5faa82a8c86ce3 (diff)
download: git-08a94a145c3231c0fa36469682591a3c45222271.tar.gz
1 files changed, 86 insertions, 2 deletions
diff --git a/commit.c b/commit.c
index 8248a994a5..1360bbd2cb 100644
--- a/commit.c
+++ b/commit.c
@@ -1112,8 +1112,92 @@ int commit_tree(const struct strbuf *msg, unsigned char *tree,
 	return result;
 }
 
+static int find_invalid_utf8(const char *buf, int len)
+{
+	int offset = 0;
+
+	while (len) {
+		unsigned char c = *buf++;
+		int bytes, bad_offset;
+
+		len--;
+		offset++;
+
+		/* Simple US-ASCII? No worries. */
+		if (c < 0x80)
+			continue;
+
+		bad_offset = offset-1;
+
+		/*
+		 * Count how many more high bits set: that's how
+		 * many more bytes this sequence should have.
+		 */
+		bytes = 0;
+		while (c & 0x40) {
+			c <<= 1;
+			bytes++;
+		}
+
+		/* Must be between 1 and 5 more bytes */
+		if (bytes < 1 || bytes > 5)
+			return bad_offset;
+
+		/* Do we *have* that many bytes? */
+		if (len < bytes)
+			return bad_offset;
+
+		offset += bytes;
+		len -= bytes;
+
+		/* And verify that they are good continuation bytes */
+		do {
+			if ((*buf++ & 0xc0) != 0x80)
+				return bad_offset;
+		} while (--bytes);
+
+		/* We could/should check the value and length here too */
+	}
+	return -1;
+}
+
+/*
+ * This verifies that the buffer is in proper utf8 format.
+ *
+ * If it isn't, it assumes any non-utf8 characters are Latin1,
+ * and does the conversion.
+ *
+ * Fixme: we should probably also disallow overlong forms and
+ * invalid characters. But we don't do that currently.
+ */
+static int verify_utf8(struct strbuf *buf)
+{
+	int ok = 1;
+	long pos = 0;
+
+	for (;;) {
+		int bad;
+		unsigned char c;
+		unsigned char replace[2];
+
+		bad = find_invalid_utf8(buf->buf + pos, buf->len - pos);
+		if (bad < 0)
+			return ok;
+		pos += bad;
+		ok = 0;
+		c = buf->buf[pos];
+		strbuf_remove(buf, pos, 1);
+
+		/* We know 'c' must be in the range 128-255 */
+		replace[0] = 0xc0 + (c >> 6);
+		replace[1] = 0x80 + (c & 0x3f);
+		strbuf_insert(buf, pos, replace, 2);
+		pos += 2;
+	}
+}
+
 static const char commit_utf8_warn[] =
-"Warning: commit message does not conform to UTF-8.\n"
+"Warning: commit message did not conform to UTF-8.\n"
 "You may want to amend it after fixing the message, or set the config\n"
 "variable i18n.commitencoding to the encoding your project uses.\n";
 
@@ -1170,7 +1254,7 @@ int commit_tree_extended(const struct strbuf *msg, unsigned char *tree,
 	strbuf_addbuf(&buffer, msg);
 
 	/* And check the encoding */
-	if (encoding_is_utf8 && !is_utf8(buffer.buf))
+	if (encoding_is_utf8 && !verify_utf8(&buffer))
 		fprintf(stderr, commit_utf8_warn);
 
 	if (sign_commit && do_sign_commit(&buffer, sign_commit))
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-06-28 11:24:14 -0700
committer	Junio C Hamano <gitster@pobox.com>	2012-08-21 16:10:53 -0700
commit	08a94a145c3231c0fa36469682591a3c45222271 (patch)
tree	f25e5463156188f41e00da2f284fd8eb1b756bf2 /commit.c
parent	4c8a9db6f7dd9e10b5ce9bfbcd5faa82a8c86ce3 (diff)
download	git-08a94a145c3231c0fa36469682591a3c45222271.tar.gz