summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJunio C Hamano <junkio@cox.net>2006-12-28 19:03:02 -0800
committerJunio C Hamano <junkio@cox.net>2006-12-28 19:03:02 -0800
commiteff73751bb94b0241fd2204effb8680fe9973cbf (patch)
tree5fe6f0f55988d64a7b8b841c62b1c996a3fda5bf
parent013672bc589da395fcba9bf62d699e70f3764689 (diff)
parent7255ff0446217ad723dad6088504a18f6afb15db (diff)
downloadgit-eff73751bb94b0241fd2204effb8680fe9973cbf.tar.gz
Merge branch 'jc/utf8'
* jc/utf8: t3900: test conversion to non UTF-8 as well Rename t3900 test vector file UTF-8: introduce i18n.logoutputencoding. Teach log family --encoding i18n.logToUTF8: convert commit log message to UTF-8 Move encoding conversion routine out of mailinfo to utf8.c Conflicts: commit.c
-rw-r--r--Documentation/config.txt4
-rw-r--r--builtin-commit-tree.c13
-rw-r--r--builtin-log.c16
-rw-r--r--builtin-mailinfo.c42
-rw-r--r--cache.h4
-rw-r--r--commit.c66
-rw-r--r--config.c8
-rwxr-xr-xcontrib/completion/git-completion.bash1
-rw-r--r--environment.c3
-rw-r--r--revision.h1
-rwxr-xr-xt/t3900-i18n-commit.sh115
-rw-r--r--t/t3900/1-UTF-8.txt3
-rw-r--r--t/t3900/2-UTF-8.txt4
-rw-r--r--t/t3900/EUCJP.txt4
-rw-r--r--t/t3900/ISO-2022-JP.txt4
-rw-r--r--t/t3900/ISO-8859-1.txt3
-rw-r--r--utf8.c54
-rw-r--r--utf8.h6
18 files changed, 308 insertions, 43 deletions
diff --git a/Documentation/config.txt b/Documentation/config.txt
index 6452a8be14..178e0e1e20 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -267,6 +267,10 @@ i18n.commitEncoding::
browser (and possibly at other places in the future or in other
porcelains). See e.g. gitlink:git-mailinfo[1]. Defaults to 'utf-8'.
+i18n.logOutputEncoding::
+ Character encoding the commit messages are converted to when
+ running `git-log` and friends.
+
log.showroot::
If true, the initial commit will be shown as a big creation event.
This is equivalent to a diff against an empty tree.
diff --git a/builtin-commit-tree.c b/builtin-commit-tree.c
index f641787988..146aaffd28 100644
--- a/builtin-commit-tree.c
+++ b/builtin-commit-tree.c
@@ -92,6 +92,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix)
char comment[1000];
char *buffer;
unsigned int size;
+ int encoding_is_utf8;
setup_ident();
git_config(git_default_config);
@@ -117,6 +118,10 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix)
parents++;
}
+ /* Not having i18n.commitencoding is the same as having utf-8 */
+ encoding_is_utf8 = (!git_commit_encoding ||
+ !strcmp(git_commit_encoding, "utf-8"));
+
init_buffer(&buffer, &size);
add_buffer(&buffer, &size, "tree %s\n", sha1_to_hex(tree_sha1));
@@ -130,7 +135,11 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix)
/* Person/date information */
add_buffer(&buffer, &size, "author %s\n", git_author_info(1));
- add_buffer(&buffer, &size, "committer %s\n\n", git_committer_info(1));
+ add_buffer(&buffer, &size, "committer %s\n", git_committer_info(1));
+ if (!encoding_is_utf8)
+ add_buffer(&buffer, &size,
+ "encoding %s\n", git_commit_encoding);
+ add_buffer(&buffer, &size, "\n");
/* And add the comment */
while (fgets(comment, sizeof(comment), stdin) != NULL)
@@ -138,7 +147,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix)
/* And check the encoding */
buffer[size] = '\0';
- if (!strcmp(git_commit_encoding, "utf-8") && !is_utf8(buffer))
+ if (encoding_is_utf8 && !is_utf8(buffer))
fprintf(stderr, commit_utf8_warn);
if (!write_sha1_file(buffer, size, commit_type, commit_sha1)) {
diff --git a/builtin-log.c b/builtin-log.c
index 8df3c1394a..a59b4acef1 100644
--- a/builtin-log.c
+++ b/builtin-log.c
@@ -20,6 +20,8 @@ void add_head(struct rev_info *revs);
static void cmd_log_init(int argc, const char **argv, const char *prefix,
struct rev_info *rev)
{
+ int i;
+
rev->abbrev = DEFAULT_ABBREV;
rev->commit_format = CMIT_FMT_DEFAULT;
rev->verbose_header = 1;
@@ -27,8 +29,18 @@ static void cmd_log_init(int argc, const char **argv, const char *prefix,
argc = setup_revisions(argc, argv, rev, "HEAD");
if (rev->diffopt.pickaxe || rev->diffopt.filter)
rev->always_show_header = 0;
- if (argc > 1)
- die("unrecognized argument: %s", argv[1]);
+ for (i = 1; i < argc; i++) {
+ const char *arg = argv[i];
+ if (!strncmp(arg, "--encoding=", 11)) {
+ arg += 11;
+ if (strcmp(arg, "none"))
+ git_log_output_encoding = strdup(arg);
+ else
+ git_log_output_encoding = "";
+ }
+ else
+ die("unrecognized argument: %s", arg);
+ }
}
static int cmd_log_walk(struct rev_info *rev)
diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c
index e6472293d4..a67f3eb90b 100644
--- a/builtin-mailinfo.c
+++ b/builtin-mailinfo.c
@@ -4,6 +4,7 @@
*/
#include "cache.h"
#include "builtin.h"
+#include "utf8.h"
static FILE *cmitmsg, *patchfile, *fin, *fout;
@@ -510,40 +511,18 @@ static int decode_b_segment(char *in, char *ot, char *ep)
static void convert_to_utf8(char *line, char *charset)
{
-#ifndef NO_ICONV
- char *in, *out;
- size_t insize, outsize, nrc;
- char outbuf[4096]; /* cheat */
static char latin_one[] = "latin1";
char *input_charset = *charset ? charset : latin_one;
- iconv_t conv = iconv_open(metainfo_charset, input_charset);
-
- if (conv == (iconv_t) -1) {
- static int warned_latin1_once = 0;
- if (input_charset != latin_one) {
- fprintf(stderr, "cannot convert from %s to %s\n",
- input_charset, metainfo_charset);
- *charset = 0;
- }
- else if (!warned_latin1_once) {
- warned_latin1_once = 1;
- fprintf(stderr, "tried to convert from %s to %s, "
- "but your iconv does not work with it.\n",
- input_charset, metainfo_charset);
- }
+ char *out = reencode_string(line, metainfo_charset, input_charset);
+
+ if (!out) {
+ fprintf(stderr, "cannot convert from %s to %s\n",
+ input_charset, metainfo_charset);
+ *charset = 0;
return;
}
- in = line;
- insize = strlen(in);
- out = outbuf;
- outsize = sizeof(outbuf);
- nrc = iconv(conv, &in, &insize, &out, &outsize);
- iconv_close(conv);
- if (nrc == (size_t) -1)
- return;
- *out = 0;
- strcpy(line, outbuf);
-#endif
+ strcpy(line, out);
+ free(out);
}
static int decode_header_bq(char *it)
@@ -827,7 +806,8 @@ int cmd_mailinfo(int argc, const char **argv, const char *prefix)
if (!strcmp(argv[1], "-k"))
keep_subject = 1;
else if (!strcmp(argv[1], "-u"))
- metainfo_charset = git_commit_encoding;
+ metainfo_charset = (git_commit_encoding
+ ? git_commit_encoding : "utf-8");
else if (!strncmp(argv[1], "--encoding=", 11))
metainfo_charset = argv[1] + 11;
else
diff --git a/cache.h b/cache.h
index 4943056c19..29dd290c92 100644
--- a/cache.h
+++ b/cache.h
@@ -416,8 +416,8 @@ extern int check_repository_format_version(const char *var, const char *value);
extern char git_default_email[MAX_GITNAME];
extern char git_default_name[MAX_GITNAME];
-#define MAX_ENCODING_LENGTH 64
-extern char git_commit_encoding[MAX_ENCODING_LENGTH];
+extern char *git_commit_encoding;
+extern char *git_log_output_encoding;
extern int copy_fd(int ifd, int ofd);
extern void write_or_die(int fd, const void *buf, size_t count);
diff --git a/commit.c b/commit.c
index 59ea77c577..eb06afbbe0 100644
--- a/commit.c
+++ b/commit.c
@@ -2,6 +2,7 @@
#include "tag.h"
#include "commit.h"
#include "pkt-line.h"
+#include "utf8.h"
int save_commit_buffer = 1;
@@ -597,10 +598,61 @@ static int add_merge_info(enum cmit_fmt fmt, char *buf, const struct commit *com
return offset;
}
-unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit,
- unsigned long len, char *buf, unsigned long space,
+static char *get_header(const struct commit *commit, const char *key)
+{
+ int key_len = strlen(key);
+ const char *line = commit->buffer;
+
+ for (;;) {
+ const char *eol = strchr(line, '\n'), *next;
+
+ if (line == eol)
+ return NULL;
+ if (!eol) {
+ eol = line + strlen(line);
+ next = NULL;
+ } else
+ next = eol + 1;
+ if (!strncmp(line, key, key_len) && line[key_len] == ' ') {
+ int len = eol - line - key_len;
+ char *ret = xmalloc(len);
+ memcpy(ret, line + key_len + 1, len - 1);
+ ret[len - 1] = '\0';
+ return ret;
+ }
+ line = next;
+ }
+}
+
+static char *logmsg_reencode(const struct commit *commit)
+{
+ char *encoding;
+ char *out;
+ char *output_encoding = (git_log_output_encoding
+ ? git_log_output_encoding
+ : git_commit_encoding);
+
+ if (!output_encoding)
+ return NULL;
+ encoding = get_header(commit, "encoding");
+ if (!encoding || !strcmp(encoding, output_encoding)) {
+ free(encoding);
+ return NULL;
+ }
+ out = reencode_string(commit->buffer, output_encoding, encoding);
+ free(encoding);
+ if (!out)
+ return NULL;
+ return out;
+}
+
+unsigned long pretty_print_commit(enum cmit_fmt fmt,
+ const struct commit *commit,
+ unsigned long len,
+ char *buf, unsigned long space,
int abbrev, const char *subject,
- const char *after_subject, int relative_date)
+ const char *after_subject,
+ int relative_date)
{
int hdr = 1, body = 0;
unsigned long offset = 0;
@@ -608,6 +660,10 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit
int parents_shown = 0;
const char *msg = commit->buffer;
int plain_non_ascii = 0;
+ char *reencoded = logmsg_reencode(commit);
+
+ if (reencoded)
+ msg = reencoded;
if (fmt == CMIT_FMT_ONELINE || fmt == CMIT_FMT_EMAIL)
indent = 0;
@@ -624,7 +680,7 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit
for (in_body = i = 0; (ch = msg[i]) && i < len; i++) {
if (!in_body) {
/* author could be non 7-bit ASCII but
- * the log may so; skip over the
+ * the log may be so; skip over the
* header part first.
*/
if (ch == '\n' &&
@@ -755,6 +811,8 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit
if (fmt == CMIT_FMT_EMAIL && !body)
buf[offset++] = '\n';
buf[offset] = '\0';
+
+ free(reencoded);
return offset;
}
diff --git a/config.c b/config.c
index 1662a4626e..fcccf7e2a4 100644
--- a/config.c
+++ b/config.c
@@ -309,10 +309,16 @@ int git_default_config(const char *var, const char *value)
}
if (!strcmp(var, "i18n.commitencoding")) {
- strlcpy(git_commit_encoding, value, sizeof(git_commit_encoding));
+ git_commit_encoding = strdup(value);
return 0;
}
+ if (!strcmp(var, "i18n.logoutputencoding")) {
+ git_log_output_encoding = strdup(value);
+ return 0;
+ }
+
+
if (!strcmp(var, "pager.color") || !strcmp(var, "color.pager")) {
pager_use_color = git_config_bool(var,value);
return 0;
diff --git a/contrib/completion/git-completion.bash b/contrib/completion/git-completion.bash
index 234cd0954b..7c7520ea29 100755
--- a/contrib/completion/git-completion.bash
+++ b/contrib/completion/git-completion.bash
@@ -711,6 +711,7 @@ _git_repo_config ()
core.compression
core.legacyHeaders
i18n.commitEncoding
+ i18n.logOutputEncoding
diff.color
color.diff
diff.renameLimit
diff --git a/environment.c b/environment.c
index f8c7dbcead..a1502c4e87 100644
--- a/environment.c
+++ b/environment.c
@@ -18,7 +18,8 @@ int prefer_symlink_refs;
int log_all_ref_updates;
int warn_ambiguous_refs = 1;
int repository_format_version;
-char git_commit_encoding[MAX_ENCODING_LENGTH] = "utf-8";
+char *git_commit_encoding;
+char *git_log_output_encoding;
int shared_repository = PERM_UMASK;
const char *apply_default_whitespace;
int zlib_compression_level = Z_DEFAULT_COMPRESSION;
diff --git a/revision.h b/revision.h
index ec991e5c57..8f7907d7ab 100644
--- a/revision.h
+++ b/revision.h
@@ -72,6 +72,7 @@ struct rev_info {
const char *ref_message_id;
const char *add_signoff;
const char *extra_headers;
+ const char *log_reencode;
/* Filter by commit log message */
struct grep_opt *grep_filter;
diff --git a/t/t3900-i18n-commit.sh b/t/t3900-i18n-commit.sh
new file mode 100755
index 0000000000..46fd47cb0f
--- /dev/null
+++ b/t/t3900-i18n-commit.sh
@@ -0,0 +1,115 @@
+#!/bin/sh
+#
+# Copyright (c) 2006 Junio C Hamano
+#
+
+test_description='commit and log output encodings'
+
+. ./test-lib.sh
+
+compare_with () {
+ git-show -s "$1" | sed -e '1,/^$/d' -e 's/^ //' -e '$d' >current &&
+ diff -u current "$2"
+}
+
+test_expect_success setup '
+ : >F &&
+ git-add F &&
+ T=$(git-write-tree) &&
+ C=$(git-commit-tree $T <../t3900/1-UTF-8.txt) &&
+ git-update-ref HEAD $C &&
+ git-tag C0
+'
+
+test_expect_success 'no encoding header for base case' '
+ E=$(git-cat-file commit C0 | sed -ne "s/^encoding //p") &&
+ test z = "z$E"
+'
+
+for H in ISO-8859-1 EUCJP ISO-2022-JP
+do
+ test_expect_success "$H setup" '
+ git-repo-config i18n.commitencoding $H &&
+ git-checkout -b $H C0 &&
+ echo $H >F &&
+ git-commit -a -F ../t3900/$H.txt
+ '
+done
+
+for H in ISO-8859-1 EUCJP ISO-2022-JP
+do
+ test_expect_success "check encoding header for $H" '
+ E=$(git-cat-file commit '$H' | sed -ne "s/^encoding //p") &&
+ test "z$E" = "z'$H'"
+ '
+done
+
+test_expect_success 'repo-config to remove customization' '
+ git-repo-config --unset-all i18n.commitencoding &&
+ if Z=$(git-repo-config --get-all i18n.commitencoding)
+ then
+ echo Oops, should have failed.
+ false
+ else
+ test z = "z$Z"
+ fi &&
+ git-repo-config i18n.commitencoding utf-8
+'
+
+test_expect_success 'ISO-8859-1 should be shown in UTF-8 now' '
+ compare_with ISO-8859-1 ../t3900/1-UTF-8.txt
+'
+
+for H in EUCJP ISO-2022-JP
+do
+ test_expect_success "$H should be shown in UTF-8 now" '
+ compare_with '$H' ../t3900/2-UTF-8.txt
+ '
+done
+
+test_expect_success 'repo-config to add customization' '
+ git-repo-config --unset-all i18n.commitencoding &&
+ if Z=$(git-repo-config --get-all i18n.commitencoding)
+ then
+ echo Oops, should have failed.
+ false
+ else
+ test z = "z$Z"
+ fi
+'
+
+for H in ISO-8859-1 EUCJP ISO-2022-JP
+do
+ test_expect_success "$H should be shown in itself now" '
+ git-repo-config i18n.commitencoding '$H' &&
+ compare_with '$H' ../t3900/'$H'.txt
+ '
+done
+
+test_expect_success 'repo-config to tweak customization' '
+ git-repo-config i18n.logoutputencoding utf-8
+'
+
+test_expect_success 'ISO-8859-1 should be shown in UTF-8 now' '
+ compare_with ISO-8859-1 ../t3900/1-UTF-8.txt
+'
+
+for H in EUCJP ISO-2022-JP
+do
+ test_expect_success "$H should be shown in UTF-8 now" '
+ compare_with '$H' ../t3900/2-UTF-8.txt
+ '
+done
+
+for J in EUCJP ISO-2022-JP
+do
+ git-repo-config i18n.logoutputencoding $J
+ for H in EUCJP ISO-2022-JP
+ do
+ test_expect_success "$H should be shown in $J now" '
+ compare_with '$H' ../t3900/'$J'.txt
+ '
+ done
+done
+
+test_done
diff --git a/t/t3900/1-UTF-8.txt b/t/t3900/1-UTF-8.txt
new file mode 100644
index 0000000000..ee31e19738
--- /dev/null
+++ b/t/t3900/1-UTF-8.txt
@@ -0,0 +1,3 @@
+ÄËÑÏÖ
+
+Ábçdèfg
diff --git a/t/t3900/2-UTF-8.txt b/t/t3900/2-UTF-8.txt
new file mode 100644
index 0000000000..63f4f8f121
--- /dev/null
+++ b/t/t3900/2-UTF-8.txt
@@ -0,0 +1,4 @@
+はれひほふ
+
+しているのが、いるので。
+濱浜ほれぷりぽれまびぐりろへ。
diff --git a/t/t3900/EUCJP.txt b/t/t3900/EUCJP.txt
new file mode 100644
index 0000000000..546f2aac01
--- /dev/null
+++ b/t/t3900/EUCJP.txt
@@ -0,0 +1,4 @@
+ϤҤۤ
+
+ƤΤΤǡ
+ͤۤפݤޤӤء
diff --git a/t/t3900/ISO-2022-JP.txt b/t/t3900/ISO-2022-JP.txt
new file mode 100644
index 0000000000..74b533042f
--- /dev/null
+++ b/t/t3900/ISO-2022-JP.txt
@@ -0,0 +1,4 @@
+$B$O$l$R$[$U(B
+
+$B$7$F$$$k$N$,!"$$$k$N$G!#(B
+$B_@IM$[$l$W$j$]$l$^$S$0$j$m$X!#(B
diff --git a/t/t3900/ISO-8859-1.txt b/t/t3900/ISO-8859-1.txt
new file mode 100644
index 0000000000..7cbef0ee6f
--- /dev/null
+++ b/t/t3900/ISO-8859-1.txt
@@ -0,0 +1,3 @@
+
+
+bdfg
diff --git a/utf8.c b/utf8.c
index 8fa62571aa..1eedd8b61a 100644
--- a/utf8.c
+++ b/utf8.c
@@ -276,3 +276,57 @@ void print_wrapped_text(const char *text, int indent, int indent2, int width)
}
}
}
+
+/*
+ * Given a buffer and its encoding, return it re-encoded
+ * with iconv. If the conversion fails, returns NULL.
+ */
+#ifndef NO_ICONV
+char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding)
+{
+ iconv_t conv;
+ size_t insz, outsz, outalloc;
+ char *out, *outpos, *cp;
+
+ if (!in_encoding)
+ return NULL;
+ conv = iconv_open(out_encoding, in_encoding);
+ if (conv == (iconv_t) -1)
+ return NULL;
+ insz = strlen(in);
+ outsz = insz;
+ outalloc = outsz + 1; /* for terminating NUL */
+ out = xmalloc(outalloc);
+ outpos = out;
+ cp = (char *)in;
+
+ while (1) {
+ size_t cnt = iconv(conv, &cp, &insz, &outpos, &outsz);
+
+ if (cnt == -1) {
+ size_t sofar;
+ if (errno != E2BIG) {
+ free(out);
+ iconv_close(conv);
+ return NULL;
+ }
+ /* insz has remaining number of bytes.
+ * since we started outsz the same as insz,
+ * it is likely that insz is not enough for
+ * converting the rest.
+ */
+ sofar = outpos - out;
+ outalloc = sofar + insz * 2 + 32;
+ out = xrealloc(out, outalloc);
+ outpos = out + sofar;
+ outsz = outalloc - sofar - 1;
+ }
+ else {
+ *outpos = '\0';
+ break;
+ }
+ }
+ iconv_close(conv);
+ return out;
+}
+#endif
diff --git a/utf8.h b/utf8.h
index a0d7f591ad..cae2a8e665 100644
--- a/utf8.h
+++ b/utf8.h
@@ -5,4 +5,10 @@ int utf8_width(const char **start);
int is_utf8(const char *text);
void print_wrapped_text(const char *text, int indent, int indent2, int len);
+#ifndef NO_ICONV
+char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding);
+#else
+#define reencode_string(a,b,c) NULL
+#endif
+
#endif