diff options
author | Junio C Hamano <junkio@cox.net> | 2006-12-28 19:03:02 -0800 |
---|---|---|
committer | Junio C Hamano <junkio@cox.net> | 2006-12-28 19:03:02 -0800 |
commit | eff73751bb94b0241fd2204effb8680fe9973cbf (patch) | |
tree | 5fe6f0f55988d64a7b8b841c62b1c996a3fda5bf | |
parent | 013672bc589da395fcba9bf62d699e70f3764689 (diff) | |
parent | 7255ff0446217ad723dad6088504a18f6afb15db (diff) | |
download | git-eff73751bb94b0241fd2204effb8680fe9973cbf.tar.gz |
Merge branch 'jc/utf8'
* jc/utf8:
t3900: test conversion to non UTF-8 as well
Rename t3900 test vector file
UTF-8: introduce i18n.logoutputencoding.
Teach log family --encoding
i18n.logToUTF8: convert commit log message to UTF-8
Move encoding conversion routine out of mailinfo to utf8.c
Conflicts:
commit.c
-rw-r--r-- | Documentation/config.txt | 4 | ||||
-rw-r--r-- | builtin-commit-tree.c | 13 | ||||
-rw-r--r-- | builtin-log.c | 16 | ||||
-rw-r--r-- | builtin-mailinfo.c | 42 | ||||
-rw-r--r-- | cache.h | 4 | ||||
-rw-r--r-- | commit.c | 66 | ||||
-rw-r--r-- | config.c | 8 | ||||
-rwxr-xr-x | contrib/completion/git-completion.bash | 1 | ||||
-rw-r--r-- | environment.c | 3 | ||||
-rw-r--r-- | revision.h | 1 | ||||
-rwxr-xr-x | t/t3900-i18n-commit.sh | 115 | ||||
-rw-r--r-- | t/t3900/1-UTF-8.txt | 3 | ||||
-rw-r--r-- | t/t3900/2-UTF-8.txt | 4 | ||||
-rw-r--r-- | t/t3900/EUCJP.txt | 4 | ||||
-rw-r--r-- | t/t3900/ISO-2022-JP.txt | 4 | ||||
-rw-r--r-- | t/t3900/ISO-8859-1.txt | 3 | ||||
-rw-r--r-- | utf8.c | 54 | ||||
-rw-r--r-- | utf8.h | 6 |
18 files changed, 308 insertions, 43 deletions
diff --git a/Documentation/config.txt b/Documentation/config.txt index 6452a8be14..178e0e1e20 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -267,6 +267,10 @@ i18n.commitEncoding:: browser (and possibly at other places in the future or in other porcelains). See e.g. gitlink:git-mailinfo[1]. Defaults to 'utf-8'. +i18n.logOutputEncoding:: + Character encoding the commit messages are converted to when + running `git-log` and friends. + log.showroot:: If true, the initial commit will be shown as a big creation event. This is equivalent to a diff against an empty tree. diff --git a/builtin-commit-tree.c b/builtin-commit-tree.c index f641787988..146aaffd28 100644 --- a/builtin-commit-tree.c +++ b/builtin-commit-tree.c @@ -92,6 +92,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix) char comment[1000]; char *buffer; unsigned int size; + int encoding_is_utf8; setup_ident(); git_config(git_default_config); @@ -117,6 +118,10 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix) parents++; } + /* Not having i18n.commitencoding is the same as having utf-8 */ + encoding_is_utf8 = (!git_commit_encoding || + !strcmp(git_commit_encoding, "utf-8")); + init_buffer(&buffer, &size); add_buffer(&buffer, &size, "tree %s\n", sha1_to_hex(tree_sha1)); @@ -130,7 +135,11 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix) /* Person/date information */ add_buffer(&buffer, &size, "author %s\n", git_author_info(1)); - add_buffer(&buffer, &size, "committer %s\n\n", git_committer_info(1)); + add_buffer(&buffer, &size, "committer %s\n", git_committer_info(1)); + if (!encoding_is_utf8) + add_buffer(&buffer, &size, + "encoding %s\n", git_commit_encoding); + add_buffer(&buffer, &size, "\n"); /* And add the comment */ while (fgets(comment, sizeof(comment), stdin) != NULL) @@ -138,7 +147,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix) /* And check the encoding */ buffer[size] = '\0'; - if (!strcmp(git_commit_encoding, "utf-8") && !is_utf8(buffer)) + if (encoding_is_utf8 && !is_utf8(buffer)) fprintf(stderr, commit_utf8_warn); if (!write_sha1_file(buffer, size, commit_type, commit_sha1)) { diff --git a/builtin-log.c b/builtin-log.c index 8df3c1394a..a59b4acef1 100644 --- a/builtin-log.c +++ b/builtin-log.c @@ -20,6 +20,8 @@ void add_head(struct rev_info *revs); static void cmd_log_init(int argc, const char **argv, const char *prefix, struct rev_info *rev) { + int i; + rev->abbrev = DEFAULT_ABBREV; rev->commit_format = CMIT_FMT_DEFAULT; rev->verbose_header = 1; @@ -27,8 +29,18 @@ static void cmd_log_init(int argc, const char **argv, const char *prefix, argc = setup_revisions(argc, argv, rev, "HEAD"); if (rev->diffopt.pickaxe || rev->diffopt.filter) rev->always_show_header = 0; - if (argc > 1) - die("unrecognized argument: %s", argv[1]); + for (i = 1; i < argc; i++) { + const char *arg = argv[i]; + if (!strncmp(arg, "--encoding=", 11)) { + arg += 11; + if (strcmp(arg, "none")) + git_log_output_encoding = strdup(arg); + else + git_log_output_encoding = ""; + } + else + die("unrecognized argument: %s", arg); + } } static int cmd_log_walk(struct rev_info *rev) diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c index e6472293d4..a67f3eb90b 100644 --- a/builtin-mailinfo.c +++ b/builtin-mailinfo.c @@ -4,6 +4,7 @@ */ #include "cache.h" #include "builtin.h" +#include "utf8.h" static FILE *cmitmsg, *patchfile, *fin, *fout; @@ -510,40 +511,18 @@ static int decode_b_segment(char *in, char *ot, char *ep) static void convert_to_utf8(char *line, char *charset) { -#ifndef NO_ICONV - char *in, *out; - size_t insize, outsize, nrc; - char outbuf[4096]; /* cheat */ static char latin_one[] = "latin1"; char *input_charset = *charset ? charset : latin_one; - iconv_t conv = iconv_open(metainfo_charset, input_charset); - - if (conv == (iconv_t) -1) { - static int warned_latin1_once = 0; - if (input_charset != latin_one) { - fprintf(stderr, "cannot convert from %s to %s\n", - input_charset, metainfo_charset); - *charset = 0; - } - else if (!warned_latin1_once) { - warned_latin1_once = 1; - fprintf(stderr, "tried to convert from %s to %s, " - "but your iconv does not work with it.\n", - input_charset, metainfo_charset); - } + char *out = reencode_string(line, metainfo_charset, input_charset); + + if (!out) { + fprintf(stderr, "cannot convert from %s to %s\n", + input_charset, metainfo_charset); + *charset = 0; return; } - in = line; - insize = strlen(in); - out = outbuf; - outsize = sizeof(outbuf); - nrc = iconv(conv, &in, &insize, &out, &outsize); - iconv_close(conv); - if (nrc == (size_t) -1) - return; - *out = 0; - strcpy(line, outbuf); -#endif + strcpy(line, out); + free(out); } static int decode_header_bq(char *it) @@ -827,7 +806,8 @@ int cmd_mailinfo(int argc, const char **argv, const char *prefix) if (!strcmp(argv[1], "-k")) keep_subject = 1; else if (!strcmp(argv[1], "-u")) - metainfo_charset = git_commit_encoding; + metainfo_charset = (git_commit_encoding + ? git_commit_encoding : "utf-8"); else if (!strncmp(argv[1], "--encoding=", 11)) metainfo_charset = argv[1] + 11; else @@ -416,8 +416,8 @@ extern int check_repository_format_version(const char *var, const char *value); extern char git_default_email[MAX_GITNAME]; extern char git_default_name[MAX_GITNAME]; -#define MAX_ENCODING_LENGTH 64 -extern char git_commit_encoding[MAX_ENCODING_LENGTH]; +extern char *git_commit_encoding; +extern char *git_log_output_encoding; extern int copy_fd(int ifd, int ofd); extern void write_or_die(int fd, const void *buf, size_t count); @@ -2,6 +2,7 @@ #include "tag.h" #include "commit.h" #include "pkt-line.h" +#include "utf8.h" int save_commit_buffer = 1; @@ -597,10 +598,61 @@ static int add_merge_info(enum cmit_fmt fmt, char *buf, const struct commit *com return offset; } -unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit, - unsigned long len, char *buf, unsigned long space, +static char *get_header(const struct commit *commit, const char *key) +{ + int key_len = strlen(key); + const char *line = commit->buffer; + + for (;;) { + const char *eol = strchr(line, '\n'), *next; + + if (line == eol) + return NULL; + if (!eol) { + eol = line + strlen(line); + next = NULL; + } else + next = eol + 1; + if (!strncmp(line, key, key_len) && line[key_len] == ' ') { + int len = eol - line - key_len; + char *ret = xmalloc(len); + memcpy(ret, line + key_len + 1, len - 1); + ret[len - 1] = '\0'; + return ret; + } + line = next; + } +} + +static char *logmsg_reencode(const struct commit *commit) +{ + char *encoding; + char *out; + char *output_encoding = (git_log_output_encoding + ? git_log_output_encoding + : git_commit_encoding); + + if (!output_encoding) + return NULL; + encoding = get_header(commit, "encoding"); + if (!encoding || !strcmp(encoding, output_encoding)) { + free(encoding); + return NULL; + } + out = reencode_string(commit->buffer, output_encoding, encoding); + free(encoding); + if (!out) + return NULL; + return out; +} + +unsigned long pretty_print_commit(enum cmit_fmt fmt, + const struct commit *commit, + unsigned long len, + char *buf, unsigned long space, int abbrev, const char *subject, - const char *after_subject, int relative_date) + const char *after_subject, + int relative_date) { int hdr = 1, body = 0; unsigned long offset = 0; @@ -608,6 +660,10 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit int parents_shown = 0; const char *msg = commit->buffer; int plain_non_ascii = 0; + char *reencoded = logmsg_reencode(commit); + + if (reencoded) + msg = reencoded; if (fmt == CMIT_FMT_ONELINE || fmt == CMIT_FMT_EMAIL) indent = 0; @@ -624,7 +680,7 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit for (in_body = i = 0; (ch = msg[i]) && i < len; i++) { if (!in_body) { /* author could be non 7-bit ASCII but - * the log may so; skip over the + * the log may be so; skip over the * header part first. */ if (ch == '\n' && @@ -755,6 +811,8 @@ unsigned long pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit if (fmt == CMIT_FMT_EMAIL && !body) buf[offset++] = '\n'; buf[offset] = '\0'; + + free(reencoded); return offset; } @@ -309,10 +309,16 @@ int git_default_config(const char *var, const char *value) } if (!strcmp(var, "i18n.commitencoding")) { - strlcpy(git_commit_encoding, value, sizeof(git_commit_encoding)); + git_commit_encoding = strdup(value); return 0; } + if (!strcmp(var, "i18n.logoutputencoding")) { + git_log_output_encoding = strdup(value); + return 0; + } + + if (!strcmp(var, "pager.color") || !strcmp(var, "color.pager")) { pager_use_color = git_config_bool(var,value); return 0; diff --git a/contrib/completion/git-completion.bash b/contrib/completion/git-completion.bash index 234cd0954b..7c7520ea29 100755 --- a/contrib/completion/git-completion.bash +++ b/contrib/completion/git-completion.bash @@ -711,6 +711,7 @@ _git_repo_config () core.compression core.legacyHeaders i18n.commitEncoding + i18n.logOutputEncoding diff.color color.diff diff.renameLimit diff --git a/environment.c b/environment.c index f8c7dbcead..a1502c4e87 100644 --- a/environment.c +++ b/environment.c @@ -18,7 +18,8 @@ int prefer_symlink_refs; int log_all_ref_updates; int warn_ambiguous_refs = 1; int repository_format_version; -char git_commit_encoding[MAX_ENCODING_LENGTH] = "utf-8"; +char *git_commit_encoding; +char *git_log_output_encoding; int shared_repository = PERM_UMASK; const char *apply_default_whitespace; int zlib_compression_level = Z_DEFAULT_COMPRESSION; diff --git a/revision.h b/revision.h index ec991e5c57..8f7907d7ab 100644 --- a/revision.h +++ b/revision.h @@ -72,6 +72,7 @@ struct rev_info { const char *ref_message_id; const char *add_signoff; const char *extra_headers; + const char *log_reencode; /* Filter by commit log message */ struct grep_opt *grep_filter; diff --git a/t/t3900-i18n-commit.sh b/t/t3900-i18n-commit.sh new file mode 100755 index 0000000000..46fd47cb0f --- /dev/null +++ b/t/t3900-i18n-commit.sh @@ -0,0 +1,115 @@ +#!/bin/sh +# +# Copyright (c) 2006 Junio C Hamano +# + +test_description='commit and log output encodings' + +. ./test-lib.sh + +compare_with () { + git-show -s "$1" | sed -e '1,/^$/d' -e 's/^ //' -e '$d' >current && + diff -u current "$2" +} + +test_expect_success setup ' + : >F && + git-add F && + T=$(git-write-tree) && + C=$(git-commit-tree $T <../t3900/1-UTF-8.txt) && + git-update-ref HEAD $C && + git-tag C0 +' + +test_expect_success 'no encoding header for base case' ' + E=$(git-cat-file commit C0 | sed -ne "s/^encoding //p") && + test z = "z$E" +' + +for H in ISO-8859-1 EUCJP ISO-2022-JP +do + test_expect_success "$H setup" ' + git-repo-config i18n.commitencoding $H && + git-checkout -b $H C0 && + echo $H >F && + git-commit -a -F ../t3900/$H.txt + ' +done + +for H in ISO-8859-1 EUCJP ISO-2022-JP +do + test_expect_success "check encoding header for $H" ' + E=$(git-cat-file commit '$H' | sed -ne "s/^encoding //p") && + test "z$E" = "z'$H'" + ' +done + +test_expect_success 'repo-config to remove customization' ' + git-repo-config --unset-all i18n.commitencoding && + if Z=$(git-repo-config --get-all i18n.commitencoding) + then + echo Oops, should have failed. + false + else + test z = "z$Z" + fi && + git-repo-config i18n.commitencoding utf-8 +' + +test_expect_success 'ISO-8859-1 should be shown in UTF-8 now' ' + compare_with ISO-8859-1 ../t3900/1-UTF-8.txt +' + +for H in EUCJP ISO-2022-JP +do + test_expect_success "$H should be shown in UTF-8 now" ' + compare_with '$H' ../t3900/2-UTF-8.txt + ' +done + +test_expect_success 'repo-config to add customization' ' + git-repo-config --unset-all i18n.commitencoding && + if Z=$(git-repo-config --get-all i18n.commitencoding) + then + echo Oops, should have failed. + false + else + test z = "z$Z" + fi +' + +for H in ISO-8859-1 EUCJP ISO-2022-JP +do + test_expect_success "$H should be shown in itself now" ' + git-repo-config i18n.commitencoding '$H' && + compare_with '$H' ../t3900/'$H'.txt + ' +done + +test_expect_success 'repo-config to tweak customization' ' + git-repo-config i18n.logoutputencoding utf-8 +' + +test_expect_success 'ISO-8859-1 should be shown in UTF-8 now' ' + compare_with ISO-8859-1 ../t3900/1-UTF-8.txt +' + +for H in EUCJP ISO-2022-JP +do + test_expect_success "$H should be shown in UTF-8 now" ' + compare_with '$H' ../t3900/2-UTF-8.txt + ' +done + +for J in EUCJP ISO-2022-JP +do + git-repo-config i18n.logoutputencoding $J + for H in EUCJP ISO-2022-JP + do + test_expect_success "$H should be shown in $J now" ' + compare_with '$H' ../t3900/'$J'.txt + ' + done +done + +test_done diff --git a/t/t3900/1-UTF-8.txt b/t/t3900/1-UTF-8.txt new file mode 100644 index 0000000000..ee31e19738 --- /dev/null +++ b/t/t3900/1-UTF-8.txt @@ -0,0 +1,3 @@ +ÄËÑÏÖ + +Ábçdèfg diff --git a/t/t3900/2-UTF-8.txt b/t/t3900/2-UTF-8.txt new file mode 100644 index 0000000000..63f4f8f121 --- /dev/null +++ b/t/t3900/2-UTF-8.txt @@ -0,0 +1,4 @@ +はれひほふ + +しているのが、いるので。 +濱浜ほれぷりぽれまびぐりろへ。 diff --git a/t/t3900/EUCJP.txt b/t/t3900/EUCJP.txt new file mode 100644 index 0000000000..546f2aac01 --- /dev/null +++ b/t/t3900/EUCJP.txt @@ -0,0 +1,4 @@ +ϤҤۤ + +ƤΤΤǡ +ͤۤפݤޤӤء diff --git a/t/t3900/ISO-2022-JP.txt b/t/t3900/ISO-2022-JP.txt new file mode 100644 index 0000000000..74b533042f --- /dev/null +++ b/t/t3900/ISO-2022-JP.txt @@ -0,0 +1,4 @@ +$B$O$l$R$[$U(B + +$B$7$F$$$k$N$,!"$$$k$N$G!#(B +$B_@IM$[$l$W$j$]$l$^$S$0$j$m$X!#(B diff --git a/t/t3900/ISO-8859-1.txt b/t/t3900/ISO-8859-1.txt new file mode 100644 index 0000000000..7cbef0ee6f --- /dev/null +++ b/t/t3900/ISO-8859-1.txt @@ -0,0 +1,3 @@ + + +bdfg @@ -276,3 +276,57 @@ void print_wrapped_text(const char *text, int indent, int indent2, int width) } } } + +/* + * Given a buffer and its encoding, return it re-encoded + * with iconv. If the conversion fails, returns NULL. + */ +#ifndef NO_ICONV +char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding) +{ + iconv_t conv; + size_t insz, outsz, outalloc; + char *out, *outpos, *cp; + + if (!in_encoding) + return NULL; + conv = iconv_open(out_encoding, in_encoding); + if (conv == (iconv_t) -1) + return NULL; + insz = strlen(in); + outsz = insz; + outalloc = outsz + 1; /* for terminating NUL */ + out = xmalloc(outalloc); + outpos = out; + cp = (char *)in; + + while (1) { + size_t cnt = iconv(conv, &cp, &insz, &outpos, &outsz); + + if (cnt == -1) { + size_t sofar; + if (errno != E2BIG) { + free(out); + iconv_close(conv); + return NULL; + } + /* insz has remaining number of bytes. + * since we started outsz the same as insz, + * it is likely that insz is not enough for + * converting the rest. + */ + sofar = outpos - out; + outalloc = sofar + insz * 2 + 32; + out = xrealloc(out, outalloc); + outpos = out + sofar; + outsz = outalloc - sofar - 1; + } + else { + *outpos = '\0'; + break; + } + } + iconv_close(conv); + return out; +} +#endif @@ -5,4 +5,10 @@ int utf8_width(const char **start); int is_utf8(const char *text); void print_wrapped_text(const char *text, int indent, int indent2, int len); +#ifndef NO_ICONV +char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding); +#else +#define reencode_string(a,b,c) NULL +#endif + #endif |