summaryrefslogtreecommitdiff
path: root/storage/mroonga/vendor/groonga/plugins/tokenizers
diff options
context:
space:
mode:
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins/tokenizers')
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt11
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am6
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c120
3 files changed, 102 insertions, 35 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt
index 9209b44d36f..8eec25d683c 100644
--- a/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt
@@ -22,6 +22,17 @@ if(GRN_WITH_MECAB)
read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/mecab_sources.am MECAB_SOURCES)
include_directories(${MECAB_INCLUDE_DIRS})
link_directories(${MECAB_LIBRARY_DIRS})
+ if(GRN_WITH_BUNDLED_MECAB)
+ set(GRN_BUNDLED_MECAB_RELATIVE_RC_PATH "${CONFIG_DIR}/mecabrc")
+ set(MECAB_COMPILE_DEFINITIONS
+ "GRN_WITH_BUNDLED_MECAB"
+ "GRN_BUNDLED_MECAB_RELATIVE_RC_PATH=\"${GRN_BUNDLED_MECAB_RELATIVE_RC_PATH}\""
+ "GRN_BUNDLED_MECAB_RC_PATH=\"${CMAKE_INSTALL_PREFIX}/${GRN_BUNDLED_MECAB_RELATIVE_RC_PATH}\"")
+ set_source_files_properties(${MECAB_SOURCES}
+ PROPERTIES
+ COMPILE_DEFINITIONS
+ "${MECAB_COMPILE_DEFINITIONS}")
+ endif()
set_source_files_properties(${MECAB_SOURCES}
PROPERTIES
COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}")
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am
index 386b1554f73..9e10612baaa 100644
--- a/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am
@@ -14,12 +14,12 @@ AM_LDFLAGS = \
LIBS = \
$(top_builddir)/lib/libgroonga.la
-tokenizers_plugins_LTLIBRARIES =
+tokenizer_plugins_LTLIBRARIES =
if WITH_MECAB
-tokenizers_plugins_LTLIBRARIES += mecab.la
+tokenizer_plugins_LTLIBRARIES += mecab.la
endif
if WITH_KYTEA
-tokenizers_plugins_LTLIBRARIES += kytea.la
+tokenizer_plugins_LTLIBRARIES += kytea.la
endif
include mecab_sources.am
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
index bade2f9d3de..eeb027acb92 100644
--- a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
@@ -1,5 +1,6 @@
/* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2009-2015 Brazil
+/*
+ Copyright(C) 2009-2016 Brazil
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -64,14 +65,14 @@ mecab_global_error_message(void)
static grn_encoding
translate_mecab_charset_to_grn_encoding(const char *charset)
{
- if (strcasecmp(charset, "euc-jp") == 0) {
+ if (grn_strcasecmp(charset, "euc-jp") == 0) {
return GRN_ENC_EUC_JP;
- } else if (strcasecmp(charset, "utf-8") == 0 ||
- strcasecmp(charset, "utf8") == 0) {
+ } else if (grn_strcasecmp(charset, "utf-8") == 0 ||
+ grn_strcasecmp(charset, "utf8") == 0) {
return GRN_ENC_UTF8;
- } else if (strcasecmp(charset, "shift_jis") == 0 ||
- strcasecmp(charset, "shift-jis") == 0 ||
- strcasecmp(charset, "sjis") == 0) {
+ } else if (grn_strcasecmp(charset, "shift_jis") == 0 ||
+ grn_strcasecmp(charset, "shift-jis") == 0 ||
+ grn_strcasecmp(charset, "sjis") == 0) {
return GRN_ENC_SJIS;
}
return GRN_ENC_NONE;
@@ -169,7 +170,7 @@ chunked_tokenize_utf8_chunk(grn_ctx *ctx,
tokenized_chunk_length = strlen(tokenized_chunk);
if (tokenized_chunk_length >= 1 &&
- isspace(tokenized_chunk[tokenized_chunk_length - 1])) {
+ isspace((unsigned char)tokenized_chunk[tokenized_chunk_length - 1])) {
GRN_TEXT_PUT(ctx, &(tokenizer->buf),
tokenized_chunk, tokenized_chunk_length - 1);
} else {
@@ -271,6 +272,65 @@ chunked_tokenize_utf8(grn_ctx *ctx,
}
}
+static mecab_t *
+mecab_create(grn_ctx *ctx)
+{
+ mecab_t *mecab;
+ int argc = 0;
+ const char *argv[4];
+
+ argv[argc++] = "Groonga";
+ argv[argc++] = "-Owakati";
+#ifdef GRN_WITH_BUNDLED_MECAB
+ argv[argc++] = "--rcfile";
+# ifdef WIN32
+ {
+ static char windows_mecab_rc_file[PATH_MAX];
+
+ grn_strcpy(windows_mecab_rc_file,
+ PATH_MAX,
+ grn_plugin_windows_base_dir());
+ grn_strcat(windows_mecab_rc_file,
+ PATH_MAX,
+ "/");
+ grn_strcat(windows_mecab_rc_file,
+ PATH_MAX,
+ GRN_BUNDLED_MECAB_RELATIVE_RC_PATH);
+ {
+ char *c;
+ for (c = windows_mecab_rc_file; *c != '\0'; c++) {
+ if (*c == '/') {
+ *c = '\\';
+ }
+ }
+ }
+ argv[argc++] = windows_mecab_rc_file;
+ }
+# else /* WIN32 */
+ argv[argc++] = GRN_BUNDLED_MECAB_RC_PATH;
+# endif /* WIN32 */
+#endif /* GRN_WITH_BUNDLED_MECAB */
+ mecab = mecab_new(argc, (char **)argv);
+
+ if (!mecab) {
+#ifdef GRN_WITH_BUNDLED_MECAB
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] failed to create mecab_t: %s: "
+ "mecab_new(\"%s\", \"%s\", \"%s\", \"%s\")",
+ mecab_global_error_message(),
+ argv[0], argv[1], argv[2], argv[3]);
+#else /* GRN_WITH_BUNDLED_MECAB */
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] failed to create mecab_t: %s: "
+ "mecab_new(\"%s\", \"%s\")",
+ mecab_global_error_message(),
+ argv[0], argv[1]);
+#endif /* GRN_WITH_BUNDLED_MECAB */
+ }
+
+ return mecab;
+}
+
/*
This function is called for a full text search query or a document to be
indexed. This means that both short/long strings are given.
@@ -294,13 +354,8 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
if (!sole_mecab) {
grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
if (!sole_mecab) {
- sole_mecab = mecab_new2("-Owakati");
- if (!sole_mecab) {
- GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
- "[tokenizer][mecab] "
- "mecab_new2() failed on mecab_init(): %s",
- mecab_global_error_message());
- } else {
+ sole_mecab = mecab_create(ctx);
+ if (sole_mecab) {
sole_mecab_encoding = get_mecab_encoding(sole_mecab);
}
}
@@ -479,28 +534,24 @@ check_mecab_dictionary_encoding(grn_ctx *ctx)
{
#ifdef HAVE_MECAB_DICTIONARY_INFO_T
mecab_t *mecab;
+ grn_encoding encoding;
+ grn_bool have_same_encoding_dictionary;
- mecab = mecab_new2("-Owakati");
- if (mecab) {
- grn_encoding encoding;
- grn_bool have_same_encoding_dictionary;
+ mecab = mecab_create(ctx);
+ if (!mecab) {
+ return;
+ }
- encoding = GRN_CTX_GET_ENCODING(ctx);
- have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
- mecab_destroy(mecab);
+ encoding = GRN_CTX_GET_ENCODING(ctx);
+ have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
+ mecab_destroy(mecab);
- if (!have_same_encoding_dictionary) {
- GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
- "[tokenizer][mecab] "
- "MeCab has no dictionary that uses the context encoding"
- ": <%s>",
- grn_encoding_to_string(encoding));
- }
- } else {
+ if (!have_same_encoding_dictionary) {
GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
"[tokenizer][mecab] "
- "mecab_new2 failed in check_mecab_dictionary_encoding: %s",
- mecab_global_error_message());
+ "MeCab has no dictionary that uses the context encoding"
+ ": <%s>",
+ grn_encoding_to_string(encoding));
}
#endif
}
@@ -549,6 +600,11 @@ GRN_PLUGIN_INIT(grn_ctx *ctx)
}
check_mecab_dictionary_encoding(ctx);
+ if (ctx->rc != GRN_SUCCESS) {
+ grn_plugin_mutex_close(ctx, sole_mecab_mutex);
+ sole_mecab_mutex = NULL;
+ }
+
return ctx->rc;
}