diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c')
-rw-r--r-- | storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c | 291 |
1 files changed, 269 insertions, 22 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c index 4ac99f9a9e8..9207f94229e 100644 --- a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c @@ -1,5 +1,5 @@ /* -*- c-basic-offset: 2 -*- */ -/* Copyright(C) 2009-2012 Brazil +/* Copyright(C) 2009-2015 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -15,13 +15,14 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <str.h> +#include <grn_str.h> #include <groonga.h> #include <groonga/tokenizer.h> #include <mecab.h> +#include <stdlib.h> #include <string.h> #include <ctype.h> @@ -29,6 +30,9 @@ static mecab_t *sole_mecab = NULL; static grn_plugin_mutex *sole_mecab_mutex = NULL; static grn_encoding sole_mecab_encoding = GRN_ENC_NONE; +static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE; +static int grn_mecab_chunk_size_threshold = 8192; + typedef struct { mecab_t *mecab; grn_obj buf; @@ -38,6 +42,21 @@ typedef struct { grn_tokenizer_token token; } grn_mecab_tokenizer; +static const char * +mecab_global_error_message(void) +{ + double version; + + version = atof(mecab_version()); + /* MeCab <= 0.993 doesn't support mecab_strerror(NULL). */ + if (version <= 0.993) { + return "Unknown"; + } + + return mecab_strerror(NULL); +} + + static grn_encoding translate_mecab_charset_to_grn_encoding(const char *charset) { @@ -67,6 +86,187 @@ get_mecab_encoding(mecab_t *mecab) return encoding; } +static inline grn_bool +is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes) +{ + switch (character_bytes) { + case 1 : + switch (character[0]) { + case ',' : + case '.' : + case '!' : + case '?' : + return GRN_TRUE; + default : + return GRN_FALSE; + } + case 3 : + switch ((unsigned char)(character[0])) { + case 0xE3 : + switch ((unsigned char)(character[1])) { + case 0x80 : + switch ((unsigned char)(character[2])) { + case 0x81 : /* U+3001 (0xE3 0x80 0x81 in UTF-8) IDEOGRAPHIC COMMA */ + case 0x82 : /* U+3002 (0xE3 0x80 0x82 in UTF-8) IDEOGRAPHIC FULL STOP */ + return GRN_TRUE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } + return GRN_FALSE; + case 0xEF : + switch ((unsigned char)(character[1])) { + case 0xBC : + switch ((unsigned char)(character[2])) { + case 0x81 : + /* U+FF01 (0xEF 0xBC 0x81 in UTF-8) FULLWIDTH EXCLAMATION MARK */ + case 0x9F : + /* U+FF1F (0xEF 0xBC 0x9F in UTF-8) FULLWIDTH QUESTION MARK */ + return GRN_TRUE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } + return GRN_FALSE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } +} + +static grn_bool +chunked_tokenize_utf8_chunk(grn_ctx *ctx, + grn_mecab_tokenizer *tokenizer, + const char *chunk, + unsigned int chunk_bytes) +{ + const char *tokenized_chunk; + size_t tokenized_chunk_length; + + tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes); + if (!tokenized_chunk) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab][chunk] " + "mecab_sparse_tostr2() failed len=%d err=%s", + chunk_bytes, + mecab_strerror(tokenizer->mecab)); + return GRN_FALSE; + } + + if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) { + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " "); + } + + tokenized_chunk_length = strlen(tokenized_chunk); + if (tokenized_chunk_length >= 1 && + isspace(tokenized_chunk[tokenized_chunk_length - 1])) { + GRN_TEXT_PUT(ctx, &(tokenizer->buf), + tokenized_chunk, tokenized_chunk_length - 1); + } else { + GRN_TEXT_PUT(ctx, &(tokenizer->buf), + tokenized_chunk, tokenized_chunk_length); + } + + return GRN_TRUE; +} + +static grn_bool +chunked_tokenize_utf8(grn_ctx *ctx, + grn_mecab_tokenizer *tokenizer, + const char *string, + unsigned int string_bytes) +{ + const char *chunk_start; + const char *current; + const char *last_delimiter; + const char *string_end = string + string_bytes; + grn_encoding encoding = tokenizer->query->encoding; + + if (string_bytes < grn_mecab_chunk_size_threshold) { + return chunked_tokenize_utf8_chunk(ctx, + tokenizer, + string, + string_bytes); + } + + chunk_start = current = string; + last_delimiter = NULL; + while (current < string_end) { + int space_bytes; + int character_bytes; + const char *current_character; + + space_bytes = grn_isspace(current, encoding); + if (space_bytes > 0) { + if (chunk_start != current) { + grn_bool succeeded; + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + if (!succeeded) { + return succeeded; + } + } + current += space_bytes; + chunk_start = current; + last_delimiter = NULL; + continue; + } + + character_bytes = grn_charlen_(ctx, current, string_end, encoding); + if (character_bytes == 0) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab][chunk] " + "invalid byte sequence: position=%d", + (int)(current - string)); + return GRN_FALSE; + } + + current_character = current; + current += character_bytes; + if (is_delimiter_character(ctx, current_character, character_bytes)) { + last_delimiter = current; + } + + if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) { + grn_bool succeeded; + if (last_delimiter) { + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + last_delimiter - chunk_start); + chunk_start = last_delimiter; + } else { + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + chunk_start = current; + } + if (!succeeded) { + return succeeded; + } + last_delimiter = NULL; + } + } + + if (current == chunk_start) { + return GRN_TRUE; + } else { + return chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + } +} + /* This function is called for a full text search query or a document to be indexed. This means that both short/long strings are given. @@ -76,7 +276,6 @@ get_mecab_encoding(mecab_t *mecab) static grn_obj * mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) { - const char *s; grn_mecab_tokenizer *tokenizer; unsigned int normalizer_flags = 0; grn_tokenizer_query *query; @@ -96,7 +295,7 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2() failed on mecab_init(): %s", - mecab_strerror(NULL)); + mecab_global_error_message()); } else { sole_mecab_encoding = get_mecab_encoding(sole_mecab); } @@ -143,21 +342,33 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) tokenizer->next = ""; tokenizer->end = tokenizer->next; } else { + grn_bool succeeded; grn_plugin_mutex_lock(ctx, sole_mecab_mutex); - s = mecab_sparse_tostr2(tokenizer->mecab, - normalized_string, - normalized_string_length); - if (!s) { - GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, - "[tokenizer][mecab] " - "mecab_sparse_tostr() failed len=%d err=%s", - normalized_string_length, - mecab_strerror(tokenizer->mecab)); + if (grn_mecab_chunked_tokenize_enabled && + ctx->encoding == GRN_ENC_UTF8) { + succeeded = chunked_tokenize_utf8(ctx, + tokenizer, + normalized_string, + normalized_string_length); } else { - GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); + const char *s; + s = mecab_sparse_tostr2(tokenizer->mecab, + normalized_string, + normalized_string_length); + if (!s) { + succeeded = GRN_FALSE; + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "mecab_sparse_tostr() failed len=%d err=%s", + normalized_string_length, + mecab_strerror(tokenizer->mecab)); + } else { + succeeded = GRN_TRUE; + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); + } } grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); - if (!s) { + if (!succeeded) { grn_tokenizer_query_close(ctx, tokenizer->query); GRN_PLUGIN_FREE(ctx, tokenizer); return NULL; @@ -207,19 +418,31 @@ mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) grn_tokenizer_status status; for (r = p; r < e; r += cl) { + int space_len; + + space_len = grn_isspace(r, encoding); + if (space_len > 0 && r == p) { + cl = space_len; + p = r + cl; + continue; + } + if (!(cl = grn_charlen_(ctx, r, e, encoding))) { tokenizer->next = e; break; } - if (grn_isspace(r, encoding)) { - const char *q = r; - while ((cl = grn_isspace(q, encoding))) { q += cl; } + + if (space_len > 0) { + const char *q = r + space_len; + while (q < e && (space_len = grn_isspace(q, encoding))) { + q += space_len; + } tokenizer->next = q; break; } } - if (r == e) { + if (r == e || tokenizer->next == e) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; @@ -256,10 +479,10 @@ check_mecab_dictionary_encoding(grn_ctx *ctx) mecab = mecab_new2("-Owakati"); if (mecab) { grn_encoding encoding; - int have_same_encoding_dictionary = 0; + grn_bool have_same_encoding_dictionary; encoding = GRN_CTX_GET_ENCODING(ctx); - have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab); + have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab)); mecab_destroy(mecab); if (!have_same_encoding_dictionary) { @@ -273,7 +496,7 @@ check_mecab_dictionary_encoding(grn_ctx *ctx) GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer][mecab] " "mecab_new2 failed in check_mecab_dictionary_encoding: %s", - mecab_strerror(NULL)); + mecab_global_error_message()); } #endif } @@ -285,6 +508,30 @@ check_mecab_dictionary_encoding(grn_ctx *ctx) grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) { + { + const char *env; + + env = getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED"); + grn_mecab_chunked_tokenize_enabled = (env && strcmp(env, "yes") == 0); + } + + { + const char *env; + + env = getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD"); + if (env) { + int threshold = -1; + const char *end; + const char *rest; + + end = env + strlen(env); + threshold = grn_atoi(env, end, &rest); + if (end > env && end == rest) { + grn_mecab_chunk_size_threshold = threshold; + } + } + } + sole_mecab = NULL; sole_mecab_mutex = grn_plugin_mutex_open(ctx); if (!sole_mecab_mutex) { |