summaryrefslogtreecommitdiff
path: root/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c')
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c291
1 files changed, 269 insertions, 22 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
index 4ac99f9a9e8..9207f94229e 100644
--- a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
@@ -1,5 +1,5 @@
/* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2009-2012 Brazil
+/* Copyright(C) 2009-2015 Brazil
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -15,13 +15,14 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include <str.h>
+#include <grn_str.h>
#include <groonga.h>
#include <groonga/tokenizer.h>
#include <mecab.h>
+#include <stdlib.h>
#include <string.h>
#include <ctype.h>
@@ -29,6 +30,9 @@ static mecab_t *sole_mecab = NULL;
static grn_plugin_mutex *sole_mecab_mutex = NULL;
static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
+static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE;
+static int grn_mecab_chunk_size_threshold = 8192;
+
typedef struct {
mecab_t *mecab;
grn_obj buf;
@@ -38,6 +42,21 @@ typedef struct {
grn_tokenizer_token token;
} grn_mecab_tokenizer;
+static const char *
+mecab_global_error_message(void)
+{
+ double version;
+
+ version = atof(mecab_version());
+ /* MeCab <= 0.993 doesn't support mecab_strerror(NULL). */
+ if (version <= 0.993) {
+ return "Unknown";
+ }
+
+ return mecab_strerror(NULL);
+}
+
+
static grn_encoding
translate_mecab_charset_to_grn_encoding(const char *charset)
{
@@ -67,6 +86,187 @@ get_mecab_encoding(mecab_t *mecab)
return encoding;
}
+static inline grn_bool
+is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes)
+{
+ switch (character_bytes) {
+ case 1 :
+ switch (character[0]) {
+ case ',' :
+ case '.' :
+ case '!' :
+ case '?' :
+ return GRN_TRUE;
+ default :
+ return GRN_FALSE;
+ }
+ case 3 :
+ switch ((unsigned char)(character[0])) {
+ case 0xE3 :
+ switch ((unsigned char)(character[1])) {
+ case 0x80 :
+ switch ((unsigned char)(character[2])) {
+ case 0x81 : /* U+3001 (0xE3 0x80 0x81 in UTF-8) IDEOGRAPHIC COMMA */
+ case 0x82 : /* U+3002 (0xE3 0x80 0x82 in UTF-8) IDEOGRAPHIC FULL STOP */
+ return GRN_TRUE;
+ default :
+ return GRN_FALSE;
+ }
+ default :
+ return GRN_FALSE;
+ }
+ return GRN_FALSE;
+ case 0xEF :
+ switch ((unsigned char)(character[1])) {
+ case 0xBC :
+ switch ((unsigned char)(character[2])) {
+ case 0x81 :
+ /* U+FF01 (0xEF 0xBC 0x81 in UTF-8) FULLWIDTH EXCLAMATION MARK */
+ case 0x9F :
+ /* U+FF1F (0xEF 0xBC 0x9F in UTF-8) FULLWIDTH QUESTION MARK */
+ return GRN_TRUE;
+ default :
+ return GRN_FALSE;
+ }
+ default :
+ return GRN_FALSE;
+ }
+ return GRN_FALSE;
+ default :
+ return GRN_FALSE;
+ }
+ default :
+ return GRN_FALSE;
+ }
+}
+
+static grn_bool
+chunked_tokenize_utf8_chunk(grn_ctx *ctx,
+ grn_mecab_tokenizer *tokenizer,
+ const char *chunk,
+ unsigned int chunk_bytes)
+{
+ const char *tokenized_chunk;
+ size_t tokenized_chunk_length;
+
+ tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes);
+ if (!tokenized_chunk) {
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab][chunk] "
+ "mecab_sparse_tostr2() failed len=%d err=%s",
+ chunk_bytes,
+ mecab_strerror(tokenizer->mecab));
+ return GRN_FALSE;
+ }
+
+ if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) {
+ GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " ");
+ }
+
+ tokenized_chunk_length = strlen(tokenized_chunk);
+ if (tokenized_chunk_length >= 1 &&
+ isspace(tokenized_chunk[tokenized_chunk_length - 1])) {
+ GRN_TEXT_PUT(ctx, &(tokenizer->buf),
+ tokenized_chunk, tokenized_chunk_length - 1);
+ } else {
+ GRN_TEXT_PUT(ctx, &(tokenizer->buf),
+ tokenized_chunk, tokenized_chunk_length);
+ }
+
+ return GRN_TRUE;
+}
+
+static grn_bool
+chunked_tokenize_utf8(grn_ctx *ctx,
+ grn_mecab_tokenizer *tokenizer,
+ const char *string,
+ unsigned int string_bytes)
+{
+ const char *chunk_start;
+ const char *current;
+ const char *last_delimiter;
+ const char *string_end = string + string_bytes;
+ grn_encoding encoding = tokenizer->query->encoding;
+
+ if (string_bytes < grn_mecab_chunk_size_threshold) {
+ return chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ string,
+ string_bytes);
+ }
+
+ chunk_start = current = string;
+ last_delimiter = NULL;
+ while (current < string_end) {
+ int space_bytes;
+ int character_bytes;
+ const char *current_character;
+
+ space_bytes = grn_isspace(current, encoding);
+ if (space_bytes > 0) {
+ if (chunk_start != current) {
+ grn_bool succeeded;
+ succeeded = chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ chunk_start,
+ current - chunk_start);
+ if (!succeeded) {
+ return succeeded;
+ }
+ }
+ current += space_bytes;
+ chunk_start = current;
+ last_delimiter = NULL;
+ continue;
+ }
+
+ character_bytes = grn_charlen_(ctx, current, string_end, encoding);
+ if (character_bytes == 0) {
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab][chunk] "
+ "invalid byte sequence: position=%d",
+ (int)(current - string));
+ return GRN_FALSE;
+ }
+
+ current_character = current;
+ current += character_bytes;
+ if (is_delimiter_character(ctx, current_character, character_bytes)) {
+ last_delimiter = current;
+ }
+
+ if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) {
+ grn_bool succeeded;
+ if (last_delimiter) {
+ succeeded = chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ chunk_start,
+ last_delimiter - chunk_start);
+ chunk_start = last_delimiter;
+ } else {
+ succeeded = chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ chunk_start,
+ current - chunk_start);
+ chunk_start = current;
+ }
+ if (!succeeded) {
+ return succeeded;
+ }
+ last_delimiter = NULL;
+ }
+ }
+
+ if (current == chunk_start) {
+ return GRN_TRUE;
+ } else {
+ return chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ chunk_start,
+ current - chunk_start);
+ }
+}
+
/*
This function is called for a full text search query or a document to be
indexed. This means that both short/long strings are given.
@@ -76,7 +276,6 @@ get_mecab_encoding(mecab_t *mecab)
static grn_obj *
mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
{
- const char *s;
grn_mecab_tokenizer *tokenizer;
unsigned int normalizer_flags = 0;
grn_tokenizer_query *query;
@@ -96,7 +295,7 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
"[tokenizer][mecab] "
"mecab_new2() failed on mecab_init(): %s",
- mecab_strerror(NULL));
+ mecab_global_error_message());
} else {
sole_mecab_encoding = get_mecab_encoding(sole_mecab);
}
@@ -143,21 +342,33 @@ mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
tokenizer->next = "";
tokenizer->end = tokenizer->next;
} else {
+ grn_bool succeeded;
grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
- s = mecab_sparse_tostr2(tokenizer->mecab,
- normalized_string,
- normalized_string_length);
- if (!s) {
- GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
- "[tokenizer][mecab] "
- "mecab_sparse_tostr() failed len=%d err=%s",
- normalized_string_length,
- mecab_strerror(tokenizer->mecab));
+ if (grn_mecab_chunked_tokenize_enabled &&
+ ctx->encoding == GRN_ENC_UTF8) {
+ succeeded = chunked_tokenize_utf8(ctx,
+ tokenizer,
+ normalized_string,
+ normalized_string_length);
} else {
- GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
+ const char *s;
+ s = mecab_sparse_tostr2(tokenizer->mecab,
+ normalized_string,
+ normalized_string_length);
+ if (!s) {
+ succeeded = GRN_FALSE;
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] "
+ "mecab_sparse_tostr() failed len=%d err=%s",
+ normalized_string_length,
+ mecab_strerror(tokenizer->mecab));
+ } else {
+ succeeded = GRN_TRUE;
+ GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
+ }
}
grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
- if (!s) {
+ if (!succeeded) {
grn_tokenizer_query_close(ctx, tokenizer->query);
GRN_PLUGIN_FREE(ctx, tokenizer);
return NULL;
@@ -207,19 +418,31 @@ mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
grn_tokenizer_status status;
for (r = p; r < e; r += cl) {
+ int space_len;
+
+ space_len = grn_isspace(r, encoding);
+ if (space_len > 0 && r == p) {
+ cl = space_len;
+ p = r + cl;
+ continue;
+ }
+
if (!(cl = grn_charlen_(ctx, r, e, encoding))) {
tokenizer->next = e;
break;
}
- if (grn_isspace(r, encoding)) {
- const char *q = r;
- while ((cl = grn_isspace(q, encoding))) { q += cl; }
+
+ if (space_len > 0) {
+ const char *q = r + space_len;
+ while (q < e && (space_len = grn_isspace(q, encoding))) {
+ q += space_len;
+ }
tokenizer->next = q;
break;
}
}
- if (r == e) {
+ if (r == e || tokenizer->next == e) {
status = GRN_TOKENIZER_LAST;
} else {
status = GRN_TOKENIZER_CONTINUE;
@@ -256,10 +479,10 @@ check_mecab_dictionary_encoding(grn_ctx *ctx)
mecab = mecab_new2("-Owakati");
if (mecab) {
grn_encoding encoding;
- int have_same_encoding_dictionary = 0;
+ grn_bool have_same_encoding_dictionary;
encoding = GRN_CTX_GET_ENCODING(ctx);
- have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab);
+ have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
mecab_destroy(mecab);
if (!have_same_encoding_dictionary) {
@@ -273,7 +496,7 @@ check_mecab_dictionary_encoding(grn_ctx *ctx)
GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
"[tokenizer][mecab] "
"mecab_new2 failed in check_mecab_dictionary_encoding: %s",
- mecab_strerror(NULL));
+ mecab_global_error_message());
}
#endif
}
@@ -285,6 +508,30 @@ check_mecab_dictionary_encoding(grn_ctx *ctx)
grn_rc
GRN_PLUGIN_INIT(grn_ctx *ctx)
{
+ {
+ const char *env;
+
+ env = getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED");
+ grn_mecab_chunked_tokenize_enabled = (env && strcmp(env, "yes") == 0);
+ }
+
+ {
+ const char *env;
+
+ env = getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD");
+ if (env) {
+ int threshold = -1;
+ const char *end;
+ const char *rest;
+
+ end = env + strlen(env);
+ threshold = grn_atoi(env, end, &rest);
+ if (end > env && end == rest) {
+ grn_mecab_chunk_size_threshold = threshold;
+ }
+ }
+ }
+
sole_mecab = NULL;
sole_mecab_mutex = grn_plugin_mutex_open(ctx);
if (!sole_mecab_mutex) {