diff options
author | Kentoku SHIBA <kentokushiba@gmail.com> | 2014-09-21 00:33:45 +0900 |
---|---|---|
committer | Kentoku SHIBA <kentokushiba@gmail.com> | 2014-09-21 00:33:45 +0900 |
commit | 0cc855cdc8cd0baa6ba50662632b299a3843ff13 (patch) | |
tree | eaf50856703412b5c1c43f8c0e5a6a5318601c17 /storage/mroonga/vendor/groonga/plugins | |
parent | 989dd4d9ec09450ff7b25987b14ee9fdfd21ad4e (diff) | |
download | mariadb-git-0cc855cdc8cd0baa6ba50662632b299a3843ff13.tar.gz |
Update Mroonga to the latest version on 2014-09-21T00:33:44+0900
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins')
27 files changed, 3325 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/CMakeLists.txt new file mode 100644 index 00000000000..a2a8f288d52 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/CMakeLists.txt @@ -0,0 +1,20 @@ +# Copyright(C) 2012 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +add_subdirectory(suggest) +add_subdirectory(tokenizers) +add_subdirectory(table) +add_subdirectory(query_expanders) +add_subdirectory(ruby) diff --git a/storage/mroonga/vendor/groonga/plugins/Makefile.am b/storage/mroonga/vendor/groonga/plugins/Makefile.am new file mode 100644 index 00000000000..75a00597e06 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/Makefile.am @@ -0,0 +1,9 @@ +SUBDIRS = \ + tokenizers \ + suggest \ + table \ + query_expanders \ + ruby + +EXTRA_DIST = \ + CMakeLists.txt diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt new file mode 100644 index 00000000000..1e2a2c23b09 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(QUERY_EXPANDERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/query_expanders") +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/tsv_sources.am TSV_SOURCES) +set_source_files_properties(${TSV_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +add_library(tsv_query_expander MODULE ${TSV_SOURCES}) +set_target_properties(tsv_query_expander PROPERTIES + PREFIX "" + OUTPUT_NAME "tsv") +target_link_libraries(tsv_query_expander libgroonga) +if(NOT MRN_GROONGA_BUNDLED) + install(TARGETS tsv_query_expander DESTINATION "${QUERY_EXPANDERS_DIR}") +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am b/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am new file mode 100644 index 00000000000..ca26760d332 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am @@ -0,0 +1,20 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +query_expanders_plugins_LTLIBRARIES = +query_expanders_plugins_LTLIBRARIES += tsv.la + +include tsv_sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c new file mode 100644 index 00000000000..6b1fc51d6dc --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c @@ -0,0 +1,299 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <groonga/plugin.h> + +/* groonga's internal headers */ +/* for grn_text_fgets(): We don't want to require stdio.h for groonga.h. + What should we do? Should we split header file such as groonga/stdio.h? */ +#include <str.h> + +#include <stdio.h> +#include <string.h> + +#ifdef HAVE__STRNICMP +# define strncasecmp(s1,s2,n) _strnicmp(s1,s2,n) +#endif /* HAVE__STRNICMP */ + +#define MAX_SYNONYM_BYTES 4096 + +static grn_hash *synonyms = NULL; + +#ifdef WIN32 +static char *win32_synonyms_file = NULL; +const char * +get_system_synonyms_file(void) +{ + if (!win32_synonyms_file) { + const char *base_dir; + const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE; + char *synonyms_file; + char *path; + size_t base_dir_length; + + base_dir = grn_plugin_win32_base_dir(); + base_dir_length = strlen(base_dir); + synonyms_file = + malloc(base_dir_length + strlen("/") + strlen(relative_path) + 1); + strcpy(synonyms_file, base_dir); + strcat(synonyms_file, "/"); + strcat(synonyms_file, relative_path); + win32_synonyms_file = synonyms_file; + } + return win32_synonyms_file; +} + +#else /* WIN32 */ +const char * +get_system_synonyms_file(void) +{ + return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE; +} +#endif /* WIN32 */ + +static inline grn_bool +is_comment_mark(char character) +{ + return character == '#'; +} + +static inline grn_encoding +detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length) +{ + grn_encoding encoding = GRN_ENC_NONE; + grn_obj null_terminated_line_buffer; + const char *c_line; + const char *coding_part_keyword = "coding: "; + const char *coding_part; + const char *encoding_name; + + GRN_TEXT_INIT(&null_terminated_line_buffer, 0); + GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length); + GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0'); + + c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer); + coding_part = strstr(c_line, coding_part_keyword); + if (coding_part) { + encoding_name = coding_part + strlen(coding_part_keyword); + if (strncasecmp(encoding_name, "utf-8", strlen("utf-8")) == 0 || + strncasecmp(encoding_name, "utf8", strlen("utf8")) == 0) { + encoding = GRN_ENC_UTF8; + } else if (strncasecmp(encoding_name, "sjis", strlen("sjis")) == 0 || + strncasecmp(encoding_name, "Shift_JIS", strlen("Shift_JIS")) == 0) { + encoding = GRN_ENC_SJIS; + } else if (strncasecmp(encoding_name, "EUC-JP", strlen("EUC-JP")) == 0 || + strncasecmp(encoding_name, "euc_jp", strlen("euc_jp")) == 0) { + encoding = GRN_ENC_EUC_JP; + } else if (strncasecmp(encoding_name, "latin1", strlen("latin1")) == 0) { + encoding = GRN_ENC_LATIN1; + } else if (strncasecmp(encoding_name, "KOI8-R", strlen("KOI8-R")) == 0 || + strncasecmp(encoding_name, "koi8r", strlen("koi8r")) == 0) { + encoding = GRN_ENC_KOI8R; + } + } else { + encoding = ctx->encoding; + } + GRN_OBJ_FIN(ctx, &null_terminated_line_buffer); + + return encoding; +} + +static inline grn_encoding +guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length) +{ + const char bom[] = {0xef, 0xbb, 0xbf}; + size_t bom_length = sizeof(bom); + + if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) { + *line += bom_length; + *line_length -= bom_length; + return GRN_ENC_UTF8; + } + + if (!is_comment_mark((*line)[0])) { + return ctx->encoding; + } + + return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1); +} + +static void +parse_synonyms_file_line(grn_ctx *ctx, const char *line, int line_length, + grn_obj *key, grn_obj *value) +{ + size_t i = 0; + + if (is_comment_mark(line[i])) { + return; + } + + while (i < line_length) { + char character = line[i]; + i++; + if (character == '\t') { + break; + } + GRN_TEXT_PUTC(ctx, key, character); + } + + if (i == line_length) { + return; + } + + GRN_TEXT_PUTS(ctx, value, "(("); + while (i < line_length) { + char character = line[i]; + i++; + if (character == '\t') { + GRN_TEXT_PUTS(ctx, value, ") OR ("); + } else { + GRN_TEXT_PUTC(ctx, value, character); + } + } + GRN_TEXT_PUTS(ctx, value, "))"); + + { + grn_id id; + void *value_location = NULL; + + id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key), + &value_location, NULL); + if (id == GRN_ID_NIL) { + GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING, + "[plugin][query-expander][tsv] " + "failed to register key: <%.*s>", + (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key)); + return; + } + + grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1); + GRN_TEXT_PUTC(ctx, value, '\0'); + memcpy(value_location, GRN_TEXT_VALUE(value), MAX_SYNONYM_BYTES); + } +} + +static void +load_synonyms(grn_ctx *ctx) +{ + const char *path; + FILE *file; + int number_of_lines; + grn_encoding encoding; + grn_obj line, key, value; + + path = getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE"); + if (!path) { + path = get_system_synonyms_file(); + } + file = fopen(path, "r"); + if (!file) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[plugin][query-expander][tsv] " + "synonyms file doesn't exist: <%s>", + path); + return; + } + + GRN_TEXT_INIT(&line, 0); + GRN_TEXT_INIT(&key, 0); + GRN_TEXT_INIT(&value, 0); + grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES); + number_of_lines = 0; + while (grn_text_fgets(ctx, &line, file) == GRN_SUCCESS) { + const char *line_value = GRN_TEXT_VALUE(&line); + size_t line_length = GRN_TEXT_LEN(&line); + + number_of_lines++; + if (number_of_lines == 1) { + encoding = guess_encoding(ctx, &line_value, &line_length); + } + GRN_BULK_REWIND(&key); + GRN_BULK_REWIND(&value); + parse_synonyms_file_line(ctx, line_value, line_length, &key, &value); + GRN_BULK_REWIND(&line); + } + GRN_OBJ_FIN(ctx, &line); + GRN_OBJ_FIN(ctx, &key); + GRN_OBJ_FIN(ctx, &value); + + fclose(file); +} + +static grn_obj * +func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + grn_rc rc = GRN_END_OF_DATA; + grn_id id; + grn_obj *term, *expanded_term; + void *value; + grn_obj *rc_object; + + term = args[0]; + expanded_term = args[1]; + id = grn_hash_get(ctx, synonyms, + GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term), + &value); + if (id != GRN_ID_NIL) { + const char *query = value; + GRN_TEXT_PUTS(ctx, expanded_term, query); + rc = GRN_SUCCESS; + } + + rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0); + if (rc_object) { + GRN_INT32_SET(ctx, rc_object, rc); + } + + return rc_object; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + if (!synonyms) { + synonyms = grn_hash_create(ctx, NULL, + GRN_TABLE_MAX_KEY_SIZE, + MAX_SYNONYM_BYTES, + GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE); + if (!synonyms) { + return ctx->rc; + } + load_synonyms(ctx); + } + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"), + GRN_PROC_FUNCTION, + func_query_expander_tsv, NULL, NULL, + 0, NULL); + return GRN_SUCCESS; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + if (synonyms) { + grn_hash_close(ctx, synonyms); + synonyms = NULL; + } + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am new file mode 100644 index 00000000000..f1bdabede85 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am @@ -0,0 +1,2 @@ +tsv_la_SOURCES = \ + tsv.c diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/ruby/CMakeLists.txt new file mode 100644 index 00000000000..d82b154098c --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/CMakeLists.txt @@ -0,0 +1,44 @@ +# Copyright(C) 2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ${MRUBY_INCLUDE_DIRS}) + +if(GRN_WITH_MRUBY) + set(GRN_RELATIVE_RUBY_PLUGINS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/ruby") + + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/eval_sources.am RUBY_EVAL_SOURCES) + add_library(eval MODULE ${RUBY_EVAL_SOURCES}) + set_source_files_properties(${RUBY_EVAL_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") + set_target_properties(eval PROPERTIES PREFIX "") + target_link_libraries(eval libgroonga) + if(NOT MRN_GROONGA_BUNDLED) + install(TARGETS eval DESTINATION "${GRN_RELATIVE_RUBY_PLUGINS_DIR}") + endif() + + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/load_sources.am RUBY_LOAD_SOURCES) + add_library(load MODULE ${RUBY_LOAD_SOURCES}) + set_source_files_properties(${RUBY_LOAD_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") + set_target_properties(load PROPERTIES PREFIX "") + target_link_libraries(load libgroonga) + if(NOT MRN_GROONGA_BUNDLED) + install(TARGETS load DESTINATION "${GRN_RELATIVE_RUBY_PLUGINS_DIR}") + endif() +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/Makefile.am b/storage/mroonga/vendor/groonga/plugins/ruby/Makefile.am new file mode 100644 index 00000000000..381fb47160b --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/Makefile.am @@ -0,0 +1,29 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CFLAGS = \ + $(MESSAGE_PACK_CFLAGS) \ + $(MRUBY_CFLAGS) + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la \ + $(MESSAGE_PACK_LIBS) + +if WITH_MRUBY +ruby_plugins_LTLIBRARIES = \ + eval.la \ + load.la +endif + +include eval_sources.am +include load_sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/eval.c b/storage/mroonga/vendor/groonga/plugins/ruby/eval.c new file mode 100644 index 00000000000..ad1e7948249 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/eval.c @@ -0,0 +1,64 @@ +/* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */ +/* + Copyright(C) 2013 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "ruby_plugin.h" + +static grn_obj * +command_ruby_eval(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + mrb_state *mrb = ctx->impl->mrb.state; + grn_obj *script; + mrb_value result; + + script = VAR(0); + switch (script->header.domain) { + case GRN_DB_SHORT_TEXT : + case GRN_DB_TEXT : + case GRN_DB_LONG_TEXT : + break; + default : + { + grn_obj inspected; + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, script); + ERR(GRN_INVALID_ARGUMENT, "script must be a string: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + break; + } + + mrb->exc = NULL; + result = grn_mrb_eval(ctx, GRN_TEXT_VALUE(script), GRN_TEXT_LEN(script)); + output_result(ctx, result); + + return NULL; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_expr_var vars[1]; + + grn_plugin_expr_var_init(ctx, &vars[0], "script", -1); + grn_plugin_command_create(ctx, "ruby_eval", -1, command_ruby_eval, 1, vars); + + return ctx->rc; +} diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/eval_sources.am b/storage/mroonga/vendor/groonga/plugins/ruby/eval_sources.am new file mode 100644 index 00000000000..08543e43fb4 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/eval_sources.am @@ -0,0 +1,3 @@ +eval_la_SOURCES = \ + ruby_plugin.h \ + eval.c diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/load.c b/storage/mroonga/vendor/groonga/plugins/ruby/load.c new file mode 100644 index 00000000000..a4e60acc357 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/load.c @@ -0,0 +1,63 @@ +/* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */ +/* + Copyright(C) 2013 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "ruby_plugin.h" + +static grn_obj * +command_ruby_load(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *path; + mrb_value result; + + path = VAR(0); + switch (path->header.domain) { + case GRN_DB_SHORT_TEXT : + case GRN_DB_TEXT : + case GRN_DB_LONG_TEXT : + break; + default : + { + grn_obj inspected; + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, path); + ERR(GRN_INVALID_ARGUMENT, "path must be a string: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + break; + } + + GRN_TEXT_PUTC(ctx, path, '\0'); + result = grn_mrb_load(ctx, GRN_TEXT_VALUE(path)); + output_result(ctx, result); + + return NULL; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_expr_var vars[1]; + + grn_plugin_expr_var_init(ctx, &vars[0], "path", -1); + grn_plugin_command_create(ctx, "ruby_load", -1, command_ruby_load, 1, vars); + + return ctx->rc; +} diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/load_sources.am b/storage/mroonga/vendor/groonga/plugins/ruby/load_sources.am new file mode 100644 index 00000000000..d1cce258caa --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/load_sources.am @@ -0,0 +1,3 @@ +load_la_SOURCES = \ + ruby_plugin.h \ + load.c diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/ruby_plugin.h b/storage/mroonga/vendor/groonga/plugins/ruby/ruby_plugin.h new file mode 100644 index 00000000000..5314ea68fe5 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/ruby_plugin.h @@ -0,0 +1,76 @@ +/* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */ +/* + Copyright(C) 2013 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <mrb.h> +#include <output.h> +#include <db.h> +#include <ctx_impl.h> +#include <util.h> + +#include <groonga/plugin.h> + +#include <mruby.h> + +#define VAR GRN_PROC_GET_VAR_BY_OFFSET + +static void +output_result(grn_ctx *ctx, mrb_value result) +{ + mrb_state *mrb = ctx->impl->mrb.state; + + GRN_OUTPUT_MAP_OPEN("result", 1); + if (mrb->exc) { + mrb_value mrb_message; + grn_obj grn_message; + GRN_OUTPUT_CSTR("exception"); + GRN_OUTPUT_MAP_OPEN("exception", 1); + GRN_OUTPUT_CSTR("message"); + mrb_message = mrb_funcall(mrb, mrb_obj_value(mrb->exc), "message", 0); + GRN_VOID_INIT(&grn_message); + if (grn_mrb_to_grn(ctx, mrb_message, &grn_message) == GRN_SUCCESS) { + GRN_OUTPUT_OBJ(&grn_message, NULL); + } else { + GRN_OUTPUT_CSTR("unsupported message type"); + } + grn_obj_unlink(ctx, &grn_message); + GRN_OUTPUT_MAP_CLOSE(); + } else { + grn_obj grn_result; + GRN_OUTPUT_CSTR("value"); + GRN_VOID_INIT(&grn_result); + if (grn_mrb_to_grn(ctx, result, &grn_result) == GRN_SUCCESS) { + GRN_OUTPUT_OBJ(&grn_result, NULL); + } else { + GRN_OUTPUT_CSTR("unsupported return value"); + } + grn_obj_unlink(ctx, &grn_result); + } + GRN_OUTPUT_MAP_CLOSE(); +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/suggest/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/suggest/CMakeLists.txt new file mode 100644 index 00000000000..72b86362c55 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/suggest/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ${MRUBY_INCLUDE_DIRS}) + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/sources.am SUGGEST_SOURCES) +add_library(suggest MODULE ${SUGGEST_SOURCES}) +set_source_files_properties(${SUGGEST_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +set_target_properties(suggest PROPERTIES PREFIX "") +target_link_libraries(suggest libgroonga) +if(NOT MRN_GROONGA_BUNDLED) + install(TARGETS suggest DESTINATION "${GRN_RELATIVE_PLUGINS_DIR}/suggest") +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/suggest/Makefile.am b/storage/mroonga/vendor/groonga/plugins/suggest/Makefile.am new file mode 100644 index 00000000000..7f321b6c482 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/suggest/Makefile.am @@ -0,0 +1,24 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CFLAGS = \ + $(MESSAGE_PACK_CFLAGS) \ + $(MRUBY_CFLAGS) + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la \ + $(MESSAGE_PACK_LIBS) + +suggest_plugins_LTLIBRARIES = suggest.la + +include sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/suggest/sources.am b/storage/mroonga/vendor/groonga/plugins/suggest/sources.am new file mode 100644 index 00000000000..798a431a8e8 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/suggest/sources.am @@ -0,0 +1,2 @@ +suggest_la_SOURCES = \ + suggest.c diff --git a/storage/mroonga/vendor/groonga/plugins/suggest/suggest.c b/storage/mroonga/vendor/groonga/plugins/suggest/suggest.c new file mode 100644 index 00000000000..ea7b6adbf6b --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/suggest/suggest.c @@ -0,0 +1,1022 @@ +/* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */ +/* Copyright(C) 2010-2013 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "ctx.h" +#include "db.h" +#include "ii.h" +#include "token.h" +#include "output.h" +#include <groonga/plugin.h> +#include <string.h> + +#ifdef HAVE__STRNICMP +# define strncasecmp(s1,s2,n) _strnicmp(s1,s2,n) +#endif /* HAVE__STRNICMP */ + +#define VAR GRN_PROC_GET_VAR_BY_OFFSET +#define CONST_STR_LEN(x) x, x ? sizeof(x) - 1 : 0 +#define TEXT_VALUE_LEN(x) GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x) + +#define MIN_LEARN_DISTANCE (60 * GRN_TIME_USEC_PER_SEC) + +#define COMPLETE 1 +#define CORRECT 2 +#define SUGGEST 4 + +typedef enum { + GRN_SUGGEST_SEARCH_YES, + GRN_SUGGEST_SEARCH_NO, + GRN_SUGGEST_SEARCH_AUTO +} grn_suggest_search_mode; + +typedef struct { + grn_obj *post_event; + grn_obj *post_type; + grn_obj *post_item; + grn_obj *seq; + grn_obj *post_time; + grn_obj *pairs; + + int learn_distance_in_seconds; + + grn_id post_event_id; + grn_id post_type_id; + grn_id post_item_id; + grn_id seq_id; + int64_t post_time_value; + + grn_obj *seqs; + grn_obj *seqs_events; + grn_obj *events; + grn_obj *events_item; + grn_obj *events_type; + grn_obj *events_time; + grn_obj *event_types; + grn_obj *items; + grn_obj *items_freq; + grn_obj *items_freq2; + grn_obj *items_last; + grn_obj *pairs_pre; + grn_obj *pairs_post; + grn_obj *pairs_freq0; + grn_obj *pairs_freq1; + grn_obj *pairs_freq2; + + grn_obj dataset_name; + + grn_obj *configuration; + + grn_obj weight; + grn_obj pre_events; + + uint64_t key_prefix; + grn_obj pre_item; +} grn_suggest_learner; + +static int +grn_parse_suggest_types(grn_obj *text) +{ + const char *nptr = GRN_TEXT_VALUE(text); + const char *end = GRN_BULK_CURR(text); + int types = 0; + while (nptr < end) { + if (*nptr == '|') { + nptr += 1; + continue; + } + { + const char string[] = "complete"; + size_t length = sizeof(string) - 1; + if (nptr + length <= end && memcmp(nptr, string, length) == 0) { + types |= COMPLETE; + nptr += length; + continue; + } + } + { + const char string[] = "correct"; + size_t length = sizeof(string) - 1; + if (nptr + length <= end && memcmp(nptr, string, length) == 0) { + types |= CORRECT; + nptr += length; + continue; + } + } + { + const char string[] = "suggest"; + size_t length = sizeof(string) - 1; + if (nptr + length <= end && memcmp(nptr, string, length) == 0) { + types |= SUGGEST; + nptr += length; + continue; + } + } + break; + } + return types; +} + +static int32_t +cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id id, + grn_obj *res, int query_type, int frequency_threshold, + double conditional_probability_threshold) +{ + int32_t max_score = 0; + if (id) { + grn_ii_cursor *c; + grn_obj *co = grn_obj_column(ctx, items, CONST_STR_LEN("co")); + grn_obj *pairs = grn_ctx_at(ctx, grn_obj_get_range(ctx, co)); + grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq")); + grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2")); + grn_obj *pairs_freq, *pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post")); + switch (query_type) { + case COMPLETE : + pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0")); + break; + case CORRECT : + pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1")); + break; + case SUGGEST : + pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2")); + break; + default : + return max_score; + } + if ((c = grn_ii_cursor_open(ctx, (grn_ii *)co, id, GRN_ID_NIL, GRN_ID_MAX, + ((grn_ii *)co)->n_elements - 1, 0))) { + grn_ii_posting *p; + grn_obj post, pair_freq, item_freq, item_freq2, item_boost; + GRN_RECORD_INIT(&post, 0, grn_obj_id(ctx, items)); + GRN_INT32_INIT(&pair_freq, 0); + GRN_INT32_INIT(&item_freq, 0); + GRN_INT32_INIT(&item_freq2, 0); + GRN_INT32_INIT(&item_boost, 0); + while ((p = grn_ii_cursor_next(ctx, c))) { + grn_id post_id; + int pfreq, ifreq, ifreq2, boost; + double conditional_probability; + GRN_BULK_REWIND(&post); + GRN_BULK_REWIND(&pair_freq); + GRN_BULK_REWIND(&item_freq); + GRN_BULK_REWIND(&item_freq2); + GRN_BULK_REWIND(&item_boost); + grn_obj_get_value(ctx, pairs_post, p->rid, &post); + grn_obj_get_value(ctx, pairs_freq, p->rid, &pair_freq); + post_id = GRN_RECORD_VALUE(&post); + grn_obj_get_value(ctx, items_freq, post_id, &item_freq); + grn_obj_get_value(ctx, items_freq2, post_id, &item_freq2); + grn_obj_get_value(ctx, items_boost, post_id, &item_boost); + pfreq = GRN_INT32_VALUE(&pair_freq); + ifreq = GRN_INT32_VALUE(&item_freq); + ifreq2 = GRN_INT32_VALUE(&item_freq2); + if (ifreq2 > 0) { + conditional_probability = (double)pfreq / (double)ifreq2; + } else { + conditional_probability = 0.0; + } + boost = GRN_INT32_VALUE(&item_boost); + if (pfreq >= frequency_threshold && ifreq >= frequency_threshold && + conditional_probability >= conditional_probability_threshold && + boost >= 0) { + grn_rset_recinfo *ri; + void *value; + int32_t score = pfreq; + int added; + if (max_score < score + boost) { max_score = score + boost; } + /* put any formula if desired */ + if (grn_hash_add(ctx, (grn_hash *)res, + &post_id, sizeof(grn_id), &value, &added)) { + ri = value; + ri->score += score; + if (added) { + ri->score += boost; + } + } + } + } + GRN_OBJ_FIN(ctx, &post); + GRN_OBJ_FIN(ctx, &pair_freq); + GRN_OBJ_FIN(ctx, &item_freq); + GRN_OBJ_FIN(ctx, &item_freq2); + GRN_OBJ_FIN(ctx, &item_boost); + grn_ii_cursor_close(ctx, c); + } + } + return max_score; +} + +#define DEFAULT_LIMIT 10 +#define DEFAULT_SORTBY "-_score" +#define DEFAULT_OUTPUT_COLUMNS "_key,_score" +#define DEFAULT_FREQUENCY_THRESHOLD 100 +#define DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD 0.2 + +static void +output(grn_ctx *ctx, grn_obj *table, grn_obj *res, grn_id tid, + grn_obj *sortby, grn_obj *output_columns, int offset, int limit) +{ + grn_obj *sorted; + if ((sorted = grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_NO_KEY, NULL, res))) { + uint32_t nkeys; + grn_obj_format format; + grn_table_sort_key *keys; + const char *sortby_val = GRN_TEXT_VALUE(sortby); + unsigned int sortby_len = GRN_TEXT_LEN(sortby); + const char *oc_val = GRN_TEXT_VALUE(output_columns); + unsigned int oc_len = GRN_TEXT_LEN(output_columns); + if (!sortby_val || !sortby_len) { + sortby_val = DEFAULT_SORTBY; + sortby_len = sizeof(DEFAULT_SORTBY) - 1; + } + if (!oc_val || !oc_len) { + oc_val = DEFAULT_OUTPUT_COLUMNS; + oc_len = sizeof(DEFAULT_OUTPUT_COLUMNS) - 1; + } + if ((keys = grn_table_sort_key_from_str(ctx, sortby_val, sortby_len, res, &nkeys))) { + grn_table_sort(ctx, res, offset, limit, sorted, keys, nkeys); + GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, + ":", "sort(%d)", limit); + GRN_OBJ_FORMAT_INIT(&format, grn_table_size(ctx, res), 0, limit, offset); + format.flags = + GRN_OBJ_FORMAT_WITH_COLUMN_NAMES| + GRN_OBJ_FORMAT_XML_ELEMENT_RESULTSET; + grn_obj_columns(ctx, sorted, oc_val, oc_len, &format.columns); + GRN_OUTPUT_OBJ(sorted, &format); + GRN_OBJ_FORMAT_FIN(ctx, &format); + grn_table_sort_key_close(ctx, keys, nkeys); + } + grn_obj_unlink(ctx, sorted); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot create temporary sort table."); + } +} + +static inline void +complete_add_item(grn_ctx *ctx, grn_id id, grn_obj *res, int frequency_threshold, + grn_obj *items_freq, grn_obj *items_boost, + grn_obj *item_freq, grn_obj *item_boost) +{ + GRN_BULK_REWIND(item_freq); + GRN_BULK_REWIND(item_boost); + grn_obj_get_value(ctx, items_freq, id, item_freq); + grn_obj_get_value(ctx, items_boost, id, item_boost); + if (GRN_INT32_VALUE(item_boost) >= 0) { + int32_t score; + score = 1 + + GRN_INT32_VALUE(item_freq) + + GRN_INT32_VALUE(item_boost); + if (score >= frequency_threshold) { + void *value; + if (grn_hash_add(ctx, (grn_hash *)res, &id, sizeof(grn_id), + &value, NULL)) { + grn_rset_recinfo *ri; + ri = value; + ri->score += score; + } + } + } +} + +static void +complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, + grn_obj *query, grn_obj *sortby, + grn_obj *output_columns, int offset, int limit, + int frequency_threshold, double conditional_probability_threshold, + grn_suggest_search_mode prefix_search_mode) +{ + grn_obj *res; + grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq")); + grn_obj item_freq, item_boost; + GRN_INT32_INIT(&item_freq, 0); + GRN_INT32_INIT(&item_boost, 0); + if ((res = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { + grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); + if (GRN_TEXT_LEN(query)) { + grn_table_cursor *cur; + /* RK search + prefix search */ + grn_obj *index; + /* FIXME: support index selection */ + if (grn_column_index(ctx, col, GRN_OP_PREFIX, &index, 1, NULL)) { + if ((cur = grn_table_cursor_open(ctx, grn_ctx_at(ctx, index->header.domain), + GRN_TEXT_VALUE(query), + GRN_TEXT_LEN(query), + NULL, 0, 0, -1, + GRN_CURSOR_PREFIX|GRN_CURSOR_RK))) { + grn_id id; + while ((id = grn_table_cursor_next(ctx, cur))) { + grn_ii_cursor *icur; + if ((icur = grn_ii_cursor_open(ctx, (grn_ii *)index, id, + GRN_ID_NIL, GRN_ID_MAX, 1, 0))) { + grn_ii_posting *p; + while ((p = grn_ii_cursor_next(ctx, icur))) { + complete_add_item(ctx, p->rid, res, frequency_threshold, + items_freq, items_boost, + &item_freq, &item_boost); + } + grn_ii_cursor_close(ctx, icur); + } + } + grn_table_cursor_close(ctx, cur); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot open cursor for prefix RK search."); + } + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot find index for prefix RK search."); + } + cooccurrence_search(ctx, items, items_boost, tid, res, COMPLETE, + frequency_threshold, + conditional_probability_threshold); + if (((prefix_search_mode == GRN_SUGGEST_SEARCH_YES) || + (prefix_search_mode == GRN_SUGGEST_SEARCH_AUTO && + !grn_table_size(ctx, res))) && + (cur = grn_table_cursor_open(ctx, items, + GRN_TEXT_VALUE(query), + GRN_TEXT_LEN(query), + NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) { + grn_id id; + while ((id = grn_table_cursor_next(ctx, cur))) { + complete_add_item(ctx, id, res, frequency_threshold, + items_freq, items_boost, &item_freq, &item_boost); + } + grn_table_cursor_close(ctx, cur); + } + } + output(ctx, items, res, tid, sortby, output_columns, offset, limit); + grn_obj_close(ctx, res); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table."); + } + GRN_OBJ_FIN(ctx, &item_boost); + GRN_OBJ_FIN(ctx, &item_freq); +} + +static void +correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, + grn_obj *query, grn_obj *sortby, + grn_obj *output_columns, int offset, int limit, + int frequency_threshold, double conditional_probability_threshold, + grn_suggest_search_mode similar_search_mode) +{ + grn_obj *res; + grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2")); + grn_obj item_freq2, item_boost; + GRN_INT32_INIT(&item_freq2, 0); + GRN_INT32_INIT(&item_boost, 0); + if ((res = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { + grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); + int32_t max_score; + max_score = cooccurrence_search(ctx, items, items_boost, tid, res, CORRECT, + frequency_threshold, + conditional_probability_threshold); + GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SCORE, + ":", "cooccur(%d)", max_score); + if (GRN_TEXT_LEN(query) && + ((similar_search_mode == GRN_SUGGEST_SEARCH_YES) || + (similar_search_mode == GRN_SUGGEST_SEARCH_AUTO && + max_score < frequency_threshold))) { + grn_obj *key, *index; + if ((key = grn_obj_column(ctx, items, + GRN_COLUMN_NAME_KEY, + GRN_COLUMN_NAME_KEY_LEN))) { + if (grn_column_index(ctx, key, GRN_OP_MATCH, &index, 1, NULL)) { + grn_select_optarg optarg; + memset(&optarg, 0, sizeof(grn_select_optarg)); + optarg.mode = GRN_OP_SIMILAR; + optarg.similarity_threshold = 0; + optarg.max_size = 2; + grn_ii_select(ctx, (grn_ii *)index, TEXT_VALUE_LEN(query), + (grn_hash *)res, GRN_OP_OR, &optarg); + grn_obj_unlink(ctx, index); + GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, + ":", "similar(%d)", grn_table_size(ctx, res)); + { + grn_hash_cursor *hc = grn_hash_cursor_open(ctx, (grn_hash *)res, NULL, + 0, NULL, 0, 0, -1, 0); + if (hc) { + while (grn_hash_cursor_next(ctx, hc)) { + void *key, *value; + if (grn_hash_cursor_get_key_value(ctx, hc, &key, NULL, &value)) { + grn_id *rp; + rp = key; + GRN_BULK_REWIND(&item_freq2); + GRN_BULK_REWIND(&item_boost); + grn_obj_get_value(ctx, items_freq2, *rp, &item_freq2); + grn_obj_get_value(ctx, items_boost, *rp, &item_boost); + if (GRN_INT32_VALUE(&item_boost) >= 0) { + int32_t score; + grn_rset_recinfo *ri; + score = 1 + + (GRN_INT32_VALUE(&item_freq2) >> 4) + + GRN_INT32_VALUE(&item_boost); + ri = value; + ri->score += score; + if (score >= frequency_threshold) { continue; } + } + /* score < frequency_threshold || item_boost < 0 */ + grn_hash_cursor_delete(ctx, hc, NULL); + } + } + grn_hash_cursor_close(ctx, hc); + } + } + GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, + ":", "filter(%d)", grn_table_size(ctx, res)); + { + /* exec _score -= edit_distance(_key, "query string") for all records */ + grn_obj *var; + grn_obj *expr; + + GRN_EXPR_CREATE_FOR_QUERY(ctx, res, expr, var); + if (expr) { + grn_table_cursor *tc; + grn_obj *score = grn_obj_column(ctx, res, + GRN_COLUMN_NAME_SCORE, + GRN_COLUMN_NAME_SCORE_LEN); + grn_obj *key = grn_obj_column(ctx, res, + GRN_COLUMN_NAME_KEY, + GRN_COLUMN_NAME_KEY_LEN); + grn_expr_append_obj(ctx, expr, + score, + GRN_OP_GET_VALUE, 1); + grn_expr_append_obj(ctx, expr, + grn_ctx_get(ctx, CONST_STR_LEN("edit_distance")), + GRN_OP_PUSH, 1); + grn_expr_append_obj(ctx, expr, + key, + GRN_OP_GET_VALUE, 1); + grn_expr_append_const(ctx, expr, query, GRN_OP_PUSH, 1); + grn_expr_append_op(ctx, expr, GRN_OP_CALL, 2); + grn_expr_append_op(ctx, expr, GRN_OP_MINUS_ASSIGN, 2); + + if ((tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, 0))) { + grn_id id; + grn_obj score_value; + GRN_INT32_INIT(&score_value, 0); + while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { + GRN_RECORD_SET(ctx, var, id); + grn_expr_exec(ctx, expr, 0); + GRN_BULK_REWIND(&score_value); + grn_obj_get_value(ctx, score, id, &score_value); + if (GRN_INT32_VALUE(&score_value) < frequency_threshold) { + grn_table_cursor_delete(ctx, tc); + } + } + grn_obj_unlink(ctx, &score_value); + grn_table_cursor_close(ctx, tc); + } + grn_obj_unlink(ctx, score); + grn_obj_unlink(ctx, key); + grn_obj_unlink(ctx, expr); + } else { + ERR(GRN_UNKNOWN_ERROR, + "error on building expr. for calicurating edit distance"); + } + } + } + grn_obj_unlink(ctx, key); + } + } + output(ctx, items, res, tid, sortby, output_columns, offset, limit); + grn_obj_close(ctx, res); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table."); + } + GRN_OBJ_FIN(ctx, &item_boost); + GRN_OBJ_FIN(ctx, &item_freq2); +} + +static void +suggest(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, + grn_obj *query, grn_obj *sortby, + grn_obj *output_columns, int offset, int limit, + int frequency_threshold, double conditional_probability_threshold) +{ + grn_obj *res; + if ((res = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { + grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); + cooccurrence_search(ctx, items, items_boost, tid, res, SUGGEST, + frequency_threshold, conditional_probability_threshold); + output(ctx, items, res, tid, sortby, output_columns, offset, limit); + grn_obj_close(ctx, res); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table."); + } +} + +static grn_suggest_search_mode +parse_search_mode(grn_ctx *ctx, grn_obj *mode_text) +{ + grn_suggest_search_mode mode; + int mode_length; + + mode_length = GRN_TEXT_LEN(mode_text); + if (mode_length == 3 && + strncasecmp("yes", GRN_TEXT_VALUE(mode_text), 3) == 0) { + mode = GRN_SUGGEST_SEARCH_YES; + } else if (mode_length == 2 && + strncasecmp("no", GRN_TEXT_VALUE(mode_text), 2) == 0) { + mode = GRN_SUGGEST_SEARCH_NO; + } else { + mode = GRN_SUGGEST_SEARCH_AUTO; + } + + return mode; +} + +static grn_obj * +command_suggest(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_obj *items, *col, *items_boost; + int types; + int offset = 0; + int limit = DEFAULT_LIMIT; + int frequency_threshold = DEFAULT_FREQUENCY_THRESHOLD; + double conditional_probability_threshold = + DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD; + grn_suggest_search_mode prefix_search_mode; + grn_suggest_search_mode similar_search_mode; + + types = grn_parse_suggest_types(VAR(0)); + if (GRN_TEXT_LEN(VAR(6)) > 0) { + offset = grn_atoi(GRN_TEXT_VALUE(VAR(6)), GRN_BULK_CURR(VAR(6)), NULL); + } + if (GRN_TEXT_LEN(VAR(7)) > 0) { + limit = grn_atoi(GRN_TEXT_VALUE(VAR(7)), GRN_BULK_CURR(VAR(7)), NULL); + } + if (GRN_TEXT_LEN(VAR(8)) > 0) { + frequency_threshold = grn_atoi(GRN_TEXT_VALUE(VAR(8)), GRN_BULK_CURR(VAR(8)), NULL); + } + if (GRN_TEXT_LEN(VAR(9)) > 0) { + GRN_TEXT_PUTC(ctx, VAR(9), '\0'); + conditional_probability_threshold = strtod(GRN_TEXT_VALUE(VAR(9)), NULL); + } + + prefix_search_mode = parse_search_mode(ctx, VAR(10)); + similar_search_mode = parse_search_mode(ctx, VAR(11)); + + if ((items = grn_ctx_get(ctx, TEXT_VALUE_LEN(VAR(1))))) { + if ((items_boost = grn_obj_column(ctx, items, CONST_STR_LEN("boost")))) { + GRN_OUTPUT_MAP_OPEN("RESULT_SET", -1); + if (types & COMPLETE) { + if ((col = grn_obj_column(ctx, items, TEXT_VALUE_LEN(VAR(2))))) { + GRN_OUTPUT_CSTR("complete"); + complete(ctx, items, items_boost, col, VAR(3), VAR(4), + VAR(5), offset, limit, + frequency_threshold, conditional_probability_threshold, + prefix_search_mode); + } else { + ERR(GRN_INVALID_ARGUMENT, "invalid column."); + } + } + if (types & CORRECT) { + GRN_OUTPUT_CSTR("correct"); + correct(ctx, items, items_boost, VAR(3), VAR(4), + VAR(5), offset, limit, + frequency_threshold, conditional_probability_threshold, + similar_search_mode); + } + if (types & SUGGEST) { + GRN_OUTPUT_CSTR("suggest"); + suggest(ctx, items, items_boost, VAR(3), VAR(4), + VAR(5), offset, limit, + frequency_threshold, conditional_probability_threshold); + } + GRN_OUTPUT_MAP_CLOSE(); + } else { + ERR(GRN_INVALID_ARGUMENT, "nonexistent column: <%.*s.boost>", + (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1))); + } + grn_obj_unlink(ctx, items); + } else { + ERR(GRN_INVALID_ARGUMENT, "nonexistent table: <%.*s>", + (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1))); + } + return NULL; +} + +static void +learner_init_values(grn_ctx *ctx, grn_suggest_learner *learner) +{ + learner->post_event_id = GRN_RECORD_VALUE(learner->post_event); + learner->post_type_id = GRN_RECORD_VALUE(learner->post_type); + learner->post_item_id = GRN_RECORD_VALUE(learner->post_item); + learner->seq_id = GRN_RECORD_VALUE(learner->seq); + learner->post_time_value = GRN_TIME_VALUE(learner->post_time); +} + +static void +learner_init(grn_ctx *ctx, grn_suggest_learner *learner, + grn_obj *post_event, grn_obj *post_type, grn_obj *post_item, + grn_obj *seq, grn_obj *post_time, grn_obj *pairs) +{ + learner->post_event = post_event; + learner->post_type = post_type; + learner->post_item = post_item; + learner->seq = seq; + learner->post_time = post_time; + learner->pairs = pairs; + + learner->learn_distance_in_seconds = 0; + + learner_init_values(ctx, learner); +} + +static void +learner_init_columns(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_id events_id, event_types_id; + grn_obj *seqs, *events, *post_item, *items, *pairs; + + learner->seqs = seqs = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(learner->seq)); + learner->seqs_events = grn_obj_column(ctx, seqs, CONST_STR_LEN("events")); + + events_id = grn_obj_get_range(ctx, learner->seqs_events); + learner->events = events = grn_ctx_at(ctx, events_id); + learner->events_item = grn_obj_column(ctx, events, CONST_STR_LEN("item")); + learner->events_type = grn_obj_column(ctx, events, CONST_STR_LEN("type")); + learner->events_time = grn_obj_column(ctx, events, CONST_STR_LEN("time")); + + event_types_id = grn_obj_get_range(ctx, learner->events_type); + learner->event_types = grn_obj_column(ctx, events, CONST_STR_LEN("time")); + + post_item = learner->post_item; + learner->items = items = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(post_item)); + learner->items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq")); + learner->items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2")); + learner->items_last = grn_obj_column(ctx, items, CONST_STR_LEN("last")); + + pairs = learner->pairs; + learner->pairs_pre = grn_obj_column(ctx, pairs, CONST_STR_LEN("pre")); + learner->pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post")); + learner->pairs_freq0 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0")); + learner->pairs_freq1 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1")); + learner->pairs_freq2 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2")); +} + +static void +learner_fin_columns(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj_unlink(ctx, learner->seqs); + grn_obj_unlink(ctx, learner->seqs_events); + + grn_obj_unlink(ctx, learner->events); + grn_obj_unlink(ctx, learner->events_item); + grn_obj_unlink(ctx, learner->events_type); + grn_obj_unlink(ctx, learner->events_time); + + grn_obj_unlink(ctx, learner->event_types); + + grn_obj_unlink(ctx, learner->items); + grn_obj_unlink(ctx, learner->items_freq); + grn_obj_unlink(ctx, learner->items_freq2); + grn_obj_unlink(ctx, learner->items_last); + + grn_obj_unlink(ctx, learner->pairs_pre); + grn_obj_unlink(ctx, learner->pairs_post); + grn_obj_unlink(ctx, learner->pairs_freq0); + grn_obj_unlink(ctx, learner->pairs_freq1); + grn_obj_unlink(ctx, learner->pairs_freq2); +} + +static void +learner_init_weight(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj *weight_column = NULL; + unsigned int weight = 1; + + if (learner->configuration) { + weight_column = grn_obj_column(ctx, + learner->configuration, + CONST_STR_LEN("weight")); + } + if (weight_column) { + grn_id id; + id = grn_table_get(ctx, learner->configuration, + GRN_TEXT_VALUE(&(learner->dataset_name)), + GRN_TEXT_LEN(&(learner->dataset_name))); + if (id != GRN_ID_NIL) { + grn_obj weight_value; + GRN_UINT32_INIT(&weight_value, 0); + grn_obj_get_value(ctx, weight_column, id, &weight_value); + weight = GRN_UINT32_VALUE(&weight_value); + GRN_OBJ_FIN(ctx, &weight_value); + } + grn_obj_unlink(ctx, weight_column); + } + + GRN_UINT32_INIT(&(learner->weight), 0); + GRN_UINT32_SET(ctx, &(learner->weight), weight); +} + +static void +learner_init_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner) +{ + char events_name[GRN_TABLE_MAX_KEY_SIZE]; + unsigned int events_name_size; + unsigned int events_name_prefix_size; + + events_name_size = grn_obj_name(ctx, learner->events, + events_name, GRN_TABLE_MAX_KEY_SIZE); + GRN_TEXT_INIT(&(learner->dataset_name), 0); + events_name_prefix_size = strlen("event_"); + if (events_name_size > events_name_prefix_size) { + GRN_TEXT_PUT(ctx, + &(learner->dataset_name), + events_name + events_name_prefix_size, + events_name_size - events_name_prefix_size); + } +} + +static void +learner_fin_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner) +{ + GRN_OBJ_FIN(ctx, &(learner->dataset_name)); +} + +static void +learner_init_configuration(grn_ctx *ctx, grn_suggest_learner *learner) +{ + learner->configuration = grn_ctx_get(ctx, "configuration", -1); +} + +static void +learner_fin_configuration(grn_ctx *ctx, grn_suggest_learner *learner) +{ + if (learner->configuration) { + grn_obj_unlink(ctx, learner->configuration); + } +} + +static void +learner_init_buffers(grn_ctx *ctx, grn_suggest_learner *learner) +{ + learner_init_weight(ctx, learner); + GRN_RECORD_INIT(&(learner->pre_events), 0, grn_obj_id(ctx, learner->events)); +} + +static void +learner_fin_buffers(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj_unlink(ctx, &(learner->weight)); + grn_obj_unlink(ctx, &(learner->pre_events)); +} + +static void +learner_init_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_id items_id; + + learner->key_prefix = ((uint64_t)learner->post_item_id) << 32; + + items_id = grn_obj_get_range(ctx, learner->events_item); + GRN_RECORD_INIT(&(learner->pre_item), 0, items_id); + + grn_obj_get_value(ctx, learner->seqs_events, learner->seq_id, + &(learner->pre_events)); +} + +static void +learner_fin_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj_unlink(ctx, &(learner->pre_item)); + GRN_BULK_REWIND(&(learner->pre_events)); +} + +static grn_bool +learner_is_valid_input(grn_ctx *ctx, grn_suggest_learner *learner) +{ + return learner->post_event_id && learner->post_item_id && learner->seq_id; +} + +static void +learner_increment(grn_ctx *ctx, grn_suggest_learner *learner, + grn_obj *column, grn_id record_id) +{ + grn_obj_set_value(ctx, column, record_id, &(learner->weight), GRN_OBJ_INCR); +} + +static void +learner_increment_item_freq(grn_ctx *ctx, grn_suggest_learner *learner, + grn_obj *column) +{ + learner_increment(ctx, learner, column, learner->post_item_id); +} + +static void +learner_set_last_post_time(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj_set_value(ctx, learner->items_last, learner->post_item_id, + learner->post_time, GRN_OBJ_SET); +} + +static void +learner_learn_for_complete_and_correcnt(grn_ctx *ctx, + grn_suggest_learner *learner) +{ + grn_obj *pre_item, *post_item, *pre_events; + grn_obj pre_type, pre_time; + grn_id *ep, *es; + uint64_t key; + int64_t post_time_value; + + pre_item = &(learner->pre_item); + post_item = learner->post_item; + pre_events = &(learner->pre_events); + post_time_value = learner->post_time_value; + GRN_RECORD_INIT(&pre_type, 0, grn_obj_get_range(ctx, learner->events_type)); + GRN_TIME_INIT(&pre_time, 0); + ep = (grn_id *)GRN_BULK_CURR(pre_events); + es = (grn_id *)GRN_BULK_HEAD(pre_events); + while (es < ep--) { + grn_id pair_id; + int added; + int64_t learn_distance; + + GRN_BULK_REWIND(&pre_type); + GRN_BULK_REWIND(&pre_time); + GRN_BULK_REWIND(pre_item); + grn_obj_get_value(ctx, learner->events_type, *ep, &pre_type); + grn_obj_get_value(ctx, learner->events_time, *ep, &pre_time); + grn_obj_get_value(ctx, learner->events_item, *ep, pre_item); + learn_distance = post_time_value - GRN_TIME_VALUE(&pre_time); + if (learn_distance >= MIN_LEARN_DISTANCE) { + learner->learn_distance_in_seconds = + (int)(learn_distance / GRN_TIME_USEC_PER_SEC); + break; + } + key = learner->key_prefix + GRN_RECORD_VALUE(pre_item); + pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t), + &added); + if (added) { + grn_obj_set_value(ctx, learner->pairs_pre, pair_id, pre_item, + GRN_OBJ_SET); + grn_obj_set_value(ctx, learner->pairs_post, pair_id, post_item, + GRN_OBJ_SET); + } + if (GRN_RECORD_VALUE(&pre_type)) { + learner_increment(ctx, learner, learner->pairs_freq1, pair_id); + break; + } else { + learner_increment(ctx, learner, learner->pairs_freq0, pair_id); + } + } + GRN_OBJ_FIN(ctx, &pre_type); + GRN_OBJ_FIN(ctx, &pre_time); +} + +static void +learner_learn_for_suggest(grn_ctx *ctx, grn_suggest_learner *learner) +{ + char keybuf[GRN_TABLE_MAX_KEY_SIZE]; + int keylen = grn_table_get_key(ctx, learner->items, learner->post_item_id, + keybuf, GRN_TABLE_MAX_KEY_SIZE); + unsigned int token_flags = 0; + grn_token *token = grn_token_open(ctx, learner->items, keybuf, keylen, + GRN_TOKEN_ADD, token_flags); + if (token) { + grn_id tid; + grn_obj *pre_item = &(learner->pre_item); + grn_obj *post_item = learner->post_item; + grn_hash *token_ids = NULL; + while ((tid = grn_token_next(ctx, token)) && tid != learner->post_item_id) { + uint64_t key; + int added; + grn_id pair_id; + key = learner->key_prefix + tid; + pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t), + &added); + if (added) { + GRN_RECORD_SET(ctx, pre_item, tid); + grn_obj_set_value(ctx, learner->pairs_pre, pair_id, + pre_item, GRN_OBJ_SET); + grn_obj_set_value(ctx, learner->pairs_post, pair_id, + post_item, GRN_OBJ_SET); + } + if (!token_ids) { + token_ids = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, + GRN_OBJ_TABLE_HASH_KEY|GRN_HASH_TINY); + } + if (token_ids) { + int token_added; + grn_hash_add(ctx, token_ids, &tid, sizeof(grn_id), NULL, &token_added); + if (token_added) { + learner_increment(ctx, learner, learner->pairs_freq2, pair_id); + } + } + } + if (token_ids) { + grn_hash_close(ctx, token_ids); + } + grn_token_close(ctx, token); + } +} + +static void +learner_append_post_event(grn_ctx *ctx, grn_suggest_learner *learner) +{ + GRN_RECORD_SET(ctx, &(learner->pre_events), learner->post_event_id); + grn_obj_set_value(ctx, learner->seqs_events, learner->seq_id, + &(learner->pre_events), GRN_OBJ_APPEND); +} + +static void +learner_learn(grn_ctx *ctx, grn_suggest_learner *learner) +{ + if (learner_is_valid_input(ctx, learner)) { + learner_init_columns(ctx, learner); + learner_init_dataset_name(ctx, learner); + learner_init_configuration(ctx, learner); + learner_init_buffers(ctx, learner); + learner_increment_item_freq(ctx, learner, learner->items_freq); + learner_set_last_post_time(ctx, learner); + if (learner->post_type_id) { + learner_init_submit_learn(ctx, learner); + learner_increment_item_freq(ctx, learner, learner->items_freq2); + learner_learn_for_complete_and_correcnt(ctx, learner); + learner_learn_for_suggest(ctx, learner); + learner_fin_submit_learn(ctx, learner); + } + learner_append_post_event(ctx, learner); + learner_fin_buffers(ctx, learner); + learner_fin_configuration(ctx, learner); + learner_fin_dataset_name(ctx, learner); + learner_fin_columns(ctx, learner); + } +} + +static grn_obj * +func_suggest_preparer(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + int learn_distance_in_seconds = 0; + grn_obj *obj; + if (nargs == 6) { + grn_obj *post_event = args[0]; + grn_obj *post_type = args[1]; + grn_obj *post_item = args[2]; + grn_obj *seq = args[3]; + grn_obj *post_time = args[4]; + grn_obj *pairs = args[5]; + grn_suggest_learner learner; + learner_init(ctx, &learner, + post_event, post_type, post_item, seq, post_time, pairs); + learner_learn(ctx, &learner); + learn_distance_in_seconds = learner.learn_distance_in_seconds; + } + if ((obj = GRN_PROC_ALLOC(GRN_DB_UINT32, 0))) { + GRN_UINT32_SET(ctx, obj, learn_distance_in_seconds); + } + return obj; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_expr_var vars[12]; + + grn_plugin_expr_var_init(ctx, &vars[0], "types", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "column", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "query", -1); + grn_plugin_expr_var_init(ctx, &vars[4], "sortby", -1); + grn_plugin_expr_var_init(ctx, &vars[5], "output_columns", -1); + grn_plugin_expr_var_init(ctx, &vars[6], "offset", -1); + grn_plugin_expr_var_init(ctx, &vars[7], "limit", -1); + grn_plugin_expr_var_init(ctx, &vars[8], "frequency_threshold", -1); + grn_plugin_expr_var_init(ctx, &vars[9], "conditional_probability_threshold", -1); + grn_plugin_expr_var_init(ctx, &vars[10], "prefix_search", -1); + grn_plugin_expr_var_init(ctx, &vars[11], "similar_search", -1); + grn_plugin_command_create(ctx, "suggest", -1, command_suggest, 12, vars); + + grn_proc_create(ctx, CONST_STR_LEN("suggest_preparer"), GRN_PROC_FUNCTION, + func_suggest_preparer, NULL, NULL, 0, NULL); + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/table/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/table/CMakeLists.txt new file mode 100644 index 00000000000..cba4697f042 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/table/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ${MRUBY_INCLUDE_DIRS}) + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/sources.am TABLE_SOURCES) +add_library(table MODULE ${TABLE_SOURCES}) +set_source_files_properties(${TABLE_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +set_target_properties(table PROPERTIES PREFIX "") +target_link_libraries(table libgroonga) +if(NOT MRN_GROONGA_BUNDLED) + install(TARGETS table DESTINATION "${GRN_RELATIVE_PLUGINS_DIR}/table") +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/table/Makefile.am b/storage/mroonga/vendor/groonga/plugins/table/Makefile.am new file mode 100644 index 00000000000..4b49b0a3224 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/table/Makefile.am @@ -0,0 +1,24 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CFLAGS = \ + $(MESSAGE_PACK_CFLAGS) \ + $(MRUBY_CFLAGS) + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la \ + $(MESSAGE_PACK_LIBS) + +table_plugins_LTLIBRARIES = table.la + +include sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/table/sources.am b/storage/mroonga/vendor/groonga/plugins/table/sources.am new file mode 100644 index 00000000000..943e79b1f5c --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/table/sources.am @@ -0,0 +1,2 @@ +table_la_SOURCES = \ + table.c diff --git a/storage/mroonga/vendor/groonga/plugins/table/table.c b/storage/mroonga/vendor/groonga/plugins/table/table.c new file mode 100644 index 00000000000..a4fcd17bf90 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/table/table.c @@ -0,0 +1,747 @@ +/* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */ +/* Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <string.h> + +#include "ctx.h" +#include "db.h" +#include "output.h" +#include "util.h" +#include <groonga/plugin.h> + +#define VAR GRN_PROC_GET_VAR_BY_OFFSET +#define TEXT_VALUE_LEN(x) GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x) + +static grn_obj * +grn_ctx_get_table_by_name_or_id(grn_ctx *ctx, + const char *name, unsigned int name_len) +{ + grn_obj *table; + const char *end = name + name_len; + const char *rest = NULL; + grn_id id = grn_atoui(name, end, &rest); + if (rest == end) { + table = grn_ctx_at(ctx, id); + } else { + table = grn_ctx_get(ctx, name, name_len); + } + if (!GRN_OBJ_TABLEP(table)) { + ERR(GRN_INVALID_ARGUMENT, "invalid table name: <%.*s>", name_len, name); + if (table) { + grn_obj_unlink(ctx, table); + table = NULL; + } + } + return table; +} + +static void +grn_output_table_name_or_id(grn_ctx *ctx, grn_obj *table) +{ + if (table) { + if (((grn_db_obj *)table)->id & GRN_OBJ_TMP_OBJECT) { + GRN_OUTPUT_INT64(((grn_db_obj *)table)->id); + } else { + int name_len; + char name_buf[GRN_TABLE_MAX_KEY_SIZE]; + name_len = grn_obj_name(ctx, table, name_buf, GRN_TABLE_MAX_KEY_SIZE); + GRN_OUTPUT_STR(name_buf, name_len); + } + } else { + GRN_OUTPUT_INT64(0); + } +} + +static grn_bool +parse_bool_value(grn_ctx *ctx, grn_obj *text) +{ + grn_bool value = GRN_FALSE; + if (GRN_TEXT_LEN(text) == 3 && + memcmp("yes", GRN_TEXT_VALUE(text), 3) == 0) { + value = GRN_TRUE; + } + return value; +} + +static grn_operator +parse_set_operator_value(grn_ctx *ctx, grn_obj *text) +{ + grn_operator value = GRN_OP_OR; + if (GRN_TEXT_LEN(text) == 3) { + if (memcmp("and", GRN_TEXT_VALUE(text), 3) == 0) { + value = GRN_OP_AND; + } else if (memcmp("but", GRN_TEXT_VALUE(text), 3) == 0) { + value = GRN_OP_AND_NOT; + } + } else if (GRN_TEXT_LEN(text) == 6 && + memcmp("adjust", GRN_TEXT_VALUE(text), 6) == 0) { + value = GRN_OP_ADJUST; + } else if (GRN_TEXT_LEN(text) == 7 && + memcmp("and_not", GRN_TEXT_VALUE(text), 7) == 0) { + value = GRN_OP_AND_NOT; + } + return value; +} + +static grn_obj * +command_match(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_obj *result_set = NULL; + grn_obj *table = grn_ctx_get_table_by_name_or_id(ctx, TEXT_VALUE_LEN(VAR(0))); + if (table) { + grn_expr_flags flags = GRN_EXPR_SYNTAX_QUERY; + grn_obj *v, *query, *columns = NULL; + GRN_EXPR_CREATE_FOR_QUERY(ctx, table, query, v); + if (query) { + if (GRN_TEXT_LEN(VAR(1))) { + GRN_EXPR_CREATE_FOR_QUERY(ctx, table, columns, v); + if (columns) { + grn_expr_parse(ctx, columns, TEXT_VALUE_LEN(VAR(1)), + NULL, GRN_OP_MATCH, GRN_OP_AND, + GRN_EXPR_SYNTAX_SCRIPT); + } + } + if (parse_bool_value(ctx, VAR(5))) { + flags |= GRN_EXPR_ALLOW_COLUMN; + } + if (parse_bool_value(ctx, VAR(6))) { + flags |= GRN_EXPR_ALLOW_PRAGMA; + } + grn_expr_parse(ctx, query, TEXT_VALUE_LEN(VAR(2)), + columns, GRN_OP_MATCH, GRN_OP_AND, flags); + if (GRN_TEXT_LEN(VAR(3))) { + result_set = grn_ctx_get_table_by_name_or_id(ctx, TEXT_VALUE_LEN(VAR(3))); + } else { + result_set = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY| + GRN_OBJ_WITH_SUBREC, + table, NULL); + } + if (result_set) { + grn_table_select(ctx, table, query, result_set, + parse_set_operator_value(ctx, VAR(4))); + } + grn_obj_unlink(ctx, columns); + grn_obj_unlink(ctx, query); + } + } + grn_output_table_name_or_id(ctx, result_set); + return NULL; +} + +static grn_obj * +command_filter_by_script(grn_ctx *ctx, int nargs, + grn_obj **args, grn_user_data *user_data) +{ + grn_obj *result_set = NULL; + grn_obj *table = grn_ctx_get_table_by_name_or_id(ctx, TEXT_VALUE_LEN(VAR(0))); + if (table) { + grn_expr_flags flags = GRN_EXPR_SYNTAX_SCRIPT; + grn_obj *v, *query; + GRN_EXPR_CREATE_FOR_QUERY(ctx, table, query, v); + if (query) { + if (parse_bool_value(ctx, VAR(4))) { + flags |= GRN_EXPR_ALLOW_UPDATE; + } + grn_expr_parse(ctx, query, TEXT_VALUE_LEN(VAR(1)), + NULL, GRN_OP_MATCH, GRN_OP_AND, flags); + if (GRN_TEXT_LEN(VAR(2))) { + result_set = grn_ctx_get_table_by_name_or_id(ctx, TEXT_VALUE_LEN(VAR(2))); + } else { + result_set = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY| + GRN_OBJ_WITH_SUBREC, + table, NULL); + } + if (result_set) { + grn_table_select(ctx, table, query, result_set, + parse_set_operator_value(ctx, VAR(3))); + } + grn_obj_unlink(ctx, query); + } + } + grn_output_table_name_or_id(ctx, result_set); + return NULL; +} + +static grn_obj * +command_filter(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_operator operator = GRN_OP_NOP; + grn_obj *table, *column, *result_set = NULL; + if (!(table = grn_ctx_get_table_by_name_or_id(ctx, TEXT_VALUE_LEN(VAR(0))))) { + goto exit; + } + if (!(column = grn_obj_column(ctx, table, TEXT_VALUE_LEN(VAR(1))))) { + ERR(GRN_INVALID_ARGUMENT, "invalid column name: <%.*s>", + (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1))); + goto exit; + } + if (GRN_TEXT_LEN(VAR(2)) == 0) { + ERR(GRN_INVALID_ARGUMENT, "missing mandatory argument: operator"); + goto exit; + } else { + uint32_t operator_len = GRN_TEXT_LEN(VAR(2)); + const char *operator_text = GRN_TEXT_VALUE(VAR(2)); + switch (operator_text[0]) { + case '<' : + if (operator_len == 1) { + operator = GRN_OP_LESS; + } + break; + } + if (operator == GRN_OP_NOP) { + ERR(GRN_INVALID_ARGUMENT, "invalid operator: <%.*s>", + operator_len, operator_text); + goto exit; + } + } + if (GRN_TEXT_LEN(VAR(4))) { + result_set = grn_ctx_get_table_by_name_or_id(ctx, TEXT_VALUE_LEN(VAR(4))); + } else { + result_set = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY| + GRN_OBJ_WITH_SUBREC, + table, NULL); + } + if (result_set) { + grn_column_filter(ctx, column, operator, VAR(3), result_set, + parse_set_operator_value(ctx, VAR(5))); + } +exit : + grn_output_table_name_or_id(ctx, result_set); + return NULL; +} + +static grn_obj * +command_group(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + const char *table = GRN_TEXT_VALUE(VAR(0)); + unsigned int table_len = GRN_TEXT_LEN(VAR(0)); + const char *key = GRN_TEXT_VALUE(VAR(1)); + unsigned int key_len = GRN_TEXT_LEN(VAR(1)); + const char *set = GRN_TEXT_VALUE(VAR(2)); + unsigned int set_len = GRN_TEXT_LEN(VAR(2)); + grn_obj *table_ = grn_ctx_get_table_by_name_or_id(ctx, table, table_len); + grn_obj *set_ = NULL; + if (table_) { + uint32_t ngkeys; + grn_table_sort_key *gkeys; + gkeys = grn_table_sort_key_from_str(ctx, key, key_len, table_, &ngkeys); + if (gkeys) { + if (set_len) { + set_ = grn_ctx_get_table_by_name_or_id(ctx, set, set_len); + } else { + set_ = grn_table_create_for_group(ctx, NULL, 0, NULL, + gkeys[0].key, table_, 0); + } + if (set_) { + if (GRN_TEXT_LEN(VAR(3))) { + uint32_t gap = grn_atoui(GRN_TEXT_VALUE(VAR(3)), + GRN_BULK_CURR(VAR(3)), NULL); + grn_table_group_with_range_gap(ctx, table_, gkeys, set_, gap); + } else { + grn_table_group_result g = { + set_, 0, 0, 1, + GRN_TABLE_GROUP_CALC_COUNT, 0 + }; + grn_table_group(ctx, table_, gkeys, 1, &g, 1); + } + } + grn_table_sort_key_close(ctx, gkeys, ngkeys); + } + } + grn_output_table_name_or_id(ctx, set_); + return NULL; +} + +#define DEFAULT_LIMIT 10 + +static grn_obj * +command_sort(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + const char *table = GRN_TEXT_VALUE(VAR(0)); + unsigned int table_len = GRN_TEXT_LEN(VAR(0)); + const char *keys = GRN_TEXT_VALUE(VAR(1)); + unsigned int keys_len = GRN_TEXT_LEN(VAR(1)); + int offset = GRN_TEXT_LEN(VAR(2)) + ? grn_atoi(GRN_TEXT_VALUE(VAR(2)), GRN_BULK_CURR(VAR(2)), NULL) + : 0; + int limit = GRN_TEXT_LEN(VAR(3)) + ? grn_atoi(GRN_TEXT_VALUE(VAR(3)), GRN_BULK_CURR(VAR(3)), NULL) + : DEFAULT_LIMIT; + grn_obj *table_ = grn_ctx_get_table_by_name_or_id(ctx, table, table_len); + grn_obj *sorted = NULL; + if (table_) { + uint32_t nkeys; + grn_table_sort_key *keys_; + if (keys_len && + (keys_ = grn_table_sort_key_from_str(ctx, keys, keys_len, + table_, &nkeys))) { + if ((sorted = grn_table_create(ctx, NULL, 0, NULL, + GRN_OBJ_TABLE_NO_KEY, NULL, table_))) { + int table_size = (int)grn_table_size(ctx, table_); + grn_normalize_offset_and_limit(ctx, table_size, &offset, &limit); + grn_table_sort(ctx, table_, offset, limit, sorted, keys_, nkeys); + grn_table_sort_key_close(ctx, keys_, nkeys); + } + } + } + grn_output_table_name_or_id(ctx, sorted); + return NULL; +} + +static grn_obj * +command_output(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + const char *table = GRN_TEXT_VALUE(VAR(0)); + unsigned int table_len = GRN_TEXT_LEN(VAR(0)); + const char *columns = GRN_TEXT_VALUE(VAR(1)); + unsigned int columns_len = GRN_TEXT_LEN(VAR(1)); + int offset = GRN_TEXT_LEN(VAR(2)) + ? grn_atoi(GRN_TEXT_VALUE(VAR(2)), GRN_BULK_CURR(VAR(2)), NULL) + : 0; + int limit = GRN_TEXT_LEN(VAR(3)) + ? grn_atoi(GRN_TEXT_VALUE(VAR(3)), GRN_BULK_CURR(VAR(3)), NULL) + : DEFAULT_LIMIT; + grn_obj *table_ = grn_ctx_get_table_by_name_or_id(ctx, table, table_len); + if (table_) { + grn_obj_format format; + int table_size = (int)grn_table_size(ctx, table_); + GRN_OBJ_FORMAT_INIT(&format, table_size, 0, limit, offset); + format.flags = + GRN_OBJ_FORMAT_WITH_COLUMN_NAMES| + GRN_OBJ_FORMAT_XML_ELEMENT_RESULTSET; + /* TODO: accept only comma separated expr as columns */ + grn_obj_columns(ctx, table_, columns, columns_len, &format.columns); + GRN_OUTPUT_OBJ(table_, &format); + GRN_OBJ_FORMAT_FIN(ctx, &format); + } + return NULL; +} + +static grn_obj * +command_each(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + const char *table = GRN_TEXT_VALUE(VAR(0)); + unsigned int table_len = GRN_TEXT_LEN(VAR(0)); + const char *expr = GRN_TEXT_VALUE(VAR(1)); + unsigned int expr_len = GRN_TEXT_LEN(VAR(1)); + grn_obj *table_ = grn_ctx_get_table_by_name_or_id(ctx, table, table_len); + if (table_) { + grn_obj *v, *expr_; + GRN_EXPR_CREATE_FOR_QUERY(ctx, table_, expr_, v); + if (expr_ && v) { + grn_table_cursor *tc; + grn_expr_parse(ctx, expr_, expr, expr_len, + NULL, GRN_OP_MATCH, GRN_OP_AND, + GRN_EXPR_SYNTAX_SCRIPT|GRN_EXPR_ALLOW_UPDATE); + if ((tc = grn_table_cursor_open(ctx, table_, NULL, 0, + NULL, 0, 0, -1, 0))) { + grn_id id; + while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { + GRN_RECORD_SET(ctx, v, id); + grn_expr_exec(ctx, expr_, 0); + } + grn_table_cursor_close(ctx, tc); + } + grn_obj_unlink(ctx, expr_); + } + } + GRN_OUTPUT_BOOL(!ctx->rc); + return NULL; +} + +static grn_obj * +command_unlink(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + const char *table = GRN_TEXT_VALUE(VAR(0)); + unsigned int table_len = GRN_TEXT_LEN(VAR(0)); + grn_obj *table_ = grn_ctx_get_table_by_name_or_id(ctx, table, table_len); + if (table_) { + grn_obj_unlink(ctx, table_); + } + GRN_OUTPUT_BOOL(!ctx->rc); + return NULL; +} + +static grn_obj * +command_add(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_load_(ctx, GRN_CONTENT_JSON, + GRN_TEXT_VALUE(VAR(0)), GRN_TEXT_LEN(VAR(0)), + NULL, 0, + GRN_TEXT_VALUE(VAR(1)), GRN_TEXT_LEN(VAR(1)), + NULL, 0, NULL, 0, 0); + GRN_OUTPUT_BOOL(ctx->impl->loader.nrecords); + if (ctx->impl->loader.table) { + grn_db_touch(ctx, DB_OBJ(ctx->impl->loader.table)->db); + } + return NULL; +} + +static grn_obj * +command_set(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + int table_name_len = GRN_TEXT_LEN(VAR(0)); + const char *table_name = GRN_TEXT_VALUE(VAR(0)); + grn_obj *table = grn_ctx_get(ctx, table_name, table_name_len); + if (table) { + grn_id id = GRN_ID_NIL; + int key_len = GRN_TEXT_LEN(VAR(2)); + int id_len = GRN_TEXT_LEN(VAR(5)); + if (key_len) { + const char *key = GRN_TEXT_VALUE(VAR(2)); + id = grn_table_get(ctx, table, key, key_len); + } else { + if (id_len) { + id = grn_atoui(GRN_TEXT_VALUE(VAR(5)), GRN_BULK_CURR(VAR(5)), NULL); + } + id = grn_table_at(ctx, table, id); + } + if (id) { + grn_obj obj; + grn_obj_format format; + GRN_RECORD_INIT(&obj, 0, ((grn_db_obj *)table)->id); + GRN_OBJ_FORMAT_INIT(&format, 1, 0, 1, 0); + GRN_RECORD_SET(ctx, &obj, id); + grn_obj_columns(ctx, table, + GRN_TEXT_VALUE(VAR(4)), + GRN_TEXT_LEN(VAR(4)), &format.columns); + format.flags = 0 /* GRN_OBJ_FORMAT_WITH_COLUMN_NAMES */; + GRN_OUTPUT_OBJ(&obj, &format); + GRN_OBJ_FORMAT_FIN(ctx, &format); + } + } else { + ERR(GRN_INVALID_ARGUMENT, + "nonexistent table name: <%.*s>", table_name_len, table_name); + } + return NULL; +} + +static grn_rc +command_get_resolve_parameters(grn_ctx *ctx, grn_user_data *user_data, + grn_obj **table, grn_id *id) +{ + const char *table_text, *id_text, *key_text; + int table_length, id_length, key_length; + + table_text = GRN_TEXT_VALUE(VAR(0)); + table_length = GRN_TEXT_LEN(VAR(0)); + if (table_length == 0) { + ERR(GRN_INVALID_ARGUMENT, "[table][get] table isn't specified"); + return ctx->rc; + } + + *table = grn_ctx_get(ctx, table_text, table_length); + if (!*table) { + ERR(GRN_INVALID_ARGUMENT, + "[table][get] table doesn't exist: <%.*s>", table_length, table_text); + return ctx->rc; + } + + key_text = GRN_TEXT_VALUE(VAR(1)); + key_length = GRN_TEXT_LEN(VAR(1)); + id_text = GRN_TEXT_VALUE(VAR(3)); + id_length = GRN_TEXT_LEN(VAR(3)); + switch ((*table)->header.type) { + case GRN_TABLE_NO_KEY: + if (key_length) { + ERR(GRN_INVALID_ARGUMENT, + "[table][get] should not specify key for NO_KEY table: <%.*s>: " + "table: <%.*s>", + key_length, key_text, + table_length, table_text); + return ctx->rc; + } + if (id_length) { + const char *rest = NULL; + *id = grn_atoi(id_text, id_text + id_length, &rest); + if (rest == id_text) { + ERR(GRN_INVALID_ARGUMENT, + "[table][get] ID should be a number: <%.*s>: table: <%.*s>", + id_length, id_text, + table_length, table_text); + } + } else { + ERR(GRN_INVALID_ARGUMENT, + "[table][get] ID isn't specified: table: <%.*s>", + table_length, table_text); + } + break; + case GRN_TABLE_HASH_KEY: + case GRN_TABLE_PAT_KEY: + case GRN_TABLE_DAT_KEY: + if (key_length && id_length) { + ERR(GRN_INVALID_ARGUMENT, + "[table][get] should not specify both key and ID: " + "key: <%.*s>: ID: <%.*s>: table: <%.*s>", + key_length, key_text, + id_length, id_text, + table_length, table_text); + return ctx->rc; + } + if (key_length) { + *id = grn_table_get(ctx, *table, key_text, key_length); + if (!*id) { + ERR(GRN_INVALID_ARGUMENT, + "[table][get] nonexistent key: <%.*s>: table: <%.*s>", + key_length, key_text, + table_length, table_text); + } + } else { + if (id_length) { + const char *rest = NULL; + *id = grn_atoi(id_text, id_text + id_length, &rest); + if (rest == id_text) { + ERR(GRN_INVALID_ARGUMENT, + "[table][get] ID should be a number: <%.*s>: table: <%.*s>", + id_length, id_text, + table_length, table_text); + } + } else { + ERR(GRN_INVALID_ARGUMENT, + "[table][get] key nor ID isn't specified: table: <%.*s>", + table_length, table_text); + } + } + break; + default: + ERR(GRN_INVALID_ARGUMENT, + "[table][get] not a table: <%.*s>", table_length, table_text); + break; + } + + return ctx->rc; +} + +static grn_obj * +command_get(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_id id = GRN_ID_NIL; + grn_obj *table = NULL; + if (!command_get_resolve_parameters(ctx, user_data, &table, &id)) { + grn_obj obj; + grn_obj_format format; + GRN_OUTPUT_ARRAY_OPEN("RESULT", 2); + GRN_RECORD_INIT(&obj, 0, ((grn_db_obj *)table)->id); + GRN_OBJ_FORMAT_INIT(&format, 1, 0, 1, 0); + GRN_RECORD_SET(ctx, &obj, id); + grn_obj_columns(ctx, table, GRN_TEXT_VALUE(VAR(2)), GRN_TEXT_LEN(VAR(2)), + &format.columns); + format.flags = + GRN_OBJ_FORMAT_WITH_COLUMN_NAMES | + GRN_OBJ_FORMAT_XML_ELEMENT_RESULTSET; + GRN_OUTPUT_OBJ(&obj, &format); + GRN_OBJ_FORMAT_FIN(ctx, &format); + GRN_OUTPUT_ARRAY_CLOSE(); + } + return NULL; +} + +static grn_obj * +command_push(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_obj *table = grn_ctx_get(ctx, GRN_TEXT_VALUE(VAR(0)), GRN_TEXT_LEN(VAR(0))); + if (table) { + switch (table->header.type) { + case GRN_TABLE_NO_KEY: + { + grn_array *array = (grn_array *)table; + grn_table_queue *queue = grn_array_queue(ctx, array); + if (queue) { + MUTEX_LOCK(queue->mutex); + if (grn_table_queue_head(queue) == queue->cap) { + grn_array_clear_curr_rec(ctx, array); + } + grn_load_(ctx, GRN_CONTENT_JSON, + GRN_TEXT_VALUE(VAR(0)), GRN_TEXT_LEN(VAR(0)), + NULL, 0, + GRN_TEXT_VALUE(VAR(1)), GRN_TEXT_LEN(VAR(1)), + NULL, 0, NULL, 0, 0); + if (grn_table_queue_size(queue) == queue->cap) { + grn_table_queue_tail_increment(queue); + } + grn_table_queue_head_increment(queue); + COND_SIGNAL(queue->cond); + MUTEX_UNLOCK(queue->mutex); + GRN_OUTPUT_BOOL(ctx->impl->loader.nrecords); + if (ctx->impl->loader.table) { + grn_db_touch(ctx, DB_OBJ(ctx->impl->loader.table)->db); + } + } else { + ERR(GRN_OPERATION_NOT_SUPPORTED, "table '%.*s' doesn't support push", + (int)GRN_TEXT_LEN(VAR(0)), GRN_TEXT_VALUE(VAR(0))); + } + } + break; + default : + ERR(GRN_OPERATION_NOT_SUPPORTED, "table '%.*s' doesn't support push", + (int)GRN_TEXT_LEN(VAR(0)), GRN_TEXT_VALUE(VAR(0))); + } + } else { + ERR(GRN_INVALID_ARGUMENT, "table '%.*s' does not exist.", + (int)GRN_TEXT_LEN(VAR(0)), GRN_TEXT_VALUE(VAR(0))); + } + return NULL; +} + +static grn_obj * +command_pull(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_obj *table = grn_ctx_get(ctx, GRN_TEXT_VALUE(VAR(0)), GRN_TEXT_LEN(VAR(0))); + if (table) { + switch (table->header.type) { + case GRN_TABLE_NO_KEY: + { + grn_array *array = (grn_array *)table; + grn_table_queue *queue = grn_array_queue(ctx, array); + if (queue) { + MUTEX_LOCK(queue->mutex); + while (grn_table_queue_size(queue) == 0) { + if (GRN_TEXT_LEN(VAR(2))) { + MUTEX_UNLOCK(queue->mutex); + GRN_OUTPUT_BOOL(0); + return NULL; + } + COND_WAIT(queue->cond, queue->mutex); + } + grn_table_queue_tail_increment(queue); + { + grn_obj obj; + grn_obj_format format; + GRN_RECORD_INIT(&obj, 0, ((grn_db_obj *)table)->id); + GRN_OBJ_FORMAT_INIT(&format, 1, 0, 1, 0); + GRN_RECORD_SET(ctx, &obj, grn_table_queue_tail(queue)); + grn_obj_columns(ctx, table, GRN_TEXT_VALUE(VAR(1)), GRN_TEXT_LEN(VAR(1)), + &format.columns); + format.flags = 0 /* GRN_OBJ_FORMAT_WITH_COLUMN_NAMES */; + GRN_OUTPUT_OBJ(&obj, &format); + GRN_OBJ_FORMAT_FIN(ctx, &format); + } + MUTEX_UNLOCK(queue->mutex); + } else { + ERR(GRN_OPERATION_NOT_SUPPORTED, "table '%.*s' doesn't support pull", + (int)GRN_TEXT_LEN(VAR(0)), GRN_TEXT_VALUE(VAR(0))); + } + } + break; + default : + ERR(GRN_OPERATION_NOT_SUPPORTED, "table '%.*s' doesn't support pull", + (int)GRN_TEXT_LEN(VAR(0)), GRN_TEXT_VALUE(VAR(0))); + } + } else { + ERR(GRN_INVALID_ARGUMENT, "table '%.*s' does not exist.", + (int)GRN_TEXT_LEN(VAR(0)), GRN_TEXT_VALUE(VAR(0))); + } + return NULL; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_expr_var vars[18]; + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "expression", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "result_set", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "set_operation", -1); + grn_plugin_expr_var_init(ctx, &vars[4], "allow_update", -1); + grn_plugin_command_create(ctx, "filter_by_script", -1, command_filter_by_script, 5, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "column", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "operator", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "value", -1); + grn_plugin_expr_var_init(ctx, &vars[4], "result_set", -1); + grn_plugin_expr_var_init(ctx, &vars[5], "set_operation", -1); + grn_plugin_command_create(ctx, "filter", -1, command_filter, 6, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "key", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "result_set", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "range_gap", -1); + grn_plugin_command_create(ctx, "group", -1, command_group, 4, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "keys", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "offset", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "limit", -1); + grn_plugin_command_create(ctx, "sort", -1, command_sort, 4, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "columns", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "offset", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "limit", -1); + grn_plugin_command_create(ctx, "output", -1, command_output, 4, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "expression", -1); + grn_plugin_command_create(ctx, "each", -1, command_each, 2, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_command_create(ctx, "unlink", -1, command_unlink, 1, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "values", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "key", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "columns", -1); + grn_plugin_expr_var_init(ctx, &vars[4], "output_columns", -1); + grn_plugin_expr_var_init(ctx, &vars[5], "id", -1); + grn_plugin_command_create(ctx, "add", -1, command_add, 2, vars); + grn_plugin_command_create(ctx, "push", -1, command_push, 2, vars); + grn_plugin_command_create(ctx, "set", -1, command_set, 6, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "key", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "output_columns", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "id", -1); + grn_plugin_command_create(ctx, "get", -1, command_get, 4, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "output_columns", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "non_block", -1); + grn_plugin_command_create(ctx, "pull", -1, command_pull, 3, vars); + + grn_plugin_expr_var_init(ctx, &vars[0], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "columns", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "query", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "result_set", -1); + grn_plugin_expr_var_init(ctx, &vars[4], "set_operation", -1); + grn_plugin_expr_var_init(ctx, &vars[5], "allow_column_expression", -1); + grn_plugin_expr_var_init(ctx, &vars[6], "allow_pragma", -1); + grn_plugin_command_create(ctx, "match", -1, command_match, 7, vars); + + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt new file mode 100644 index 00000000000..e044c1fbb93 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt @@ -0,0 +1,53 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(TOKENIZERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/tokenizers") +if(GRN_WITH_MECAB) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/mecab_sources.am MECAB_SOURCES) + include_directories(${MECAB_INCLUDE_DIRS}) + link_directories(${MECAB_LIBRARY_DIRS}) + add_library(mecab_tokenizer MODULE ${MECAB_SOURCES}) + set_source_files_properties(${MECAB_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") + set_target_properties(mecab_tokenizer PROPERTIES + PREFIX "" + OUTPUT_NAME "mecab") + target_link_libraries(mecab_tokenizer libgroonga ${MECAB_LIBRARIES}) + if(NOT MRN_GROONGA_BUNDLED) + install(TARGETS mecab_tokenizer DESTINATION "${TOKENIZERS_DIR}") + endif() +endif() + +if(GRN_WITH_KYTEA) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/kytea_sources.am KYTEA_SOURCES) + include_directories(${KYTEA_INCLUDE_DIRS}) + link_directories(${KYTEA_LIBRARY_DIRS}) + add_library(kytea_tokenizer MODULE ${KYTEA_SOURCES}) + set_source_files_properties(${KYTEA_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_CXX_COMPILE_FLAGS}") + set_target_properties(kytea_tokenizer PROPERTIES + PREFIX "" + OUTPUT_NAME "kytea") + target_link_libraries(kytea_tokenizer libgroonga ${KYTEA_LIBRARIES}) + if(NOT MRN_GROONGA_BUNDLED) + install(TARGETS kytea_tokenizer DESTINATION "${TOKENIZERS_DIR}") + endif() +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am new file mode 100644 index 00000000000..386b1554f73 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am @@ -0,0 +1,33 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +tokenizers_plugins_LTLIBRARIES = +if WITH_MECAB +tokenizers_plugins_LTLIBRARIES += mecab.la +endif +if WITH_KYTEA +tokenizers_plugins_LTLIBRARIES += kytea.la +endif + +include mecab_sources.am +mecab_la_CPPFLAGS = $(AM_CPPFLAGS) $(MECAB_CPPFLAGS) +mecab_la_LIBADD = $(LIBS) $(MECAB_LIBS) +mecab_la_LDFLAGS = $(AM_LDFLAGS) $(MECAB_LDFLAGS) + +include kytea_sources.am +kytea_la_CPPFLAGS = $(AM_CPPFLAGS) $(KYTEA_CFLAGS) +kytea_la_LIBADD = $(LIBS) $(KYTEA_LIBS) +kytea_la_LDFLAGS = $(AM_LDFLAGS) $(KYTEA_LDFLAGS) diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp new file mode 100644 index 00000000000..a7ee4104592 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp @@ -0,0 +1,354 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <groonga/tokenizer.h> + +#include <kytea/kytea.h> +#include <kytea/string-util.h> + +#include <string.h> + +#include <string> +#include <vector> + +namespace { + +grn_plugin_mutex *kytea_mutex = NULL; +kytea::KyteaConfig *kytea_config = NULL; +kytea::Kytea *kytea_tagger = NULL; +kytea::StringUtil *kytea_util = NULL; + +void kytea_init(grn_ctx *ctx); +void kytea_fin(grn_ctx *ctx); + +void kytea_init(grn_ctx *ctx) { + if (kytea_mutex || kytea_config || kytea_tagger || kytea_util) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "TokenKytea is already initialized"); + return; + } + + kytea_mutex = grn_plugin_mutex_open(ctx); + if (!kytea_mutex) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "grn_plugin_mutex_open() failed"); + return; + } + + kytea::KyteaConfig * const config = static_cast<kytea::KyteaConfig *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::KyteaConfig))); + if (!config) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to kytea::KyteaConfig failed"); + return; + } + + try { + new (config) kytea::KyteaConfig; + kytea_config = config; + try { + kytea_config->setDebug(0); + kytea_config->setOnTraining(false); + kytea_config->parseRunCommandLine(0, NULL); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::KyteaConfig settings failed"); + return; + } + } catch (...) { + GRN_PLUGIN_FREE(ctx, config); + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::KyteaConfig initialization failed"); + return; + } + + kytea::Kytea * const tagger = static_cast<kytea::Kytea *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::Kytea))); + if (!tagger) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to kytea::Kytea failed"); + return; + } + + try { + new (tagger) kytea::Kytea; + kytea_tagger = tagger; + try { + kytea_tagger->readModel(kytea_config->getModelFile().c_str()); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea::readModel() failed"); + return; + } + } catch (...) { + GRN_PLUGIN_FREE(ctx, tagger); + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea initialization failed"); + return; + } + + try { + kytea_util = kytea_tagger->getStringUtil(); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea::getStringUtil() failed"); + return; + } +} + +void kytea_fin(grn_ctx *ctx) { + kytea_util = NULL; + + if (kytea_tagger) { + kytea_tagger->~Kytea(); + GRN_PLUGIN_FREE(ctx, kytea_tagger); + kytea_tagger = NULL; + } + + if (kytea_config) { + kytea_config->~KyteaConfig(); + GRN_PLUGIN_FREE(ctx, kytea_config); + kytea_config = NULL; + } + + if (kytea_mutex) { + grn_plugin_mutex_close(ctx, kytea_mutex); + kytea_mutex = NULL; + } +} + +struct grn_tokenizer_kytea { + grn_tokenizer_query *query; + kytea::KyteaSentence sentence; + std::vector<std::string> tokens; + std::size_t id; + grn_tokenizer_token token; + const char *rest_query_string; + unsigned int rest_query_string_length; + + grn_tokenizer_kytea() : + query(NULL), + sentence(), + tokens(), + id(0), + token(), + rest_query_string(NULL) + { + } + ~grn_tokenizer_kytea() {} +}; + +void grn_tokenizer_kytea_init(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) { + new (tokenizer) grn_tokenizer_kytea; + grn_tokenizer_token_init(ctx, &tokenizer->token); +} + +void grn_tokenizer_kytea_fin(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) { + grn_tokenizer_token_fin(ctx, &tokenizer->token); + if (tokenizer->query) { + grn_tokenizer_query_close(ctx, tokenizer->query); + } + tokenizer->~grn_tokenizer_kytea(); +} + +grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + unsigned int normalizer_flags = 0; + grn_tokenizer_query * const query = + grn_tokenizer_query_open(ctx, num_args, args, normalizer_flags); + if (!query) { + return NULL; + } + + grn_tokenizer_kytea * const tokenizer = static_cast<grn_tokenizer_kytea *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_kytea))); + if (!tokenizer) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to grn_tokenizer_kytea failed"); + return NULL; + } + + try { + grn_tokenizer_kytea_init(ctx, tokenizer); + } catch (...) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "tokenizer initialization failed"); + return NULL; + } + + tokenizer->query = query; + + grn_obj *normalized_query = query->normalized_query; + const char *normalized_string; + unsigned int normalized_string_length; + grn_string_get_normalized(ctx, + normalized_query, + &normalized_string, + &normalized_string_length, + NULL); + if (tokenizer->query->have_tokenized_delimiter) { + tokenizer->rest_query_string = normalized_string; + tokenizer->rest_query_string_length = normalized_string_length; + } else { + grn_plugin_mutex_lock(ctx, kytea_mutex); + try { + const std::string str(normalized_string, normalized_string_length); + const kytea::KyteaString &surface_str = kytea_util->mapString(str); + const kytea::KyteaString &normalized_str = kytea_util->normalize(surface_str); + tokenizer->sentence = kytea::KyteaSentence(surface_str, normalized_str); + kytea_tagger->calculateWS(tokenizer->sentence); + } catch (...) { + grn_plugin_mutex_unlock(ctx, kytea_mutex); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "tokenization failed"); + return NULL; + } + grn_plugin_mutex_unlock(ctx, kytea_mutex); + + try { + for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) { + const std::string &token = + kytea_util->showString(tokenizer->sentence.words[i].surface); + const char *ptr = token.c_str(); + unsigned int left = static_cast<unsigned int>(token.length()); + while (left > 0) { + const int char_length = + grn_tokenizer_charlen(ctx, ptr, left, query->encoding); + if ((char_length == 0) || + (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) { + break; + } + ptr += char_length; + left -= char_length; + } + if (left == 0) { + tokenizer->tokens.push_back(token); + } + } + } catch (...) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "adjustment failed"); + return NULL; + } + } + + user_data->ptr = tokenizer; + return NULL; +} + +grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_kytea * const tokenizer = + static_cast<grn_tokenizer_kytea *>(user_data->ptr); + + if (tokenizer->query->have_tokenized_delimiter) { + unsigned int rest_query_string_length = + tokenizer->rest_query_string_length; + const char *rest_query_string = + grn_tokenizer_tokenized_delimiter_next(ctx, + &(tokenizer->token), + tokenizer->rest_query_string, + rest_query_string_length, + tokenizer->query->encoding); + if (rest_query_string) { + tokenizer->rest_query_string_length -= + rest_query_string - tokenizer->rest_query_string; + } + tokenizer->rest_query_string = rest_query_string; + } else { + const grn_tokenizer_status status = + ((tokenizer->id + 1) < tokenizer->tokens.size()) ? + GRN_TOKENIZER_CONTINUE : GRN_TOKENIZER_LAST; + if (tokenizer->id < tokenizer->tokens.size()) { + const std::string &token = tokenizer->tokens[tokenizer->id++]; + grn_tokenizer_token_push(ctx, &tokenizer->token, + token.c_str(), token.length(), status); + } else { + grn_tokenizer_token_push(ctx, &tokenizer->token, "", 0, status); + } + } + + return NULL; +} + +grn_obj *grn_kytea_fin(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_kytea * const tokenizer = + static_cast<grn_tokenizer_kytea *>(user_data->ptr); + if (tokenizer) { + grn_tokenizer_kytea_fin(ctx, tokenizer); + GRN_PLUGIN_FREE(ctx, tokenizer); + } + return NULL; +} + +} // namespace + +extern "C" { + +/* + GRN_PLUGIN_INIT() is called to initialize this plugin. Note that an error + code must be set in `ctx->rc' on failure. + */ +grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) { + kytea_init(ctx); + return ctx->rc; +} + +/* + GRN_PLUGIN_REGISTER() registers this plugin to the database associated with + `ctx'. The registration requires the plugin name and the functions to be + called for tokenization. + */ +grn_rc GRN_PLUGIN_REGISTER(grn_ctx *ctx) { + return grn_tokenizer_register(ctx, "TokenKytea", 10, grn_kytea_init, + grn_kytea_next, grn_kytea_fin); +} + +/* + GRN_PLUGIN_FIN() is called to finalize the plugin that was initialized by + GRN_PLUGIN_INIT(). + */ +grn_rc GRN_PLUGIN_FIN(grn_ctx *ctx) { + kytea_fin(ctx); + return GRN_SUCCESS; +} + +} // extern "C" diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am new file mode 100644 index 00000000000..182f38577e3 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am @@ -0,0 +1,2 @@ +kytea_la_SOURCES = \ + kytea.cpp diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c new file mode 100644 index 00000000000..4ac99f9a9e8 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c @@ -0,0 +1,338 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2009-2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <str.h> + +#include <groonga.h> +#include <groonga/tokenizer.h> + +#include <mecab.h> + +#include <string.h> +#include <ctype.h> + +static mecab_t *sole_mecab = NULL; +static grn_plugin_mutex *sole_mecab_mutex = NULL; +static grn_encoding sole_mecab_encoding = GRN_ENC_NONE; + +typedef struct { + mecab_t *mecab; + grn_obj buf; + const char *next; + const char *end; + grn_tokenizer_query *query; + grn_tokenizer_token token; +} grn_mecab_tokenizer; + +static grn_encoding +translate_mecab_charset_to_grn_encoding(const char *charset) +{ + if (strcasecmp(charset, "euc-jp") == 0) { + return GRN_ENC_EUC_JP; + } else if (strcasecmp(charset, "utf-8") == 0 || + strcasecmp(charset, "utf8") == 0) { + return GRN_ENC_UTF8; + } else if (strcasecmp(charset, "shift_jis") == 0 || + strcasecmp(charset, "shift-jis") == 0 || + strcasecmp(charset, "sjis") == 0) { + return GRN_ENC_SJIS; + } + return GRN_ENC_NONE; +} + +static grn_encoding +get_mecab_encoding(mecab_t *mecab) +{ + grn_encoding encoding = GRN_ENC_NONE; + const mecab_dictionary_info_t *dictionary_info; + dictionary_info = mecab_dictionary_info(mecab); + if (dictionary_info) { + const char *charset = dictionary_info->charset; + encoding = translate_mecab_charset_to_grn_encoding(charset); + } + return encoding; +} + +/* + This function is called for a full text search query or a document to be + indexed. This means that both short/long strings are given. + The return value of this function is ignored. When an error occurs in this + function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). + */ +static grn_obj * +mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + const char *s; + grn_mecab_tokenizer *tokenizer; + unsigned int normalizer_flags = 0; + grn_tokenizer_query *query; + grn_obj *normalized_query; + const char *normalized_string; + unsigned int normalized_string_length; + + query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); + if (!query) { + return NULL; + } + if (!sole_mecab) { + grn_plugin_mutex_lock(ctx, sole_mecab_mutex); + if (!sole_mecab) { + sole_mecab = mecab_new2("-Owakati"); + if (!sole_mecab) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "mecab_new2() failed on mecab_init(): %s", + mecab_strerror(NULL)); + } else { + sole_mecab_encoding = get_mecab_encoding(sole_mecab); + } + } + grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); + } + if (!sole_mecab) { + grn_tokenizer_query_close(ctx, query); + return NULL; + } + + if (query->encoding != sole_mecab_encoding) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "MeCab dictionary charset (%s) does not match " + "the table encoding: <%s>", + grn_encoding_to_string(sole_mecab_encoding), + grn_encoding_to_string(query->encoding)); + return NULL; + } + + if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][mecab] " + "memory allocation to grn_mecab_tokenizer failed"); + return NULL; + } + tokenizer->mecab = sole_mecab; + tokenizer->query = query; + + normalized_query = query->normalized_query; + grn_string_get_normalized(ctx, + normalized_query, + &normalized_string, + &normalized_string_length, + NULL); + GRN_TEXT_INIT(&(tokenizer->buf), 0); + if (query->have_tokenized_delimiter) { + tokenizer->next = normalized_string; + tokenizer->end = tokenizer->next + normalized_string_length; + } else if (normalized_string_length == 0) { + tokenizer->next = ""; + tokenizer->end = tokenizer->next; + } else { + grn_plugin_mutex_lock(ctx, sole_mecab_mutex); + s = mecab_sparse_tostr2(tokenizer->mecab, + normalized_string, + normalized_string_length); + if (!s) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "mecab_sparse_tostr() failed len=%d err=%s", + normalized_string_length, + mecab_strerror(tokenizer->mecab)); + } else { + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); + } + grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); + if (!s) { + grn_tokenizer_query_close(ctx, tokenizer->query); + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; + } + { + char *buf, *p; + unsigned int bufsize; + + buf = GRN_TEXT_VALUE(&(tokenizer->buf)); + bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); + /* A certain version of mecab returns trailing lf or spaces. */ + for (p = buf + bufsize - 2; + buf <= p && isspace(*(unsigned char *)p); + p--) { *p = '\0'; } + tokenizer->next = buf; + tokenizer->end = p + 1; + } + } + user_data->ptr = tokenizer; + + grn_tokenizer_token_init(ctx, &(tokenizer->token)); + + return NULL; +} + +/* + This function returns tokens one by one. + */ +static grn_obj * +mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + /* grn_obj *table = args[0]; */ + grn_mecab_tokenizer *tokenizer = user_data->ptr; + grn_encoding encoding = tokenizer->query->encoding; + + if (tokenizer->query->have_tokenized_delimiter) { + tokenizer->next = + grn_tokenizer_tokenized_delimiter_next(ctx, + &(tokenizer->token), + tokenizer->next, + tokenizer->end - tokenizer->next, + encoding); + } else { + size_t cl; + const char *p = tokenizer->next, *r; + const char *e = tokenizer->end; + grn_tokenizer_status status; + + for (r = p; r < e; r += cl) { + if (!(cl = grn_charlen_(ctx, r, e, encoding))) { + tokenizer->next = e; + break; + } + if (grn_isspace(r, encoding)) { + const char *q = r; + while ((cl = grn_isspace(q, encoding))) { q += cl; } + tokenizer->next = q; + break; + } + } + + if (r == e) { + status = GRN_TOKENIZER_LAST; + } else { + status = GRN_TOKENIZER_CONTINUE; + } + grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status); + } + + return NULL; +} + +/* + This function finalizes a tokenization. + */ +static grn_obj * +mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_mecab_tokenizer *tokenizer = user_data->ptr; + if (!tokenizer) { + return NULL; + } + grn_tokenizer_token_fin(ctx, &(tokenizer->token)); + grn_tokenizer_query_close(ctx, tokenizer->query); + grn_obj_unlink(ctx, &(tokenizer->buf)); + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; +} + +static void +check_mecab_dictionary_encoding(grn_ctx *ctx) +{ +#ifdef HAVE_MECAB_DICTIONARY_INFO_T + mecab_t *mecab; + + mecab = mecab_new2("-Owakati"); + if (mecab) { + grn_encoding encoding; + int have_same_encoding_dictionary = 0; + + encoding = GRN_CTX_GET_ENCODING(ctx); + have_same_encoding_dictionary = encoding == get_mecab_encoding(mecab); + mecab_destroy(mecab); + + if (!have_same_encoding_dictionary) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "MeCab has no dictionary that uses the context encoding" + ": <%s>", + grn_encoding_to_string(encoding)); + } + } else { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "mecab_new2 failed in check_mecab_dictionary_encoding: %s", + mecab_strerror(NULL)); + } +#endif +} + +/* + This function initializes a plugin. This function fails if there is no + dictionary that uses the context encoding of groonga. + */ +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + sole_mecab = NULL; + sole_mecab_mutex = grn_plugin_mutex_open(ctx); + if (!sole_mecab_mutex) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][mecab] grn_plugin_mutex_open() failed"); + return ctx->rc; + } + + check_mecab_dictionary_encoding(ctx); + return ctx->rc; +} + +/* + This function registers a plugin to a database. + */ +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_tokenizer_register(ctx, "TokenMecab", 10, + mecab_init, mecab_next, mecab_fin); + if (rc == GRN_SUCCESS) { + grn_obj *token_mecab; + token_mecab = grn_ctx_get(ctx, "TokenMecab", 10); + /* Just for backward compatibility. TokenMecab was built-in not plugin. */ + if (token_mecab && grn_obj_id(ctx, token_mecab) != GRN_DB_MECAB) { + rc = GRN_FILE_CORRUPT; + } + } + + return rc; +} + +/* + This function finalizes a plugin. + */ +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + if (sole_mecab) { + mecab_destroy(sole_mecab); + sole_mecab = NULL; + } + if (sole_mecab_mutex) { + grn_plugin_mutex_close(ctx, sole_mecab_mutex); + sole_mecab_mutex = NULL; + } + + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am new file mode 100644 index 00000000000..56912727c3d --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am @@ -0,0 +1,2 @@ +mecab_la_SOURCES = \ + mecab.c |