summaryrefslogtreecommitdiff
path: root/storage/mroonga/vendor/groonga/lib/token_cursor.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/token_cursor.c')
-rw-r--r--storage/mroonga/vendor/groonga/lib/token_cursor.c359
1 files changed, 359 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/token_cursor.c b/storage/mroonga/vendor/groonga/lib/token_cursor.c
new file mode 100644
index 00000000000..d98d1d46342
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/lib/token_cursor.c
@@ -0,0 +1,359 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2009-2014 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include "grn_token_cursor.h"
+#include "grn_string.h"
+#include "grn_pat.h"
+#include "grn_dat.h"
+
+static void
+grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx,
+ grn_token_cursor *token_cursor)
+{
+ grn_obj *token_filters = token_cursor->token_filters;
+ unsigned int i, n_token_filters;
+
+ if (token_filters) {
+ n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
+ } else {
+ n_token_filters = 0;
+ }
+
+ for (i = 0; i < n_token_filters; i++) {
+ grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
+ grn_proc *token_filter = (grn_proc *)token_filter_object;
+
+ token_filter->user_data =
+ token_filter->callbacks.token_filter.init(ctx,
+ token_cursor->table,
+ token_cursor->mode);
+ }
+}
+
+grn_token_cursor *
+grn_token_cursor_open(grn_ctx *ctx, grn_obj *table,
+ const char *str, size_t str_len,
+ grn_tokenize_mode mode, unsigned int flags)
+{
+ grn_token_cursor *token_cursor;
+ grn_encoding encoding;
+ grn_obj *tokenizer;
+ grn_obj *normalizer;
+ grn_obj *token_filters;
+ grn_obj_flags table_flags;
+ if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer,
+ &normalizer, &token_filters)) {
+ return NULL;
+ }
+ if (!(token_cursor = GRN_MALLOC(sizeof(grn_token_cursor)))) { return NULL; }
+ token_cursor->table = table;
+ token_cursor->mode = mode;
+ token_cursor->encoding = encoding;
+ token_cursor->tokenizer = tokenizer;
+ token_cursor->token_filters = token_filters;
+ token_cursor->orig = (const unsigned char *)str;
+ token_cursor->orig_blen = str_len;
+ token_cursor->curr = NULL;
+ token_cursor->nstr = NULL;
+ token_cursor->curr_size = 0;
+ token_cursor->pos = -1;
+ token_cursor->status = GRN_TOKEN_CURSOR_DOING;
+ token_cursor->force_prefix = GRN_FALSE;
+ if (tokenizer) {
+ grn_obj str_, flags_, mode_;
+ GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
+ GRN_TEXT_SET_REF(&str_, str, str_len);
+ GRN_UINT32_INIT(&flags_, 0);
+ GRN_UINT32_SET(ctx, &flags_, flags);
+ GRN_UINT32_INIT(&mode_, 0);
+ GRN_UINT32_SET(ctx, &mode_, mode);
+ token_cursor->pctx.caller = NULL;
+ token_cursor->pctx.user_data.ptr = NULL;
+ token_cursor->pctx.proc = (grn_proc *)tokenizer;
+ token_cursor->pctx.hooks = NULL;
+ token_cursor->pctx.currh = NULL;
+ token_cursor->pctx.phase = PROC_INIT;
+ grn_ctx_push(ctx, &mode_);
+ grn_ctx_push(ctx, &str_);
+ grn_ctx_push(ctx, &flags_);
+ ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token_cursor->pctx.user_data);
+ grn_obj_close(ctx, &flags_);
+ grn_obj_close(ctx, &str_);
+ grn_obj_close(ctx, &mode_);
+ } else {
+ int nflags = 0;
+ token_cursor->nstr = grn_string_open_(ctx, str, str_len,
+ normalizer,
+ nflags,
+ token_cursor->encoding);
+ if (token_cursor->nstr) {
+ const char *normalized;
+ grn_string_get_normalized(ctx, token_cursor->nstr,
+ &normalized, &(token_cursor->curr_size), NULL);
+ token_cursor->curr = (const unsigned char *)normalized;
+ } else {
+ ERR(GRN_TOKENIZER_ERROR,
+ "[token-cursor][open] failed to grn_string_open()");
+ }
+ }
+
+ grn_token_cursor_open_initialize_token_filters(ctx, token_cursor);
+
+ if (ctx->rc) {
+ grn_token_cursor_close(ctx, token_cursor);
+ token_cursor = NULL;
+ }
+ return token_cursor;
+}
+
+static int
+grn_token_cursor_next_apply_token_filters(grn_ctx *ctx,
+ grn_token_cursor *token_cursor,
+ grn_obj *current_token_data,
+ grn_obj *status)
+{
+ grn_obj *token_filters = token_cursor->token_filters;
+ unsigned int i, n_token_filters;
+ grn_token current_token;
+ grn_token next_token;
+
+ if (token_filters) {
+ n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
+ } else {
+ n_token_filters = 0;
+ }
+
+ GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY);
+ GRN_TEXT_SET(ctx, &(current_token.data),
+ GRN_TEXT_VALUE(current_token_data),
+ GRN_TEXT_LEN(current_token_data));
+ current_token.status = GRN_INT32_VALUE(status);
+ GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY);
+ GRN_TEXT_SET(ctx, &(next_token.data),
+ GRN_TEXT_VALUE(&(current_token.data)),
+ GRN_TEXT_LEN(&(current_token.data)));
+ next_token.status = current_token.status;
+
+ for (i = 0; i < n_token_filters; i++) {
+ grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
+ grn_proc *token_filter = (grn_proc *)token_filter_object;
+
+#define SKIP_FLAGS\
+ (GRN_TOKEN_SKIP |\
+ GRN_TOKEN_SKIP_WITH_POSITION)
+ if (current_token.status & SKIP_FLAGS) {
+ break;
+ }
+#undef SKIP_FLAGS
+
+ token_filter->callbacks.token_filter.filter(ctx,
+ &current_token,
+ &next_token,
+ token_filter->user_data);
+ GRN_TEXT_SET(ctx, &(current_token.data),
+ GRN_TEXT_VALUE(&(next_token.data)),
+ GRN_TEXT_LEN(&(next_token.data)));
+ current_token.status = next_token.status;
+ }
+
+ token_cursor->curr =
+ (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data));
+ token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data));
+
+ return current_token.status;
+}
+
+grn_id
+grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor)
+{
+ int status;
+ grn_id tid = GRN_ID_NIL;
+ grn_obj *table = token_cursor->table;
+ grn_obj *tokenizer = token_cursor->tokenizer;
+ while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
+ if (tokenizer) {
+ grn_obj *curr_, *stat_;
+ ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data);
+ stat_ = grn_ctx_pop(ctx);
+ curr_ = grn_ctx_pop(ctx);
+ status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor,
+ curr_, stat_);
+ token_cursor->status =
+ ((status & GRN_TOKEN_LAST) ||
+ (token_cursor->mode == GRN_TOKENIZE_GET &&
+ (status & GRN_TOKEN_REACH_END)))
+ ? GRN_TOKEN_CURSOR_DONE : GRN_TOKEN_CURSOR_DOING;
+ token_cursor->force_prefix = GRN_FALSE;
+#define SKIP_FLAGS \
+ (GRN_TOKEN_SKIP | GRN_TOKEN_SKIP_WITH_POSITION)
+ if (status & SKIP_FLAGS) {
+ if (status & GRN_TOKEN_SKIP) {
+ token_cursor->pos++;
+ }
+ if (token_cursor->status == GRN_TOKEN_CURSOR_DONE && tid == GRN_ID_NIL) {
+ token_cursor->status = GRN_TOKEN_CURSOR_DONE_SKIP;
+ break;
+ } else {
+ continue;
+ }
+ }
+#undef SKIP_FLAGS
+ if (status & GRN_TOKEN_FORCE_PREFIX) {
+ token_cursor->force_prefix = GRN_TRUE;
+ }
+ if (token_cursor->curr_size == 0) {
+ if (token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
+ char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE];
+ int tokenizer_name_length;
+ tokenizer_name_length =
+ grn_obj_name(ctx, token_cursor->tokenizer,
+ tokenizer_name, GRN_TABLE_MAX_KEY_SIZE);
+ GRN_LOG(ctx, GRN_WARN,
+ "[token_next] ignore an empty token: <%.*s>: <%.*s>",
+ tokenizer_name_length, tokenizer_name,
+ token_cursor->orig_blen, token_cursor->orig);
+ }
+ continue;
+ }
+ if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) {
+ GRN_LOG(ctx, GRN_WARN,
+ "[token_next] ignore too long token. "
+ "Token must be less than or equal to %d: <%d>(<%.*s>)",
+ GRN_TABLE_MAX_KEY_SIZE,
+ token_cursor->curr_size,
+ token_cursor->curr_size, token_cursor->curr);
+ continue;
+ }
+ if (status & GRN_TOKEN_UNMATURED) {
+ if (status & GRN_TOKEN_OVERLAP) {
+ if (token_cursor->mode == GRN_TOKENIZE_GET) {
+ token_cursor->pos++;
+ continue;
+ }
+ } else {
+ if (status & GRN_TOKEN_LAST) {
+ token_cursor->force_prefix = GRN_TRUE;
+ }
+ }
+ }
+ } else {
+ token_cursor->status = GRN_TOKEN_CURSOR_DONE;
+ }
+ if (token_cursor->mode == GRN_TOKENIZE_ADD) {
+ switch (table->header.type) {
+ case GRN_TABLE_PAT_KEY :
+ if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) {
+ tid = GRN_ID_NIL;
+ } else {
+ tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size,
+ NULL, NULL);
+ grn_io_unlock(((grn_pat *)table)->io);
+ }
+ break;
+ case GRN_TABLE_DAT_KEY :
+ if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) {
+ tid = GRN_ID_NIL;
+ } else {
+ tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size,
+ NULL, NULL);
+ grn_io_unlock(((grn_dat *)table)->io);
+ }
+ break;
+ case GRN_TABLE_HASH_KEY :
+ if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) {
+ tid = GRN_ID_NIL;
+ } else {
+ tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size,
+ NULL, NULL);
+ grn_io_unlock(((grn_hash *)table)->io);
+ }
+ break;
+ case GRN_TABLE_NO_KEY :
+ if (token_cursor->curr_size == sizeof(grn_id)) {
+ tid = *((grn_id *)token_cursor->curr);
+ } else {
+ tid = GRN_ID_NIL;
+ }
+ break;
+ }
+ } else {
+ switch (table->header.type) {
+ case GRN_TABLE_PAT_KEY :
+ tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL);
+ break;
+ case GRN_TABLE_DAT_KEY :
+ tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL);
+ break;
+ case GRN_TABLE_HASH_KEY :
+ tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL);
+ break;
+ case GRN_TABLE_NO_KEY :
+ if (token_cursor->curr_size == sizeof(grn_id)) {
+ tid = *((grn_id *)token_cursor->curr);
+ } else {
+ tid = GRN_ID_NIL;
+ }
+ break;
+ }
+ }
+ if (tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
+ token_cursor->status = GRN_TOKEN_CURSOR_NOT_FOUND;
+ }
+ token_cursor->pos++;
+ break;
+ }
+ return tid;
+}
+
+static void
+grn_token_cursor_close_token_filters(grn_ctx *ctx,
+ grn_token_cursor *token_cursor)
+{
+ grn_obj *token_filters = token_cursor->token_filters;
+ unsigned int i, n_token_filters;
+
+ if (token_filters) {
+ n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
+ } else {
+ n_token_filters = 0;
+ }
+ for (i = 0; i < n_token_filters; i++) {
+ grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
+ grn_proc *token_filter = (grn_proc *)token_filter_object;
+
+ token_filter->callbacks.token_filter.fin(ctx, token_filter->user_data);
+ }
+}
+
+grn_rc
+grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor)
+{
+ if (token_cursor) {
+ if (token_cursor->tokenizer) {
+ ((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, 1, &token_cursor->table,
+ &token_cursor->pctx.user_data);
+ }
+ grn_token_cursor_close_token_filters(ctx, token_cursor);
+ if (token_cursor->nstr) {
+ grn_obj_close(ctx, token_cursor->nstr);
+ }
+ GRN_FREE(token_cursor);
+ return GRN_SUCCESS;
+ } else {
+ return GRN_INVALID_ARGUMENT;
+ }
+}