/* -*- c-basic-offset: 2 -*- */ /* Copyright(C) 2009-2017 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ #include "grn_token_cursor.h" #include "grn_string.h" #include "grn_pat.h" #include "grn_dat.h" static void grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx, grn_token_cursor *token_cursor) { grn_obj *token_filters = token_cursor->token_filter.objects; unsigned int i, n_token_filters; token_cursor->token_filter.data = NULL; if (token_filters) { n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); } else { n_token_filters = 0; } if (n_token_filters == 0) { return; } token_cursor->token_filter.data = GRN_CALLOC(sizeof(void *) * n_token_filters); if (!token_cursor->token_filter.data) { return; } for (i = 0; i < n_token_filters; i++) { grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); grn_proc *token_filter = (grn_proc *)token_filter_object; token_cursor->token_filter.data[i] = token_filter->callbacks.token_filter.init(ctx, token_cursor->table, token_cursor->mode); } } grn_token_cursor * grn_token_cursor_open(grn_ctx *ctx, grn_obj *table, const char *str, size_t str_len, grn_tokenize_mode mode, unsigned int flags) { grn_token_cursor *token_cursor; grn_encoding encoding; grn_obj *tokenizer; grn_obj *normalizer; grn_obj *token_filters; grn_table_flags table_flags; if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer, &normalizer, &token_filters)) { return NULL; } if (!(token_cursor = GRN_MALLOC(sizeof(grn_token_cursor)))) { return NULL; } token_cursor->table = table; token_cursor->mode = mode; token_cursor->encoding = encoding; token_cursor->tokenizer = tokenizer; token_cursor->token_filter.objects = token_filters; token_cursor->token_filter.data = NULL; token_cursor->orig = (const unsigned char *)str; token_cursor->orig_blen = str_len; token_cursor->curr = NULL; token_cursor->nstr = NULL; token_cursor->curr_size = 0; token_cursor->pos = -1; token_cursor->status = GRN_TOKEN_CURSOR_DOING; token_cursor->force_prefix = GRN_FALSE; if (tokenizer) { grn_obj str_, flags_, mode_; GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY); GRN_TEXT_SET_REF(&str_, str, str_len); GRN_UINT32_INIT(&flags_, 0); GRN_UINT32_SET(ctx, &flags_, flags); GRN_UINT32_INIT(&mode_, 0); GRN_UINT32_SET(ctx, &mode_, mode); token_cursor->pctx.caller = NULL; token_cursor->pctx.user_data.ptr = NULL; token_cursor->pctx.proc = (grn_proc *)tokenizer; token_cursor->pctx.hooks = NULL; token_cursor->pctx.currh = NULL; token_cursor->pctx.phase = PROC_INIT; grn_ctx_push(ctx, &mode_); grn_ctx_push(ctx, &str_); grn_ctx_push(ctx, &flags_); ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token_cursor->pctx.user_data); grn_obj_close(ctx, &flags_); grn_obj_close(ctx, &str_); grn_obj_close(ctx, &mode_); } else { int nflags = 0; token_cursor->nstr = grn_string_open_(ctx, str, str_len, normalizer, nflags, token_cursor->encoding); if (token_cursor->nstr) { const char *normalized; grn_string_get_normalized(ctx, token_cursor->nstr, &normalized, &(token_cursor->curr_size), NULL); token_cursor->curr = (const unsigned char *)normalized; } else { ERR(GRN_TOKENIZER_ERROR, "[token-cursor][open] failed to grn_string_open()"); } } if (ctx->rc == GRN_SUCCESS) { grn_token_cursor_open_initialize_token_filters(ctx, token_cursor); } if (ctx->rc) { grn_token_cursor_close(ctx, token_cursor); token_cursor = NULL; } return token_cursor; } static int grn_token_cursor_next_apply_token_filters(grn_ctx *ctx, grn_token_cursor *token_cursor, grn_obj *current_token_data, grn_obj *status) { grn_obj *token_filters = token_cursor->token_filter.objects; unsigned int i, n_token_filters; grn_token current_token; grn_token next_token; if (token_filters) { n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); } else { n_token_filters = 0; } GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY); GRN_TEXT_SET(ctx, &(current_token.data), GRN_TEXT_VALUE(current_token_data), GRN_TEXT_LEN(current_token_data)); current_token.status = GRN_INT32_VALUE(status); GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY); GRN_TEXT_SET(ctx, &(next_token.data), GRN_TEXT_VALUE(&(current_token.data)), GRN_TEXT_LEN(&(current_token.data))); next_token.status = current_token.status; for (i = 0; i < n_token_filters; i++) { grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); grn_proc *token_filter = (grn_proc *)token_filter_object; void *data = token_cursor->token_filter.data[i]; #define SKIP_FLAGS\ (GRN_TOKEN_SKIP |\ GRN_TOKEN_SKIP_WITH_POSITION) if (current_token.status & SKIP_FLAGS) { break; } #undef SKIP_FLAGS token_filter->callbacks.token_filter.filter(ctx, ¤t_token, &next_token, data); GRN_TEXT_SET(ctx, &(current_token.data), GRN_TEXT_VALUE(&(next_token.data)), GRN_TEXT_LEN(&(next_token.data))); current_token.status = next_token.status; } token_cursor->curr = (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data)); token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data)); return current_token.status; } grn_id grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor) { int status; grn_id tid = GRN_ID_NIL; grn_obj *table = token_cursor->table; grn_obj *tokenizer = token_cursor->tokenizer; while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { if (tokenizer) { grn_obj *curr_, *stat_; ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data); stat_ = grn_ctx_pop(ctx); curr_ = grn_ctx_pop(ctx); status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor, curr_, stat_); token_cursor->status = ((status & GRN_TOKEN_LAST) || (token_cursor->mode == GRN_TOKENIZE_GET && (status & GRN_TOKEN_REACH_END))) ? GRN_TOKEN_CURSOR_DONE : GRN_TOKEN_CURSOR_DOING; token_cursor->force_prefix = GRN_FALSE; #define SKIP_FLAGS \ (GRN_TOKEN_SKIP | GRN_TOKEN_SKIP_WITH_POSITION) if (status & SKIP_FLAGS) { if (status & GRN_TOKEN_SKIP) { token_cursor->pos++; } if (token_cursor->status == GRN_TOKEN_CURSOR_DONE && tid == GRN_ID_NIL) { token_cursor->status = GRN_TOKEN_CURSOR_DONE_SKIP; break; } else { continue; } } #undef SKIP_FLAGS if (status & GRN_TOKEN_FORCE_PREFIX) { token_cursor->force_prefix = GRN_TRUE; } if (token_cursor->curr_size == 0) { if (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE]; int tokenizer_name_length; tokenizer_name_length = grn_obj_name(ctx, token_cursor->tokenizer, tokenizer_name, GRN_TABLE_MAX_KEY_SIZE); GRN_LOG(ctx, GRN_WARN, "[token_next] ignore an empty token: <%.*s>: <%.*s>", tokenizer_name_length, tokenizer_name, token_cursor->orig_blen, token_cursor->orig); } continue; } if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) { GRN_LOG(ctx, GRN_WARN, "[token_next] ignore too long token. " "Token must be less than or equal to %d: <%d>(<%.*s>)", GRN_TABLE_MAX_KEY_SIZE, token_cursor->curr_size, token_cursor->curr_size, token_cursor->curr); continue; } if (status & GRN_TOKEN_UNMATURED) { if (status & GRN_TOKEN_OVERLAP) { if (token_cursor->mode == GRN_TOKENIZE_GET) { token_cursor->pos++; continue; } } else { if (status & GRN_TOKEN_REACH_END) { token_cursor->force_prefix = GRN_TRUE; } } } } else { token_cursor->status = GRN_TOKEN_CURSOR_DONE; } if (token_cursor->mode == GRN_TOKENIZE_ADD) { switch (table->header.type) { case GRN_TABLE_PAT_KEY : if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL, NULL); grn_io_unlock(((grn_pat *)table)->io); } break; case GRN_TABLE_DAT_KEY : if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL, NULL); grn_io_unlock(((grn_dat *)table)->io); } break; case GRN_TABLE_HASH_KEY : if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) { tid = GRN_ID_NIL; } else { tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL, NULL); grn_io_unlock(((grn_hash *)table)->io); } break; case GRN_TABLE_NO_KEY : if (token_cursor->curr_size == sizeof(grn_id)) { tid = *((grn_id *)token_cursor->curr); } else { tid = GRN_ID_NIL; } break; } } else if (token_cursor->mode != GRN_TOKENIZE_ONLY) { switch (table->header.type) { case GRN_TABLE_PAT_KEY : tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL); break; case GRN_TABLE_DAT_KEY : tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL); break; case GRN_TABLE_HASH_KEY : tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL); break; case GRN_TABLE_NO_KEY : if (token_cursor->curr_size == sizeof(grn_id)) { tid = *((grn_id *)token_cursor->curr); } else { tid = GRN_ID_NIL; } break; } } if (token_cursor->mode != GRN_TOKENIZE_ONLY && tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_CURSOR_DONE) { token_cursor->status = GRN_TOKEN_CURSOR_NOT_FOUND; } token_cursor->pos++; break; } return tid; } static void grn_token_cursor_close_token_filters(grn_ctx *ctx, grn_token_cursor *token_cursor) { grn_obj *token_filters = token_cursor->token_filter.objects; unsigned int i, n_token_filters; if (!token_cursor->token_filter.data) { return; } if (token_filters) { n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); } else { n_token_filters = 0; } if (n_token_filters == 0) { return; } for (i = 0; i < n_token_filters; i++) { grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); grn_proc *token_filter = (grn_proc *)token_filter_object; void *data = token_cursor->token_filter.data[i]; token_filter->callbacks.token_filter.fin(ctx, data); } GRN_FREE(token_cursor->token_filter.data); } grn_rc grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor) { if (token_cursor) { if (token_cursor->tokenizer) { ((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, 1, &token_cursor->table, &token_cursor->pctx.user_data); } grn_token_cursor_close_token_filters(ctx, token_cursor); if (token_cursor->nstr) { grn_obj_close(ctx, token_cursor->nstr); } GRN_FREE(token_cursor); return GRN_SUCCESS; } else { return GRN_INVALID_ARGUMENT; } }