summaryrefslogtreecommitdiff
path: root/storage/mroonga/vendor/groonga/lib/tokenizers.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/tokenizers.c')
-rw-r--r--storage/mroonga/vendor/groonga/lib/tokenizers.c21
1 files changed, 19 insertions, 2 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/tokenizers.c b/storage/mroonga/vendor/groonga/lib/tokenizers.c
index 28fd13c33c4..6bd0a1b9e18 100644
--- a/storage/mroonga/vendor/groonga/lib/tokenizers.c
+++ b/storage/mroonga/vendor/groonga/lib/tokenizers.c
@@ -1,6 +1,6 @@
/* -*- c-basic-offset: 2 -*- */
/*
- Copyright(C) 2009-2015 Brazil
+ Copyright(C) 2009-2017 Brazil
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -237,6 +237,8 @@ delimit_null_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_d
/* ngram tokenizer */
+static grn_bool grn_ngram_tokenizer_remove_blank_disable = GRN_FALSE;
+
typedef struct {
grn_tokenizer_token token;
grn_tokenizer_query *query;
@@ -268,6 +270,9 @@ ngram_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data, ui
unsigned int normalized_length_in_bytes;
grn_ngram_tokenizer *tokenizer;
+ if (grn_ngram_tokenizer_remove_blank_disable) {
+ normalize_flags &= ~GRN_STRING_REMOVE_BLANK;
+ }
query = grn_tokenizer_query_open(ctx, nargs, args, normalize_flags);
if (!query) {
return NULL;
@@ -544,7 +549,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
grn_obj *buffer = &(tokenizer->buffer);
const char *current = tokenizer->next;
const char *end = tokenizer->end;
- const const uint_least8_t *char_types = tokenizer->char_types;
+ const uint_least8_t *char_types = tokenizer->char_types;
grn_tokenize_mode mode = tokenizer->query->tokenize_mode;
grn_bool is_begin = tokenizer->is_begin;
grn_bool is_start_token = tokenizer->is_start_token;
@@ -598,6 +603,7 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
if (is_begin &&
char_len == GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN &&
memcmp(current, GRN_TOKENIZER_BEGIN_MARK_UTF8, char_len) == 0) {
+ tokenizer->is_start_token = GRN_TRUE;
n_characters++;
GRN_TEXT_PUT(ctx, buffer, current, char_len);
current += char_len;
@@ -808,6 +814,17 @@ grn_db_init_builtin_tokenizers(grn_ctx *ctx)
GRN_TEXT_INIT(&vars[1].value, 0);
GRN_UINT32_INIT(&vars[2].value, 0);
+ {
+ char grn_ngram_tokenizer_remove_blank_disable_env[GRN_ENV_BUFFER_SIZE];
+
+ grn_getenv("GRN_NGRAM_TOKENIZER_REMOVE_BLANK_DISABLE",
+ grn_ngram_tokenizer_remove_blank_disable_env,
+ GRN_ENV_BUFFER_SIZE);
+ if (grn_ngram_tokenizer_remove_blank_disable_env[0]) {
+ grn_ngram_tokenizer_remove_blank_disable = GRN_TRUE;
+ }
+ }
+
obj = DEF_TOKENIZER("TokenDelimit",
delimit_init, delimited_next, delimited_fin, vars);
if (!obj || ((grn_db_obj *)obj)->id != GRN_DB_DELIMIT) { return GRN_FILE_CORRUPT; }