diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/ii.c')
-rw-r--r-- | storage/mroonga/vendor/groonga/lib/ii.c | 1168 |
1 files changed, 869 insertions, 299 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/ii.c b/storage/mroonga/vendor/groonga/lib/ii.c index 5dc203865ad..2d32df2fd4d 100644 --- a/storage/mroonga/vendor/groonga/lib/ii.c +++ b/storage/mroonga/vendor/groonga/lib/ii.c @@ -1,5 +1,5 @@ /* -*- c-basic-offset: 2 -*- */ -/* Copyright(C) 2009-2014 Brazil +/* Copyright(C) 2009-2015 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -14,27 +14,42 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "groonga_in.h" +#include "grn.h" #include <stdio.h> #include <fcntl.h> #include <string.h> #include <sys/stat.h> +#include <math.h> -#include "ii.h" -#include "ctx_impl.h" -#include "token.h" -#include "pat.h" -#include "db.h" -#include "output.h" -#include "util.h" +#ifdef WIN32 +# include <io.h> +# include <share.h> +#endif /* WIN32 */ + +#include "grn_ii.h" +#include "grn_ctx_impl.h" +#include "grn_token_cursor.h" +#include "grn_pat.h" +#include "grn_db.h" +#include "grn_output.h" +#include "grn_scorer.h" +#include "grn_util.h" + +#ifdef GRN_WITH_ONIGMO +# define GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH +#endif + +#ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH +# include "grn_string.h" +# include <oniguruma.h> +#endif #define MAX_PSEG 0x20000 #define S_CHUNK (1 << GRN_II_W_CHUNK) #define W_SEGMENT 18 #define S_SEGMENT (1 << W_SEGMENT) -#define N_CHUNKS_PER_FILE (GRN_IO_FILE_SIZE >> W_SEGMENT) -#define W_ARRAY_ELEMENT 3 -#define S_ARRAY_ELEMENT (1 << W_ARRAY_ELEMENT) +#define W_ARRAY_ELEMENT 3 +#define S_ARRAY_ELEMENT (1 << W_ARRAY_ELEMENT) #define W_ARRAY (W_SEGMENT - W_ARRAY_ELEMENT) #define ARRAY_MASK_IN_A_SEGMENT ((1 << W_ARRAY) - 1) #define NOT_ASSIGNED 0xffffffff @@ -230,11 +245,11 @@ typedef struct { uint32_t recs[N_GARBAGES]; } grn_ii_ginfo; -#define WIN_MAP2(chunk,ctx,iw,seg,pos,size,mode)\ - grn_io_win_map2(chunk, ctx, iw,\ - ((seg) >> GRN_II_N_CHUNK_VARIATION),\ - (((seg) & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK) + (pos),\ - size,mode) +#define WIN_MAP(chunk,ctx,iw,seg,pos,size,mode)\ + grn_io_win_map(chunk, ctx, iw,\ + ((seg) >> GRN_II_N_CHUNK_VARIATION),\ + (((seg) & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK) + (pos),\ + size,mode) /* static int new_histogram[32]; static int free_histogram[32]; @@ -285,10 +300,10 @@ chunk_new(grn_ctx *ctx, grn_ii *ii, uint32_t *res, uint32_t size) iw_.addr = NULL; gseg = &ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; while (*gseg != NOT_ASSIGNED) { - ginfo = WIN_MAP2(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); + ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); //GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); if (!ginfo) { - if (iw_.addr) { grn_io_win_unmap2(&iw_); } + if (iw_.addr) { grn_io_win_unmap(&iw_); } return GRN_NO_MEMORY_AVAILABLE; } if (ginfo->next != NOT_ASSIGNED || ginfo->nrecs > N_GARBAGES_TH) { @@ -300,15 +315,15 @@ chunk_new(grn_ctx *ctx, grn_ii *ii, uint32_t *res, uint32_t size) HEADER_CHUNK_OFF(ii, *gseg); *gseg = ginfo->next; } - if (iw_.addr) { grn_io_win_unmap2(&iw_); } - grn_io_win_unmap2(&iw); + if (iw_.addr) { grn_io_win_unmap(&iw_); } + grn_io_win_unmap(&iw); return GRN_SUCCESS; } - if (iw_.addr) { grn_io_win_unmap2(&iw_); } + if (iw_.addr) { grn_io_win_unmap(&iw_); } iw_ = iw; gseg = &ginfo->next; } - if (iw_.addr) { grn_io_win_unmap2(&iw_); } + if (iw_.addr) { grn_io_win_unmap(&iw_); } } vp = &ii->header->free_chunks[m - GRN_II_W_LEAST_CHUNK]; if (*vp == NOT_ASSIGNED) { @@ -358,24 +373,24 @@ chunk_free(grn_ctx *ctx, grn_ii *ii, uint32_t offset, uint32_t dummy, uint32_t s gseg = &ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; iw_.addr = NULL; while (*gseg != NOT_ASSIGNED) { - ginfo = WIN_MAP2(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); + ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); // GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); if (!ginfo) { - if (iw_.addr) { grn_io_win_unmap2(&iw_); } + if (iw_.addr) { grn_io_win_unmap(&iw_); } return GRN_NO_MEMORY_AVAILABLE; } if (ginfo->nrecs < N_GARBAGES) { break; } - if (iw_.addr) { grn_io_win_unmap2(&iw_); } + if (iw_.addr) { grn_io_win_unmap(&iw_); } iw_ = iw; gseg = &ginfo->next; } if (*gseg == NOT_ASSIGNED) { grn_rc rc; if ((rc = chunk_new(ctx, ii, gseg, S_GARBAGE))) { - if (iw_.addr) { grn_io_win_unmap2(&iw_); } + if (iw_.addr) { grn_io_win_unmap(&iw_); } return rc; } - ginfo = WIN_MAP2(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); + ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); /* uint32_t i = 0; while (HEADER_CHUNK_AT(ii, i)) { @@ -386,7 +401,7 @@ chunk_free(grn_ctx *ctx, grn_ii *ii, uint32_t offset, uint32_t dummy, uint32_t s GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); */ if (!ginfo) { - if (iw_.addr) { grn_io_win_unmap2(&iw_); } + if (iw_.addr) { grn_io_win_unmap(&iw_); } return GRN_NO_MEMORY_AVAILABLE; } ginfo->head = 0; @@ -394,50 +409,15 @@ chunk_free(grn_ctx *ctx, grn_ii *ii, uint32_t offset, uint32_t dummy, uint32_t s ginfo->nrecs = 0; ginfo->next = NOT_ASSIGNED; } - if (iw_.addr) { grn_io_win_unmap2(&iw_); } + if (iw_.addr) { grn_io_win_unmap(&iw_); } ginfo->recs[ginfo->head] = offset; if (++ginfo->head == N_GARBAGES) { ginfo->head = 0; } ginfo->nrecs++; - grn_io_win_unmap2(&iw); + grn_io_win_unmap(&iw); ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK]++; return GRN_SUCCESS; } -/* -inline static grn_rc -chunk_new(grn_ii *ii, uint32_t *res, uint32_t size) -{ - int i, j; - uint32_t n = (size + S_CHUNK - 1) >> GRN_II_W_CHUNK; - uint32_t base_seg = grn_io_base_seg(ii->chunk); - for (i = 0, j = -1; i < GRN_II_MAX_CHUNK; i++) { - if (HEADER_CHUNK_AT(ii, i)) { - j = i; - } else { - if (i == j + n) { - j++; - if (res) { *res = j; } - for (; j <= i; j++) { HEADER_CHUNK_ON(ii, j); } - return GRN_SUCCESS; - } - // todo : cut off - if ((i + base_seg)/ N_CHUNKS_PER_FILE != - (i + base_seg + 1) / N_CHUNKS_PER_FILE) { j = i; } - } - } - GRN_LOG(ctx, GRN_LOG_CRIT, "index full."); - return GRN_NO_MEMORY_AVAILABLE; -} - -static void -chunk_free(grn_ii *ii, int offset, uint32_t size1, uint32_t size2) -{ - uint32_t i = offset + ((size1 + S_CHUNK - 1) >> GRN_II_W_CHUNK); - uint32_t n = offset + ((size2 + S_CHUNK - 1) >> GRN_II_W_CHUNK); - for (; i < n; i++) { HEADER_CHUNK_OFF(ii, i); } -} -*/ - #define UNIT_SIZE 0x80 #define UNIT_MASK (UNIT_SIZE - 1) @@ -1434,7 +1414,7 @@ pack(uint32_t *p, uint32_t i, uint8_t *freq, uint8_t *rp) } } rp = pack_(p - i, i, w, rp); - memcpy(rp, ebuf, ep - ebuf); + grn_memcpy(rp, ebuf, ep - ebuf); return rp + (ep - ebuf); } @@ -1586,7 +1566,7 @@ grn_p_encv(grn_ctx *ctx, datavec *dv, uint32_t dvlen, uint8_t *res) case 0x08 : \ if (_v == 0x8f) { \ if (_p + sizeof(uint32_t) > pe) { return 0; } \ - memcpy(&_v, _p, sizeof(uint32_t)); \ + grn_memcpy(&_v, _p, sizeof(uint32_t)); \ _p += sizeof(uint32_t); \ } \ break; \ @@ -2030,7 +2010,7 @@ buffer_put(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_term *bt, buffer_rec *r_curr, *r_start = NULL; uint16_t last = 0, *lastp = &bt->pos_in_buffer, pos = BUFFER_REC_POS(b, rnew); int vdelta = 0, delta, delta0 = 0, vhops = 0, nhops = 0, reset = 1; - memcpy(NEXT_ADDR(rnew), bs, size - sizeof(buffer_rec)); + grn_memcpy(NEXT_ADDR(rnew), bs, size - sizeof(buffer_rec)); for (;;) { if (!*lastp) { rnew->step = 0; @@ -2487,21 +2467,25 @@ typedef struct { static grn_rc chunk_flush(grn_ctx *ctx, grn_ii *ii, chunk_info *cinfo, uint8_t *enc, uint32_t encsize) { - grn_rc rc; + grn_rc rc = GRN_SUCCESS; uint8_t *dc; uint32_t dcn; grn_io_win dw; - if (!(rc = chunk_new(ctx, ii, &dcn, encsize))) { - if ((dc = WIN_MAP2(ii->chunk, ctx, &dw, dcn, 0, encsize, grn_io_wronly))) { - memcpy(dc, enc, encsize); - grn_io_win_unmap2(&dw); - cinfo->segno = dcn; - cinfo->size = encsize; - rc = GRN_SUCCESS; - } else { - chunk_free(ctx, ii, dcn, 0, encsize); - rc = GRN_NO_MEMORY_AVAILABLE; + if (encsize) { + if (!(rc = chunk_new(ctx, ii, &dcn, encsize))) { + if ((dc = WIN_MAP(ii->chunk, ctx, &dw, dcn, 0, encsize, grn_io_wronly))) { + grn_memcpy(dc, enc, encsize); + grn_io_win_unmap(&dw); + cinfo->segno = dcn; + cinfo->size = encsize; + } else { + chunk_free(ctx, ii, dcn, 0, encsize); + rc = GRN_NO_MEMORY_AVAILABLE; + } } + } else { + cinfo->segno = 0; + cinfo->size = 0; } return rc; } @@ -2517,7 +2501,7 @@ chunk_merge(grn_ctx *ctx, grn_ii *ii, buffer *sb, buffer_term *bt, uint32_t segno = cinfo->segno, size = cinfo->size, sdf = 0, ndf = 0; uint32_t *ridp = NULL, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL; docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = *bidp; - uint8_t *scp = WIN_MAP2(ii->chunk, ctx, &sw, segno, 0, size, grn_io_rdonly); + uint8_t *scp = WIN_MAP(ii->chunk, ctx, &sw, segno, 0, size, grn_io_rdonly); if (scp) { uint16_t nextb = *nextbp; uint32_t snn = 0, *srp, *ssp = NULL, *stp, *sop = NULL, *snp; @@ -2558,7 +2542,7 @@ chunk_merge(grn_ctx *ctx, grn_ii *ii, buffer *sb, buffer_term *bt, ndf = ridp - dv[0].data; } datavec_fin(ctx, rdv); - grn_io_win_unmap2(&sw); + grn_io_win_unmap(&sw); } else { rc = GRN_NO_MEMORY_AVAILABLE; } @@ -2621,6 +2605,7 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, uint64_t spos = 0; int32_t balance = 0; uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp, nchunks = 0; + uint32_t nvchunks = 0; chunk_info *cinfo = NULL; grn_id crid = GRN_ID_NIL; docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = {0, 0}; @@ -2633,7 +2618,7 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, if (!bt->pos_in_buffer) { GRN_ASSERT(!bt->size_in_buffer); if (bt->size_in_chunk) { - memcpy(dcp, sc + bt->pos_in_chunk, bt->size_in_chunk); + grn_memcpy(dcp, sc + bt->pos_in_chunk, bt->size_in_chunk); bt->pos_in_chunk = (uint32_t)(dcp - dc); dcp += bt->size_in_chunk; } @@ -2667,6 +2652,12 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, return rc; } } + if (cinfo[i].size) { + nvchunks++; + } else { + crid -= cinfo[i].dgap; + cinfo[i + 1].dgap += cinfo[i].dgap; + } } } if (sce > scp) { @@ -2739,21 +2730,21 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, memset(bt, 0, sizeof(buffer_term)); nterms_void++; } else { - if (!ndf && !nchunks) { + if (!ndf && !nvchunks) { a[0] = 0; a[1] = 0; lexicon_delete(ctx, ii, tid, h); memset(bt, 0, sizeof(buffer_term)); nterms_void++; } else if ((ii->header->flags & GRN_OBJ_WITH_SECTION) - && !nchunks && ndf == 1 && lid.rid < 0x100000 && + && !nvchunks && ndf == 1 && lid.rid < 0x100000 && lid.sid < 0x800 && lid.tf == 1 && lid.weight == 0) { a[0] = (lid.rid << 12) + (lid.sid << 1) + 1; a[1] = (ii->header->flags & GRN_OBJ_WITH_POSITION) ? posp[-1] : 0; memset(bt, 0, sizeof(buffer_term)); nterms_void++; } else if (!(ii->header->flags & GRN_OBJ_WITH_SECTION) - && !nchunks && ndf == 1 && lid.tf == 1 && lid.weight == 0) { + && !nvchunks && ndf == 1 && lid.tf == 1 && lid.weight == 0) { a[0] = (lid.rid << 1) + 1; a[1] = (ii->header->flags & GRN_OBJ_WITH_POSITION) ? posp[-1] : 0; memset(bt, 0, sizeof(buffer_term)); @@ -2779,20 +2770,25 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, } dcp0 = dcp; a[1] = (bt->size_in_chunk ? a[1] : 0) + (ndf - sdf) + balance; - if (nchunks) { + if (nvchunks) { int i; - GRN_B_ENC(nchunks, dcp); + GRN_B_ENC(nvchunks, dcp); for (i = 0; i < nchunks; i++) { - GRN_B_ENC(cinfo[i].segno, dcp); - GRN_B_ENC(cinfo[i].size, dcp); - GRN_B_ENC(cinfo[i].dgap, dcp); + if (cinfo[i].size) { + GRN_B_ENC(cinfo[i].segno, dcp); + GRN_B_ENC(cinfo[i].size, dcp); + GRN_B_ENC(cinfo[i].dgap, dcp); + } } } encsize = grn_p_encv(ctx, dv, ii->n_elements, dcp); if (sb->header.chunk_size + S_SEGMENT <= (dcp - dc) + encsize) { int i; - char buf[255], *bufp; +#define BUF_SIZE 255 + char buf[BUF_SIZE], *bufp, *buf_end; + buf_end = buf + BUF_SIZE; +#undef BUF_SIZE GRN_LOG(ctx, GRN_LOG_NOTICE, "cs(%d)+(%d)=(%d)<=(%" GRN_FMT_LLD ")+(%d)=(%" GRN_FMT_LLD ")", sb->header.chunk_size, S_SEGMENT, sb->header.chunk_size + S_SEGMENT, @@ -2801,7 +2797,10 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, GRN_LOG(ctx, GRN_LOG_NOTICE, "rdv[%d] data_size=%d, flags=%d", j, rdv[j].data_size, rdv[j].flags); for (i = 0, bufp = buf; i < rdv[j].data_size;) { - bufp += sprintf(bufp, " %d", rdv[j].data[i]); + bufp += grn_snprintf(bufp, + buf_end - bufp, + buf_end - bufp, + " %d", rdv[j].data[i]); i++; if (!(i % 32) || i == rdv[j].data_size) { GRN_LOG(ctx, GRN_LOG_NOTICE, "rdv[%d].data[%d]%s", j, i, buf); @@ -2814,7 +2813,10 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, GRN_LOG(ctx, GRN_LOG_NOTICE, "dv[%d] data_size=%d, flags=%d", j, dv[j].data_size, dv[j].flags); for (i = 0, bufp = buf; i < dv[j].data_size;) { - bufp += sprintf(bufp, " %d", dv[j].data[i]); + bufp += grn_snprintf(bufp, + buf_end - bufp, + buf_end - bufp, + " %d", dv[j].data[i]); i++; if (!(i % 32) || i == dv[j].data_size) { GRN_LOG(ctx, GRN_LOG_NOTICE, "dv[%d].data[%d]%s", j, i, buf); @@ -2830,18 +2832,23 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, !chunk_flush(ctx, ii, &cinfo[nchunks], dcp, encsize)) { int i; cinfo[nchunks].dgap = lid.rid - crid; - nchunks++; + nvchunks++; dcp = dcp0; - GRN_B_ENC(nchunks, dcp); - for (i = 0; i < nchunks; i++) { - GRN_B_ENC(cinfo[i].segno, dcp); - GRN_B_ENC(cinfo[i].size, dcp); - GRN_B_ENC(cinfo[i].dgap, dcp); + GRN_B_ENC(nvchunks, dcp); + for (i = 0; i <= nchunks; i++) { + if (cinfo[i].size) { + GRN_B_ENC(cinfo[i].segno, dcp); + GRN_B_ENC(cinfo[i].size, dcp); + GRN_B_ENC(cinfo[i].dgap, dcp); + } } GRN_LOG(ctx, GRN_LOG_NOTICE, "split (%d) encsize=%d", tid, encsize); bt->tid |= CHUNK_SPLIT; } else { dcp += encsize; + if (!nvchunks) { + bt->tid &= ~CHUNK_SPLIT; + } } bt->pos_in_chunk = (uint32_t)(dcp0 - dc); bt->size_in_chunk = (uint32_t)(dcp - dcp0); @@ -2863,7 +2870,7 @@ buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, } static void -fake_map2(grn_ctx *ctx, grn_io *io, grn_io_win *iw, void *addr, uint32_t seg, uint32_t size) +fake_map(grn_ctx *ctx, grn_io *io, grn_io_win *iw, void *addr, uint32_t seg, uint32_t size) { iw->ctx = ctx; iw->diff = 0; @@ -2894,11 +2901,11 @@ buffer_flush(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) uint32_t max_dest_chunk_size = sb->header.chunk_size + S_SEGMENT; if ((dc = GRN_MALLOC(max_dest_chunk_size * 2))) { if ((scn = sb->header.chunk) == NOT_ASSIGNED || - (sc = WIN_MAP2(ii->chunk, ctx, &sw, scn, 0, - sb->header.chunk_size, grn_io_rdonly))) { + (sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, + sb->header.chunk_size, grn_io_rdonly))) { uint16_t n = sb->header.nterms; memset(db, 0, S_SEGMENT); - memcpy(db->terms, sb->terms, n * sizeof(buffer_term)); + grn_memcpy(db->terms, sb->terms, n * sizeof(buffer_term)); db->header.nterms = n; if (!(rc = buffer_merge(ctx, ii, seg, h, sb, sc, db, dc))) { actual_chunk_size = db->header.chunk_size; @@ -2908,12 +2915,12 @@ buffer_flush(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) } if (!actual_chunk_size || !(rc = chunk_new(ctx, ii, &dcn, actual_chunk_size))) { db->header.chunk = actual_chunk_size ? dcn : NOT_ASSIGNED; - fake_map2(ctx, ii->chunk, &dw, dc, dcn, actual_chunk_size); - if (!(rc = grn_io_win_unmap2(&dw))) { + fake_map(ctx, ii->chunk, &dw, dc, dcn, actual_chunk_size); + if (!(rc = grn_io_win_unmap(&dw))) { buffer_segment_update(ii, seg, ds); ii->header->total_chunk_size += actual_chunk_size; if (scn != NOT_ASSIGNED) { - grn_io_win_unmap2(&sw); + grn_io_win_unmap(&sw); chunk_free(ctx, ii, scn, 0, sb->header.chunk_size); ii->header->total_chunk_size -= sb->header.chunk_size; } @@ -2922,15 +2929,15 @@ buffer_flush(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) if (actual_chunk_size) { chunk_free(ctx, ii, dcn, 0, actual_chunk_size); } - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc); - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc); - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc); @@ -2991,7 +2998,7 @@ grn_ii_buffer_check(grn_ctx *ctx, grn_ii *ii, uint32_t seg) GRN_OUTPUT_CSTR("void chunk size"); GRN_OUTPUT_INT64(sb->header.chunk_size); } else { - if ((sc = WIN_MAP2(ii->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, grn_io_rdonly))) { + if ((sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, grn_io_rdonly))) { GRN_OUTPUT_CSTR("chunk size"); GRN_OUTPUT_INT64(sb->header.chunk_size); } else { @@ -3133,7 +3140,7 @@ grn_ii_buffer_check(grn_ctx *ctx, grn_ii *ii, uint32_t seg) } GRN_OUTPUT_MAP_CLOSE(); datavec_fin(ctx, rdv); - if (sc) { grn_io_win_unmap2(&sw); } + if (sc) { grn_io_win_unmap(&sw); } buffer_close(ctx, ii, pseg); } @@ -3176,7 +3183,7 @@ term_split(grn_ctx *ctx, grn_obj *lexicon, buffer *sb, buffer *db0, buffer *db1) bt = db0->terms; nt = &db0->header.nterms; for (s = 0; n + 1 < i && s <= th; n++, bt++) { - memcpy(bt, ts[n].bt, sizeof(buffer_term)); + grn_memcpy(bt, ts[n].bt, sizeof(buffer_term)); (*nt)++; s += ts[n].bt->size_in_chunk + 1; } @@ -3184,7 +3191,7 @@ term_split(grn_ctx *ctx, grn_obj *lexicon, buffer *sb, buffer *db0, buffer *db1) bt = db1->terms; nt = &db1->header.nterms; for (; n < i; n++, bt++) { - memcpy(bt, ts[n].bt, sizeof(buffer_term)); + grn_memcpy(bt, ts[n].bt, sizeof(buffer_term)); (*nt)++; } GRN_FREE(ts); @@ -3236,8 +3243,8 @@ buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) if ((dc0 = GRN_MALLOC(max_dest_chunk_size * 2))) { if ((dc1 = GRN_MALLOC(max_dest_chunk_size * 2))) { if ((scn = sb->header.chunk) == NOT_ASSIGNED || - (sc = WIN_MAP2(ii->chunk, ctx, &sw, scn, 0, - sb->header.chunk_size, grn_io_rdonly))) { + (sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, + sb->header.chunk_size, grn_io_rdonly))) { term_split(ctx, ii->lexicon, sb, db0, db1); if (!(rc = buffer_merge(ctx, ii, seg, h, sb, sc, db0, dc0))) { actual_db0_chunk_size = db0->header.chunk_size; @@ -3249,8 +3256,8 @@ buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) if (!actual_db0_chunk_size || !(rc = chunk_new(ctx, ii, &dcn0, actual_db0_chunk_size))) { db0->header.chunk = actual_db0_chunk_size ? dcn0 : NOT_ASSIGNED; - fake_map2(ctx, ii->chunk, &dw0, dc0, dcn0, actual_db0_chunk_size); - if (!(rc = grn_io_win_unmap2(&dw0))) { + fake_map(ctx, ii->chunk, &dw0, dc0, dcn0, actual_db0_chunk_size); + if (!(rc = grn_io_win_unmap(&dw0))) { if (!(rc = buffer_merge(ctx, ii, seg, h, sb, sc, db1, dc1))) { actual_db1_chunk_size = db1->header.chunk_size; if (actual_db1_chunk_size >= max_dest_chunk_size) { @@ -3260,8 +3267,8 @@ buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) } if (!actual_db1_chunk_size || !(rc = chunk_new(ctx, ii, &dcn1, actual_db1_chunk_size))) { - fake_map2(ctx, ii->chunk, &dw1, dc1, dcn1, actual_db1_chunk_size); - if (!(rc = grn_io_win_unmap2(&dw1))) { + fake_map(ctx, ii->chunk, &dw1, dc1, dcn1, actual_db1_chunk_size); + if (!(rc = grn_io_win_unmap(&dw1))) { db1->header.chunk = actual_db1_chunk_size ? dcn1 : NOT_ASSIGNED; buffer_segment_update(ii, dls0, dps0); buffer_segment_update(ii, dls1, dps1); @@ -3271,7 +3278,7 @@ buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) ii->header->total_chunk_size += actual_db0_chunk_size; ii->header->total_chunk_size += actual_db1_chunk_size; if (scn != NOT_ASSIGNED) { - grn_io_win_unmap2(&sw); + grn_io_win_unmap(&sw); chunk_free(ctx, ii, scn, 0, sb->header.chunk_size); ii->header->total_chunk_size -= sb->header.chunk_size; } @@ -3283,21 +3290,21 @@ buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); } GRN_FREE(dc1); - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { if (actual_db0_chunk_size) { chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); } GRN_FREE(dc1); - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { if (actual_db0_chunk_size) { chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); } GRN_FREE(dc1); - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { if (actual_db0_chunk_size) { @@ -3305,17 +3312,17 @@ buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) } GRN_FREE(dc1); GRN_FREE(dc0); - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc1); GRN_FREE(dc0); - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc1); GRN_FREE(dc0); - if (scn != NOT_ASSIGNED) { grn_io_win_unmap2(&sw); } + if (scn != NOT_ASSIGNED) { grn_io_win_unmap(&sw); } } } else { GRN_FREE(dc1); @@ -3476,8 +3483,8 @@ _grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uin S_SEGMENT, MAX_PSEG, grn_io_auto, GRN_IO_EXPIRE_SEGMENT); if (!seg) { return NULL; } if (path) { - strcpy(path2, path); - strcat(path2, ".c"); + grn_strcpy(path2, PATH_MAX, path); + grn_strcat(path2, PATH_MAX, ".c"); chunk = grn_io_create(ctx, path2, 0, S_CHUNK, GRN_II_MAX_CHUNK, grn_io_auto, GRN_IO_EXPIRE_SEGMENT); } else { @@ -3533,7 +3540,8 @@ grn_ii_remove(grn_ctx *ctx, const char *path) char buffer[PATH_MAX]; if (!path || strlen(path) > PATH_MAX - 4) { return GRN_INVALID_ARGUMENT; } if ((rc = grn_io_remove(ctx, path))) { goto exit; } - snprintf(buffer, PATH_MAX, "%s.c", path); + grn_snprintf(buffer, PATH_MAX, PATH_MAX, + "%s.c", path); rc = grn_io_remove(ctx, buffer); exit : return rc; @@ -3595,8 +3603,8 @@ grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon) return NULL; } if (strlen(path) + 6 >= PATH_MAX) { return NULL; } - strcpy(path2, path); - strcat(path2, ".c"); + grn_strcpy(path2, PATH_MAX, path); + grn_strcat(path2, PATH_MAX, ".c"); seg = grn_io_open(ctx, path, grn_io_auto); if (!seg) { return NULL; } chunk = grn_io_open(ctx, path2, grn_io_auto); @@ -3984,16 +3992,16 @@ chunk_is_reused(grn_ctx *ctx, grn_ii *ii, grn_ii_cursor *c, uint32_t offset, uin gseg = ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; while (gseg != NOT_ASSIGNED) { grn_io_win iw; - grn_ii_ginfo *ginfo = WIN_MAP2(ii->chunk, ctx, &iw, gseg, 0, S_GARBAGE, grn_io_rdwr); + grn_ii_ginfo *ginfo = WIN_MAP(ii->chunk, ctx, &iw, gseg, 0, S_GARBAGE, grn_io_rdwr); if (!ginfo) { break; } for (i = 0; i < ginfo->nrecs; i++) { if (ginfo->recs[i] == offset) { - grn_io_win_unmap2(&iw); + grn_io_win_unmap(&iw); return 0; } } gseg = ginfo->next; - grn_io_win_unmap2(&iw); + grn_io_win_unmap(&iw); } return 1; } @@ -4047,8 +4055,8 @@ grn_ii_cursor_open(grn_ctx *ctx, grn_ii *ii, grn_id tid, } c->ppseg = &ii->header->binfo[LSEG(pos)]; if (bt->size_in_chunk && (chunk = c->buf->header.chunk) != NOT_ASSIGNED) { - if (!(c->cp = WIN_MAP2(ii->chunk, ctx, &c->iw, chunk, bt->pos_in_chunk, - bt->size_in_chunk, grn_io_rdonly))) { + if (!(c->cp = WIN_MAP(ii->chunk, ctx, &c->iw, chunk, bt->pos_in_chunk, + bt->size_in_chunk, grn_io_rdonly))) { buffer_close(ctx, ii, c->buffer_pseg); GRN_FREE(c); c = NULL; @@ -4069,7 +4077,7 @@ grn_ii_cursor_open(grn_ctx *ctx, grn_ii *ii, grn_id tid, } if (!(c->cinfo = GRN_MALLOCN(chunk_info, c->nchunks))) { buffer_close(ctx, ii, c->buffer_pseg); - grn_io_win_unmap2(&c->iw); + grn_io_win_unmap(&c->iw); GRN_FREE(c); c = NULL; goto exit; @@ -4101,6 +4109,41 @@ exit : return c; } +static inline void +grn_ii_cursor_set_min(grn_ctx *ctx, grn_ii_cursor *c, grn_id min) +{ + char grn_ii_cursor_set_min_enable_env[GRN_ENV_BUFFER_SIZE]; + + if (c->min >= min) { + return; + } + + grn_getenv("GRN_II_CURSOR_SET_MIN_ENABLE", + grn_ii_cursor_set_min_enable_env, + GRN_ENV_BUFFER_SIZE); + if (grn_ii_cursor_set_min_enable_env[0]) { + c->min = min; + if (c->buf && c->pc.rid < c->min && c->curr_chunk < c->nchunks) { + uint32_t i, skip_chunk = 0; + grn_id rid; + for (i = 0, rid = GRN_ID_NIL; i < c->nchunks; i++) { + rid += c->cinfo[i].dgap; + if (rid < c->min) { + skip_chunk = i + 1; + } else { + rid -= c->cinfo[i].dgap; + break; + } + } + if (skip_chunk > c->curr_chunk) { + c->pc.rid = rid; + c->curr_chunk = skip_chunk; + c->crp = c->cdp + c->cdf; + } + } + } +} + grn_ii_posting * grn_ii_cursor_next(grn_ctx *ctx, grn_ii_cursor *c) { @@ -4162,11 +4205,11 @@ grn_ii_cursor_next(grn_ctx *ctx, grn_ii_cursor *c) uint8_t *cp; grn_io_win iw; uint32_t size = c->cinfo[c->curr_chunk].size; - if (size && (cp = WIN_MAP2(c->ii->chunk, ctx, &iw, - c->cinfo[c->curr_chunk].segno, 0, - size, grn_io_rdonly))) { + if (size && (cp = WIN_MAP(c->ii->chunk, ctx, &iw, + c->cinfo[c->curr_chunk].segno, 0, + size, grn_io_rdonly))) { grn_p_decv(ctx, cp, size, c->rdv, c->ii->n_elements); - grn_io_win_unmap2(&iw); + grn_io_win_unmap(&iw); if (chunk_is_reused(ctx, c->ii, c, c->cinfo[c->curr_chunk].segno, size)) { GRN_LOG(ctx, GRN_LOG_WARNING, @@ -4206,34 +4249,55 @@ grn_ii_cursor_next(grn_ctx *ctx, grn_ii_cursor *c) } } if (c->stat & BUFFER_USED) { - if (c->nextb) { - uint32_t lrid = c->pb.rid, lsid = c->pb.sid; /* for check */ - buffer_rec *br = BUFFER_REC_AT(c->buf, c->nextb); - if (buffer_is_reused(ctx, c->ii, c)) { - GRN_LOG(ctx, GRN_LOG_NOTICE, "buffer reused(%d,%d)", c->buffer_pseg, *c->ppseg); - // todo : rewind; - } - c->bp = NEXT_ADDR(br); - GRN_B_DEC(c->pb.rid, c->bp); - if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { - GRN_B_DEC(c->pb.sid, c->bp); - } else { - c->pb.sid = 1; - } - if (lrid > c->pb.rid || (lrid == c->pb.rid && lsid >= c->pb.sid)) { - ERR(GRN_FILE_CORRUPT, "brokend!! (%d:%d) -> (%d:%d) (%d->%d)", lrid, lsid, c->pb.rid, c->pb.sid, c->buffer_pseg, *c->ppseg); - } - c->nextb = br->step; - GRN_B_DEC(c->pb.tf, c->bp); - if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { - GRN_B_DEC(c->pb.weight, c->bp); + for (;;) { + if (c->nextb) { + uint32_t lrid = c->pb.rid, lsid = c->pb.sid; /* for check */ + buffer_rec *br = BUFFER_REC_AT(c->buf, c->nextb); + if (buffer_is_reused(ctx, c->ii, c)) { + GRN_LOG(ctx, GRN_LOG_NOTICE, "buffer reused(%d,%d)", c->buffer_pseg, *c->ppseg); + // todo : rewind; + } + c->bp = NEXT_ADDR(br); + GRN_B_DEC(c->pb.rid, c->bp); + if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { + GRN_B_DEC(c->pb.sid, c->bp); + } else { + c->pb.sid = 1; + } + if (lrid > c->pb.rid || (lrid == c->pb.rid && lsid >= c->pb.sid)) { + ERR(GRN_FILE_CORRUPT, "brokend!! (%d:%d) -> (%d:%d) (%d->%d)", lrid, lsid, c->pb.rid, c->pb.sid, c->buffer_pseg, *c->ppseg); + } + if (c->pb.rid < c->min) { + c->pb.rid = 0; + if (br->jump > 0) { + buffer_rec *jump_br = BUFFER_REC_AT(c->buf, br->jump); + uint8_t *jump_bp; + uint32_t jump_rid; + jump_bp = NEXT_ADDR(jump_br); + GRN_B_DEC(jump_rid, jump_bp); + if (jump_rid < c->min) { + c->nextb = br->jump; + } else { + c->nextb = br->step; + } + } else { + c->nextb = br->step; + } + continue; + } + c->nextb = br->step; + GRN_B_DEC(c->pb.tf, c->bp); + if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { + GRN_B_DEC(c->pb.weight, c->bp); + } else { + c->pb.weight = 0; + } + c->pb.rest = c->pb.tf; + c->pb.pos = 0; } else { - c->pb.weight = 0; + c->pb.rid = 0; } - c->pb.rest = c->pb.tf; - c->pb.pos = 0; - } else { - c->pb.rid = 0; + break; } } if (c->pb.rid) { @@ -4281,6 +4345,10 @@ grn_ii_cursor_next(grn_ctx *ctx, grn_ii_cursor *c) } else { c->post = &c->pb; c->stat |= SOLE_DOC_USED; + if (c->post->rid < c->min) { + c->post = NULL; + return NULL; + } } } return c->post; @@ -4340,7 +4408,7 @@ grn_ii_cursor_close(grn_ctx *ctx, grn_ii_cursor *c) datavec_fin(ctx, c->rdv); if (c->cinfo) { GRN_FREE(c->cinfo); } if (c->buf) { buffer_close(ctx, c->ii, c->buffer_pseg); } - if (c->cp) { grn_io_win_unmap2(&c->iw); } + if (c->cp) { grn_io_win_unmap(&c->iw); } GRN_FREE(c); return GRN_SUCCESS; } @@ -4560,10 +4628,11 @@ cursor_heap_recalc_min(cursor_heap *h) } static inline void -cursor_heap_pop(grn_ctx *ctx, cursor_heap *h) +cursor_heap_pop(grn_ctx *ctx, cursor_heap *h, grn_id min) { if (h->n_entries) { grn_ii_cursor *c = h->bins[0]; + grn_ii_cursor_set_min(ctx, c, min); if (!grn_ii_cursor_next(ctx, c)) { grn_ii_cursor_close(ctx, c); h->bins[0] = h->bins[--h->n_entries]; @@ -4670,6 +4739,7 @@ inline static grn_rc index_del(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgram, const char *value, size_t value_len) { + grn_rc rc = GRN_SUCCESS; grn_hash *h; unsigned int token_flags = 0; grn_token_cursor *token_cursor; @@ -4702,12 +4772,16 @@ index_del(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgr grn_token_cursor_close(ctx, token_cursor); GRN_HASH_EACH(ctx, h, id, &tp, NULL, &u, { if (*tp) { - grn_ii_delete_one(ctx, ii, *tp, *u, NULL); + grn_rc r; + r = grn_ii_delete_one(ctx, ii, *tp, *u, NULL); + if (r) { + rc = r; + } } grn_ii_updspec_close(ctx, *u); }); grn_hash_close(ctx, h); - return GRN_SUCCESS; + return rc; } grn_rc @@ -4842,7 +4916,11 @@ grn_ii_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, unsigned i grn_hash_delete_by_id(ctx, new, eid, NULL); } } else { - grn_ii_delete_one(ctx, ii, *tp, *u, new); + grn_rc r; + r = grn_ii_delete_one(ctx, ii, *tp, *u, new); + if (r) { + rc = r; + } } grn_ii_updspec_close(ctx, *u); }); @@ -4867,7 +4945,8 @@ exit : static grn_rc grn_vector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, - grn_obj *in, grn_obj *out, grn_token_mode mode, grn_obj *posting) + grn_obj *in, grn_obj *out, grn_tokenize_mode mode, + grn_obj *posting) { int j; grn_id tid; @@ -4911,8 +4990,65 @@ grn_vector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, } static grn_rc -grn_uvector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, - grn_obj *in, grn_obj *out) +grn_uvector2updspecs_data(grn_ctx *ctx, grn_ii *ii, grn_id rid, + unsigned int section, grn_obj *in, grn_obj *out, + grn_tokenize_mode mode, grn_obj *posting) +{ + int i, n; + grn_hash *h = (grn_hash *)out; + grn_obj *lexicon = ii->lexicon; + unsigned int element_size; + + n = grn_uvector_size(ctx, in); + element_size = grn_uvector_element_size(ctx, in); + for (i = 0; i < n; i++) { + grn_token_cursor *token_cursor; + unsigned int token_flags = 0; + const char *element; + + element = GRN_BULK_HEAD(in) + (element_size * i); + token_cursor = grn_token_cursor_open(ctx, lexicon, + element, element_size, + mode, token_flags); + if (!token_cursor) { + continue; + } + + while (!token_cursor->status) { + grn_id tid; + if ((tid = grn_token_cursor_next(ctx, token_cursor))) { + grn_ii_updspec **u; + + if (posting) { GRN_RECORD_PUT(ctx, posting, tid); } + if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&u, NULL)) { + break; + } + if (!*u) { + if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_open on grn_uvector2updspecs_data failed!"); + grn_token_cursor_close(ctx, token_cursor); + return GRN_NO_MEMORY_AVAILABLE; + } + } + if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, 0)) { + GRN_LOG(ctx, GRN_LOG_ALERT, + "grn_ii_updspec_add on grn_uvector2updspecs failed!"); + grn_token_cursor_close(ctx, token_cursor); + return GRN_NO_MEMORY_AVAILABLE; + } + } + } + + grn_token_cursor_close(ctx, token_cursor); + } + + return GRN_SUCCESS; +} + +static grn_rc +grn_uvector2updspecs_id(grn_ctx *ctx, grn_ii *ii, grn_id rid, + unsigned int section, grn_obj *in, grn_obj *out) { int i, n; grn_ii_updspec **u; @@ -4941,6 +5077,19 @@ grn_uvector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, return GRN_SUCCESS; } +static grn_rc +grn_uvector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, + unsigned int section, grn_obj *in, grn_obj *out, + grn_tokenize_mode mode, grn_obj *posting) +{ + if (in->header.domain < GRN_N_RESERVED_TYPES) { + return grn_uvector2updspecs_data(ctx, ii, rid, section, in, out, + mode, posting); + } else { + return grn_uvector2updspecs_id(ctx, ii, rid, section, in, out); + } +} + grn_rc grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, grn_obj *oldvalue, grn_obj *newvalue, grn_obj *posting) @@ -5001,7 +5150,8 @@ grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, rc = GRN_NO_MEMORY_AVAILABLE; } else { if (new_->header.type == GRN_UVECTOR) { - rc = grn_uvector2updspecs(ctx, ii, rid, section, new_, new); + rc = grn_uvector2updspecs(ctx, ii, rid, section, new_, new, + GRN_TOKEN_ADD, post); } else { grn_obj uvector; unsigned int weight = 0; @@ -5010,7 +5160,8 @@ grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, uvector.header.impl_flags |= GRN_OBJ_WITH_WEIGHT; } grn_uvector_add_element(ctx, &uvector, GRN_RECORD_VALUE(new_), weight); - rc = grn_uvector2updspecs(ctx, ii, rid, section, &uvector, new); + rc = grn_uvector2updspecs(ctx, ii, rid, section, &uvector, new, + GRN_TOKEN_ADD, post); GRN_OBJ_FIN(ctx, &uvector); } } @@ -5095,7 +5246,8 @@ grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, rc = GRN_NO_MEMORY_AVAILABLE; } else { if (old_->header.type == GRN_UVECTOR) { - rc = grn_uvector2updspecs(ctx, ii, rid, section, old_, old); + rc = grn_uvector2updspecs(ctx, ii, rid, section, old_, old, + GRN_TOKEN_DEL, NULL); } else { grn_obj uvector; unsigned int weight = 0; @@ -5104,7 +5256,8 @@ grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, uvector.header.impl_flags |= GRN_OBJ_WITH_WEIGHT; } grn_uvector_add_element(ctx, &uvector, GRN_RECORD_VALUE(old_), weight); - rc = grn_uvector2updspecs(ctx, ii, rid, section, &uvector, old); + rc = grn_uvector2updspecs(ctx, ii, rid, section, &uvector, old, + GRN_TOKEN_DEL, NULL); GRN_OBJ_FIN(ctx, &uvector); } } @@ -5130,7 +5283,11 @@ grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, grn_hash_delete_by_id(ctx, n, eid, NULL); } } else { - grn_ii_delete_one(ctx, ii, *tp, *u, n); + grn_rc r; + r = grn_ii_delete_one(ctx, ii, *tp, *u, n); + if (r) { + rc = r; + } } grn_ii_updspec_close(ctx, *u); }); @@ -5322,7 +5479,7 @@ token_info_skip(grn_ctx *ctx, token_info *ti, uint32_t rid, uint32_t sid) if (!(c = cursor_heap_min(ti->cursors))) { return GRN_END_OF_DATA; } p = c->post; if (p->rid > rid || (p->rid == rid && p->sid >= sid)) { break; } - cursor_heap_pop(ctx, ti->cursors); + cursor_heap_pop(ctx, ti->cursors, rid); } ti->pos = p->pos - ti->offset; ti->p = p; @@ -5362,7 +5519,7 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, const char *key; uint32_t size; grn_rc rc = GRN_END_OF_DATA; - unsigned int token_flags = GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER; + unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; grn_token_cursor *token_cursor = grn_token_cursor_open(ctx, lexicon, string, string_len, GRN_TOKEN_GET, @@ -5394,11 +5551,11 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, tid = grn_token_cursor_next(ctx, token_cursor); if (token_cursor->force_prefix) { ef |= EX_PREFIX; } switch (token_cursor->status) { - case GRN_TOKEN_DOING : + case GRN_TOKEN_CURSOR_DOING : key = _grn_table_key(ctx, lexicon, tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, ef & EX_SUFFIX); break; - case GRN_TOKEN_DONE : + case GRN_TOKEN_CURSOR_DONE : ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, token_cursor->curr_size, 0, ef); /* @@ -5408,11 +5565,11 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, token_cursor->orig_blen, token_cursor->pos, ef); */ break; - case GRN_TOKEN_NOT_FOUND : + case GRN_TOKEN_CURSOR_NOT_FOUND : ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, token_cursor->orig_blen, 0, ef); break; - case GRN_TOKEN_DONE_SKIP : + case GRN_TOKEN_CURSOR_DONE_SKIP : *only_skip_token = GRN_TRUE; goto exit; default : @@ -5420,16 +5577,17 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, } if (!ti) { goto exit ; } tis[(*n)++] = ti; - while (token_cursor->status == GRN_TOKEN_DOING) { + while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { tid = grn_token_cursor_next(ctx, token_cursor); + if (token_cursor->force_prefix) { ef |= EX_PREFIX; } switch (token_cursor->status) { - case GRN_TOKEN_DONE_SKIP : + case GRN_TOKEN_CURSOR_DONE_SKIP : continue; - case GRN_TOKEN_DOING : + case GRN_TOKEN_CURSOR_DOING : key = _grn_table_key(ctx, lexicon, tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, EX_NONE); break; - case GRN_TOKEN_DONE : + case GRN_TOKEN_CURSOR_DONE : if (tid) { key = _grn_table_key(ctx, lexicon, tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, ef & EX_PREFIX); @@ -5440,7 +5598,9 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, token_cursor->curr_size, token_cursor->pos, ef & EX_PREFIX); break; } - if (!ti) { goto exit; } + if (!ti) { + goto exit; + } tis[(*n)++] = ti; } rc = GRN_SUCCESS; @@ -5460,7 +5620,7 @@ token_info_clear_offset(token_info **tis, uint32_t n) /* select */ inline static void -res_add(grn_ctx *ctx, grn_hash *s, grn_rset_posinfo *pi, uint32_t score, +res_add(grn_ctx *ctx, grn_hash *s, grn_rset_posinfo *pi, double score, grn_operator op) { grn_rset_recinfo *ri; @@ -5619,7 +5779,7 @@ typedef enum { grn_wv_constant } grn_wv_mode; -inline static int +inline static double get_weight(grn_ctx *ctx, grn_hash *s, grn_id rid, int sid, grn_wv_mode wvm, grn_select_optarg *optarg) { @@ -5656,7 +5816,7 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii, grn_rc rc = GRN_SUCCESS; grn_hash *h; grn_token_cursor *token_cursor; - unsigned int token_flags = GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER; + unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; grn_obj *lexicon = ii->lexicon; if (!lexicon || !ii || !string || !string_len || !s || !optarg) { return GRN_INVALID_ARGUMENT; } if (!(h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(int), 0))) { @@ -5668,8 +5828,8 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii, return GRN_NO_MEMORY_AVAILABLE; } if (!(max_size = optarg->max_size)) { max_size = 1048576; } - while (token_cursor->status != GRN_TOKEN_DONE && - token_cursor->status != GRN_TOKEN_DONE_SKIP) { + while (token_cursor->status != GRN_TOKEN_CURSOR_DONE && + token_cursor->status != GRN_TOKEN_CURSOR_DONE_SKIP) { if ((tid = grn_token_cursor_next(ctx, token_cursor))) { if (grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&w1, NULL)) { (*w1)++; } } @@ -5749,7 +5909,7 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii, if (rep) { while (grn_ii_cursor_next(ctx, c)) { pos = c->post; - if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg))) { + if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg)) > 0) { while (grn_ii_cursor_next_pos(ctx, c)) { res_add(ctx, s, (grn_rset_posinfo *) pos, *w1 * w2 * (1 + pos->weight), op); } @@ -5758,7 +5918,7 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii, } else { while (grn_ii_cursor_next(ctx, c)) { pos = c->post; - if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg))) { + if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg)) > 0) { res_add(ctx, s, (grn_rset_posinfo *) pos, *w1 * w2 * (pos->tf + pos->weight), op); } } @@ -5859,6 +6019,215 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string, return rc; } +#ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH +static grn_bool +grn_ii_select_sequential_search_should_use(grn_ctx *ctx, + grn_ii *ii, + const char *raw_query, + unsigned int raw_query_len, + grn_hash *result, + grn_operator op, + grn_wv_mode wvm, + grn_select_optarg *optarg, + token_info **token_infos, + uint32_t n_token_infos, + double too_many_index_match_ratio) +{ + int n_sources; + + if (too_many_index_match_ratio < 0.0) { + return GRN_FALSE; + } + + if (op != GRN_OP_AND) { + return GRN_FALSE; + } + + if (optarg->mode != GRN_OP_EXACT) { + return GRN_FALSE; + } + + n_sources = ii->obj.source_size / sizeof(grn_id); + if (n_sources == 0) { + return GRN_FALSE; + } + + { + uint32_t i; + int n_existing_records; + + n_existing_records = GRN_HASH_SIZE(result); + for (i = 0; i < n_token_infos; i++) { + token_info *info = token_infos[i]; + if (n_existing_records <= (info->size * too_many_index_match_ratio)) { + return GRN_TRUE; + } + } + return GRN_FALSE; + } +} + +static void +grn_ii_select_sequential_search_body(grn_ctx *ctx, + grn_ii *ii, + grn_obj *normalizer, + grn_encoding encoding, + OnigRegex regex, + grn_hash *result, + grn_operator op, + grn_wv_mode wvm, + grn_select_optarg *optarg) +{ + int i, n_sources; + grn_id *source_ids = ii->obj.source; + grn_obj buffer; + + GRN_TEXT_INIT(&buffer, 0); + n_sources = ii->obj.source_size / sizeof(grn_id); + for (i = 0; i < n_sources; i++) { + grn_id source_id = source_ids[i]; + grn_obj *source; + char column_name[GRN_TABLE_MAX_KEY_SIZE]; + int column_name_size; + grn_obj *accessor; + + source = grn_ctx_at(ctx, source_id); + column_name_size = grn_column_name(ctx, source, + column_name, + GRN_TABLE_MAX_KEY_SIZE); + accessor = grn_obj_column(ctx, (grn_obj *)result, column_name, + column_name_size); + { + grn_hash_cursor *cursor; + grn_id id; + cursor = grn_hash_cursor_open(ctx, result, NULL, 0, NULL, 0, 0, -1, 0); + while ((id = grn_hash_cursor_next(ctx, cursor)) != GRN_ID_NIL) { + OnigPosition position; + grn_obj *value; + const char *normalized_value; + unsigned int normalized_value_length; + + GRN_BULK_REWIND(&buffer); + grn_obj_get_value(ctx, accessor, id, &buffer); + value = grn_string_open_(ctx, + GRN_TEXT_VALUE(&buffer), + GRN_TEXT_LEN(&buffer), + normalizer, 0, encoding); + grn_string_get_normalized(ctx, value, + &normalized_value, &normalized_value_length, + NULL); + position = onig_search(regex, + normalized_value, + normalized_value + normalized_value_length, + normalized_value, + normalized_value + normalized_value_length, + NULL, + 0); + if (position != ONIG_MISMATCH) { + grn_rset_posinfo info; + double score; + info.rid = id; + info.sid = i + 1; + info.pos = 0; + score = get_weight(ctx, result, info.rid, info.sid, wvm, optarg); + res_add(ctx, result, &info, score, op); + } + grn_obj_unlink(ctx, value); + } + grn_hash_cursor_close(ctx, cursor); + } + grn_obj_unlink(ctx, accessor); + } + grn_obj_unlink(ctx, &buffer); +} + +static grn_bool +grn_ii_select_sequential_search(grn_ctx *ctx, + grn_ii *ii, + const char *raw_query, + unsigned int raw_query_len, + grn_hash *result, + grn_operator op, + grn_wv_mode wvm, + grn_select_optarg *optarg, + token_info **token_infos, + uint32_t n_token_infos) +{ + grn_bool processed = GRN_TRUE; + + { + /* Disabled by default. */ + double too_many_index_match_ratio = -1; + char too_many_index_match_ratio_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_II_SELECT_TOO_MANY_INDEX_MATCH_RATIO", + too_many_index_match_ratio_env, + GRN_ENV_BUFFER_SIZE); + if (too_many_index_match_ratio_env[0]) { + too_many_index_match_ratio = atof(too_many_index_match_ratio_env); + } + + if (!grn_ii_select_sequential_search_should_use(ctx, + ii, + raw_query, + raw_query_len, + result, + op, + wvm, + optarg, + token_infos, + n_token_infos, + too_many_index_match_ratio)) { + return GRN_FALSE; + } + } + + { + grn_encoding encoding; + grn_obj *normalizer; + int nflags = 0; + grn_obj *query; + const char *normalized_query; + unsigned int normalized_query_length; + + grn_table_get_info(ctx, ii->lexicon, + NULL, &encoding, NULL, &normalizer, NULL); + query = grn_string_open_(ctx, raw_query, raw_query_len, + normalizer, nflags, encoding); + grn_string_get_normalized(ctx, query, + &normalized_query, &normalized_query_length, + NULL); + { + OnigRegex regex; + int onig_result; + OnigErrorInfo error_info; + onig_result = onig_new(®ex, + normalized_query, + normalized_query + normalized_query_length, + ONIG_OPTION_NONE, + ONIG_ENCODING_UTF8, + ONIG_SYNTAX_ASIS, + &error_info); + if (onig_result == ONIG_NORMAL) { + grn_ii_select_sequential_search_body(ctx, ii, normalizer, encoding, + regex, result, op, wvm, optarg); + onig_free(regex); + } else { + char message[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(message, onig_result, error_info); + GRN_LOG(ctx, GRN_LOG_WARNING, + "[ii][select][sequential] " + "failed to create regular expression object: %s", + message); + processed = GRN_FALSE; + } + } + grn_obj_unlink(ctx, query); + } + + return processed; +} +#endif + grn_rc grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, grn_hash *s, grn_operator op, grn_select_optarg *optarg) @@ -5872,6 +6241,9 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ grn_operator mode = GRN_OP_EXACT; grn_wv_mode wvm = grn_wv_none; grn_obj *lexicon = ii->lexicon; + grn_scorer_score_func *score_func = NULL; + grn_scorer_matched_record record; + if (!lexicon || !ii || !s) { return GRN_INVALID_ARGUMENT; } if (optarg) { mode = optarg->mode; @@ -5938,6 +6310,31 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ goto exit; } */ +#ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH + if (grn_ii_select_sequential_search(ctx, ii, string, string_len, + s, op, wvm, optarg, tis, n)) { + goto exit; + } +#endif + + if (optarg && optarg->scorer) { + grn_proc *scorer = (grn_proc *)(optarg->scorer); + score_func = scorer->callbacks.scorer.score; + record.table = grn_ctx_at(ctx, s->obj.header.domain); + record.lexicon = lexicon; + record.id = GRN_ID_NIL; + GRN_RECORD_INIT(&(record.terms), GRN_OBJ_VECTOR, lexicon->header.domain); + GRN_UINT32_INIT(&(record.term_weights), GRN_OBJ_VECTOR); + record.total_term_weights = 0; + record.n_documents = grn_table_size(ctx, record.table); + record.n_occurrences = 0; + record.n_candidates = 0; + record.n_tokens = 0; + record.weight = 0; + record.args_expr = optarg->scorer_args_expr; + record.args_expr_offset = optarg->scorer_args_expr_offset; + } + for (;;) { rid = (*tis)->p->rid; sid = (*tis)->p->sid; @@ -5951,11 +6348,18 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ } } weight = get_weight(ctx, s, rid, sid, wvm, optarg); - if (tip == tie && weight) { + if (tip == tie && weight > 0) { grn_rset_posinfo pi = {rid, sid, 0}; if (orp || grn_hash_get(ctx, s, &pi, s->key_size, NULL)) { int count = 0, noccur = 0, pos = 0, score = 0, tscore = 0, min, max; + if (score_func) { + GRN_BULK_REWIND(&(record.terms)); + GRN_BULK_REWIND(&(record.term_weights)); + record.n_candidates = 0; + record.n_tokens = 0; + } + #define SKIP_OR_BREAK(pos) {\ if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ if (ti->p->rid != rid || ti->p->sid != sid) { \ @@ -5967,6 +6371,13 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ if (n == 1 && !rep) { noccur = (*tis)->p->tf; tscore = (*tis)->p->weight; + if (score_func) { + GRN_RECORD_PUT(ctx, &(record.terms), (*tis)->cursors->bins[0]->id); + GRN_UINT32_PUT(ctx, &(record.term_weights), tscore); + record.n_occurrences = noccur; + record.n_candidates = (*tis)->size; + record.n_tokens = (*tis)->ntoken; + } } else if (mode == GRN_OP_NEAR) { bt_zap(bt); for (tip = tis; tip < tie; tip++) { @@ -6003,6 +6414,18 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ score += ti->p->weight; count++; } else { score = ti->p->weight; count = 1; pos = ti->pos; + if (noccur == 0 && score_func) { + GRN_BULK_REWIND(&(record.terms)); + GRN_BULK_REWIND(&(record.term_weights)); + record.n_candidates = 0; + record.n_tokens = 0; + } + } + if (noccur == 0 && score_func) { + GRN_RECORD_PUT(ctx, &(record.terms), ti->cursors->bins[0]->id); + GRN_UINT32_PUT(ctx, &(record.term_weights), ti->p->weight); + record.n_candidates += ti->size; + record.n_tokens += ti->ntoken; } if (count == n) { if (rep) { pi.pos = pos; res_add(ctx, s, &pi, (score + 1) * weight, op); } @@ -6012,13 +6435,29 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ } } } - if (noccur && !rep) { res_add(ctx, s, &pi, (noccur + tscore) * weight, op); } + if (noccur && !rep) { + double record_score; + if (score_func) { + record.id = rid; + record.weight = weight; + record.n_occurrences = noccur; + record.total_term_weights = tscore; + record_score = score_func(ctx, &record) * weight; + } else { + record_score = (noccur + tscore) * weight; + } + res_add(ctx, s, &pi, record_score, op); + } #undef SKIP_OR_BREAK } } if (token_info_skip(ctx, *tis, nrid, nsid)) { goto exit; } } exit : + if (score_func) { + GRN_OBJ_FIN(ctx, &(record.terms)); + GRN_OBJ_FIN(ctx, &(record.term_weights)); + } for (tip = tis; tip < tis + n; tip++) { if (*tip) { token_info_close(ctx, *tip); } } @@ -6039,6 +6478,93 @@ exit : return rc; } +uint32_t +grn_ii_estimate_size_for_query(grn_ctx *ctx, grn_ii *ii, + const char *query, unsigned int query_len, + grn_search_optarg *optarg) +{ + grn_rc rc; + grn_obj *lexicon = ii->lexicon; + token_info **tis = NULL; + uint32_t i; + uint32_t n_tis = 0; + grn_bool only_skip_token = GRN_FALSE; + grn_operator mode = GRN_OP_EXACT; + double estimated_size = 0; + + if (query_len == 0) { + return 0; + } + + tis = GRN_MALLOC(sizeof(token_info *) * query_len * 2); + if (!tis) { + return 0; + } + + if (optarg) { + switch (optarg->mode) { + case GRN_OP_NEAR : + case GRN_OP_NEAR2 : + mode = optarg->mode; + break; + case GRN_OP_SIMILAR : + mode = optarg->mode; + break; + case GRN_OP_REGEXP : + mode = optarg->mode; + break; + default : + break; + } + } + + rc = token_info_build(ctx, lexicon, ii, query, query_len, + tis, &n_tis, &only_skip_token, mode); + if (rc != GRN_SUCCESS) { + goto exit; + } + + for (i = 0; i < n_tis; i++) { + token_info *ti = tis[i]; + double term_estimated_size; + term_estimated_size = ((double)ti->size / ti->ntoken); + if (i == 0) { + estimated_size = term_estimated_size; + } else { + estimated_size = fmin(estimated_size, term_estimated_size); + } + } + +exit : + for (i = 0; i < n_tis; i++) { + token_info *ti = tis[i]; + if (ti) { + token_info_close(ctx, ti); + } + } + if (tis) { + GRN_FREE(tis); + } + + return estimated_size; +} + +uint32_t +grn_ii_estimate_size_for_lexicon_cursor(grn_ctx *ctx, grn_ii *ii, + grn_table_cursor *lexicon_cursor) +{ + grn_id term_id; + uint32_t estimated_size = 0; + + while ((term_id = grn_table_cursor_next(ctx, lexicon_cursor)) != GRN_ID_NIL) { + uint32_t term_estimated_size; + term_estimated_size = grn_ii_estimate_size(ctx, ii, term_id); + estimated_size += term_estimated_size; + } + + return estimated_size; +} + grn_rc grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, grn_hash *s, grn_operator op, grn_search_optarg *optarg) @@ -6046,7 +6572,7 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len ERRCLR(ctx); GRN_LOG(ctx, GRN_LOG_INFO, "grn_ii_sel > (%.*s)", string_len, string); { - grn_select_optarg arg = {GRN_OP_EXACT, 0, 0, NULL, 0, NULL, NULL, 0}; + grn_select_optarg arg = {GRN_OP_EXACT, 0, 0, NULL, 0, NULL, NULL, 0, NULL}; if (!s) { return GRN_INVALID_ARGUMENT; } if (optarg) { switch (optarg->mode) { @@ -6059,6 +6585,9 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len arg.mode = optarg->mode; arg.similarity_threshold = optarg->similarity_threshold; break; + case GRN_OP_REGEXP : + arg.mode = optarg->mode; + break; default : break; } @@ -6066,6 +6595,9 @@ grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len arg.weight_vector = optarg->weight_vector; arg.vector_size = optarg->vector_size; } + arg.scorer = optarg->scorer; + arg.scorer_args_expr = optarg->scorer_args_expr; + arg.scorer_args_expr_offset = optarg->scorer_args_expr_offset; } /* todo : support subrec grn_rset_init(ctx, s, grn_rec_document, 0, grn_rec_none, 0, 0); @@ -6198,11 +6730,11 @@ grn_ii_cursor_next_all(grn_ctx *ctx, grn_ii_cursor *c) uint8_t *cp; grn_io_win iw; uint32_t size = c->cinfo[c->curr_chunk].size; - if (size && (cp = WIN_MAP2(c->ii->chunk, ctx, &iw, - c->cinfo[c->curr_chunk].segno, 0, - size, grn_io_rdonly))) { + if (size && (cp = WIN_MAP(c->ii->chunk, ctx, &iw, + c->cinfo[c->curr_chunk].segno, 0, + size, grn_io_rdonly))) { grn_p_decv(ctx, cp, size, c->rdv, c->ii->n_elements); - grn_io_win_unmap2(&iw); + grn_io_win_unmap(&iw); } else { c->pc.rid = 0; break; @@ -6367,7 +6899,7 @@ grn_ii_cursor_inspect(grn_ctx *ctx, grn_ii_cursor *c, grn_obj *buf) } void -grn_ii_inspect_elements(grn_ctx *ctx, grn_ii *ii, grn_obj *buf) +grn_ii_inspect_values(grn_ctx *ctx, grn_ii *ii, grn_obj *buf) { grn_table_cursor *tc; GRN_TEXT_PUTS(ctx, buf, "["); @@ -6396,8 +6928,12 @@ grn_ii_inspect_elements(grn_ctx *ctx, grn_ii *ii, grn_obj *buf) /********************** buffered index builder ***********************/ -const grn_id II_BUFFER_RID_FLAG = 0x80000000; -const grn_id II_BUFFER_WEIGHT_FLAG = 0x40000000; +const grn_id II_BUFFER_TYPE_MASK = 0xc0000000; +#define II_BUFFER_TYPE_RID 0x80000000 +#define II_BUFFER_TYPE_WEIGHT 0x40000000 +#define II_BUFFER_TYPE(id) (((id) & II_BUFFER_TYPE_MASK)) +#define II_BUFFER_PACK(value, type) ((value) | (type)) +#define II_BUFFER_UNPACK(id, type) ((id) & ~(type)) #ifdef II_BUFFER_ORDER_BY_ID const int II_BUFFER_ORDER = GRN_CURSOR_BY_ID; #else /* II_BUFFER_ORDER_BY_ID */ @@ -6426,8 +6962,8 @@ typedef struct { } ii_buffer_counter; typedef struct { - off_t head; - off_t tail; + off64_t head; + off64_t tail; uint32_t nextsize; uint8_t *buffer; uint32_t buffersize; @@ -6450,7 +6986,7 @@ struct _grn_ii_buffer { char tmpfpath[PATH_MAX]; uint64_t update_buffer_size; // stuff for parsing - off_t filepos; + off64_t filepos; grn_id *block_buf; size_t block_buf_size; size_t block_pos; @@ -6521,7 +7057,7 @@ allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer) bufsize_ = bufsize; } } - GRN_LOG(ctx, GRN_LOG_INFO, "flushing:%d bufsize:%zu", + GRN_LOG(ctx, GRN_LOG_INFO, "flushing:%d bufsize:%" GRN_FMT_SIZE, ii_buffer->nblocks, bufsize); return (uint8_t *)GRN_MALLOC(bufsize); } @@ -6573,7 +7109,7 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer, } if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) { uint32_t size = outbufp - outbufp_ + sizeof(uint32_t); - memcpy(pnext, &size, sizeof(uint32_t)); + grn_memcpy(pnext, &size, sizeof(uint32_t)); pnext = outbufp; outbufp += sizeof(uint32_t); outbufp_ = outbufp; @@ -6582,7 +7118,7 @@ encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_table_cursor_close(ctx, tc); if (outbufp_ < outbufp) { uint32_t size = outbufp - outbufp_; - memcpy(pnext, &size, sizeof(uint32_t)); + grn_memcpy(pnext, &size, sizeof(uint32_t)); } return outbufp - outbuf; } @@ -6599,59 +7135,66 @@ encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) uint32_t flags = ii_buffer->ii->header->flags; for (rest = ii_buffer->block_pos; rest; bp++, rest--) { grn_id id = *bp; - if (id & II_BUFFER_RID_FLAG) { - rid = id - II_BUFFER_RID_FLAG; + switch (II_BUFFER_TYPE(id)) { + case II_BUFFER_TYPE_RID : + rid = II_BUFFER_UNPACK(id, II_BUFFER_TYPE_RID); if ((flags & GRN_OBJ_WITH_SECTION) && rest) { sid = *++bp; rest--; } weight = 0; pos = 0; - } else if (id & II_BUFFER_WEIGHT_FLAG) { - weight = id - II_BUFFER_WEIGHT_FLAG; - } else { - ii_buffer_counter *counter = &ii_buffer->counters[id - 1]; - if (counter->last_rid == rid && counter->last_sid == sid) { - counter->last_tf++; - counter->last_weight += weight; - } else { - if (counter->last_tf) { - uint8_t *p = outbuf + counter->offset_tf; - GRN_B_ENC(counter->last_tf - 1, p); - counter->offset_tf = p - outbuf; - if (flags & GRN_OBJ_WITH_WEIGHT) { - p = outbuf + counter->offset_weight; - GRN_B_ENC(counter->last_weight, p); - counter->offset_weight = p - outbuf; + break; + case II_BUFFER_TYPE_WEIGHT : + weight = II_BUFFER_UNPACK(id, II_BUFFER_TYPE_WEIGHT); + break; + default : + { + ii_buffer_counter *counter = &ii_buffer->counters[id - 1]; + if (counter->last_rid == rid && counter->last_sid == sid) { + counter->last_tf++; + counter->last_weight += weight; + } else { + if (counter->last_tf) { + uint8_t *p = outbuf + counter->offset_tf; + GRN_B_ENC(counter->last_tf - 1, p); + counter->offset_tf = p - outbuf; + if (flags & GRN_OBJ_WITH_WEIGHT) { + p = outbuf + counter->offset_weight; + GRN_B_ENC(counter->last_weight, p); + counter->offset_weight = p - outbuf; + } } - } - { - uint8_t *p = outbuf + counter->offset_rid; - GRN_B_ENC(rid - counter->last_rid, p); - counter->offset_rid = p - outbuf; - } - if (flags & GRN_OBJ_WITH_SECTION) { - uint8_t *p = outbuf + counter->offset_sid; - if (counter->last_rid != rid) { - GRN_B_ENC(sid - 1, p); - } else { - GRN_B_ENC(sid - counter->last_sid - 1, p); + { + uint8_t *p = outbuf + counter->offset_rid; + GRN_B_ENC(rid - counter->last_rid, p); + counter->offset_rid = p - outbuf; } - counter->offset_sid = p - outbuf; + if (flags & GRN_OBJ_WITH_SECTION) { + uint8_t *p = outbuf + counter->offset_sid; + if (counter->last_rid != rid) { + GRN_B_ENC(sid - 1, p); + } else { + GRN_B_ENC(sid - counter->last_sid - 1, p); + } + counter->offset_sid = p - outbuf; + } + counter->last_rid = rid; + counter->last_sid = sid; + counter->last_tf = 1; + counter->last_weight = weight; + counter->last_pos = 0; + } + if ((flags & GRN_OBJ_WITH_POSITION) && rest) { + uint8_t *p = outbuf + counter->offset_pos; + pos = *++bp; + rest--; + GRN_B_ENC(pos - counter->last_pos, p); + counter->offset_pos = p - outbuf; + counter->last_pos = pos; } - counter->last_rid = rid; - counter->last_sid = sid; - counter->last_tf = 1; - counter->last_weight = weight; - counter->last_pos = 0; - } - if (flags & GRN_OBJ_WITH_POSITION) { - uint8_t *p = outbuf + counter->offset_pos; - GRN_B_ENC(pos - counter->last_pos, p); - counter->offset_pos = p - outbuf; - counter->last_pos = pos; } - pos++; + break; } } } @@ -6679,7 +7222,7 @@ grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) size_t encsize; uint8_t *outbuf; ii_buffer_block *block; - GRN_LOG(ctx, GRN_LOG_NOTICE, "flushing:%d npostings:%zu", + GRN_LOG(ctx, GRN_LOG_NOTICE, "flushing:%d npostings:%" GRN_FMT_SIZE, ii_buffer->nblocks, ii_buffer->block_pos); if (!(block = block_new(ctx, ii_buffer))) { return; } if (!(outbuf = allocate_outbuf(ctx, ii_buffer))) { return; } @@ -6687,7 +7230,7 @@ grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) encode_postings(ctx, ii_buffer, outbuf); encode_last_tf(ctx, ii_buffer, outbuf); { - ssize_t r = GRN_WRITE(ii_buffer->tmpfd, outbuf, encsize); + ssize_t r = grn_write(ii_buffer->tmpfd, outbuf, encsize); if (r != encsize) { ERR(GRN_INPUT_OUTPUT_ERROR, "write returned %" GRN_FMT_LLD " != %" GRN_FMT_LLU, (long long int)r, (unsigned long long int)encsize); @@ -6701,7 +7244,7 @@ grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) grn_table_size(ctx, ii_buffer->tmp_lexicon) * sizeof(ii_buffer_counter)); grn_obj_close(ctx, ii_buffer->tmp_lexicon); - GRN_LOG(ctx, GRN_LOG_NOTICE, "flushed: %d encsize:%zu", + GRN_LOG(ctx, GRN_LOG_NOTICE, "flushed: %d encsize:%" GRN_FMT_SIZE, ii_buffer->nblocks, encsize); ii_buffer->tmp_lexicon = NULL; ii_buffer->nblocks++; @@ -6719,9 +7262,10 @@ get_tmp_lexicon(grn_ctx *ctx, grn_ii_buffer *ii_buffer) grn_obj *range = grn_ctx_at(ctx, DB_OBJ(ii_buffer->lexicon)->range); grn_obj *tokenizer; grn_obj *normalizer; + grn_obj *token_filters; grn_obj_flags flags; grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, - &tokenizer, &normalizer, NULL); + &tokenizer, &normalizer, &token_filters); flags &= ~GRN_OBJ_PERSISTENT; tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range); if (tmp_lexicon) { @@ -6730,6 +7274,8 @@ get_tmp_lexicon(grn_ctx *ctx, grn_ii_buffer *ii_buffer) GRN_INFO_DEFAULT_TOKENIZER, tokenizer); grn_obj_set_info(ctx, tmp_lexicon, GRN_INFO_NORMALIZER, normalizer); + grn_obj_set_info(ctx, tmp_lexicon, + GRN_INFO_TOKEN_FILTERS, token_filters); if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { grn_pat_cache_enable(ctx, (grn_pat *)tmp_lexicon, PAT_CACHE_SIZE); } @@ -6764,7 +7310,7 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid, { if (value_len) { grn_obj *tmp_lexicon; - uint32_t est_len = value_len + 2; + uint32_t est_len = value_len * 2 + 2; if (ii_buffer->block_buf_size < ii_buffer->block_pos + est_len) { grn_ii_buffer_flush(ctx, ii_buffer); } @@ -6780,24 +7326,27 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid, grn_token_cursor *token_cursor; grn_id *buffer = ii_buffer->block_buf; uint32_t block_pos = ii_buffer->block_pos; - buffer[block_pos++] = rid + II_BUFFER_RID_FLAG; - if ((ii_buffer->ii->header->flags & GRN_OBJ_WITH_SECTION)) { + uint32_t ii_flags = ii_buffer->ii->header->flags; + buffer[block_pos++] = II_BUFFER_PACK(rid, II_BUFFER_TYPE_RID); + if (ii_flags & GRN_OBJ_WITH_SECTION) { buffer[block_pos++] = sid; } if (weight) { - buffer[block_pos++] = weight + II_BUFFER_WEIGHT_FLAG; + buffer[block_pos++] = II_BUFFER_PACK(weight, II_BUFFER_TYPE_WEIGHT); } if ((token_cursor = grn_token_cursor_open(ctx, tmp_lexicon, value, value_len, GRN_TOKEN_ADD, token_flags))) { - uint32_t pos; - for (pos = 0; !token_cursor->status; pos++) { + while (!token_cursor->status) { grn_id tid; if ((tid = grn_token_cursor_next(ctx, token_cursor))) { ii_buffer_counter *counter; counter = get_buffer_counter(ctx, ii_buffer, tmp_lexicon, tid); if (!counter) { return; } buffer[block_pos++] = tid; + if (ii_flags & GRN_OBJ_WITH_POSITION) { + buffer[block_pos++] = token_cursor->pos; + } if (counter->last_rid != rid) { counter->offset_rid += GRN_B_ENC_SIZE(rid - counter->last_rid); counter->last_rid = rid; @@ -6825,8 +7374,9 @@ grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid, counter->last_pos = 0; counter->nrecs++; } - counter->offset_pos += GRN_B_ENC_SIZE(pos - counter->last_pos); - counter->last_pos = pos; + counter->offset_pos += + GRN_B_ENC_SIZE(token_cursor->pos - counter->last_pos); + counter->last_pos = token_cursor->pos; counter->last_tf++; counter->last_weight += weight; counter->nposts++; @@ -6857,11 +7407,20 @@ grn_ii_buffer_fetch(grn_ctx *ctx, grn_ii_buffer *ii_buffer, return; } } - if (lseek(ii_buffer->tmpfd, block->head, SEEK_SET) != block->head) { - SERR("lseek"); - return; + { + off64_t seeked_position; + seeked_position = grn_lseek(ii_buffer->tmpfd, block->head, SEEK_SET); + if (seeked_position != block->head) { + ERRNO_ERR("grn_lseek"); + GRN_LOG(ctx, GRN_LOG_ERROR, + "failed to " + "grn_lseek(%" GRN_FMT_OFF64_T ") -> %" GRN_FMT_OFF64_T, + block->head, + seeked_position); + return; + } } - if (read(ii_buffer->tmpfd, block->buffer, bytesize) != bytesize) { + if (grn_read(ii_buffer->tmpfd, block->buffer, bytesize) != bytesize) { SERR("read"); return; } @@ -6870,14 +7429,15 @@ grn_ii_buffer_fetch(grn_ctx *ctx, grn_ii_buffer *ii_buffer, if (block->head >= block->tail) { if (block->head > block->tail) { GRN_LOG(ctx, GRN_LOG_WARNING, - "fetch error: %jd > %jd", block->head, block->tail); + "fetch error: %" GRN_FMT_INT64D " > %" GRN_FMT_INT64D, + block->head, block->tail); } block->rest = block->nextsize; block->nextsize = 0; } else { block->rest = block->nextsize - sizeof(uint32_t); - memcpy(&block->nextsize, - &block->buffer[block->rest], sizeof(uint32_t)); + grn_memcpy(&block->nextsize, + &block->buffer[block->rest], sizeof(uint32_t)); } } } @@ -6899,11 +7459,11 @@ grn_ii_buffer_chunk_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) grn_io_win io_win; uint32_t chunk_number; chunk_new(ctx, ii_buffer->ii, &chunk_number, ii_buffer->packed_len); - GRN_LOG(ctx, GRN_LOG_INFO, "chunk:%d, packed_len:%zu", + GRN_LOG(ctx, GRN_LOG_INFO, "chunk:%d, packed_len:%" GRN_FMT_SIZE, chunk_number, ii_buffer->packed_len); - fake_map2(ctx, ii_buffer->ii->chunk, &io_win, ii_buffer->packed_buf, - chunk_number, ii_buffer->packed_len); - grn_io_win_unmap2(&io_win); + fake_map(ctx, ii_buffer->ii->chunk, &io_win, ii_buffer->packed_buf, + chunk_number, ii_buffer->packed_len); + grn_io_win_unmap(&io_win); ii_buffer->term_buffer->header.chunk = chunk_number; ii_buffer->term_buffer->header.chunk_size = ii_buffer->packed_len; ii_buffer->term_buffer->header.buffer_free = @@ -7165,12 +7725,10 @@ grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii, if (ii_buffer->counters) { ii_buffer->block_buf = GRN_MALLOCN(grn_id, II_BUFFER_BLOCK_SIZE); if (ii_buffer->block_buf) { - snprintf(ii_buffer->tmpfpath, PATH_MAX, - "%sXXXXXX", grn_io_path(ii->seg)); + grn_snprintf(ii_buffer->tmpfpath, PATH_MAX, PATH_MAX, + "%sXXXXXX", grn_io_path(ii->seg)); ii_buffer->block_buf_size = II_BUFFER_BLOCK_SIZE; - ii_buffer->tmpfd = GRN_MKOSTEMP(ii_buffer->tmpfpath, - O_WRONLY|O_CREAT|O_TRUNC, - S_IRUSR|S_IWUSR); + ii_buffer->tmpfd = grn_mkstemp(ii_buffer->tmpfpath); if (ii_buffer->tmpfd != -1) { grn_obj_flags flags; grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL, NULL, NULL); @@ -7210,7 +7768,7 @@ grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer) grn_ii_buffer_flush(ctx, ii_buffer); } if (ii_buffer->tmpfd != -1) { - GRN_CLOSE(ii_buffer->tmpfd); + grn_close(ii_buffer->tmpfd); } if (ii_buffer->block_buf) { GRN_FREE(ii_buffer->block_buf); @@ -7237,13 +7795,11 @@ grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer) ii_buffer->nblocks, ii_buffer->update_buffer_size); datavec_init(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, 0, 0); -#ifdef WIN32 - ii_buffer->tmpfd = GRN_OPEN(ii_buffer->tmpfpath, O_RDONLY|O_BINARY); -#else /* WIN32 */ - ii_buffer->tmpfd = GRN_OPEN(ii_buffer->tmpfpath, O_RDONLY); -#endif /* WIN32 */ + grn_open(ii_buffer->tmpfd, + ii_buffer->tmpfpath, + O_RDONLY | GRN_OPEN_FLAG_BINARY); if (ii_buffer->tmpfd == -1) { - SERR("oepn"); + ERRNO_ERR("oepn"); return ctx->rc; } { @@ -7283,10 +7839,10 @@ grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer) } datavec_fin(ctx, ii_buffer->data_vectors); GRN_LOG(ctx, GRN_LOG_NOTICE, - "tmpfile_size:%jd > total_chunk_size:%" GRN_FMT_INT64U, + "tmpfile_size:%" GRN_FMT_INT64D " > total_chunk_size:%" GRN_FMT_SIZE, ii_buffer->filepos, ii_buffer->total_chunk_size); - GRN_CLOSE(ii_buffer->tmpfd); - unlink(ii_buffer->tmpfpath); + grn_close(ii_buffer->tmpfd); + grn_unlink(ii_buffer->tmpfpath); ii_buffer->tmpfd = -1; return ctx->rc; } @@ -7304,8 +7860,8 @@ grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer) grn_obj_close(ctx, ii_buffer->tmp_lexicon); } if (ii_buffer->tmpfd != -1) { - GRN_CLOSE(ii_buffer->tmpfd); - unlink(ii_buffer->tmpfpath); + grn_close(ii_buffer->tmpfd); + grn_unlink(ii_buffer->tmpfpath); } if (ii_buffer->block_buf) { GRN_FREE(ii_buffer->block_buf); @@ -7382,7 +7938,21 @@ grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_rc grn_ii_build(grn_ctx *ctx, grn_ii *ii, uint64_t sparsity) { - grn_ii_buffer *ii_buffer = grn_ii_buffer_open(ctx, ii, sparsity); + grn_ii_buffer *ii_buffer; + + { + grn_obj *data_table; + + data_table = grn_ctx_at(ctx, DB_OBJ(ii)->range); + if (!data_table) { + return ctx->rc; + } + if (grn_table_size(ctx, data_table) == 0) { + return ctx->rc; + } + } + + ii_buffer = grn_ii_buffer_open(ctx, ii, sparsity); if (ii_buffer) { grn_id *s = ii->obj.source; if ((ii->obj.source_size) && s) { |