diff options
Diffstat (limited to 'src/backend/tsearch/wparser_def.c')
-rw-r--r-- | src/backend/tsearch/wparser_def.c | 173 |
1 files changed, 99 insertions, 74 deletions
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 9a351ac9da..e5c4d7cb04 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -1942,9 +1942,10 @@ prsd_end(PG_FUNCTION_ARGS) #define INTERESTINGWORD(j) \ (prs->words[j].item && !prs->words[j].repeated) -/* Don't want to end at a non-word or a short word */ +/* Don't want to end at a non-word or a short word, unless interesting */ #define BADENDPOINT(j) \ - (NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) + ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \ + !INTERESTINGWORD(j)) typedef struct { @@ -2003,75 +2004,97 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data) return false; } - -static bool -hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q) +/* + * hlFirstIndex: find first index >= pos containing any word used in query + * + * Returns -1 if no such index + */ +static int +hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos) { - int i, - j; - QueryItem *item = GETQUERY(query); - int pos = *p; - - *q = -1; - *p = INT_MAX; + int i; - for (j = 0; j < query->size; j++) + /* For each word ... */ + for (i = pos; i < prs->curwords; i++) { - if (item->type != QI_VAL) + /* ... scan the query to see if this word matches any operand */ + QueryItem *item = GETQUERY(query); + int j; + + for (j = 0; j < query->size; j++) { + if (item->type == QI_VAL && + prs->words[i].item == &item->qoperand) + return i; item++; - continue; - } - for (i = pos; i < prs->curwords; i++) - { - if (prs->words[i].item == &item->qoperand) - { - if (i > *q) - *q = i; - break; - } } - item++; } + return -1; +} - if (*q < 0) - return false; +/* + * hlCover: try to find a substring of prs' word list that satisfies query + * + * At entry, *p must be the first word index to consider (initialize this to + * zero, or to the next index after a previous successful search). + * + * On success, sets *p to first word index and *q to last word index of the + * cover substring, and returns true. + * + * The result is a minimal cover, in the sense that both *p and *q will be + * words used in the query. + */ +static bool +hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q) +{ + int pmin, + pmax, + nextpmin, + nextpmax; + hlCheck ch; - item = GETQUERY(query); - for (j = 0; j < query->size; j++) + /* + * We look for the earliest, shortest substring of prs->words that + * satisfies the query. Both the pmin and pmax indices must be words + * appearing in the query; there's no point in trying endpoints in between + * such points. + */ + pmin = hlFirstIndex(prs, query, *p); + while (pmin >= 0) { - if (item->type != QI_VAL) + /* This useless assignment just keeps stupider compilers quiet */ + nextpmin = -1; + /* Consider substrings starting at pmin */ + ch.words = &(prs->words[pmin]); + /* Consider the length-one substring first, then longer substrings */ + pmax = pmin; + do { - item++; - continue; - } - for (i = *q; i >= pos; i--) - { - if (prs->words[i].item == &item->qoperand) + /* Try to match query against pmin .. pmax substring */ + ch.len = pmax - pmin + 1; + if (TS_execute(GETQUERY(query), &ch, + TS_EXEC_EMPTY, checkcondition_HL)) { - if (i < *p) - *p = i; - break; + *p = pmin; + *q = pmax; + return true; } - } - item++; - } - - if (*p <= *q) - { - hlCheck ch; + /* Nope, so advance pmax to next feasible endpoint */ + nextpmax = hlFirstIndex(prs, query, pmax + 1); - ch.words = &(prs->words[*p]); - ch.len = *q - *p + 1; - if (TS_execute(GETQUERY(query), &ch, TS_EXEC_EMPTY, checkcondition_HL)) - return true; - else - { - (*p)++; - return hlCover(prs, query, p, q); + /* + * If this is our first advance past pmin, then the result is also + * the next feasible value of pmin; remember it to save a + * redundant search. + */ + if (pmax == pmin) + nextpmin = nextpmax; + pmax = nextpmax; } + while (pmax >= 0); + /* No luck here, so try next feasible startpoint */ + pmin = nextpmin; } - return false; } @@ -2357,11 +2380,12 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall, int bestb = -1, beste = -1; int bestlen = -1; + bool bestcover = false; int pose, posb, poslen, curlen; - + bool poscover; int i; if (!highlightall) @@ -2387,14 +2411,6 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall, pose = i; } - /* XXX this optimization seems unnecessary and wrong */ - if (poslen < bestlen && !BADENDPOINT(beste)) - { - /* better cover already found, so try next cover */ - p++; - continue; - } - if (curlen < max_words) { /* @@ -2449,29 +2465,38 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall, i = q; for (; curlen > min_words; i--) { + if (!BADENDPOINT(i)) + break; if (!NONWORDTOKEN(prs->words[i].type)) curlen--; if (INTERESTINGWORD(i)) poslen--; - pose = i; - if (!BADENDPOINT(i)) - break; + pose = i - 1; } } /* - * Adopt this headline if it's the first, or if it has more - * interesting words and isn't ending at a bad endpoint, or if it - * replaces a bad endpoint with a good one (XXX even if it has - * fewer interesting words? Really?) + * Check whether the proposed headline includes the original + * cover; it might not if we trimmed it due to max_words. + */ + poscover = (posb <= p && pose >= q); + + /* + * Adopt this headline if it's better than the last one, giving + * highest priority to headlines including the cover, then to + * headlines with more interesting words, then to headlines with + * good stopping points. (Since bestlen is initially -1, we will + * certainly adopt the first headline.) */ - if (bestlen < 0 || - (poslen > bestlen && !BADENDPOINT(pose)) || - (!BADENDPOINT(pose) && BADENDPOINT(beste))) + if (poscover > bestcover || + (poscover == bestcover && poslen > bestlen) || + (poscover == bestcover && poslen == bestlen && + !BADENDPOINT(pose) && BADENDPOINT(beste))) { bestb = posb; beste = pose; bestlen = poslen; + bestcover = poscover; } /* move p to generate the next cover */ |