diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2014-06-30 21:46:29 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2014-06-30 21:46:29 -0400 |
commit | 1b2488731cc2c87cc9a4cb8d654e4d9981fdf9ac (patch) | |
tree | 45cbe297cbcfcdd475193642056484e7c49c4dbe /contrib/unaccent | |
parent | 97c40ce61465582b96944e41ed6ec06c2016b95c (diff) | |
download | postgresql-1b2488731cc2c87cc9a4cb8d654e4d9981fdf9ac.tar.gz |
Allow multi-character source strings in contrib/unaccent.
This could be useful in languages where diacritic signs are represented as
separate characters; more generally it supports using unaccent dictionaries
for substring substitutions beyond narrowly conceived "diacritic removal".
In any case, since the rule-file parser doesn't complain about
multi-character source strings, it behooves us to do something unsurprising
with them.
Diffstat (limited to 'contrib/unaccent')
-rw-r--r-- | contrib/unaccent/unaccent.c | 91 |
1 files changed, 59 insertions, 32 deletions
diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index 5a31f85a13..0101506b45 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -23,9 +23,16 @@ PG_MODULE_MAGIC; /* - * Unaccent dictionary uses a trie to find a character to replace. Each node of - * the trie is an array of 256 TrieChar structs (n-th element of array - * corresponds to byte) + * An unaccent dictionary uses a trie to find a string to replace. Each node + * of the trie is an array of 256 TrieChar structs; the N-th element of the + * array corresponds to next byte value N. That element can contain both a + * replacement string (to be used if the source string ends with this byte) + * and a link to another trie node (to be followed if there are more bytes). + * + * Note that the trie search logic pays no attention to multibyte character + * boundaries. This is OK as long as both the data entered into the trie and + * the data we're trying to look up are validly encoded; no partial-character + * matches will occur. */ typedef struct TrieChar { @@ -36,34 +43,38 @@ typedef struct TrieChar /* * placeChar - put str into trie's structure, byte by byte. + * + * If node is NULL, we need to make a new node, which will be returned; + * otherwise the return value is the same as node. */ static TrieChar * -placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen) +placeChar(TrieChar *node, const unsigned char *str, int lenstr, + const char *replaceTo, int replacelen) { TrieChar *curnode; if (!node) - { - node = palloc(sizeof(TrieChar) * 256); - memset(node, 0, sizeof(TrieChar) * 256); - } + node = (TrieChar *) palloc0(sizeof(TrieChar) * 256); + + Assert(lenstr > 0); /* else str[0] doesn't exist */ curnode = node + *str; - if (lenstr == 1) + if (lenstr <= 1) { if (curnode->replaceTo) - elog(WARNING, "duplicate TO argument, use first one"); + elog(WARNING, "duplicate source strings, first one will be used"); else { curnode->replacelen = replacelen; - curnode->replaceTo = palloc(replacelen); + curnode->replaceTo = (char *) palloc(replacelen); memcpy(curnode->replaceTo, replaceTo, replacelen); } } else { - curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen); + curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, + replaceTo, replacelen); } return node; @@ -213,23 +224,35 @@ initTrie(char *filename) } /* - * findReplaceTo - find multibyte character in trie + * findReplaceTo - find longest possible match in trie + * + * On success, returns pointer to ending subnode, plus length of matched + * source string in *p_matchlen. On failure, returns NULL. */ static TrieChar * -findReplaceTo(TrieChar *node, unsigned char *src, int srclen) +findReplaceTo(TrieChar *node, const unsigned char *src, int srclen, + int *p_matchlen) { - while (node) + TrieChar *result = NULL; + int matchlen = 0; + + *p_matchlen = 0; /* prevent uninitialized-variable warnings */ + + while (node && matchlen < srclen) { - node = node + *src; - if (srclen == 1) - return node; + node = node + src[matchlen]; + matchlen++; + + if (node->replaceTo) + { + result = node; + *p_matchlen = matchlen; + } - src++; - srclen--; node = node->nextChar; } - return NULL; + return result; } PG_FUNCTION_INFO_V1(unaccent_init); @@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS) TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0); char *srcchar = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); - char *srcstart, + char *srcstart = srcchar, *trgchar = NULL; - int charlen; TSLexeme *res = NULL; - TrieChar *node; - srcstart = srcchar; - while (srcchar - srcstart < len) + while (len > 0) { - charlen = pg_mblen(srcchar); + TrieChar *node; + int matchlen; - node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen); + node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len, + &matchlen); if (node && node->replaceTo) { if (!res) @@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS) memcpy(trgchar, node->replaceTo, node->replacelen); trgchar += node->replacelen; } - else if (res) + else { - memcpy(trgchar, srcchar, charlen); - trgchar += charlen; + matchlen = pg_mblen(srcchar); + if (res) + { + memcpy(trgchar, srcchar, matchlen); + trgchar += matchlen; + } } - srcchar += charlen; + srcchar += matchlen; + len -= matchlen; } if (res) |