diff options
-rw-r--r-- | include/m_ctype.h | 8 | ||||
-rw-r--r-- | strings/ctype-uca.c | 230 | ||||
-rw-r--r-- | strings/ctype-uca.ic | 2 |
3 files changed, 228 insertions, 12 deletions
diff --git a/include/m_ctype.h b/include/m_ctype.h index b3b3a8093ea..8b9e878f960 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -190,6 +190,13 @@ typedef struct my_uca_level_booster_t } MY_UCA_LEVEL_BOOSTER; +typedef struct my_uca_level_booster_contractions_t +{ + size_t nitems_alloced; + MY_CONTRACTION *item; +} MY_UCA_LEVEL_BOOSTER_CONTRACTIONS; + + /* Collation weights on a single level (e.g. primary, secondary, tertiarty) */ typedef struct my_uca_level_info_st { @@ -199,6 +206,7 @@ typedef struct my_uca_level_info_st MY_CONTRACTIONS contractions; uint levelno; MY_UCA_LEVEL_BOOSTER *booster; + MY_UCA_LEVEL_BOOSTER_CONTRACTIONS contractions_booster; } MY_UCA_WEIGHT_LEVEL; diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index b8bc48eb6fc..7f913dfffa2 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -6541,7 +6541,8 @@ MY_UCA_INFO my_uca_v400= NULL /* flags */ }, 0, /* levelno */ - NULL /* weights_2bytes */ + NULL, /* weights_2bytes */ + {0} }, { 0, @@ -6553,7 +6554,8 @@ MY_UCA_INFO my_uca_v400= NULL }, 1, /* levelno */ - NULL /* weights_2bytes */ + NULL, /* weights_2bytes */ + {0} }, }, @@ -30100,7 +30102,8 @@ MY_UCA_INFO my_uca_v520_th= NULL /* flags */ }, 0, /* levelno */ - NULL /* weights_2bytes */ + NULL, /* weights_2bytes */ + {0} }, { 0x10FFFF, /* maxchar */ @@ -30112,7 +30115,8 @@ MY_UCA_INFO my_uca_v520_th= NULL /* flags */ }, 1, /* levelno */ - NULL /* weights_2bytes */ + NULL, /* weights_2bytes */ + {0} }, }, @@ -30148,7 +30152,8 @@ MY_UCA_INFO my_uca_v520= NULL /* flags */ }, 0, /* levelno */ - NULL /* weights_2bytes */ + NULL, /* weights_2bytes */ + {0} }, { @@ -30161,7 +30166,8 @@ MY_UCA_INFO my_uca_v520= NULL /* flags */ }, 1, /* levelno */ - NULL /* weights_2bytes */ + NULL, /* weights_2bytes */ + {0} }, }, @@ -31477,12 +31483,198 @@ my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc) */ static int -my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len) +my_wmemcmp(const my_wc_t *a, const my_wc_t *b, size_t len) { return memcmp(a, b, len * sizeof(my_wc_t)); } +/* + Test if the MY_CONTRACTION instance is equal to the wide + string with the given length. + Note, only true contractions are checked, + while previous context pairs always return FALSE. +*/ +static inline my_bool +my_uca_true_contraction_eq(const MY_CONTRACTION *c, + const my_wc_t *wc, size_t len) +{ + return (len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) && + !c->with_context && + !my_wmemcmp(c->ch, wc, len); +} + + +/* + The number of elements must be a degree of 2. + This allows to use the faster & operator instead of the + slow % operator to find the remainder of the division: + pos= (start_pos + iteration) & MASK + instead of: + pos= (start_pos + iteration) % NUMBER_OF_PREALLOCED_HASH_ELEMENTS + + DUCET as of Unicode-14.0.0 has 939 default contractions. + CLDR-40 has around 2601 contractions (all collations total). + The built-in Myanmar collation tailoring has 912 contractions. + 4096 as the contraction prealloced hash size should be enough + for all collations. +*/ +#define MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS 4096 +#define MY_UCA_CONTRACTION_HASH_LSHIFT 2 +#define MY_UCA_CONTRACTION_HASH_MASK \ + ((MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS-1)>>MY_UCA_CONTRACTION_HASH_LSHIFT) +#define MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS \ + (MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS-1) + + +static uint16 +my_uca_level_booster_contractions_hash(my_wc_t a, my_wc_t b) +{ + return (uint16) (((a * 465 + b) & MY_UCA_CONTRACTION_HASH_MASK) << + MY_UCA_CONTRACTION_HASH_LSHIFT); +} + + +#define DBUG_UCA_CONTRACTIONS +#ifdef DBUG_UCA_CONTRACTIONS +static ulonglong collisions= 0; +static ulonglong collisions_eq= 0; +#endif + + +static my_bool +my_uca_level_booster_contractions_find_empty( + const MY_UCA_LEVEL_BOOSTER_CONTRACTIONS *cnt, + uint16 start, + uint16 *ppos) +{ + uint16 i; + for (i= 0; i < MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS; i++) + { + uint16 pos= (i + start) % cnt->nitems_alloced; + if (!cnt->item[pos].ch[0]) + { + *ppos= pos; + return FALSE; + } + } + return TRUE; +} + + +static inline const uint16 * +my_uca_level_booster_contractions_weight( + const MY_UCA_LEVEL_BOOSTER_CONTRACTIONS *cnt, + const my_wc_t *wc, size_t len) +{ + uint16 start= my_uca_level_booster_contractions_hash(wc[0], wc[1]); + uint16 i; + DBUG_ASSERT(len <= MY_UCA_MAX_CONTRACTION); + + for (i=0 ; i < MY_UCA_CONTRACTION_HASH_ALLOWED_COLLISIONS; i++) + { + uint16 pos= (i + start) % cnt->nitems_alloced; + const MY_CONTRACTION *c= &cnt->item[pos]; + if (!c->ch[0]) + break; + if (my_uca_true_contraction_eq(c, wc, len)) + return c->weight; + } + return NULL; +} + + +static my_bool +my_uca_level_booster_contractions_allocate( + MY_UCA_LEVEL_BOOSTER_CONTRACTIONS *dst, + MY_CHARSET_LOADER *loader) +{ + size_t nbytes= MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS * sizeof(MY_CONTRACTION); + bzero(dst, sizeof(*dst)); + if (!(dst->item= (MY_CONTRACTION*) (loader->once_alloc)(nbytes))) + return TRUE; + bzero(dst->item, nbytes); + dst->nitems_alloced= MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS; + return FALSE; +} + + +static my_bool +my_uca_level_booster_contractions_populate( + MY_UCA_LEVEL_BOOSTER_CONTRACTIONS *dst, + const MY_CONTRACTIONS *src) +{ + size_t i; + DBUG_ASSERT(dst->nitems_alloced > 0); + for (i= 0; i < src->nitems; i++) + { + const MY_CONTRACTION *c= &src->item[i]; + uint16 start= my_uca_level_booster_contractions_hash(c->ch[0], c->ch[1]); + if (!dst->item[start].ch[0]) + dst->item[start]= src->item[i]; + else + { + uint16 pos; +#ifdef DBUG_UCA_CONTRACTIONS + if (dst->item[start].ch[0] != c->ch[0] && + dst->item[start].ch[1] != c->ch[1]) + collisions++; + else + collisions_eq++; +#endif + if (my_uca_level_booster_contractions_find_empty(dst, start, &pos)) + return TRUE; + dst->item[pos]= src->item[i]; + } + } + return FALSE; +} + + +static const uint16 * +my_uca_scanner_booster_contractions_find(my_uca_scanner *scanner, my_wc_t currwc) +{ + size_t clen= 1; + int flag; + const uchar *s, *beg[MY_UCA_MAX_CONTRACTION]; + my_wc_t wc[MY_UCA_MAX_CONTRACTION]; + wc[0]= currwc; + + memset((void*) beg, 0, sizeof(beg)); + + /* Scan all contraction candidates */ + for (s= scanner->sbeg, flag= MY_UCA_CNT_MID1; + clen < MY_UCA_MAX_CONTRACTION; + flag<<= 1) + { + int mblen; + if ((mblen= my_ci_mb_wc(scanner->cs, &wc[clen], s, scanner->send)) <= 0) + break; + beg[clen]= s= s + mblen; + if (!my_uca_can_be_contraction_part(&scanner->level->contractions, + wc[clen++], flag)) + break; + } + + /* Find among candidates the longest real contraction */ + for ( ; clen > 1; clen--) + { + const uint16 *cweight; + if (my_uca_can_be_contraction_tail(&scanner->level->contractions, + wc[clen - 1]) && + (cweight= my_uca_level_booster_contractions_weight(&scanner->level->contractions_booster, + wc, clen))) + { + scanner->sbeg= beg[clen - 1]; + return cweight; + } + } + + return NULL; /* No contractions were found */ +} + + + /** Check if a string is a contraction, and return its weight array on success. @@ -31528,6 +31720,7 @@ my_uca_contraction_weight(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len) @retval ptr - contraction weight array */ +#if 0 static uint16 * my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc) { @@ -31569,7 +31762,7 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc) return NULL; /* No contractions were found */ } - +#endif /** Find weight for contraction with previous context @@ -31615,10 +31808,10 @@ my_uca_previous_context_find(const MY_CONTRACTIONS *list, @retval NULL if could not find any contextual weights for wc[0] @retval non null pointer to a zero-terminated weight string otherwise */ -static inline uint16 * +static inline const uint16 * my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc) { - uint16 *cweight; + const uint16 *cweight; my_wc_t prevwc; DBUG_ASSERT(scanner->level->contractions.nitems); /* @@ -31646,7 +31839,7 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc) currwc)) { /* Check if w[0] starts a contraction */ - if ((cweight= my_uca_scanner_contraction_find(scanner, currwc))) + if ((cweight= my_uca_scanner_booster_contractions_find(scanner, currwc))) return cweight; } return NULL; @@ -34206,6 +34399,21 @@ init_weight_level(MY_CHARSET_LOADER *loader, CHARSET_INFO *cs, if (cs->mbminlen == 1) dst->booster= my_uca_level_booster_new(loader, cs, dst); + + if (ncontractions) + { + if (ncontractions > MY_UCA_CONTRACTION_HASH_ALLOC_ELEMENTS || + my_uca_level_booster_contractions_allocate(&dst->contractions_booster, + loader) || + my_uca_level_booster_contractions_populate(&dst->contractions_booster, + &dst->contractions)) + { + my_snprintf(loader->error, sizeof(loader->error), + "Can't initialize %d contractions", (int) ncontractions); + return TRUE; + } + } + return FALSE; } diff --git a/strings/ctype-uca.ic b/strings/ctype-uca.ic index 5ae1696677d..ba6841284d7 100644 --- a/strings/ctype-uca.ic +++ b/strings/ctype-uca.ic @@ -143,7 +143,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) #if MY_UCA_COMPILE_CONTRACTIONS if (my_uca_needs_context_handling(scanner->level, currwc)) { - uint16 *cweight= my_uca_context_weight_find(scanner, currwc); + const uint16 *cweight= my_uca_context_weight_find(scanner, currwc); if (cweight) { if ((weight= my_uca_scanner_set_weight(scanner, cweight))) |