/* Copyright (c) 2004, 2006 MySQL AB Copyright (c) 2009-2011, Monty Program Ab Use is subject to license terms. Copyright (c) 2009-2011, Monty Program Ab This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ #include #include #include typedef unsigned char uchar; typedef unsigned short uint16; struct uca_item_st { uchar num; uint16 weight[4][9]; }; #if 0 #define MY_UCA_NPAGES 1024 #define MY_UCA_NCHARS 64 #define MY_UCA_CMASK 63 #define MY_UCA_PSHIFT 6 #else #define MY_UCA_NPAGES 4352 /* 0x110000 characters / 0x100 chars per page */ #define MY_UCA_NCHARS 256 #define MY_UCA_CMASK 255 #define MY_UCA_PSHIFT 8 #endif #define MAX_ALLOWED_CODE 0x10FFFF /* Name that goes into all array names */ static const char *global_name_prefix= "uca520"; /* Name prefix that goes into page weight array names after global_name_prefix */ static char *pname_prefix[]= {"_p", "_p", "_p"}; /* Name suffix that goes into page weight array names after page number */ static char *pname_suffix[]= {"", "_w2", "_w3"}; int main(int ac, char **av) { char str[256]; char *weights[64]; static struct uca_item_st uca[MAX_ALLOWED_CODE+1]; size_t code, w; int pageloaded[MY_UCA_NPAGES]; bzero(uca, sizeof(uca)); bzero(pageloaded, sizeof(pageloaded)); while (fgets(str,sizeof(str),stdin)) { char *comment; char *weight; char *s; size_t codenum; code= strtol(str,NULL,16); if (str[0]=='#' || (code > MAX_ALLOWED_CODE)) continue; if ((comment=strchr(str,'#'))) { *comment++= '\0'; for ( ; *comment==' ' ; comment++); }else continue; if ((weight=strchr(str,';'))) { *weight++= '\0'; for ( ; *weight==' ' ; weight++); } else continue; codenum= 0; s= strtok(str, " \t"); while (s) { s= strtok(NULL, " \t"); codenum++; } if (codenum>1) { /* Multi-character weight, i.e. contraction. Not supported yet. */ continue; } uca[code].num= 0; s= strtok(weight, " []"); while (s) { weights[uca[code].num]= s; s= strtok(NULL, " []"); uca[code].num++; } for (w=0; w < uca[code].num; w++) { size_t partnum; partnum= 0; s= weights[w]; while (*s) { char *endptr; size_t part; part= strtol(s+1,&endptr,16); uca[code].weight[partnum][w]= part; s= endptr; partnum++; } } /* Mark that a character from this page was loaded */ pageloaded[code >> MY_UCA_PSHIFT]++; } /* Now set implicit weights */ for (code=0; code <= MAX_ALLOWED_CODE; code++) { size_t base, aaaa, bbbb; if (uca[code].num) continue; /* 3400; 4DB5; 4E00; 9FA5; */ if (code >= 0x3400 && code <= 0x4DB5) base= 0xFB80; else if (code >= 0x4E00 && code <= 0x9FA5) base= 0xFB40; else base= 0xFBC0; aaaa= base + (code >> 15); bbbb= (code & 0x7FFF) | 0x8000; uca[code].weight[0][0]= aaaa; uca[code].weight[0][1]= bbbb; uca[code].weight[1][0]= 0x0020; uca[code].weight[1][1]= 0x0000; uca[code].weight[2][0]= 0x0002; uca[code].weight[2][1]= 0x0000; uca[code].weight[3][0]= 0x0001; uca[code].weight[3][2]= 0x0000; uca[code].num= 2; } printf("#include \"my_uca.h\"\n"); printf("#define MY_UCA_NPAGES %d\n",MY_UCA_NPAGES); printf("#define MY_UCA_NCHARS %d\n",MY_UCA_NCHARS); printf("#define MY_UCA_CMASK %d\n",MY_UCA_CMASK); printf("#define MY_UCA_PSHIFT %d\n",MY_UCA_PSHIFT); for (w=0; w<3; w++) { size_t page; int pagemaxlen[MY_UCA_NPAGES]; for (page=0; page < MY_UCA_NPAGES; page++) { size_t offs; size_t maxnum= 0; size_t nchars= 0; size_t mchars; size_t ndefs= 0; size_t code_line_start= page * MY_UCA_NCHARS; pagemaxlen[page]= 0; /* Skip this page if no weights were loaded */ if (!pageloaded[page]) continue; /* Calculate maximum weight length for this page */ for (offs=0; offs < MY_UCA_NCHARS; offs++) { size_t i, num; code= page*MY_UCA_NCHARS+offs; /* Calculate only non-zero weights */ for (num=0, i=0; i < uca[code].num; i++) if (uca[code].weight[w][i]) num++; maxnum= maxnum < num ? num : maxnum; /* Check if default weight */ if (w == 1 && num == 1) { /* 0020 0000 ... */ if (uca[code].weight[w][0] == 0x0020) ndefs++; } else if (w == 2 && num == 1) { /* 0002 0000 ... */ if (uca[code].weight[w][0] == 0x0002) ndefs++; } } maxnum++; /* If the page have only default weights then no needs to dump it, skip. */ if (ndefs == MY_UCA_NCHARS) { continue; } switch (maxnum) { case 0: mchars= 8; break; case 1: mchars= 8; break; case 2: mchars= 8; break; case 3: mchars= 9; break; case 4: mchars= 8; break; default: mchars= uca[code].num; } pagemaxlen[page]= maxnum; /* Now print this page */ printf("static const uint16 %s%s%03X%s[]= { /* %04X (%d weights per char) */\n", global_name_prefix, pname_prefix[w], (int) page, pname_suffix[w], (int) page*MY_UCA_NCHARS, (int) maxnum); for (offs=0; offs < MY_UCA_NCHARS; offs++) { uint16 weight[8]; size_t num, i; code= page*MY_UCA_NCHARS+offs; bzero(weight,sizeof(weight)); /* Copy non-zero weights */ for (num=0, i=0; i < uca[code].num; i++) { if (uca[code].weight[w][i]) { weight[num]= uca[code].weight[w][i]; num++; } } for (i=0; i < maxnum; i++) { /* Invert weights for secondary level to sort upper case letters before their lower case counter part. */ int tmp= weight[i]; if (w == 2 && tmp) tmp= (int)(0x20 - weight[i]); printf("0x%04X", tmp); if ((offs+1 != MY_UCA_NCHARS) || (i+1!=maxnum)) printf(","); else printf(" "); nchars++; } if (nchars >=mchars) { printf(" /* %04X */\n", (int) code_line_start); code_line_start= code + 1; nchars=0; } else { printf(" "); } } printf("};\n\n"); } printf("const uchar %s_length%s[%d]={\n", global_name_prefix, pname_suffix[w], MY_UCA_NPAGES); for (page=0; page < MY_UCA_NPAGES; page++) { printf("%d%s%s",pagemaxlen[page],page