diff options
Diffstat (limited to '3rdparty/clucene/src/CLucene/config/gunichartables.cpp')
-rw-r--r-- | 3rdparty/clucene/src/CLucene/config/gunichartables.cpp | 386 |
1 files changed, 386 insertions, 0 deletions
diff --git a/3rdparty/clucene/src/CLucene/config/gunichartables.cpp b/3rdparty/clucene/src/CLucene/config/gunichartables.cpp new file mode 100644 index 000000000..5463936f6 --- /dev/null +++ b/3rdparty/clucene/src/CLucene/config/gunichartables.cpp @@ -0,0 +1,386 @@ +/* + * Copyright (C) 1999 Tom Tromey + * Copyright (C) 2000 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * + ************************************************ + * Also licensed with permission from Tom Tromey + * and Owen Taylor under the Apache license. + * Original location: + * http://cvs.gnome.org/viewcvs/glib/glib/guniprop.c?view=log + ************************************************ + * + * Copyright 2003-2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Changes are Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +*/ + +#include "CLucene/StdHeader.h" + +typedef unsigned long gunichar; +typedef unsigned short guint16; +typedef short gint16; +typedef char gchar; +typedef unsigned char guchar; + +/* These are the possible character classifications. + * See http://www.unicode.org/Public/UNIDATA/UnicodeData.txt + or http://www.unicode.org/Public/UNIDATA/UCD.html. + + todo: i think there is a new version of the unicode, which we should use. + data is licensed like this: http://www.unicode.org/copyright.html... not sure but looks apache compatible + */ +typedef enum +{ + G_UNICODE_CONTROL, + G_UNICODE_FORMAT, + G_UNICODE_UNASSIGNED, + G_UNICODE_PRIVATE_USE, + G_UNICODE_SURROGATE, + G_UNICODE_LOWERCASE_LETTER, + G_UNICODE_MODIFIER_LETTER, + G_UNICODE_OTHER_LETTER, + G_UNICODE_TITLECASE_LETTER, + G_UNICODE_UPPERCASE_LETTER, + G_UNICODE_COMBINING_MARK, + G_UNICODE_ENCLOSING_MARK, + G_UNICODE_NON_SPACING_MARK, + G_UNICODE_DECIMAL_NUMBER, + G_UNICODE_LETTER_NUMBER, + G_UNICODE_OTHER_NUMBER, + G_UNICODE_CONNECT_PUNCTUATION, + G_UNICODE_DASH_PUNCTUATION, + G_UNICODE_CLOSE_PUNCTUATION, + G_UNICODE_FINAL_PUNCTUATION, + G_UNICODE_INITIAL_PUNCTUATION, + G_UNICODE_OTHER_PUNCTUATION, + G_UNICODE_OPEN_PUNCTUATION, + G_UNICODE_CURRENCY_SYMBOL, + G_UNICODE_MODIFIER_SYMBOL, + G_UNICODE_MATH_SYMBOL, + G_UNICODE_OTHER_SYMBOL, + G_UNICODE_LINE_SEPARATOR, + G_UNICODE_PARAGRAPH_SEPARATOR, + G_UNICODE_SPACE_SEPARATOR +} GUnicodeType; + + +#include "gunichartables.h" + +#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ + ? attr_table_part1[Page] \ + : attr_table_part2[(Page) - 0xe00]) + +#define ATTTABLE(Page, Char) \ + ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char])) + + +#define TTYPE_PART1(Page, Char) \ + ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ + ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ + : (type_data[type_table_part1[Page]][Char])) + +#define TTYPE_PART2(Page, Char) \ + ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ + ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ + : (type_data[type_table_part2[Page]][Char])) + +#define TYPE(Char) \ + (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ + ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \ + : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ + ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ + : G_UNICODE_UNASSIGNED)) + +/* Count the number of elements in an array. The array must be defined + * as such; using this with a dynamically allocated array will give + * incorrect results. + */ +#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0])) + + + + +#if defined(LUCENE_USE_INTERNAL_CHAR_FUNCTIONS) +#ifdef _LUCENE_PRAGMA_WARNINGS + #pragma message ("===== Using internal character function =====") +#else +#if !(defined(Q_OS_SOLARIS) || defined(Q_CC_MIPS)) +#warning "===== Using internal character function =====" +#endif +#endif + +bool cl_isletter(gunichar c) +{ + int t = TYPE (c); + switch(t) + { + case G_UNICODE_LOWERCASE_LETTER: return true; + case G_UNICODE_TITLECASE_LETTER: return true; + case G_UNICODE_UPPERCASE_LETTER: return true; + case G_UNICODE_MODIFIER_LETTER: return true; + case G_UNICODE_OTHER_LETTER: return true; + default: return false; + } +} + +bool cl_isalnum(gunichar c) +{ + int t = TYPE (c); + switch(t) + { + case G_UNICODE_LOWERCASE_LETTER: return true; + case G_UNICODE_TITLECASE_LETTER: return true; + case G_UNICODE_UPPERCASE_LETTER: return true; + case G_UNICODE_MODIFIER_LETTER: return true; + case G_UNICODE_OTHER_LETTER: return true; + case G_UNICODE_DECIMAL_NUMBER: return true; + case G_UNICODE_LETTER_NUMBER: return true; + case G_UNICODE_OTHER_NUMBER: return true; + default: return false; + } +} + +bool cl_isdigit(gunichar c) +{ + int t = TYPE (c); + switch(t) + { + case G_UNICODE_DECIMAL_NUMBER: return true; + case G_UNICODE_LETTER_NUMBER: return true; + case G_UNICODE_OTHER_NUMBER: return true; + default: return false; + } +} + +/** + * cl_isspace: + * @c: a Unicode character + * + * Determines whether a character is a space, tab, or line separator + * (newline, carriage return, etc.). Given some UTF-8 text, obtain a + * character value with lucene_utf8towc(). + * + * (Note: don't use this to do word breaking; you have to use + * Pango or equivalent to get word breaking right, the algorithm + * is fairly complex.) + * + * Return value: %TRUE if @c is a punctuation character + **/ +bool cl_isspace (gunichar c) +{ + switch (c) + { + /* special-case these since Unicode thinks they are not spaces */ + case '\t': + case '\n': + case '\r': + case '\f': + return true; + + default: + { + int t = TYPE ((gunichar)c); + return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR + || t == G_UNICODE_PARAGRAPH_SEPARATOR); + } + } +} + + + +/** + * cl_tolower: + * @c: a Unicode character. + * + * Converts a character to lower case. + * + * Return value: the result of converting @c to lower case. + * If @c is not an upperlower or titlecase character, + * or has no lowercase equivalent @c is returned unchanged. + **/ +TCHAR cl_tolower (TCHAR ch) +{ + gunichar c=ch; + int t = TYPE ((gunichar)c); + if (t == G_UNICODE_UPPERCASE_LETTER) + { + gunichar val = ATTTABLE (c >> 8, c & 0xff); + if (val >= 0x1000000) + { + const gchar *p = special_case_table + val - 0x1000000; + int len=0; + wchar_t ret=0; + lucene_utf8towc(&ret,p,6); +#ifdef _UCS2 + return ret; +#else + return LUCENE_OOR_CHAR(ret); +#endif + //return cl_utf8_get_char (p, &len); + }else + return val ? val : c; + }else if (t == G_UNICODE_TITLECASE_LETTER){ + unsigned int i; + for (i = 0; i < G_N_ELEMENTS (title_table); ++i) + { + if (title_table[i][0] == c) + return title_table[i][2]; + } + } + return c; +} + +/** + * cl_toupper: + * @c: a Unicode character + * + * Converts a character to uppercase. + * + * Return value: the result of converting @c to uppercase. + * If @c is not an lowercase or titlecase character, + * or has no upper case equivalent @c is returned unchanged. + **/ +TCHAR cl_toupper (TCHAR ch) +{ + gunichar c=ch; + int t = TYPE (c); + if (t == G_UNICODE_LOWERCASE_LETTER) + { + gunichar val = ATTTABLE (c >> 8, c & 0xff); + if (val >= 0x1000000) + { + const gchar *p = special_case_table + val - 0x1000000; + + wchar_t ret=0; + lucene_utf8towc(&ret,p,6); +#ifdef _UCS2 + return ret; +#else + return LUCENE_OOR_CHAR(ret); +#endif + //return lucene_utf8towc (p); + } + else + return val ? val : c; + } + else if (t == G_UNICODE_TITLECASE_LETTER) + { + unsigned int i; + for (i = 0; i < G_N_ELEMENTS (title_table); ++i) + { + if (title_table[i][0] == c) + return title_table[i][1]; + } + } + return c; +} + + + +/** + * cl_tcasefold: + * @str: a unicode string + * + * Converts a string into a form that is independent of case. The + * result will not correspond to any particular case, but can be + * compared for equality or ordered with the results of calling + * cl_tcasefold() on other strings. + * + * Note that calling cl_tcasefold() followed by g_utf8_collate() is + * only an approximation to the correct linguistic case insensitive + * ordering, though it is a fairly good one. Getting this exactly + * right would require a more sophisticated collation function that + * takes case sensitivity into account. GLib does not currently + * provide such a function. + * + * Return value: a newly allocated string, that is a + * case independent form of @str. + **/ +TCHAR cl_tcasefold(const TCHAR ch){ + int start = 0; + int end = G_N_ELEMENTS (casefold_table); + + if (ch >= casefold_table[start].ch && + ch <= casefold_table[end - 1].ch) + { + while (1) + { + int half = (start + end) / 2; + if (ch == casefold_table[half].ch) + { + wchar_t ret=0; + lucene_utf8towc(&ret,casefold_table[half].data,6); + + #ifdef _UCS2 + return ret; + #else + LUCENE_OOR_CHAR(ret) + #endif + }else if (half == start){ + break; + }else if (ch > casefold_table[half].ch){ + start = half; + }else{ + end = half; + } + } + } + return cl_tolower(ch); + +} + + +//this function was not taken from gnome +TCHAR* cl_tcscasefold( TCHAR * str, int len ) //len default is -1 +{ + TCHAR *p = str; + while ((len < 0 || p < str + len) && *p) + { + *p = cl_tcasefold(*p); + p++; + } + return str; +} +//this function was not taken from gnome +int cl_tcscasefoldcmp(const TCHAR * dst, const TCHAR * src){ + TCHAR f,l; + + do{ + f = cl_tcasefold( (*(dst++)) ); + l = cl_tcasefold( (*(src++)) ); + } while ( (f) && (f == l) ); + + return (int)(f - l); +} + +#endif |