diff options
author | Choe Hwanjin <choe.hwanjin@gmail.com> | 2008-02-24 02:29:23 +0900 |
---|---|---|
committer | Choe Hwanjin <choe.hwanjin@gmail.com> | 2008-02-24 02:29:23 +0900 |
commit | 234795cf54ad7ab9376d204720e4f9ab3a851e99 (patch) | |
tree | 0b82311c352f21391928882d7c2d16af03ba61d4 | |
parent | ebb1debbbac572af8e0655b2071185e138ead403 (diff) | |
download | libhangul-234795cf54ad7ab9376d204720e4f9ab3a851e99.tar.gz |
한자 사전 파일을 바이너리 형태로 사용하는 기능 구현:
* 내부적으로 mmap을 이용하여 로딩, 메모리 사용량을 줄임
* txt 버젼은 vector로 구현, 더이상 slist를 사용하지 않음
* hanja.txt파일을 hanja.bin 형태로 변환하여 사용함
* 파일 포맷 변환을 위한 API, hanja_table_txt_to_bin() 추가
* tools 디렉토리 추가
* 파일 포맷 변환을 위해 hanjac라는 도구를 제공
* 기본 한자 사전 파일을 hanja.txt에서 hanja.bin으로 변경
새로운 api 추가
* hanja_list_get_nth_key()
git-svn-id: http://kldp.net/svn/hangul/libhangul/trunk@158 8f00fcd2-89fc-0310-932e-b01be5b65e01
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | configure.ac | 10 | ||||
-rw-r--r-- | data/hanja/Makefile.am | 8 | ||||
-rw-r--r-- | hangul/Makefile.am | 2 | ||||
-rw-r--r-- | hangul/hangul.h | 3 | ||||
-rw-r--r-- | hangul/hanja.c | 1032 | ||||
-rw-r--r-- | tools/Makefile.am | 6 | ||||
-rw-r--r-- | tools/hanjac.c | 15 |
8 files changed, 845 insertions, 233 deletions
diff --git a/Makefile.am b/Makefile.am index 6478f1a..f83858b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = hangul data bindings test +SUBDIRS = hangul data bindings test tools pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = libhangul.pc diff --git a/configure.ac b/configure.ac index ef28e77..f43e1f0 100644 --- a/configure.ac +++ b/configure.ac @@ -26,7 +26,7 @@ AC_PROG_INSTALL # Checks for header files. AC_HEADER_STDC -AC_CHECK_HEADERS([stdlib.h string.h]) +AC_CHECK_HEADERS([stdlib.h string.h limits.h]) # Checks for typedefs, structures, and compiler characteristics. AC_HEADER_STDBOOL @@ -36,9 +36,17 @@ AC_C_INLINE AC_TYPE_UINT32_T # Checks for library functions. +AC_FUNC_MEMCMP +AC_FUNC_MMAP +AC_FUNC_REALLOC +AC_CHECK_FUNCS([munmap]) +AC_CHECK_FUNCS([strcasecmp]) + + AC_CONFIG_FILES([ Makefile hangul/Makefile +tools/Makefile data/Makefile data/hanja/Makefile bindings/Makefile diff --git a/data/hanja/Makefile.am b/data/hanja/Makefile.am index 2e057d8..8b44034 100644 --- a/data/hanja/Makefile.am +++ b/data/hanja/Makefile.am @@ -2,4 +2,10 @@ hanjadicdir = $(datadir)/libhangul/hanja hanjadic_DATA = hanja.txt -EXTRA_DIST = $(hanjadic_DATA) +hanjadicbindir = $(datadir)/libhangul/hanja +hanjadicbin_DATA = hanja.bin + +$(hanjadicbin_DATA): + ../../tools/hanjac hanja.txt $@ + +EXTRA_DIST = $(hanjadic_DATA) $(hanjadicbin_DATA) diff --git a/hangul/Makefile.am b/hangul/Makefile.am index bd9770f..e3186c0 100644 --- a/hangul/Makefile.am +++ b/hangul/Makefile.am @@ -14,7 +14,7 @@ libhangul_la_SOURCES = \ hanja.c libhangul_la_CFLAGS = \ - -DLIBHANGUL_DEFAULT_HANJA_DIC=\"$(datadir)/libhangul/hanja/hanja.txt\" + -DLIBHANGUL_DEFAULT_HANJA_DIC=\"$(datadir)/libhangul/hanja/hanja.bin\" libhangul_la_LDFLAGS = -version-info $(LIBHANGUL_CURRENT):$(LIBHANGUL_REVISION):$(LIBHANGUL_AGE) libhangul_la_LIBADD = diff --git a/hangul/hangul.h b/hangul/hangul.h index 8103d26..33e1c92 100644 --- a/hangul/hangul.h +++ b/hangul/hangul.h @@ -128,6 +128,9 @@ HanjaList* hanja_table_match_prefix(const HanjaTable* table, const char *key); HanjaList* hanja_table_match_suffix(const HanjaTable* table, const char *key); void hanja_table_delete(HanjaTable *table); +int hanja_table_txt_to_bin(const char* txtfilename, + const char* binfilename); + int hanja_list_get_size(const HanjaList *list); const char* hanja_list_get_key(const HanjaList *list); const Hanja* hanja_list_get_nth(const HanjaList *list, unsigned int n); diff --git a/hangul/hanja.c b/hangul/hanja.c index eade6bc..8b1e85f 100644 --- a/hangul/hanja.c +++ b/hangul/hanja.c @@ -1,5 +1,5 @@ /* libhangul - * Copyright (C) 2005 Choe Hwanjin + * Copyright (C) 2005-2008 Choe Hwanjin * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -20,6 +20,12 @@ #include <config.h> #endif +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <sys/mman.h> + #include <limits.h> #include <stdio.h> #include <stdlib.h> @@ -28,29 +34,83 @@ #include "hangul.h" #include "hangulinternals.h" +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +enum { + HANGUL_ERROR_NOERROR, + HANGUL_ERROR_INVALID, + HANGUL_ERROR_RANGE, + HANGUL_ERROR_CANTOPEN, +}; + +enum { + HANJA_TABLE_TYPE_VECTOR, + HANJA_TABLE_TYPE_MMAP +}; + +typedef struct _PtrVector PtrVector; +typedef struct _HanjaKeyEntry HanjaKeyEntry; + +typedef struct _HanjaVectorTable HanjaVectorTable; +typedef struct _HanjaMMapTable HanjaMMapTable; + typedef struct _HanjaPair HanjaPair; typedef struct _HanjaPairArray HanjaPairArray; struct _Hanja { - char *key; - char *value; - char *comment; + uint32_t key_offset; + uint32_t value_offset; + uint32_t comment_offset; }; struct _HanjaList { - char *key; - unsigned int nitems; - Hanja **items; + char* key; + size_t len; + size_t alloc; + const Hanja** items; }; +typedef void (*HanjaTableDelete)(HanjaTable*); +typedef void (*HanjaTableMatch)(const HanjaTable*, const char*, HanjaList**); + struct _HanjaTable { - unsigned int nmember; - HanjaList **base; + int type; + HanjaTableDelete destroy; + HanjaTableMatch match; }; -struct slist { - void *data; - struct slist *next; +struct _PtrVector { + void** ptrs; + size_t len; + size_t alloc; +}; + +struct _HanjaVectorTable { + HanjaTable parent; + + PtrVector* keytable; +}; + +struct _HanjaKeyEntry { + uint32_t hanja_offset; + uint32_t nitems; +}; + +struct _HanjaMMapTable { + HanjaTable parent; + + HanjaKeyEntry* keytable; + unsigned int nkeys; + unsigned int ndata; + + void* map; + size_t map_length; }; struct _HanjaPair { @@ -65,11 +125,110 @@ struct _HanjaPairArray { #include "hanjacompatible.h" -/* utility functions */ -static inline void h_free(void *ptr) +enum { + HANJA_STREAM_MEMORY, + HANJA_STREAM_FILE +}; + +typedef struct { + int type; + unsigned char* data; + unsigned char* current; + size_t length; +} HanjaMemoryStream; + +typedef struct { + int type; + FILE* file; +} HanjaFileStream; + +typedef union { + int type; + HanjaMemoryStream memory; + HanjaFileStream file; +} HangulStream; + +static void hanja_vector_table_delete(HanjaTable* hanja_table); +static void hanja_vector_table_match(const HanjaTable* hanja_table, + const char* key, HanjaList** list); + +static void hanja_mmap_table_delete(HanjaTable* hanja_table); +static void hanja_mmap_table_match(const HanjaTable* hanja_table, + const char* key, HanjaList** list); + +static inline int +hangul_stream_init_as_memory(HangulStream* stream, void* data, size_t length) +{ + stream->type = HANJA_STREAM_MEMORY; + stream->memory.data = data; + stream->memory.current = data; + stream->memory.length = length; + return 0; +} + +static inline int +hangul_stream_init_as_file(HangulStream* stream, FILE* file) +{ + stream->type = HANJA_STREAM_FILE; + stream->file.file = file; + return 0; +} + +static inline bool +hangul_stream_check_range(HanjaMemoryStream* stream, unsigned char* p) +{ + if (p >= stream->data && p < stream->data + stream->length) + return true; + else + return false; +} + +static inline int +hangul_stream_seek(HangulStream* stream, size_t offset) +{ + if (stream->type == HANJA_STREAM_MEMORY) { + HanjaMemoryStream* mstream = &stream->memory; + if (!hangul_stream_check_range(mstream, mstream->current + offset)) + return HANGUL_ERROR_RANGE; + + stream->memory.current += offset; + return 0; + } + + return HANGUL_ERROR_INVALID; +} + +static inline int +hangul_stream_read_uint32(HangulStream* stream, uint32_t* value) { - if (ptr) - free(ptr); + if (stream->type == HANJA_STREAM_MEMORY) { + HanjaMemoryStream* mstream = &stream->memory; + if (!hangul_stream_check_range(mstream, mstream->current + sizeof(*value))) + return HANGUL_ERROR_RANGE; + + memcpy(value, mstream->current, sizeof(*value)); + mstream->current += sizeof(*value); + return 0; + } + + return HANGUL_ERROR_INVALID; +} + +static inline int +hangul_stream_write(HangulStream* stream, const void* ptr, size_t len) +{ + if (stream->type == HANJA_STREAM_MEMORY) { + HanjaMemoryStream* mstream = &stream->memory; + if (!hangul_stream_check_range(mstream, mstream->current + len)) + return HANGUL_ERROR_RANGE; + + memcpy(mstream->current, ptr, len); + mstream->current += len; + } else if (stream->type == HANJA_STREAM_FILE) { + fwrite(ptr, len, 1, stream->file.file); + } + + return 0; } static const char utf8_skip_table[256] = { @@ -83,130 +242,186 @@ static const char utf8_skip_table[256] = { 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 }; -static inline int h_char_len(const char *p) +static inline int utf8_char_len(const char *p) { return utf8_skip_table[*(const unsigned char*)p]; } -static struct slist * -slist_append(struct slist *head, void *data) +static inline const char* utf8_next(const char *str) { - struct slist *tail; + int n = utf8_char_len(str); - if (data == NULL) - return head; + while (n > 0) { + str++; + if (*str == '\0') + return str; + n--; + } - if (head == NULL) { - head = malloc(sizeof(struct slist)); - if (head != NULL) { - head->data = data; - head->next = NULL; - } + return str; +} - return head; +static inline char* utf8_prev(const char *str, const char *p) +{ + for (--p; p >= str; --p) { + if ((*p & 0xc0) != 0x80) + break; } + return (char*)p; +} - for (tail = head; tail->next != NULL; tail = tail->next) - continue; +static PtrVector* +ptr_vector_new(size_t initial_size) +{ + PtrVector* vector; - tail->next = malloc(sizeof(struct slist)); - if (tail->next != NULL) { - tail->next->data = data; - tail->next->next = NULL; + if (initial_size == 0) + initial_size = 2; + + vector = malloc(sizeof(*vector)); + vector->len = 0; + vector->alloc = initial_size; + vector->ptrs = malloc(initial_size * sizeof(vector->ptrs[0]));; + + if (vector->ptrs == NULL) { + free(vector); + return NULL; } - return head; + return vector; } static void -slist_delete(struct slist *head) +ptr_vector_delete(PtrVector* vector) { - struct slist *item; - while (head != NULL) { - item = head; - head = head->next; - free(item); + if (vector != NULL) { + free(vector->ptrs); + free(vector); } } -static unsigned int -slist_length(struct slist *head) +static inline size_t +ptr_vector_get_length(PtrVector* vector) +{ + return vector->len; +} + +static void +ptr_vector_append(PtrVector* vector, void* data) { - unsigned int n = 0; - while (head != NULL) { - head = head->next; - n++; + if (vector->alloc < vector->len + 1) { + size_t alloc = vector->alloc * 2; + void** ptrs; + + ptrs = realloc(vector->ptrs, alloc * sizeof(vector->ptrs[0])); + if (ptrs != NULL) { + vector->alloc = alloc; + vector->ptrs = ptrs; + } + } + + if (vector->len + 1 <= vector->alloc) { + vector->ptrs[vector->len] = data; + vector->len++; } - return n; } /* hanja searching functions */ static Hanja * hanja_new(const char *key, const char *value, const char *comment) { - Hanja *item; + Hanja* hanja; + size_t size; + size_t keylen; + size_t valuelen; + size_t commentlen; + char* p; + + keylen = strlen(key) + 1; + valuelen = strlen(value) + 1; + if (comment != NULL) + commentlen = strlen(comment) + 1; + else + commentlen = 1; + + size = sizeof(*hanja) + keylen + valuelen + commentlen; + hanja = malloc(size); + if (hanja == NULL) + return NULL; - item = malloc(sizeof(Hanja)); - if (item != NULL) { - item->key = strdup(key); - item->value = strdup(value); - if (comment != NULL) - item->comment = strdup(comment); - else - item->comment = strdup(""); - } + p = (char*)hanja + sizeof(*hanja); + strcpy(p, key); + p += keylen; + strcpy(p, value); + p += valuelen; + if (comment != NULL) + strcpy(p, comment); + else + *p = '\0'; + p += valuelen; + + hanja->key_offset = sizeof(*hanja); + hanja->value_offset = sizeof(*hanja) + keylen; + hanja->comment_offset = sizeof(*hanja) + keylen + valuelen; + + return hanja; +} - return item; +static void +hanja_delete(Hanja* hanja) +{ + free(hanja); } const char* hanja_get_key(const Hanja* hanja) { - if (hanja != NULL) - return hanja->key; + if (hanja != NULL) { + const char* p = (const char*)hanja; + return p + hanja->key_offset; + } return NULL; } const char* hanja_get_value(const Hanja* hanja) { - if (hanja != NULL) - return hanja->value; + if (hanja != NULL) { + const char* p = (const char*)hanja; + return p + hanja->value_offset; + } return NULL; } const char* hanja_get_comment(const Hanja* hanja) { - if (hanja != NULL) - return hanja->comment; + if (hanja != NULL) { + const char* p = (const char*)hanja; + return p + hanja->comment_offset; + } return NULL; } +static const Hanja* +hanja_keyentry_get_hanja(const HanjaKeyEntry* entry) +{ + const char* p = (const char*)entry; + return (const Hanja*)(p + entry->hanja_offset); +} + static HanjaList * -hanja_list_new_from_slist(const char *key, struct slist *items) +hanja_list_new(const char *key) { - unsigned int nitems; HanjaList *list; - nitems = slist_length(items); - if (nitems > ULONG_MAX / sizeof(Hanja*)) - return NULL; - - list = malloc(sizeof(HanjaList)); + list = malloc(sizeof(*list)); if (list != NULL) { - int i; list->key = strdup(key); - list->nitems = nitems; - list->items = malloc(sizeof(Hanja*) * list->nitems); - if (list->items != NULL) { - for (i = 0; i < list->nitems; i++) { - list->items[i] = items->data; - items = items->next; - } - } else { - if (list->key != NULL) - free(list->key); + list->len = 0; + list->alloc = 1; + list->items = malloc(list->alloc * sizeof(list->items[0])); + if (list->items == NULL) { free(list); list = NULL; } @@ -215,61 +430,76 @@ hanja_list_new_from_slist(const char *key, struct slist *items) return list; } -static HanjaTable * -hanja_table_new_from_slist(struct slist *lists) +static void +hanja_list_reserve(HanjaList* list, size_t n) { - unsigned int nitems; - HanjaTable *table; + if (list->alloc < list->len + n) { + const Hanja** data; + size_t size = list->alloc; - nitems = slist_length(lists); - if (nitems > ULONG_MAX / sizeof(HanjaList*)) - return NULL; + while (size < list->len + n) + size *= 2; - table = malloc(sizeof(HanjaTable)); - if (table) { - int i; - table->nmember = nitems; - table->base = malloc(sizeof(HanjaList*) * table->nmember); - if (table->base != NULL) { - for (i = 0; i < table->nmember; i++) { - table->base[i] = lists->data; - lists = lists->next; - } - } else { - free(table); - table = NULL; + data = realloc(list->items, size * sizeof(list->items[0])); + if (data != NULL) { + list->alloc = size; + list->items = data; } } - return table; } -HanjaTable* -hanja_table_load(const char *filename) +static void +hanja_list_append_n(HanjaList* list, const Hanja* hanja, int n) +{ + hanja_list_reserve(list, n); + + if (list->alloc >= list->len + n) { + unsigned int i; + for (i = 0; i < n ; i++) + list->items[list->len + i] = hanja + i; + list->len += n; + } +} + +static void +hanja_list_append_nptrs(HanjaList* list, const Hanja** hanja, int n) +{ + hanja_list_reserve(list, n); + + if (list->alloc >= list->len + n) { + unsigned int i; + for (i = 0; i < n ; i++) + list->items[list->len + i] = hanja[i]; + list->len += n; + } +} + + +static PtrVector* +hanja_vectors_from_txt(const char *filename) { char *save_ptr = NULL; char *key; char *value; char *comment; - char listkey[64] = { 0, }; + char lastkey[64] = { 0, }; char buf[1024]; FILE *file; - HanjaTable *table; - HanjaList *list; - Hanja *item; - struct slist *items = NULL; - struct slist *lists = NULL; + PtrVector* keys = NULL; + PtrVector* data = NULL; if (filename == NULL) - filename = LIBHANGUL_DEFAULT_HANJA_DIC; + return NULL; file = fopen(filename, "r"); if (file == NULL) { - printf("cant open file: %s\n", filename); return NULL; } - + while (fgets(buf, sizeof(buf), file) != NULL) { + Hanja* hanja; + /* skip comments and empty lines */ if (buf[0] == '#' || buf[0] == '\r' || buf[0] == '\n' || buf[0] == '\0') continue; @@ -279,166 +509,503 @@ hanja_table_load(const char *filename) value = strtok_r(NULL, ":", &save_ptr); comment = strtok_r(NULL, "\r\n", &save_ptr); - if (strlen(listkey) == 0 || - strncmp(listkey, key, strlen(listkey)) != 0) { - if (items != NULL) { - list = hanja_list_new_from_slist(listkey, items); - slist_delete(items); - items = NULL; + if (key == NULL || strlen(key) == 0) + continue; - lists = slist_append(lists, list); - } + if (value == NULL || strlen(value) == 0) + continue; - strncpy(listkey, key, sizeof(listkey)); - } - - item = hanja_new(key, value, comment); - items = slist_append(items, item); - } + if (comment == NULL) + comment = ""; - if (items != NULL) { - list = hanja_list_new_from_slist(listkey, items); - slist_delete(items); - items = NULL; + if (data != NULL && strcmp(key, lastkey) != 0) { + if (keys == NULL) + keys = ptr_vector_new(32); - lists = slist_append(lists, list); + ptr_vector_append(keys, data); + strncpy(lastkey, key, sizeof(lastkey)); + data = NULL; + } + + hanja = hanja_new(key, value, comment); + if (hanja != NULL) { + if (data == NULL) + data = ptr_vector_new(1); + ptr_vector_append(data, hanja); + } } - table = hanja_table_new_from_slist(lists); - slist_delete(lists); - lists = NULL; + if (data != NULL) { + if (keys == NULL) + keys = ptr_vector_new(1); + ptr_vector_append(keys, data); + } fclose(file); - return table; + printf("%s: done\n", __func__); + + return keys; } static void -delete_last_char(char *str) +hanja_vectors_delete(PtrVector* vectors) { - char *end = strchr(str, '\0'); - for (--end; end >= str; --end) { - if ((*end & 0xc0) != 0x80) { - break; + unsigned int i, j; + for (i = 0; i < vectors->len; i++) { + PtrVector* vector = vectors->ptrs[i]; + + for (j = 0; j < vector->len; j++) + hanja_delete(vector->ptrs[j]); + + ptr_vector_delete(vector); + } + ptr_vector_delete(vectors); +} + +static int +hanja_vectors_save(PtrVector* vectors, HangulStream* stream) +{ + unsigned int i, j, k; + uint32_t nkeys; + uint32_t ndata; + uint32_t keytable_size; + uint32_t datatable_size; + uint32_t last_offset; + + /* signature */ + hangul_stream_write(stream, "HANJADB\x0", 8); + + nkeys = vectors->len; + hangul_stream_write(stream, &nkeys, sizeof(nkeys)); + + ndata = 0; + for (i = 0; i < nkeys; i++) + ndata += ptr_vector_get_length(vectors->ptrs[i]); + hangul_stream_write(stream, &ndata, sizeof(ndata)); + + keytable_size = nkeys * sizeof(HanjaKeyEntry); + datatable_size = ndata * sizeof(Hanja); + + /* key table */ + last_offset = keytable_size; + for (i = 0; i < nkeys; i++) { + HanjaKeyEntry entry; + + entry.hanja_offset = last_offset - i * sizeof(entry); + entry.nitems = ptr_vector_get_length(vectors->ptrs[i]); + + hangul_stream_write(stream, &entry, sizeof(entry)); + + last_offset += entry.nitems * sizeof(Hanja); + } + + /* data table */ + last_offset = datatable_size; + k = 0; + for (i = 0; i < nkeys; i++) { + PtrVector* items = vectors->ptrs[i]; + for (j = 0; j < items->len; j++) { + const char* key; + const char* value; + const char* comment; + size_t key_len; + size_t value_len; + size_t comment_len; + Hanja hanja; + Hanja* item = items->ptrs[j]; + + key = hanja_get_key(item); + value = hanja_get_value(item); + comment = hanja_get_comment(item); + + hanja.key_offset = last_offset - k * sizeof(hanja); + hanja.value_offset = last_offset - k * sizeof(hanja); + hanja.comment_offset = last_offset - k * sizeof(hanja); + + key_len = strlen(key) + 1; + value_len = strlen(value) + 1; + comment_len = strlen(comment) + 1; + + hanja.value_offset += key_len; + hanja.comment_offset += key_len + value_len; + + hangul_stream_write(stream, &hanja, sizeof(hanja)); + + last_offset += key_len + value_len + comment_len; + k++; } } - while (*end != '\0') { - *end++ = '\0'; + /* data */ + for (i = 0; i < nkeys; i++) { + PtrVector* items = vectors->ptrs[i]; + for (j = 0; j < items->len; j++) { + size_t len; + const char* key; + const char* value; + const char* comment; + Hanja* hanja = items->ptrs[j]; + + key = hanja_get_key(hanja); + value = hanja_get_value(hanja); + comment = hanja_get_comment(hanja); + + len = strlen(key) + 1; + hangul_stream_write(stream, key, len); + + len = strlen(value) + 1; + hangul_stream_write(stream, value, len); + + len = strlen(comment) + 1; + hangul_stream_write(stream, comment, len); + } + } + + return 0; +} + +int +hanja_table_txt_to_bin(const char *txtfilename, const char* binfilename) +{ + PtrVector* vectors; + + vectors = hanja_vectors_from_txt(txtfilename); + if (vectors != NULL) { + FILE* file; + HangulStream stream; + + file = fopen(binfilename, "w"); + if (file != NULL) { + hangul_stream_init_as_file(&stream, file); + hanja_vectors_save(vectors, &stream); + fclose(file); + } + + hanja_vectors_delete(vectors); + } + + return 0; +} + +HanjaTable* +hanja_vector_table_load(PtrVector* vector) +{ + HanjaVectorTable* table; + + table = malloc(sizeof(*table)); + if (table != NULL) { + table->parent.type = HANJA_TABLE_TYPE_VECTOR; + table->parent.destroy = hanja_vector_table_delete; + table->parent.match = hanja_vector_table_match; + table->keytable = vector; + } + + return (HanjaTable*)table; +} + +static void +hanja_vector_table_delete(HanjaTable* hanja_table) +{ + if (hanja_table != NULL) { + HanjaVectorTable* table = (HanjaVectorTable*)hanja_table; + hanja_vectors_delete(table->keytable); + free(table); } } static int -hanja_table_compare(const void *key, const void *item) +vector_table_cmp(const void* m1, const void* m2) { - return strncmp((const char*)key, (*((HanjaList**)item))->key, strlen(key)); + const char* key = m1; + const PtrVector* vector = *(const void**)m2; + const Hanja* hanja = vector->ptrs[0]; + const char* hanja_key = hanja_get_key(hanja); + + return strcmp(key, hanja_key); } -HanjaList* -hanja_table_match_prefix(const HanjaTable* table, const char *key) +static void +hanja_vector_table_match(const HanjaTable* hanja_table, + const char* key, HanjaList** list) { - char *p; - char newkey[64] = { '\0', }; - HanjaList **list; - HanjaList *ret; - struct slist *items = NULL; + const HanjaVectorTable* table; + const PtrVector** res; - strncpy(newkey, key, sizeof(newkey)); - p = newkey + h_char_len(newkey); - *p = '\0'; + table = (const HanjaVectorTable*)hanja_table; + res = bsearch(key, table->keytable->ptrs, table->keytable->len, + sizeof(PtrVector*), vector_table_cmp); + if (res != NULL && *res != NULL) { + const Hanja** hanja; - list = bsearch(newkey, - table->base, table->nmember, - sizeof(HanjaList*), - hanja_table_compare); - if (list != NULL) { - int i; - strncpy(newkey, key, sizeof(newkey)); - for (; strlen(newkey) > 0; delete_last_char(newkey)) { - for (i = 0; i < (*list)->nitems; i++) { - if (strcmp(newkey, (*list)->items[i]->key) == 0) { - items = slist_append(items, (*list)->items[i]); - } - } - } + if (*list == NULL) + *list = hanja_list_new(key); - if (items) { - ret = hanja_list_new_from_slist(key, items); - slist_delete(items); - return ret; + hanja = (const Hanja**)res[0]->ptrs; + hanja_list_append_nptrs(*list, hanja, res[0]->len); + } +} + +HanjaMMapTable* +hanja_mmap_table_load(void* data, size_t length) +{ + unsigned int i, j; + uint32_t nkeys = 0; + uint32_t ndata = 0; + HanjaKeyEntry* keytable = NULL; + HanjaMMapTable* table = NULL; + int res = 0; + const char* end; + + HangulStream stream; + + /* signature */ + if (memcmp("HANJADB\x0", data, 8) != 0) + goto error; + + res = hangul_stream_init_as_memory(&stream, data, length); + + res = hangul_stream_seek(&stream, 8); + + res = hangul_stream_read_uint32(&stream, &nkeys); + if (res != 0) + goto error; + + res = hangul_stream_read_uint32(&stream, &ndata); + if (res != 0) + goto error; + + end = (const char*)data + length; + + keytable = (HanjaKeyEntry*)stream.memory.current; + if ((const char*)keytable > end) + goto error; + + /* check integrity here. + * If the data file is wrong, the program may access the wrong address + * and it will be killed by segmentation fault. + * So we check it here, before to use. */ + for (i = 0; i < nkeys; i++) { + const Hanja* hanja; + const HanjaKeyEntry* entry; + + entry = &keytable[i]; + if ((const char*)entry > end) + goto error; + + hanja = hanja_keyentry_get_hanja(entry); + if ((const char*)hanja > end) + goto error; + + for (j = 0; j < entry->nitems; j++) { + const char* key = hanja_get_key(hanja + j); + const char* value = hanja_get_value(hanja + j); + const char* comment = hanja_get_comment(hanja + j); + + if (key > end) + goto error; + + if (value > end) + goto error; + + if (comment > end) + goto error; } } - + + /* the last byte should be nul, or the last comment will be over the + * boundary */ + end--; + if (end[0] != '\0') + goto error; + + table = malloc(sizeof(*table)); + if (table == NULL) + goto error; + + table->parent.type = HANJA_TABLE_TYPE_MMAP; + table->parent.destroy = hanja_mmap_table_delete; + table->parent.match = hanja_mmap_table_match; + table->keytable = keytable; + table->nkeys = nkeys; + table->ndata = ndata; + table->map = data; + table->map_length = length; + + return table; + +error: return NULL; } -HanjaList* -hanja_table_match_suffix(const HanjaTable* table, const char *key) +static void +hanja_mmap_table_delete(HanjaTable* hanja_table) { - const char *p; - char newkey[64] = { '\0', }; - HanjaList **list = NULL; - HanjaList *ret; - struct slist *items = NULL; - - p = key; - strncpy(newkey, p, sizeof(newkey)); - newkey[h_char_len(newkey)] = '\0'; - while (strlen(newkey) > 0) { - list = bsearch(newkey, - table->base, table->nmember, - sizeof(HanjaList*), - hanja_table_compare); - - if (list != NULL) { - int i; - for (i = 0; i < (*list)->nitems; i++) { - if (strcmp(p, (*list)->items[i]->key) == 0) { - items = slist_append(items, (*list)->items[i]); - } - } + if (hanja_table != NULL) { + HanjaMMapTable* table = (HanjaMMapTable*)hanja_table; + if (table->map != NULL) { + munmap(table->map, table->map_length); } + free(table); + } +} + +static int +mmap_table_cmp(const void* m1, const void* m2) +{ + const char* key = m1; + const Hanja* hanja = hanja_keyentry_get_hanja(m2); + const char* hanja_key = hanja_get_key(hanja); - p += h_char_len(p); - strncpy(newkey, p, sizeof(newkey)); - newkey[h_char_len(newkey)] = '\0'; + return strcmp(key, hanja_key); +} + +static void +hanja_mmap_table_match(const HanjaTable* hanja_table, + const char* key, HanjaList** list) +{ + const HanjaKeyEntry* res; + const HanjaMMapTable* table; + + table = (const HanjaMMapTable*)hanja_table; + res = bsearch(key, table->keytable, table->nkeys, + sizeof(table->keytable[0]), mmap_table_cmp); + if (res != NULL) { + const Hanja* hanja = hanja_keyentry_get_hanja(res); + if (*list == NULL) + *list = hanja_list_new(key); + hanja_list_append_n(*list, hanja, res->nitems); } +} + +HanjaTable* +hanja_table_load_from_txt(const char *filename) +{ + PtrVector* vectors; + HanjaTable* table; + + vectors = hanja_vectors_from_txt(filename); + if (vectors == NULL) + return NULL; - if (items != NULL) { - ret = hanja_list_new_from_slist(key, items); - slist_delete(items); - return ret; + table = hanja_vector_table_load(vectors); + if (table == NULL) { + hanja_vectors_delete(vectors); + return NULL; } - return NULL; + return table; +} + +HanjaTable* +hanja_table_load_from_bin(const char *filename) +{ + struct stat buf; + FILE* file; + void* data; + size_t length; + HanjaTable *table = NULL; + + file = fopen(filename, "r"); + if (file == NULL) + return NULL; + + fstat(fileno(file), &buf); + + length = buf.st_size; + data = mmap(0, length, PROT_READ, MAP_SHARED, fileno(file), 0); + fclose(file); + + table = (HanjaTable*)hanja_mmap_table_load(data, length); + if (table == NULL) { + munmap(data, length); + return NULL; + } + + return table; +} + +HanjaTable* +hanja_table_load(const char* filename) +{ + size_t len; + HanjaTable* table = NULL; + + if (filename == NULL) + filename = LIBHANGUL_DEFAULT_HANJA_DIC; + + len = strlen(filename); + if (len > 4 && + filename[len - 1] == 't' && + filename[len - 2] == 'x' && + filename[len - 3] == 't' && + filename[len - 4] == '.') { + table = hanja_table_load_from_txt(filename); + } + + if (table == NULL) + table = hanja_table_load_from_bin(filename); + + if (table == NULL) + table = hanja_table_load_from_txt(filename); + + return table; } void hanja_table_delete(HanjaTable *table) { - if (table) { - int i, j; - for (j = 0; j < table->nmember; j++) { - for (i = 0; i < table->base[j]->nitems; i++) { - h_free((char*)table->base[j]->items[i]->key); - h_free((char*)table->base[j]->items[i]->value); - h_free((char*)table->base[j]->items[i]->comment); - h_free(table->base[j]->items[i]); - } - h_free((char*)table->base[j]->key); - h_free(table->base[j]->items); - h_free(table->base[j]); - } - h_free(table->base); - h_free(table); + if (table != NULL) { + table->destroy(table); + } +} + +HanjaList* +hanja_table_match_prefix(const HanjaTable* table, const char *key) +{ + char* p; + char* newkey; + HanjaList* ret = NULL; + + if (key == NULL || key[0] == '\0') + return NULL; + + newkey = strdup(key); + p = strchr(newkey, '\0'); + while (newkey[0] != '\0') { + printf("%s: %s\n", __func__, newkey); + table->match(table, newkey, &ret); + p = utf8_prev(newkey, p); + p[0] = '\0'; } + free(newkey); + + return ret; +} + +HanjaList* +hanja_table_match_suffix(const HanjaTable* table, const char *key) +{ + const char* p; + HanjaList* ret = NULL; + + if (key == NULL || key[0] == '\0') + return NULL; + + p = key; + while (p[0] != '\0') { + table->match(table, p, &ret); + p = utf8_next(p); + } + + return ret; } int hanja_list_get_size(const HanjaList *list) { if (list != NULL) - return list->nitems; + return list->len; return 0; } @@ -454,13 +1021,20 @@ const Hanja* hanja_list_get_nth(const HanjaList *list, unsigned int n) { if (list != NULL) { - if (n < list->nitems) + if (n < list->len) return list->items[n]; } return NULL; } const char* +hanja_list_get_nth_key(const HanjaList *list, unsigned int n) +{ + const Hanja* hanja = hanja_list_get_nth(list, n); + return hanja_get_key(hanja); +} + +const char* hanja_list_get_nth_value(const HanjaList *list, unsigned int n) { const Hanja* hanja = hanja_list_get_nth(list, n); @@ -478,9 +1052,9 @@ void hanja_list_delete(HanjaList *list) { if (list) { - h_free(list->items); - h_free((char*)list->key); - h_free(list); + free(list->items); + free(list->key); + free(list); } } diff --git a/tools/Makefile.am b/tools/Makefile.am new file mode 100644 index 0000000..fd0bc9b --- /dev/null +++ b/tools/Makefile.am @@ -0,0 +1,6 @@ + +bin_PROGRAMS = hanjac + +hanjac_CFLAGS = +hanjac_SOURCES = hanjac.c +hanjac_LDADD = ../hangul/libhangul.la diff --git a/tools/hanjac.c b/tools/hanjac.c new file mode 100644 index 0000000..7d6b67d --- /dev/null +++ b/tools/hanjac.c @@ -0,0 +1,15 @@ +#include <stdio.h> +#include <string.h> + +#include "../hangul/hangul.h" + +int +main(int argc, char *argv[]) +{ + if (argc != 3) + return 1; + + hanja_table_txt_to_bin(argv[1], argv[2]); + + return 0; +} |