summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChoe Hwanjin <choe.hwanjin@gmail.com>2008-02-24 02:29:23 +0900
committerChoe Hwanjin <choe.hwanjin@gmail.com>2008-02-24 02:29:23 +0900
commit234795cf54ad7ab9376d204720e4f9ab3a851e99 (patch)
tree0b82311c352f21391928882d7c2d16af03ba61d4
parentebb1debbbac572af8e0655b2071185e138ead403 (diff)
downloadlibhangul-234795cf54ad7ab9376d204720e4f9ab3a851e99.tar.gz
한자 사전 파일을 바이너리 형태로 사용하는 기능 구현:
* 내부적으로 mmap을 이용하여 로딩, 메모리 사용량을 줄임 * txt 버젼은 vector로 구현, 더이상 slist를 사용하지 않음 * hanja.txt파일을 hanja.bin 형태로 변환하여 사용함 * 파일 포맷 변환을 위한 API, hanja_table_txt_to_bin() 추가 * tools 디렉토리 추가 * 파일 포맷 변환을 위해 hanjac라는 도구를 제공 * 기본 한자 사전 파일을 hanja.txt에서 hanja.bin으로 변경 새로운 api 추가 * hanja_list_get_nth_key() git-svn-id: http://kldp.net/svn/hangul/libhangul/trunk@158 8f00fcd2-89fc-0310-932e-b01be5b65e01
-rw-r--r--Makefile.am2
-rw-r--r--configure.ac10
-rw-r--r--data/hanja/Makefile.am8
-rw-r--r--hangul/Makefile.am2
-rw-r--r--hangul/hangul.h3
-rw-r--r--hangul/hanja.c1032
-rw-r--r--tools/Makefile.am6
-rw-r--r--tools/hanjac.c15
8 files changed, 845 insertions, 233 deletions
diff --git a/Makefile.am b/Makefile.am
index 6478f1a..f83858b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = hangul data bindings test
+SUBDIRS = hangul data bindings test tools
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = libhangul.pc
diff --git a/configure.ac b/configure.ac
index ef28e77..f43e1f0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -26,7 +26,7 @@ AC_PROG_INSTALL
# Checks for header files.
AC_HEADER_STDC
-AC_CHECK_HEADERS([stdlib.h string.h])
+AC_CHECK_HEADERS([stdlib.h string.h limits.h])
# Checks for typedefs, structures, and compiler characteristics.
AC_HEADER_STDBOOL
@@ -36,9 +36,17 @@ AC_C_INLINE
AC_TYPE_UINT32_T
# Checks for library functions.
+AC_FUNC_MEMCMP
+AC_FUNC_MMAP
+AC_FUNC_REALLOC
+AC_CHECK_FUNCS([munmap])
+AC_CHECK_FUNCS([strcasecmp])
+
+
AC_CONFIG_FILES([
Makefile
hangul/Makefile
+tools/Makefile
data/Makefile
data/hanja/Makefile
bindings/Makefile
diff --git a/data/hanja/Makefile.am b/data/hanja/Makefile.am
index 2e057d8..8b44034 100644
--- a/data/hanja/Makefile.am
+++ b/data/hanja/Makefile.am
@@ -2,4 +2,10 @@
hanjadicdir = $(datadir)/libhangul/hanja
hanjadic_DATA = hanja.txt
-EXTRA_DIST = $(hanjadic_DATA)
+hanjadicbindir = $(datadir)/libhangul/hanja
+hanjadicbin_DATA = hanja.bin
+
+$(hanjadicbin_DATA):
+ ../../tools/hanjac hanja.txt $@
+
+EXTRA_DIST = $(hanjadic_DATA) $(hanjadicbin_DATA)
diff --git a/hangul/Makefile.am b/hangul/Makefile.am
index bd9770f..e3186c0 100644
--- a/hangul/Makefile.am
+++ b/hangul/Makefile.am
@@ -14,7 +14,7 @@ libhangul_la_SOURCES = \
hanja.c
libhangul_la_CFLAGS = \
- -DLIBHANGUL_DEFAULT_HANJA_DIC=\"$(datadir)/libhangul/hanja/hanja.txt\"
+ -DLIBHANGUL_DEFAULT_HANJA_DIC=\"$(datadir)/libhangul/hanja/hanja.bin\"
libhangul_la_LDFLAGS = -version-info $(LIBHANGUL_CURRENT):$(LIBHANGUL_REVISION):$(LIBHANGUL_AGE)
libhangul_la_LIBADD =
diff --git a/hangul/hangul.h b/hangul/hangul.h
index 8103d26..33e1c92 100644
--- a/hangul/hangul.h
+++ b/hangul/hangul.h
@@ -128,6 +128,9 @@ HanjaList* hanja_table_match_prefix(const HanjaTable* table, const char *key);
HanjaList* hanja_table_match_suffix(const HanjaTable* table, const char *key);
void hanja_table_delete(HanjaTable *table);
+int hanja_table_txt_to_bin(const char* txtfilename,
+ const char* binfilename);
+
int hanja_list_get_size(const HanjaList *list);
const char* hanja_list_get_key(const HanjaList *list);
const Hanja* hanja_list_get_nth(const HanjaList *list, unsigned int n);
diff --git a/hangul/hanja.c b/hangul/hanja.c
index eade6bc..8b1e85f 100644
--- a/hangul/hanja.c
+++ b/hangul/hanja.c
@@ -1,5 +1,5 @@
/* libhangul
- * Copyright (C) 2005 Choe Hwanjin
+ * Copyright (C) 2005-2008 Choe Hwanjin
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -20,6 +20,12 @@
#include <config.h>
#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
@@ -28,29 +34,83 @@
#include "hangul.h"
#include "hangulinternals.h"
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+enum {
+ HANGUL_ERROR_NOERROR,
+ HANGUL_ERROR_INVALID,
+ HANGUL_ERROR_RANGE,
+ HANGUL_ERROR_CANTOPEN,
+};
+
+enum {
+ HANJA_TABLE_TYPE_VECTOR,
+ HANJA_TABLE_TYPE_MMAP
+};
+
+typedef struct _PtrVector PtrVector;
+typedef struct _HanjaKeyEntry HanjaKeyEntry;
+
+typedef struct _HanjaVectorTable HanjaVectorTable;
+typedef struct _HanjaMMapTable HanjaMMapTable;
+
typedef struct _HanjaPair HanjaPair;
typedef struct _HanjaPairArray HanjaPairArray;
struct _Hanja {
- char *key;
- char *value;
- char *comment;
+ uint32_t key_offset;
+ uint32_t value_offset;
+ uint32_t comment_offset;
};
struct _HanjaList {
- char *key;
- unsigned int nitems;
- Hanja **items;
+ char* key;
+ size_t len;
+ size_t alloc;
+ const Hanja** items;
};
+typedef void (*HanjaTableDelete)(HanjaTable*);
+typedef void (*HanjaTableMatch)(const HanjaTable*, const char*, HanjaList**);
+
struct _HanjaTable {
- unsigned int nmember;
- HanjaList **base;
+ int type;
+ HanjaTableDelete destroy;
+ HanjaTableMatch match;
};
-struct slist {
- void *data;
- struct slist *next;
+struct _PtrVector {
+ void** ptrs;
+ size_t len;
+ size_t alloc;
+};
+
+struct _HanjaVectorTable {
+ HanjaTable parent;
+
+ PtrVector* keytable;
+};
+
+struct _HanjaKeyEntry {
+ uint32_t hanja_offset;
+ uint32_t nitems;
+};
+
+struct _HanjaMMapTable {
+ HanjaTable parent;
+
+ HanjaKeyEntry* keytable;
+ unsigned int nkeys;
+ unsigned int ndata;
+
+ void* map;
+ size_t map_length;
};
struct _HanjaPair {
@@ -65,11 +125,110 @@ struct _HanjaPairArray {
#include "hanjacompatible.h"
-/* utility functions */
-static inline void h_free(void *ptr)
+enum {
+ HANJA_STREAM_MEMORY,
+ HANJA_STREAM_FILE
+};
+
+typedef struct {
+ int type;
+ unsigned char* data;
+ unsigned char* current;
+ size_t length;
+} HanjaMemoryStream;
+
+typedef struct {
+ int type;
+ FILE* file;
+} HanjaFileStream;
+
+typedef union {
+ int type;
+ HanjaMemoryStream memory;
+ HanjaFileStream file;
+} HangulStream;
+
+static void hanja_vector_table_delete(HanjaTable* hanja_table);
+static void hanja_vector_table_match(const HanjaTable* hanja_table,
+ const char* key, HanjaList** list);
+
+static void hanja_mmap_table_delete(HanjaTable* hanja_table);
+static void hanja_mmap_table_match(const HanjaTable* hanja_table,
+ const char* key, HanjaList** list);
+
+static inline int
+hangul_stream_init_as_memory(HangulStream* stream, void* data, size_t length)
+{
+ stream->type = HANJA_STREAM_MEMORY;
+ stream->memory.data = data;
+ stream->memory.current = data;
+ stream->memory.length = length;
+ return 0;
+}
+
+static inline int
+hangul_stream_init_as_file(HangulStream* stream, FILE* file)
+{
+ stream->type = HANJA_STREAM_FILE;
+ stream->file.file = file;
+ return 0;
+}
+
+static inline bool
+hangul_stream_check_range(HanjaMemoryStream* stream, unsigned char* p)
+{
+ if (p >= stream->data && p < stream->data + stream->length)
+ return true;
+ else
+ return false;
+}
+
+static inline int
+hangul_stream_seek(HangulStream* stream, size_t offset)
+{
+ if (stream->type == HANJA_STREAM_MEMORY) {
+ HanjaMemoryStream* mstream = &stream->memory;
+ if (!hangul_stream_check_range(mstream, mstream->current + offset))
+ return HANGUL_ERROR_RANGE;
+
+ stream->memory.current += offset;
+ return 0;
+ }
+
+ return HANGUL_ERROR_INVALID;
+}
+
+static inline int
+hangul_stream_read_uint32(HangulStream* stream, uint32_t* value)
{
- if (ptr)
- free(ptr);
+ if (stream->type == HANJA_STREAM_MEMORY) {
+ HanjaMemoryStream* mstream = &stream->memory;
+ if (!hangul_stream_check_range(mstream, mstream->current + sizeof(*value)))
+ return HANGUL_ERROR_RANGE;
+
+ memcpy(value, mstream->current, sizeof(*value));
+ mstream->current += sizeof(*value);
+ return 0;
+ }
+
+ return HANGUL_ERROR_INVALID;
+}
+
+static inline int
+hangul_stream_write(HangulStream* stream, const void* ptr, size_t len)
+{
+ if (stream->type == HANJA_STREAM_MEMORY) {
+ HanjaMemoryStream* mstream = &stream->memory;
+ if (!hangul_stream_check_range(mstream, mstream->current + len))
+ return HANGUL_ERROR_RANGE;
+
+ memcpy(mstream->current, ptr, len);
+ mstream->current += len;
+ } else if (stream->type == HANJA_STREAM_FILE) {
+ fwrite(ptr, len, 1, stream->file.file);
+ }
+
+ return 0;
}
static const char utf8_skip_table[256] = {
@@ -83,130 +242,186 @@ static const char utf8_skip_table[256] = {
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};
-static inline int h_char_len(const char *p)
+static inline int utf8_char_len(const char *p)
{
return utf8_skip_table[*(const unsigned char*)p];
}
-static struct slist *
-slist_append(struct slist *head, void *data)
+static inline const char* utf8_next(const char *str)
{
- struct slist *tail;
+ int n = utf8_char_len(str);
- if (data == NULL)
- return head;
+ while (n > 0) {
+ str++;
+ if (*str == '\0')
+ return str;
+ n--;
+ }
- if (head == NULL) {
- head = malloc(sizeof(struct slist));
- if (head != NULL) {
- head->data = data;
- head->next = NULL;
- }
+ return str;
+}
- return head;
+static inline char* utf8_prev(const char *str, const char *p)
+{
+ for (--p; p >= str; --p) {
+ if ((*p & 0xc0) != 0x80)
+ break;
}
+ return (char*)p;
+}
- for (tail = head; tail->next != NULL; tail = tail->next)
- continue;
+static PtrVector*
+ptr_vector_new(size_t initial_size)
+{
+ PtrVector* vector;
- tail->next = malloc(sizeof(struct slist));
- if (tail->next != NULL) {
- tail->next->data = data;
- tail->next->next = NULL;
+ if (initial_size == 0)
+ initial_size = 2;
+
+ vector = malloc(sizeof(*vector));
+ vector->len = 0;
+ vector->alloc = initial_size;
+ vector->ptrs = malloc(initial_size * sizeof(vector->ptrs[0]));;
+
+ if (vector->ptrs == NULL) {
+ free(vector);
+ return NULL;
}
- return head;
+ return vector;
}
static void
-slist_delete(struct slist *head)
+ptr_vector_delete(PtrVector* vector)
{
- struct slist *item;
- while (head != NULL) {
- item = head;
- head = head->next;
- free(item);
+ if (vector != NULL) {
+ free(vector->ptrs);
+ free(vector);
}
}
-static unsigned int
-slist_length(struct slist *head)
+static inline size_t
+ptr_vector_get_length(PtrVector* vector)
+{
+ return vector->len;
+}
+
+static void
+ptr_vector_append(PtrVector* vector, void* data)
{
- unsigned int n = 0;
- while (head != NULL) {
- head = head->next;
- n++;
+ if (vector->alloc < vector->len + 1) {
+ size_t alloc = vector->alloc * 2;
+ void** ptrs;
+
+ ptrs = realloc(vector->ptrs, alloc * sizeof(vector->ptrs[0]));
+ if (ptrs != NULL) {
+ vector->alloc = alloc;
+ vector->ptrs = ptrs;
+ }
+ }
+
+ if (vector->len + 1 <= vector->alloc) {
+ vector->ptrs[vector->len] = data;
+ vector->len++;
}
- return n;
}
/* hanja searching functions */
static Hanja *
hanja_new(const char *key, const char *value, const char *comment)
{
- Hanja *item;
+ Hanja* hanja;
+ size_t size;
+ size_t keylen;
+ size_t valuelen;
+ size_t commentlen;
+ char* p;
+
+ keylen = strlen(key) + 1;
+ valuelen = strlen(value) + 1;
+ if (comment != NULL)
+ commentlen = strlen(comment) + 1;
+ else
+ commentlen = 1;
+
+ size = sizeof(*hanja) + keylen + valuelen + commentlen;
+ hanja = malloc(size);
+ if (hanja == NULL)
+ return NULL;
- item = malloc(sizeof(Hanja));
- if (item != NULL) {
- item->key = strdup(key);
- item->value = strdup(value);
- if (comment != NULL)
- item->comment = strdup(comment);
- else
- item->comment = strdup("");
- }
+ p = (char*)hanja + sizeof(*hanja);
+ strcpy(p, key);
+ p += keylen;
+ strcpy(p, value);
+ p += valuelen;
+ if (comment != NULL)
+ strcpy(p, comment);
+ else
+ *p = '\0';
+ p += valuelen;
+
+ hanja->key_offset = sizeof(*hanja);
+ hanja->value_offset = sizeof(*hanja) + keylen;
+ hanja->comment_offset = sizeof(*hanja) + keylen + valuelen;
+
+ return hanja;
+}
- return item;
+static void
+hanja_delete(Hanja* hanja)
+{
+ free(hanja);
}
const char*
hanja_get_key(const Hanja* hanja)
{
- if (hanja != NULL)
- return hanja->key;
+ if (hanja != NULL) {
+ const char* p = (const char*)hanja;
+ return p + hanja->key_offset;
+ }
return NULL;
}
const char*
hanja_get_value(const Hanja* hanja)
{
- if (hanja != NULL)
- return hanja->value;
+ if (hanja != NULL) {
+ const char* p = (const char*)hanja;
+ return p + hanja->value_offset;
+ }
return NULL;
}
const char*
hanja_get_comment(const Hanja* hanja)
{
- if (hanja != NULL)
- return hanja->comment;
+ if (hanja != NULL) {
+ const char* p = (const char*)hanja;
+ return p + hanja->comment_offset;
+ }
return NULL;
}
+static const Hanja*
+hanja_keyentry_get_hanja(const HanjaKeyEntry* entry)
+{
+ const char* p = (const char*)entry;
+ return (const Hanja*)(p + entry->hanja_offset);
+}
+
static HanjaList *
-hanja_list_new_from_slist(const char *key, struct slist *items)
+hanja_list_new(const char *key)
{
- unsigned int nitems;
HanjaList *list;
- nitems = slist_length(items);
- if (nitems > ULONG_MAX / sizeof(Hanja*))
- return NULL;
-
- list = malloc(sizeof(HanjaList));
+ list = malloc(sizeof(*list));
if (list != NULL) {
- int i;
list->key = strdup(key);
- list->nitems = nitems;
- list->items = malloc(sizeof(Hanja*) * list->nitems);
- if (list->items != NULL) {
- for (i = 0; i < list->nitems; i++) {
- list->items[i] = items->data;
- items = items->next;
- }
- } else {
- if (list->key != NULL)
- free(list->key);
+ list->len = 0;
+ list->alloc = 1;
+ list->items = malloc(list->alloc * sizeof(list->items[0]));
+ if (list->items == NULL) {
free(list);
list = NULL;
}
@@ -215,61 +430,76 @@ hanja_list_new_from_slist(const char *key, struct slist *items)
return list;
}
-static HanjaTable *
-hanja_table_new_from_slist(struct slist *lists)
+static void
+hanja_list_reserve(HanjaList* list, size_t n)
{
- unsigned int nitems;
- HanjaTable *table;
+ if (list->alloc < list->len + n) {
+ const Hanja** data;
+ size_t size = list->alloc;
- nitems = slist_length(lists);
- if (nitems > ULONG_MAX / sizeof(HanjaList*))
- return NULL;
+ while (size < list->len + n)
+ size *= 2;
- table = malloc(sizeof(HanjaTable));
- if (table) {
- int i;
- table->nmember = nitems;
- table->base = malloc(sizeof(HanjaList*) * table->nmember);
- if (table->base != NULL) {
- for (i = 0; i < table->nmember; i++) {
- table->base[i] = lists->data;
- lists = lists->next;
- }
- } else {
- free(table);
- table = NULL;
+ data = realloc(list->items, size * sizeof(list->items[0]));
+ if (data != NULL) {
+ list->alloc = size;
+ list->items = data;
}
}
- return table;
}
-HanjaTable*
-hanja_table_load(const char *filename)
+static void
+hanja_list_append_n(HanjaList* list, const Hanja* hanja, int n)
+{
+ hanja_list_reserve(list, n);
+
+ if (list->alloc >= list->len + n) {
+ unsigned int i;
+ for (i = 0; i < n ; i++)
+ list->items[list->len + i] = hanja + i;
+ list->len += n;
+ }
+}
+
+static void
+hanja_list_append_nptrs(HanjaList* list, const Hanja** hanja, int n)
+{
+ hanja_list_reserve(list, n);
+
+ if (list->alloc >= list->len + n) {
+ unsigned int i;
+ for (i = 0; i < n ; i++)
+ list->items[list->len + i] = hanja[i];
+ list->len += n;
+ }
+}
+
+
+static PtrVector*
+hanja_vectors_from_txt(const char *filename)
{
char *save_ptr = NULL;
char *key;
char *value;
char *comment;
- char listkey[64] = { 0, };
+ char lastkey[64] = { 0, };
char buf[1024];
FILE *file;
- HanjaTable *table;
- HanjaList *list;
- Hanja *item;
- struct slist *items = NULL;
- struct slist *lists = NULL;
+ PtrVector* keys = NULL;
+ PtrVector* data = NULL;
if (filename == NULL)
- filename = LIBHANGUL_DEFAULT_HANJA_DIC;
+ return NULL;
file = fopen(filename, "r");
if (file == NULL) {
- printf("cant open file: %s\n", filename);
return NULL;
}
-
+
while (fgets(buf, sizeof(buf), file) != NULL) {
+ Hanja* hanja;
+
/* skip comments and empty lines */
if (buf[0] == '#' || buf[0] == '\r' || buf[0] == '\n' || buf[0] == '\0')
continue;
@@ -279,166 +509,503 @@ hanja_table_load(const char *filename)
value = strtok_r(NULL, ":", &save_ptr);
comment = strtok_r(NULL, "\r\n", &save_ptr);
- if (strlen(listkey) == 0 ||
- strncmp(listkey, key, strlen(listkey)) != 0) {
- if (items != NULL) {
- list = hanja_list_new_from_slist(listkey, items);
- slist_delete(items);
- items = NULL;
+ if (key == NULL || strlen(key) == 0)
+ continue;
- lists = slist_append(lists, list);
- }
+ if (value == NULL || strlen(value) == 0)
+ continue;
- strncpy(listkey, key, sizeof(listkey));
- }
-
- item = hanja_new(key, value, comment);
- items = slist_append(items, item);
- }
+ if (comment == NULL)
+ comment = "";
- if (items != NULL) {
- list = hanja_list_new_from_slist(listkey, items);
- slist_delete(items);
- items = NULL;
+ if (data != NULL && strcmp(key, lastkey) != 0) {
+ if (keys == NULL)
+ keys = ptr_vector_new(32);
- lists = slist_append(lists, list);
+ ptr_vector_append(keys, data);
+ strncpy(lastkey, key, sizeof(lastkey));
+ data = NULL;
+ }
+
+ hanja = hanja_new(key, value, comment);
+ if (hanja != NULL) {
+ if (data == NULL)
+ data = ptr_vector_new(1);
+ ptr_vector_append(data, hanja);
+ }
}
- table = hanja_table_new_from_slist(lists);
- slist_delete(lists);
- lists = NULL;
+ if (data != NULL) {
+ if (keys == NULL)
+ keys = ptr_vector_new(1);
+ ptr_vector_append(keys, data);
+ }
fclose(file);
- return table;
+ printf("%s: done\n", __func__);
+
+ return keys;
}
static void
-delete_last_char(char *str)
+hanja_vectors_delete(PtrVector* vectors)
{
- char *end = strchr(str, '\0');
- for (--end; end >= str; --end) {
- if ((*end & 0xc0) != 0x80) {
- break;
+ unsigned int i, j;
+ for (i = 0; i < vectors->len; i++) {
+ PtrVector* vector = vectors->ptrs[i];
+
+ for (j = 0; j < vector->len; j++)
+ hanja_delete(vector->ptrs[j]);
+
+ ptr_vector_delete(vector);
+ }
+ ptr_vector_delete(vectors);
+}
+
+static int
+hanja_vectors_save(PtrVector* vectors, HangulStream* stream)
+{
+ unsigned int i, j, k;
+ uint32_t nkeys;
+ uint32_t ndata;
+ uint32_t keytable_size;
+ uint32_t datatable_size;
+ uint32_t last_offset;
+
+ /* signature */
+ hangul_stream_write(stream, "HANJADB\x0", 8);
+
+ nkeys = vectors->len;
+ hangul_stream_write(stream, &nkeys, sizeof(nkeys));
+
+ ndata = 0;
+ for (i = 0; i < nkeys; i++)
+ ndata += ptr_vector_get_length(vectors->ptrs[i]);
+ hangul_stream_write(stream, &ndata, sizeof(ndata));
+
+ keytable_size = nkeys * sizeof(HanjaKeyEntry);
+ datatable_size = ndata * sizeof(Hanja);
+
+ /* key table */
+ last_offset = keytable_size;
+ for (i = 0; i < nkeys; i++) {
+ HanjaKeyEntry entry;
+
+ entry.hanja_offset = last_offset - i * sizeof(entry);
+ entry.nitems = ptr_vector_get_length(vectors->ptrs[i]);
+
+ hangul_stream_write(stream, &entry, sizeof(entry));
+
+ last_offset += entry.nitems * sizeof(Hanja);
+ }
+
+ /* data table */
+ last_offset = datatable_size;
+ k = 0;
+ for (i = 0; i < nkeys; i++) {
+ PtrVector* items = vectors->ptrs[i];
+ for (j = 0; j < items->len; j++) {
+ const char* key;
+ const char* value;
+ const char* comment;
+ size_t key_len;
+ size_t value_len;
+ size_t comment_len;
+ Hanja hanja;
+ Hanja* item = items->ptrs[j];
+
+ key = hanja_get_key(item);
+ value = hanja_get_value(item);
+ comment = hanja_get_comment(item);
+
+ hanja.key_offset = last_offset - k * sizeof(hanja);
+ hanja.value_offset = last_offset - k * sizeof(hanja);
+ hanja.comment_offset = last_offset - k * sizeof(hanja);
+
+ key_len = strlen(key) + 1;
+ value_len = strlen(value) + 1;
+ comment_len = strlen(comment) + 1;
+
+ hanja.value_offset += key_len;
+ hanja.comment_offset += key_len + value_len;
+
+ hangul_stream_write(stream, &hanja, sizeof(hanja));
+
+ last_offset += key_len + value_len + comment_len;
+ k++;
}
}
- while (*end != '\0') {
- *end++ = '\0';
+ /* data */
+ for (i = 0; i < nkeys; i++) {
+ PtrVector* items = vectors->ptrs[i];
+ for (j = 0; j < items->len; j++) {
+ size_t len;
+ const char* key;
+ const char* value;
+ const char* comment;
+ Hanja* hanja = items->ptrs[j];
+
+ key = hanja_get_key(hanja);
+ value = hanja_get_value(hanja);
+ comment = hanja_get_comment(hanja);
+
+ len = strlen(key) + 1;
+ hangul_stream_write(stream, key, len);
+
+ len = strlen(value) + 1;
+ hangul_stream_write(stream, value, len);
+
+ len = strlen(comment) + 1;
+ hangul_stream_write(stream, comment, len);
+ }
+ }
+
+ return 0;
+}
+
+int
+hanja_table_txt_to_bin(const char *txtfilename, const char* binfilename)
+{
+ PtrVector* vectors;
+
+ vectors = hanja_vectors_from_txt(txtfilename);
+ if (vectors != NULL) {
+ FILE* file;
+ HangulStream stream;
+
+ file = fopen(binfilename, "w");
+ if (file != NULL) {
+ hangul_stream_init_as_file(&stream, file);
+ hanja_vectors_save(vectors, &stream);
+ fclose(file);
+ }
+
+ hanja_vectors_delete(vectors);
+ }
+
+ return 0;
+}
+
+HanjaTable*
+hanja_vector_table_load(PtrVector* vector)
+{
+ HanjaVectorTable* table;
+
+ table = malloc(sizeof(*table));
+ if (table != NULL) {
+ table->parent.type = HANJA_TABLE_TYPE_VECTOR;
+ table->parent.destroy = hanja_vector_table_delete;
+ table->parent.match = hanja_vector_table_match;
+ table->keytable = vector;
+ }
+
+ return (HanjaTable*)table;
+}
+
+static void
+hanja_vector_table_delete(HanjaTable* hanja_table)
+{
+ if (hanja_table != NULL) {
+ HanjaVectorTable* table = (HanjaVectorTable*)hanja_table;
+ hanja_vectors_delete(table->keytable);
+ free(table);
}
}
static int
-hanja_table_compare(const void *key, const void *item)
+vector_table_cmp(const void* m1, const void* m2)
{
- return strncmp((const char*)key, (*((HanjaList**)item))->key, strlen(key));
+ const char* key = m1;
+ const PtrVector* vector = *(const void**)m2;
+ const Hanja* hanja = vector->ptrs[0];
+ const char* hanja_key = hanja_get_key(hanja);
+
+ return strcmp(key, hanja_key);
}
-HanjaList*
-hanja_table_match_prefix(const HanjaTable* table, const char *key)
+static void
+hanja_vector_table_match(const HanjaTable* hanja_table,
+ const char* key, HanjaList** list)
{
- char *p;
- char newkey[64] = { '\0', };
- HanjaList **list;
- HanjaList *ret;
- struct slist *items = NULL;
+ const HanjaVectorTable* table;
+ const PtrVector** res;
- strncpy(newkey, key, sizeof(newkey));
- p = newkey + h_char_len(newkey);
- *p = '\0';
+ table = (const HanjaVectorTable*)hanja_table;
+ res = bsearch(key, table->keytable->ptrs, table->keytable->len,
+ sizeof(PtrVector*), vector_table_cmp);
+ if (res != NULL && *res != NULL) {
+ const Hanja** hanja;
- list = bsearch(newkey,
- table->base, table->nmember,
- sizeof(HanjaList*),
- hanja_table_compare);
- if (list != NULL) {
- int i;
- strncpy(newkey, key, sizeof(newkey));
- for (; strlen(newkey) > 0; delete_last_char(newkey)) {
- for (i = 0; i < (*list)->nitems; i++) {
- if (strcmp(newkey, (*list)->items[i]->key) == 0) {
- items = slist_append(items, (*list)->items[i]);
- }
- }
- }
+ if (*list == NULL)
+ *list = hanja_list_new(key);
- if (items) {
- ret = hanja_list_new_from_slist(key, items);
- slist_delete(items);
- return ret;
+ hanja = (const Hanja**)res[0]->ptrs;
+ hanja_list_append_nptrs(*list, hanja, res[0]->len);
+ }
+}
+
+HanjaMMapTable*
+hanja_mmap_table_load(void* data, size_t length)
+{
+ unsigned int i, j;
+ uint32_t nkeys = 0;
+ uint32_t ndata = 0;
+ HanjaKeyEntry* keytable = NULL;
+ HanjaMMapTable* table = NULL;
+ int res = 0;
+ const char* end;
+
+ HangulStream stream;
+
+ /* signature */
+ if (memcmp("HANJADB\x0", data, 8) != 0)
+ goto error;
+
+ res = hangul_stream_init_as_memory(&stream, data, length);
+
+ res = hangul_stream_seek(&stream, 8);
+
+ res = hangul_stream_read_uint32(&stream, &nkeys);
+ if (res != 0)
+ goto error;
+
+ res = hangul_stream_read_uint32(&stream, &ndata);
+ if (res != 0)
+ goto error;
+
+ end = (const char*)data + length;
+
+ keytable = (HanjaKeyEntry*)stream.memory.current;
+ if ((const char*)keytable > end)
+ goto error;
+
+ /* check integrity here.
+ * If the data file is wrong, the program may access the wrong address
+ * and it will be killed by segmentation fault.
+ * So we check it here, before to use. */
+ for (i = 0; i < nkeys; i++) {
+ const Hanja* hanja;
+ const HanjaKeyEntry* entry;
+
+ entry = &keytable[i];
+ if ((const char*)entry > end)
+ goto error;
+
+ hanja = hanja_keyentry_get_hanja(entry);
+ if ((const char*)hanja > end)
+ goto error;
+
+ for (j = 0; j < entry->nitems; j++) {
+ const char* key = hanja_get_key(hanja + j);
+ const char* value = hanja_get_value(hanja + j);
+ const char* comment = hanja_get_comment(hanja + j);
+
+ if (key > end)
+ goto error;
+
+ if (value > end)
+ goto error;
+
+ if (comment > end)
+ goto error;
}
}
-
+
+ /* the last byte should be nul, or the last comment will be over the
+ * boundary */
+ end--;
+ if (end[0] != '\0')
+ goto error;
+
+ table = malloc(sizeof(*table));
+ if (table == NULL)
+ goto error;
+
+ table->parent.type = HANJA_TABLE_TYPE_MMAP;
+ table->parent.destroy = hanja_mmap_table_delete;
+ table->parent.match = hanja_mmap_table_match;
+ table->keytable = keytable;
+ table->nkeys = nkeys;
+ table->ndata = ndata;
+ table->map = data;
+ table->map_length = length;
+
+ return table;
+
+error:
return NULL;
}
-HanjaList*
-hanja_table_match_suffix(const HanjaTable* table, const char *key)
+static void
+hanja_mmap_table_delete(HanjaTable* hanja_table)
{
- const char *p;
- char newkey[64] = { '\0', };
- HanjaList **list = NULL;
- HanjaList *ret;
- struct slist *items = NULL;
-
- p = key;
- strncpy(newkey, p, sizeof(newkey));
- newkey[h_char_len(newkey)] = '\0';
- while (strlen(newkey) > 0) {
- list = bsearch(newkey,
- table->base, table->nmember,
- sizeof(HanjaList*),
- hanja_table_compare);
-
- if (list != NULL) {
- int i;
- for (i = 0; i < (*list)->nitems; i++) {
- if (strcmp(p, (*list)->items[i]->key) == 0) {
- items = slist_append(items, (*list)->items[i]);
- }
- }
+ if (hanja_table != NULL) {
+ HanjaMMapTable* table = (HanjaMMapTable*)hanja_table;
+ if (table->map != NULL) {
+ munmap(table->map, table->map_length);
}
+ free(table);
+ }
+}
+
+static int
+mmap_table_cmp(const void* m1, const void* m2)
+{
+ const char* key = m1;
+ const Hanja* hanja = hanja_keyentry_get_hanja(m2);
+ const char* hanja_key = hanja_get_key(hanja);
- p += h_char_len(p);
- strncpy(newkey, p, sizeof(newkey));
- newkey[h_char_len(newkey)] = '\0';
+ return strcmp(key, hanja_key);
+}
+
+static void
+hanja_mmap_table_match(const HanjaTable* hanja_table,
+ const char* key, HanjaList** list)
+{
+ const HanjaKeyEntry* res;
+ const HanjaMMapTable* table;
+
+ table = (const HanjaMMapTable*)hanja_table;
+ res = bsearch(key, table->keytable, table->nkeys,
+ sizeof(table->keytable[0]), mmap_table_cmp);
+ if (res != NULL) {
+ const Hanja* hanja = hanja_keyentry_get_hanja(res);
+ if (*list == NULL)
+ *list = hanja_list_new(key);
+ hanja_list_append_n(*list, hanja, res->nitems);
}
+}
+
+HanjaTable*
+hanja_table_load_from_txt(const char *filename)
+{
+ PtrVector* vectors;
+ HanjaTable* table;
+
+ vectors = hanja_vectors_from_txt(filename);
+ if (vectors == NULL)
+ return NULL;
- if (items != NULL) {
- ret = hanja_list_new_from_slist(key, items);
- slist_delete(items);
- return ret;
+ table = hanja_vector_table_load(vectors);
+ if (table == NULL) {
+ hanja_vectors_delete(vectors);
+ return NULL;
}
- return NULL;
+ return table;
+}
+
+HanjaTable*
+hanja_table_load_from_bin(const char *filename)
+{
+ struct stat buf;
+ FILE* file;
+ void* data;
+ size_t length;
+ HanjaTable *table = NULL;
+
+ file = fopen(filename, "r");
+ if (file == NULL)
+ return NULL;
+
+ fstat(fileno(file), &buf);
+
+ length = buf.st_size;
+ data = mmap(0, length, PROT_READ, MAP_SHARED, fileno(file), 0);
+ fclose(file);
+
+ table = (HanjaTable*)hanja_mmap_table_load(data, length);
+ if (table == NULL) {
+ munmap(data, length);
+ return NULL;
+ }
+
+ return table;
+}
+
+HanjaTable*
+hanja_table_load(const char* filename)
+{
+ size_t len;
+ HanjaTable* table = NULL;
+
+ if (filename == NULL)
+ filename = LIBHANGUL_DEFAULT_HANJA_DIC;
+
+ len = strlen(filename);
+ if (len > 4 &&
+ filename[len - 1] == 't' &&
+ filename[len - 2] == 'x' &&
+ filename[len - 3] == 't' &&
+ filename[len - 4] == '.') {
+ table = hanja_table_load_from_txt(filename);
+ }
+
+ if (table == NULL)
+ table = hanja_table_load_from_bin(filename);
+
+ if (table == NULL)
+ table = hanja_table_load_from_txt(filename);
+
+ return table;
}
void
hanja_table_delete(HanjaTable *table)
{
- if (table) {
- int i, j;
- for (j = 0; j < table->nmember; j++) {
- for (i = 0; i < table->base[j]->nitems; i++) {
- h_free((char*)table->base[j]->items[i]->key);
- h_free((char*)table->base[j]->items[i]->value);
- h_free((char*)table->base[j]->items[i]->comment);
- h_free(table->base[j]->items[i]);
- }
- h_free((char*)table->base[j]->key);
- h_free(table->base[j]->items);
- h_free(table->base[j]);
- }
- h_free(table->base);
- h_free(table);
+ if (table != NULL) {
+ table->destroy(table);
+ }
+}
+
+HanjaList*
+hanja_table_match_prefix(const HanjaTable* table, const char *key)
+{
+ char* p;
+ char* newkey;
+ HanjaList* ret = NULL;
+
+ if (key == NULL || key[0] == '\0')
+ return NULL;
+
+ newkey = strdup(key);
+ p = strchr(newkey, '\0');
+ while (newkey[0] != '\0') {
+ printf("%s: %s\n", __func__, newkey);
+ table->match(table, newkey, &ret);
+ p = utf8_prev(newkey, p);
+ p[0] = '\0';
}
+ free(newkey);
+
+ return ret;
+}
+
+HanjaList*
+hanja_table_match_suffix(const HanjaTable* table, const char *key)
+{
+ const char* p;
+ HanjaList* ret = NULL;
+
+ if (key == NULL || key[0] == '\0')
+ return NULL;
+
+ p = key;
+ while (p[0] != '\0') {
+ table->match(table, p, &ret);
+ p = utf8_next(p);
+ }
+
+ return ret;
}
int
hanja_list_get_size(const HanjaList *list)
{
if (list != NULL)
- return list->nitems;
+ return list->len;
return 0;
}
@@ -454,13 +1021,20 @@ const Hanja*
hanja_list_get_nth(const HanjaList *list, unsigned int n)
{
if (list != NULL) {
- if (n < list->nitems)
+ if (n < list->len)
return list->items[n];
}
return NULL;
}
const char*
+hanja_list_get_nth_key(const HanjaList *list, unsigned int n)
+{
+ const Hanja* hanja = hanja_list_get_nth(list, n);
+ return hanja_get_key(hanja);
+}
+
+const char*
hanja_list_get_nth_value(const HanjaList *list, unsigned int n)
{
const Hanja* hanja = hanja_list_get_nth(list, n);
@@ -478,9 +1052,9 @@ void
hanja_list_delete(HanjaList *list)
{
if (list) {
- h_free(list->items);
- h_free((char*)list->key);
- h_free(list);
+ free(list->items);
+ free(list->key);
+ free(list);
}
}
diff --git a/tools/Makefile.am b/tools/Makefile.am
new file mode 100644
index 0000000..fd0bc9b
--- /dev/null
+++ b/tools/Makefile.am
@@ -0,0 +1,6 @@
+
+bin_PROGRAMS = hanjac
+
+hanjac_CFLAGS =
+hanjac_SOURCES = hanjac.c
+hanjac_LDADD = ../hangul/libhangul.la
diff --git a/tools/hanjac.c b/tools/hanjac.c
new file mode 100644
index 0000000..7d6b67d
--- /dev/null
+++ b/tools/hanjac.c
@@ -0,0 +1,15 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "../hangul/hangul.h"
+
+int
+main(int argc, char *argv[])
+{
+ if (argc != 3)
+ return 1;
+
+ hanja_table_txt_to_bin(argv[1], argv[2]);
+
+ return 0;
+}