summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2008-05-10 15:23:28 +0200
committerBruno Haible <bruno@clisp.org>2008-05-10 15:23:28 +0200
commit3bcc3f3d73d014f863c561fd129b34c3e058feed (patch)
tree1f3b51a46897c6f3aa002a116637988dc01cf6c5
parent8abebfa303487917d282f2d423cfac8244c6ddc7 (diff)
downloadgnulib-3bcc3f3d73d014f863c561fd129b34c3e058feed.tar.gz
Use u8_conv_from_encoding instead of using special code for the conversion.
-rw-r--r--ChangeLog19
-rw-r--r--lib/unilbrk/ulc-common.c118
-rw-r--r--lib/unilbrk/ulc-common.h16
-rw-r--r--lib/unilbrk/ulc-possible-linebreaks.c138
-rw-r--r--lib/unilbrk/ulc-width-linebreaks.c165
-rw-r--r--modules/unilbrk/ulc-common1
-rw-r--r--modules/unilbrk/ulc-possible-linebreaks4
-rw-r--r--modules/unilbrk/ulc-width-linebreaks4
8 files changed, 148 insertions, 317 deletions
diff --git a/ChangeLog b/ChangeLog
index a533eead65..5a96a0fe2f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,24 @@
2008-05-10 Bruno Haible <bruno@clisp.org>
+ * lib/unilbrk/ulc-common.c: Don't include <stdlib.h>.
+ (iconv_string_length, iconv_string_keeping_offsets): Remove functions.
+ * lib/unilbrk/ulc-common.h (iconv_string_length,
+ iconv_string_keeping_offsets): Remove declarations.
+ * lib/unilbrk/ulc-possible-linebreaks.c: Include <string.h>, uniconv.h.
+ Don't include <iconv.h>, streq.h, xsize.h.
+ (ulc_possible_linebreaks): Use u8_conv_from_encoding for doing the
+ conversion.
+ * lib/unilbrk/ulc-width-linebreaks.c: Include uniconv.h. Don't include
+ <iconv.h>, streq.h, xsize.h.
+ (ulc_width_linebreaks): Use u8_conv_from_encoding for doing the
+ conversion.
+ * modules/unilbrk/ulc-common (Depends-on): Remove iconv.
+ * modules/unilbrk/ulc-possible-linebreaks (Depends-on): Add
+ uniconv/u8-conv-from-enc. Remove iconv_open, streq, xsize.
+ * modules/unilbrk/ulc-width-linebreaks (Depends-on): Likewise.
+
+2008-05-10 Bruno Haible <bruno@clisp.org>
+
* modules/unilbrk/ulc-width-linebreaks-tests: New file.
* tests/unilbrk/test-ulc-width-linebreaks.c: New file.
diff --git a/lib/unilbrk/ulc-common.c b/lib/unilbrk/ulc-common.c
index 3ab31c2d96..7bdfa44919 100644
--- a/lib/unilbrk/ulc-common.c
+++ b/lib/unilbrk/ulc-common.c
@@ -20,8 +20,6 @@
/* Specification. */
#include "unilbrk/ulc-common.h"
-#include <stdlib.h>
-
#include "c-ctype.h"
#include "streq.h"
@@ -33,122 +31,6 @@ is_utf8_encoding (const char *encoding)
return 0;
}
-#if HAVE_ICONV
-
-# include <errno.h>
-
-size_t
-iconv_string_length (iconv_t cd, const char *s, size_t n)
-{
-# define TMPBUFSIZE 4096
- size_t count = 0;
- char tmpbuf[TMPBUFSIZE];
- const char *inptr = s;
- size_t insize = n;
-
- while (insize > 0)
- {
- char *outptr = tmpbuf;
- size_t outsize = TMPBUFSIZE;
- size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
- if (res == (size_t)(-1) && errno != E2BIG
-# if !defined _LIBICONV_VERSION && !defined __GLIBC__
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. */
- || res > 0
-# endif
- )
- return (size_t)(-1);
- count += outptr - tmpbuf;
- }
- /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
-# if defined _LIBICONV_VERSION \
- || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
- {
- char *outptr = tmpbuf;
- size_t outsize = TMPBUFSIZE;
- size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
- if (res == (size_t)(-1))
- return (size_t)(-1);
- count += outptr - tmpbuf;
- }
- /* Return to the initial state. */
- iconv (cd, NULL, NULL, NULL, NULL);
-# endif
- return count;
-# undef TMPBUFSIZE
-}
-
-void
-iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
- size_t *offtable, char *t, size_t m)
-{
- size_t i;
- const char *s_end;
- const char *inptr;
- char *outptr;
- size_t outsize;
- /* Avoid glibc-2.1 bug. */
-# if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
- const size_t extra = 1;
-# else
- const size_t extra = 0;
-# endif
-
- for (i = 0; i < n; i++)
- offtable[i] = (size_t)(-1);
-
- s_end = s + n;
- inptr = s;
- outptr = t;
- outsize = m + extra;
- while (inptr < s_end)
- {
- const char *saved_inptr;
- size_t insize;
- size_t res;
-
- offtable[inptr - s] = outptr - t;
-
- saved_inptr = inptr;
- res = (size_t)(-1);
- for (insize = 1; inptr + insize <= s_end; insize++)
- {
- res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
- if (!(res == (size_t)(-1) && errno == EINVAL))
- break;
- /* We expect that no input bytes have been consumed so far. */
- if (inptr != saved_inptr)
- abort ();
- }
- /* After we verified the convertibility and computed the translation's
- size m, there shouldn't be any conversion error here. */
- if (res == (size_t)(-1)
-# if !defined _LIBICONV_VERSION && !defined __GLIBC__
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. */
- || res > 0
-# endif
- )
- abort ();
- }
- /* Avoid glibc-2.1 bug and Solaris 7 bug. */
-# if defined _LIBICONV_VERSION \
- || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
- if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
- abort ();
-# endif
- /* We should have produced exactly m output bytes. */
- if (outsize != extra)
- abort ();
-}
-
-#endif /* HAVE_ICONV */
-
#if C_CTYPE_ASCII
/* Tests whether a string is entirely ASCII. Returns 1 if yes.
diff --git a/lib/unilbrk/ulc-common.h b/lib/unilbrk/ulc-common.h
index bba8ec6d25..3b48188755 100644
--- a/lib/unilbrk/ulc-common.h
+++ b/lib/unilbrk/ulc-common.h
@@ -23,22 +23,6 @@
#define is_utf8_encoding unilbrk_is_utf8_encoding
extern int is_utf8_encoding (const char *encoding);
-#if HAVE_ICONV
-
-# include <iconv.h>
-
-/* Luckily, the encoding's name is platform independent. */
-# define UTF8_NAME "UTF-8"
-
-/* Return the length of a string after conversion through an iconv_t. */
-# define iconv_string_length unilbrk_iconv_string_length
-extern size_t iconv_string_length (iconv_t cd, const char *s, size_t n);
-
-# define iconv_string_keeping_offsets unilbrk_iconv_string_keeping_offsets
-extern void iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n, size_t *offtable, char *t, size_t m);
-
-#endif /* HAVE_ICONV */
-
#if C_CTYPE_ASCII
# define is_all_ascii unilbrk_is_all_ascii
diff --git a/lib/unilbrk/ulc-possible-linebreaks.c b/lib/unilbrk/ulc-possible-linebreaks.c
index 444e9a9461..74cd42a671 100644
--- a/lib/unilbrk/ulc-possible-linebreaks.c
+++ b/lib/unilbrk/ulc-possible-linebreaks.c
@@ -21,13 +21,10 @@
#include "unilbrk.h"
#include <stdlib.h>
-#if HAVE_ICONV
-# include <iconv.h>
-#endif
+#include <string.h>
#include "c-ctype.h"
-#include "streq.h"
-#include "xsize.h"
+#include "uniconv.h"
#include "unilbrk/ulc-common.h"
/* Line breaking of a string in an arbitrary encoding.
@@ -47,92 +44,73 @@ void
ulc_possible_linebreaks (const char *s, size_t n, const char *encoding,
char *p)
{
- if (n == 0)
- return;
- if (is_utf8_encoding (encoding))
- u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
- else
+ if (n > 0)
{
-#if HAVE_ICONV
- iconv_t to_utf8;
- /* Avoid glibc-2.1 bug with EUC-KR. */
-# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
- if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
- to_utf8 = (iconv_t)(-1);
+ if (is_utf8_encoding (encoding))
+ u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
else
-# endif
- /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
- GB18030. */
-# if defined __sun && !defined _LIBICONV_VERSION
- if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
- || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
- || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
- || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
- || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
- || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
- to_utf8 = (iconv_t)(-1);
- else
-# endif
- to_utf8 = iconv_open (UTF8_NAME, encoding);
- if (to_utf8 != (iconv_t)(-1))
{
- /* Determine the length of the resulting UTF-8 string. */
- size_t m = iconv_string_length (to_utf8, s, n);
- if (m != (size_t)(-1))
+ /* Convert the string to UTF-8 and build a translation table
+ from offsets into s to offsets into the translated string. */
+ size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
+
+ if (offsets != NULL)
{
- /* Convert the string to UTF-8 and build a translation table
- from offsets into s to offsets into the translated string. */
- size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
- char *memory =
- (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
- if (memory != NULL)
+ uint8_t *t = NULL;
+ size_t m;
+ if (u8_conv_from_encoding (encoding, iconveh_question_mark,
+ s, n, offsets, &t, &m)
+ == 0)
{
- size_t *offtable = (size_t *) memory;
- char *t = (char *) (offtable + n);
- char *q = (char *) (t + m);
- size_t i;
-
- iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
-
- /* Determine the possible line breaks of the UTF-8 string. */
- u8_possible_linebreaks ((const uint8_t *) t, m, encoding, q);
-
- /* Translate the result back to the original string. */
- memset (p, UC_BREAK_PROHIBITED, n);
- for (i = 0; i < n; i++)
- if (offtable[i] != (size_t)(-1))
- p[i] = q[offtable[i]];
-
- free (memory);
- iconv_close (to_utf8);
- return;
+ char *q = (char *) malloc (m);
+
+ if (q != NULL)
+ {
+ size_t i;
+
+ /* Determine the possible line breaks of the UTF-8
+ string. */
+ u8_possible_linebreaks (t, m, encoding, q);
+
+ /* Translate the result back to the original string. */
+ memset (p, UC_BREAK_PROHIBITED, n);
+ for (i = 0; i < n; i++)
+ if (offsets[i] != (size_t)(-1))
+ p[i] = q[offsets[i]];
+
+ free (q);
+ free (t);
+ free (offsets);
+ return;
+ }
+ free (t);
}
+ free (offsets);
}
- iconv_close (to_utf8);
- }
-#endif
- /* Impossible to convert. */
+
+ /* Impossible to convert. */
#if C_CTYPE_ASCII
- if (is_all_ascii (s, n))
- {
- /* ASCII is a subset of UTF-8. */
- u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
- return;
- }
+ if (is_all_ascii (s, n))
+ {
+ /* ASCII is a subset of UTF-8. */
+ u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
+ return;
+ }
#endif
- /* We have a non-ASCII string and cannot convert it.
- Don't produce line breaks except those already present in the
- input string. All we assume here is that the encoding is
- minimally ASCII compatible. */
- {
- const char *s_end = s + n;
- while (s < s_end)
+ /* We have a non-ASCII string and cannot convert it.
+ Don't produce line breaks except those already present in the
+ input string. All we assume here is that the encoding is
+ minimally ASCII compatible. */
{
- *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
- s++;
- p++;
+ const char *s_end = s + n;
+ while (s < s_end)
+ {
+ *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
+ s++;
+ p++;
+ }
}
- }
+ }
}
}
diff --git a/lib/unilbrk/ulc-width-linebreaks.c b/lib/unilbrk/ulc-width-linebreaks.c
index 892e01ff1e..5340a4deb5 100644
--- a/lib/unilbrk/ulc-width-linebreaks.c
+++ b/lib/unilbrk/ulc-width-linebreaks.c
@@ -22,13 +22,9 @@
#include <stdlib.h>
#include <string.h>
-#if HAVE_ICONV
-# include <iconv.h>
-#endif
#include "c-ctype.h"
-#include "streq.h"
-#include "xsize.h"
+#include "uniconv.h"
#include "unilbrk/ulc-common.h"
/* Line breaking of a string in an arbitrary encoding.
@@ -50,113 +46,90 @@ ulc_width_linebreaks (const char *s, size_t n,
const char *o, const char *encoding,
char *p)
{
- if (n == 0)
- return start_column;
- if (is_utf8_encoding (encoding))
- return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
- else
+ if (n > 0)
{
-#if HAVE_ICONV
- iconv_t to_utf8;
- /* Avoid glibc-2.1 bug with EUC-KR. */
-# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
- if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
- to_utf8 = (iconv_t)(-1);
+ if (is_utf8_encoding (encoding))
+ return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
else
-# endif
- /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
- GB18030. */
-# if defined __sun && !defined _LIBICONV_VERSION
- if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
- || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
- || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
- || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
- || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
- || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
- to_utf8 = (iconv_t)(-1);
- else
-# endif
- to_utf8 = iconv_open (UTF8_NAME, encoding);
- if (to_utf8 != (iconv_t)(-1))
{
- /* Determine the length of the resulting UTF-8 string. */
- size_t m = iconv_string_length (to_utf8, s, n);
- if (m != (size_t)(-1))
+ /* Convert the string to UTF-8 and build a translation table
+ from offsets into s to offsets into the translated string. */
+ size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
+
+ if (offsets != NULL)
{
- /* Convert the string to UTF-8 and build a translation table
- from offsets into s to offsets into the translated string. */
- size_t memory_size =
- xsum4 (xtimes (n, sizeof (size_t)), m, m,
- (o != NULL ? m : 0));
- char *memory =
- (char *)
- (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
- if (memory != NULL)
+ uint8_t *t = NULL;
+ size_t m;
+ if (u8_conv_from_encoding (encoding, iconveh_question_mark,
+ s, n, offsets, &t, &m)
+ == 0)
{
- size_t *offtable = (size_t *) memory;
- char *t = (char *) (offtable + n);
- char *q = (char *) (t + m);
- char *o8 = (o != NULL ? (char *) (q + m) : NULL);
- int res_column;
- size_t i;
+ char *memory = (char *) malloc (m + (o != NULL ? m : 0));
- iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
-
- /* Translate the overrides to the UTF-8 string. */
- if (o != NULL)
+ if (memory != NULL)
{
- memset (o8, UC_BREAK_UNDEFINED, m);
+ char *q = (char *) memory;
+ char *o8 = (o != NULL ? (char *) (q + m) : NULL);
+ int res_column;
+ size_t i;
+
+ /* Translate the overrides to the UTF-8 string. */
+ if (o != NULL)
+ {
+ memset (o8, UC_BREAK_UNDEFINED, m);
+ for (i = 0; i < n; i++)
+ if (offsets[i] != (size_t)(-1))
+ o8[offsets[i]] = o[i];
+ }
+
+ /* Determine the line breaks of the UTF-8 string. */
+ res_column =
+ u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q);
+
+ /* Translate the result back to the original string. */
+ memset (p, UC_BREAK_PROHIBITED, n);
for (i = 0; i < n; i++)
- if (offtable[i] != (size_t)(-1))
- o8[offtable[i]] = o[i];
- }
-
- /* Determine the line breaks of the UTF-8 string. */
- res_column =
- u8_width_linebreaks ((const uint8_t *) t, m, width, start_column, at_end_columns, o8, encoding, q);
-
- /* Translate the result back to the original string. */
- memset (p, UC_BREAK_PROHIBITED, n);
- for (i = 0; i < n; i++)
- if (offtable[i] != (size_t)(-1))
- p[i] = q[offtable[i]];
+ if (offsets[i] != (size_t)(-1))
+ p[i] = q[offsets[i]];
- free (memory);
- iconv_close (to_utf8);
- return res_column;
+ free (memory);
+ free (t);
+ free (offsets);
+ return res_column;
+ }
+ free (t);
}
+ free (offsets);
}
- iconv_close (to_utf8);
- }
-#endif
- /* Impossible to convert. */
+ /* Impossible to convert. */
#if C_CTYPE_ASCII
- if (is_all_ascii (s, n))
- {
- /* ASCII is a subset of UTF-8. */
- return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
- }
+ if (is_all_ascii (s, n))
+ {
+ /* ASCII is a subset of UTF-8. */
+ return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
+ }
#endif
- /* We have a non-ASCII string and cannot convert it.
- Don't produce line breaks except those already present in the
- input string. All we assume here is that the encoding is
- minimally ASCII compatible. */
- {
- const char *s_end = s + n;
- while (s < s_end)
+ /* We have a non-ASCII string and cannot convert it.
+ Don't produce line breaks except those already present in the
+ input string. All we assume here is that the encoding is
+ minimally ASCII compatible. */
{
- *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
- ? UC_BREAK_MANDATORY
- : UC_BREAK_PROHIBITED);
- s++;
- p++;
- if (o != NULL)
- o++;
+ const char *s_end = s + n;
+ while (s < s_end)
+ {
+ *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
+ ? UC_BREAK_MANDATORY
+ : UC_BREAK_PROHIBITED);
+ s++;
+ p++;
+ if (o != NULL)
+ o++;
+ }
+ /* We cannot compute widths in this case. */
}
- /* We cannot compute widths in this case. */
- return start_column;
- }
+ }
}
+ return start_column;
}
diff --git a/modules/unilbrk/ulc-common b/modules/unilbrk/ulc-common
index 2b6c00d24a..4729f662f8 100644
--- a/modules/unilbrk/ulc-common
+++ b/modules/unilbrk/ulc-common
@@ -7,7 +7,6 @@ lib/unilbrk/ulc-common.c
Depends-on:
c-ctype
-iconv
streq
configure.ac:
diff --git a/modules/unilbrk/ulc-possible-linebreaks b/modules/unilbrk/ulc-possible-linebreaks
index 2b3587a815..f7cc283e3f 100644
--- a/modules/unilbrk/ulc-possible-linebreaks
+++ b/modules/unilbrk/ulc-possible-linebreaks
@@ -8,10 +8,8 @@ Depends-on:
unilbrk/base
unilbrk/u8-possible-linebreaks
unilbrk/ulc-common
+uniconv/u8-conv-from-enc
c-ctype
-iconv_open
-streq
-xsize
configure.ac:
diff --git a/modules/unilbrk/ulc-width-linebreaks b/modules/unilbrk/ulc-width-linebreaks
index e84e1f76f0..5763e8508f 100644
--- a/modules/unilbrk/ulc-width-linebreaks
+++ b/modules/unilbrk/ulc-width-linebreaks
@@ -8,10 +8,8 @@ Depends-on:
unilbrk/base
unilbrk/u8-width-linebreaks
unilbrk/ulc-common
+uniconv/u8-conv-from-enc
c-ctype
-iconv_open
-streq
-xsize
configure.ac: