/* unicode.c - functions to convert unicode characters */
/* Copyright (C) 2010 Free Software Foundation, Inc.
This file is part of GNU Bash, the Bourne Again SHell.
Bash is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Bash is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Bash. If not, see .
*/
#include
#if defined (HANDLE_MULTIBYTE)
#include
#include
#include
#ifdef HAVE_UNISTD_H
#include
#endif
#include
#if HAVE_ICONV
# include
#endif
#include
#ifndef USHORT_MAX
# ifdef USHRT_MAX
# define USHORT_MAX USHRT_MAX
# else
# define USHORT_MAX ((unsigned short) ~(unsigned short)0)
# endif
#endif
#if !defined (STREQ)
# define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
#endif /* !STREQ */
#if defined (HAVE_LOCALE_CHARSET)
extern const char *locale_charset __P((void));
#else
extern char *get_locale_var __P((char *));
#endif
static int u32init = 0;
static int utf8locale = 0;
#if defined (HAVE_ICONV)
static iconv_t localconv;
#endif
#ifndef HAVE_LOCALE_CHARSET
static char *
stub_charset ()
{
char *locale, *s, *t;
locale = get_locale_var ("LC_CTYPE");
if (locale == 0 || *locale == 0)
return "ASCII";
s = strrchr (locale, '.');
if (s)
{
t = strchr (s, '@');
if (t)
*t = 0;
return ++s;
}
else if (STREQ (locale, "UTF-8"))
return "UTF-8";
else
return "ASCII";
}
#endif
/* u32toascii ? */
int
u32tochar (wc, s)
wchar_t wc;
char *s;
{
unsigned long x;
int l;
x = wc;
l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4);
if (x <= UCHAR_MAX)
s[0] = x & 0xFF;
else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */
{
s[0] = (x >> 8) & 0xFF;
s[1] = x & 0xFF;
}
else
{
s[0] = (x >> 24) & 0xFF;
s[1] = (x >> 16) & 0xFF;
s[2] = (x >> 8) & 0xFF;
s[3] = x & 0xFF;
}
s[l] = '\0';
return l;
}
int
u32toutf8 (wc, s)
wchar_t wc;
char *s;
{
int l;
l = (wc < 0x0080) ? 1 : ((wc < 0x0800) ? 2 : 3);
if (wc < 0x0080)
s[0] = (unsigned char)wc;
else if (wc < 0x0800)
{
s[0] = (wc >> 6) | 0xc0;
s[1] = (wc & 0x3f) | 0x80;
}
else
{
s[0] = (wc >> 12) | 0xe0;
s[1] = ((wc >> 6) & 0x3f) | 0x80;
s[2] = (wc & 0x3f) | 0x80;
}
s[l] = '\0';
return l;
}
/* convert a single unicode-32 character into a multibyte string and put the
result in S, which must be large enough (at least MB_LEN_MAX bytes) */
int
u32cconv (c, s)
unsigned long c;
char *s;
{
wchar_t wc;
int n;
#if HAVE_ICONV
const char *charset;
char obuf[25], *optr;
size_t obytesleft;
const char *iptr;
size_t sn;
#endif
wc = c;
#if __STDC_ISO_10646__
if (sizeof (wchar_t) == 4)
{
n = wctomb (wc, s);
return n;
}
#endif
#if HAVE_NL_LANGINFO
codeset = nl_langinfo (CODESET);
if (STREQ (codeset, "UTF-8"))
{
n = u32toutf8 (wc, s);
return n;
}
#endif
#if HAVE_ICONV
/* this is mostly from coreutils-8.5/lib/unicodeio.c */
if (u32init == 0)
{
# if HAVE_LOCALE_CHARSET
charset = locale_charset (); /* XXX - fix later */
# else
charset = stub_charset ();
# endif
if (STREQ (charset, "UTF-8"))
utf8locale = 1;
else
{
localconv = iconv_open (charset, "UTF-8");
if (localconv == (iconv_t)-1)
localconv = iconv_open (charset, "ASCII");
}
u32init = 1;
}
if (utf8locale)
{
n = u32toutf8 (wc, s);
return n;
}
if (localconv == (iconv_t)-1)
{
n = u32tochar (wc, s);
return n;
}
n = u32toutf8 (wc, s);
optr = obuf;
obytesleft = sizeof (obuf);
iptr = s;
sn = n;
iconv (localconv, NULL, NULL, NULL, NULL);
if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1)
return n; /* You get utf-8 if iconv fails */
*optr = '\0';
/* number of chars to be copied is optr - obuf if we want to do bounds
checking */
strcpy (s, obuf);
return (optr - obuf);
#endif
n = u32tochar (wc, s); /* fallback */
return n;
}
#endif /* HANDLE_MULTIBYTE */