diff options
author | simonmar <unknown> | 2005-03-14 12:18:08 +0000 |
---|---|---|
committer | simonmar <unknown> | 2005-03-14 12:18:08 +0000 |
commit | 2b179f023136bb22103856f74e44de281f31be82 (patch) | |
tree | d25c440f898312aad70dcfb49085f48ee84a43c9 /libraries/base/GHC/Unicode.hs | |
parent | 9cc3315f705cd70d6d1432377a8c2a5d9a81fef4 (diff) | |
download | haskell-2b179f023136bb22103856f74e44de281f31be82.tar.gz |
[project @ 2005-03-14 12:18:05 by simonmar]
Add Dimitry Golubovsky <dimitry@golubovsky.org>'s Unicode character
class implementation. This will remove the dependency on libc's
locale code and give us much more consistent support for Unicode
across platforms.
Diffstat (limited to 'libraries/base/GHC/Unicode.hs')
-rw-r--r-- | libraries/base/GHC/Unicode.hs | 187 |
1 files changed, 142 insertions, 45 deletions
diff --git a/libraries/base/GHC/Unicode.hs b/libraries/base/GHC/Unicode.hs index 52d14f37c8..8aefad5f2e 100644 --- a/libraries/base/GHC/Unicode.hs +++ b/libraries/base/GHC/Unicode.hs @@ -1,4 +1,5 @@ -{-# OPTIONS_GHC -fno-implicit-prelude #-} +{-# OPTIONS -fno-implicit-prelude #-} +{-# OPTIONS -#include "WCsubst.h" #-} ----------------------------------------------------------------------------- -- | -- Module : GHC.Unicde @@ -15,14 +16,21 @@ -- ----------------------------------------------------------------------------- --- #hide module GHC.Unicode ( + GeneralCategory (..), + generalCategory, isAscii, isLatin1, isControl, isAsciiUpper, isAsciiLower, isPrint, isSpace, isUpper, isLower, isAlpha, isDigit, isOctDigit, isHexDigit, isAlphaNum, - toUpper, toLower, + toUpper, toLower, toTitle, + isLetter, -- :: Char -> Bool + isMark, -- :: Char -> Bool + isNumber, -- :: Char -> Bool + isPunctuation, -- :: Char -> Bool + isSymbol, -- :: Char -> Bool + isSeparator, -- :: Char -> Bool ) where import GHC.Base @@ -30,9 +38,106 @@ import GHC.Real (fromIntegral) import GHC.Int import GHC.Word import GHC.Num (fromInteger) +import GHC.Read +import GHC.Show +import GHC.Enum #include "HsBaseConfig.h" +-- | Unicode General Categories (column 2 of the UnicodeData table) +-- in the order they are listed in the Unicode standard. + +data GeneralCategory + = UppercaseLetter -- Lu Letter, Uppercase + | LowercaseLetter -- Ll Letter, Lowercase + | TitlecaseLetter -- Lt Letter, Titlecase + | ModifierLetter -- Lm Letter, Modifier + | OtherLetter -- Lo Letter, Other + | NonSpacingMark -- Mn Mark, Non-Spacing + | SpacingCombiningMark -- Mc Mark, Spacing Combining + | EnclosingMark -- Me Mark, Enclosing + | DecimalNumber -- Nd Number, Decimal + | LetterNumber -- Nl Number, Letter + | OtherNumber -- No Number, Other + | ConnectorPunctuation -- Pc Punctuation, Connector + | DashPunctuation -- Pd Punctuation, Dash + | OpenPunctuation -- Ps Punctuation, Open + | ClosePunctuation -- Pe Punctuation, Close + | InitialQuote -- Pi Punctuation, Initial quote + | FinalQuote -- Pf Punctuation, Final quote + | OtherPunctuation -- Po Punctuation, Other + | MathSymbol -- Sm Symbol, Math + | CurrencySymbol -- Sc Symbol, Currency + | ModifierSymbol -- Sk Symbol, Modifier + | OtherSymbol -- So Symbol, Other + | Space -- Zs Separator, Space + | LineSeparator -- Zl Separator, Line + | ParagraphSeparator -- Zp Separator, Paragraph + | Control -- Cc Other, Control + | Format -- Cf Other, Format + | Surrogate -- Cs Other, Surrogate + | PrivateUse -- Co Other, Private Use + | NotAssigned -- Cn Other, Not Assigned + deriving (Eq, Ord, Enum, Read, Show, Bounded) + +-- | Retrieves the general Unicode category of the character. +generalCategory :: Char -> GeneralCategory +generalCategory c = toEnum (wgencat (fromIntegral (ord c))) + +-- ------------------------------------------------------------------------ +-- These are copied from Hugs Unicode.hs + +-- derived character classifiers + +isLetter :: Char -> Bool +isLetter c = case generalCategory c of + UppercaseLetter -> True + LowercaseLetter -> True + TitlecaseLetter -> True + ModifierLetter -> True + OtherLetter -> True + _ -> False + +isMark :: Char -> Bool +isMark c = case generalCategory c of + NonSpacingMark -> True + SpacingCombiningMark -> True + EnclosingMark -> True + _ -> False + +isNumber :: Char -> Bool +isNumber c = case generalCategory c of + DecimalNumber -> True + LetterNumber -> True + OtherNumber -> True + _ -> False + +isPunctuation :: Char -> Bool +isPunctuation c = case generalCategory c of + ConnectorPunctuation -> True + DashPunctuation -> True + OpenPunctuation -> True + ClosePunctuation -> True + InitialQuote -> True + FinalQuote -> True + OtherPunctuation -> True + _ -> False + +isSymbol :: Char -> Bool +isSymbol c = case generalCategory c of + MathSymbol -> True + CurrencySymbol -> True + ModifierSymbol -> True + OtherSymbol -> True + _ -> False + +isSeparator :: Char -> Bool +isSeparator c = case generalCategory c of + Space -> True + LineSeparator -> True + ParagraphSeparator -> True + _ -> False + -- | Selects the first 128 characters of the Unicode character set, -- corresponding to the ASCII character set. isAscii :: Char -> Bool @@ -67,7 +172,8 @@ isSpace c = c == ' ' || c == '\r' || c == '\f' || c == '\v' || - c == '\xa0' + c == '\xa0' || + iswspace (fromIntegral (ord c)) /= 0 -- | Selects alphabetic Unicode characters (letters) that are not lower-case. -- (In Unicode terms, this includes letters in upper and title cases, @@ -78,15 +184,6 @@ isUpper :: Char -> Bool isLower :: Char -> Bool -- | Selects alphabetic Unicode characters (letters). --- --- Note: the Haskell 98 definition of 'isAlpha' is: --- --- > isAlpha c = isUpper c || isLower c --- --- the implementation here diverges from the Haskell 98 --- definition in the sense that Unicode alphabetic characters which --- are neither upper nor lower case will still be identified as --- alphabetic by 'isAlpha'. isAlpha :: Char -> Bool -- | Selects alphabetic or numeric digit Unicode characters. @@ -120,21 +217,14 @@ toUpper :: Char -> Char toLower :: Char -> Char -- ----------------------------------------------------------------------------- --- Win32 implementation +-- Implementation with the supplied auto-generated Unicode character properties +-- table (default) -#if (defined(HAVE_WCTYPE_H) && HAVE_ISWSPACE && defined(HTYPE_WINT_T)) || mingw32_HOST_OS +#if 1 --- Use the wide-char classification functions if available. Glibc --- seems to implement these properly, even for chars > 0xffff, as long --- as you call setlocale() to set the locale to something other than --- "C". Therefore, we call setlocale() in hs_init(). - --- Win32 uses UTF-16, so presumably the system-supplied iswlower() and --- friends won't work properly with characters > 0xffff. These --- characters are represented as surrogate pairs in UTF-16. +-- Regardless of the O/S and Library, use the functions contained in WCsubst.c type WInt = HTYPE_WINT_T -type CInt = HTYPE_INT isDigit c = iswdigit (fromIntegral (ord c)) /= 0 isAlpha c = iswalpha (fromIntegral (ord c)) /= 0 @@ -147,39 +237,46 @@ isLower c = iswlower (fromIntegral (ord c)) /= 0 toLower c = chr (fromIntegral (towlower (fromIntegral (ord c)))) toUpper c = chr (fromIntegral (towupper (fromIntegral (ord c)))) +toTitle c = chr (fromIntegral (towtitle (fromIntegral (ord c)))) + +foreign import ccall unsafe "u_iswdigit" + iswdigit :: CInt -> CInt + +foreign import ccall unsafe "u_iswalpha" + iswalpha :: CInt -> CInt -foreign import ccall unsafe "iswdigit" - iswdigit :: WInt -> CInt +foreign import ccall unsafe "u_iswalnum" + iswalnum :: CInt -> CInt -foreign import ccall unsafe "iswalpha" - iswalpha :: WInt -> CInt +foreign import ccall unsafe "u_iswcntrl" + iswcntrl :: CInt -> CInt -foreign import ccall unsafe "iswalnum" - iswalnum :: WInt -> CInt +foreign import ccall unsafe "u_iswspace" + iswspace :: CInt -> CInt -foreign import ccall unsafe "iswcntrl" - iswcntrl :: WInt -> CInt +foreign import ccall unsafe "u_iswprint" + iswprint :: CInt -> CInt -foreign import ccall unsafe "iswspace" - iswspace :: WInt -> CInt +foreign import ccall unsafe "u_iswlower" + iswlower :: CInt -> CInt -foreign import ccall unsafe "iswprint" - iswprint :: WInt -> CInt +foreign import ccall unsafe "u_iswupper" + iswupper :: CInt -> CInt -foreign import ccall unsafe "iswlower" - iswlower :: WInt -> CInt +foreign import ccall unsafe "u_towlower" + towlower :: CInt -> CInt -foreign import ccall unsafe "iswupper" - iswupper :: WInt -> CInt +foreign import ccall unsafe "u_towupper" + towupper :: CInt -> CInt -foreign import ccall unsafe "towlower" - towlower :: WInt -> WInt +foreign import ccall unsafe "u_towtitle" + towtitle :: CInt -> CInt -foreign import ccall unsafe "towupper" - towupper :: WInt -> WInt +foreign import ccall unsafe "u_gencat" + wgencat :: CInt -> Int -- ----------------------------------------------------------------------------- --- No libunicode, so fall back to the ASCII-only implementation +-- No libunicode, so fall back to the ASCII-only implementation (never used, indeed) #else |