summaryrefslogtreecommitdiff
path: root/libraries/base/GHC/Unicode.hs
diff options
context:
space:
mode:
authorsimonmar <unknown>2005-03-14 12:18:08 +0000
committersimonmar <unknown>2005-03-14 12:18:08 +0000
commit2b179f023136bb22103856f74e44de281f31be82 (patch)
treed25c440f898312aad70dcfb49085f48ee84a43c9 /libraries/base/GHC/Unicode.hs
parent9cc3315f705cd70d6d1432377a8c2a5d9a81fef4 (diff)
downloadhaskell-2b179f023136bb22103856f74e44de281f31be82.tar.gz
[project @ 2005-03-14 12:18:05 by simonmar]
Add Dimitry Golubovsky <dimitry@golubovsky.org>'s Unicode character class implementation. This will remove the dependency on libc's locale code and give us much more consistent support for Unicode across platforms.
Diffstat (limited to 'libraries/base/GHC/Unicode.hs')
-rw-r--r--libraries/base/GHC/Unicode.hs187
1 files changed, 142 insertions, 45 deletions
diff --git a/libraries/base/GHC/Unicode.hs b/libraries/base/GHC/Unicode.hs
index 52d14f37c8..8aefad5f2e 100644
--- a/libraries/base/GHC/Unicode.hs
+++ b/libraries/base/GHC/Unicode.hs
@@ -1,4 +1,5 @@
-{-# OPTIONS_GHC -fno-implicit-prelude #-}
+{-# OPTIONS -fno-implicit-prelude #-}
+{-# OPTIONS -#include "WCsubst.h" #-}
-----------------------------------------------------------------------------
-- |
-- Module : GHC.Unicde
@@ -15,14 +16,21 @@
--
-----------------------------------------------------------------------------
--- #hide
module GHC.Unicode (
+ GeneralCategory (..),
+ generalCategory,
isAscii, isLatin1, isControl,
isAsciiUpper, isAsciiLower,
isPrint, isSpace, isUpper,
isLower, isAlpha, isDigit,
isOctDigit, isHexDigit, isAlphaNum,
- toUpper, toLower,
+ toUpper, toLower, toTitle,
+ isLetter, -- :: Char -> Bool
+ isMark, -- :: Char -> Bool
+ isNumber, -- :: Char -> Bool
+ isPunctuation, -- :: Char -> Bool
+ isSymbol, -- :: Char -> Bool
+ isSeparator, -- :: Char -> Bool
) where
import GHC.Base
@@ -30,9 +38,106 @@ import GHC.Real (fromIntegral)
import GHC.Int
import GHC.Word
import GHC.Num (fromInteger)
+import GHC.Read
+import GHC.Show
+import GHC.Enum
#include "HsBaseConfig.h"
+-- | Unicode General Categories (column 2 of the UnicodeData table)
+-- in the order they are listed in the Unicode standard.
+
+data GeneralCategory
+ = UppercaseLetter -- Lu Letter, Uppercase
+ | LowercaseLetter -- Ll Letter, Lowercase
+ | TitlecaseLetter -- Lt Letter, Titlecase
+ | ModifierLetter -- Lm Letter, Modifier
+ | OtherLetter -- Lo Letter, Other
+ | NonSpacingMark -- Mn Mark, Non-Spacing
+ | SpacingCombiningMark -- Mc Mark, Spacing Combining
+ | EnclosingMark -- Me Mark, Enclosing
+ | DecimalNumber -- Nd Number, Decimal
+ | LetterNumber -- Nl Number, Letter
+ | OtherNumber -- No Number, Other
+ | ConnectorPunctuation -- Pc Punctuation, Connector
+ | DashPunctuation -- Pd Punctuation, Dash
+ | OpenPunctuation -- Ps Punctuation, Open
+ | ClosePunctuation -- Pe Punctuation, Close
+ | InitialQuote -- Pi Punctuation, Initial quote
+ | FinalQuote -- Pf Punctuation, Final quote
+ | OtherPunctuation -- Po Punctuation, Other
+ | MathSymbol -- Sm Symbol, Math
+ | CurrencySymbol -- Sc Symbol, Currency
+ | ModifierSymbol -- Sk Symbol, Modifier
+ | OtherSymbol -- So Symbol, Other
+ | Space -- Zs Separator, Space
+ | LineSeparator -- Zl Separator, Line
+ | ParagraphSeparator -- Zp Separator, Paragraph
+ | Control -- Cc Other, Control
+ | Format -- Cf Other, Format
+ | Surrogate -- Cs Other, Surrogate
+ | PrivateUse -- Co Other, Private Use
+ | NotAssigned -- Cn Other, Not Assigned
+ deriving (Eq, Ord, Enum, Read, Show, Bounded)
+
+-- | Retrieves the general Unicode category of the character.
+generalCategory :: Char -> GeneralCategory
+generalCategory c = toEnum (wgencat (fromIntegral (ord c)))
+
+-- ------------------------------------------------------------------------
+-- These are copied from Hugs Unicode.hs
+
+-- derived character classifiers
+
+isLetter :: Char -> Bool
+isLetter c = case generalCategory c of
+ UppercaseLetter -> True
+ LowercaseLetter -> True
+ TitlecaseLetter -> True
+ ModifierLetter -> True
+ OtherLetter -> True
+ _ -> False
+
+isMark :: Char -> Bool
+isMark c = case generalCategory c of
+ NonSpacingMark -> True
+ SpacingCombiningMark -> True
+ EnclosingMark -> True
+ _ -> False
+
+isNumber :: Char -> Bool
+isNumber c = case generalCategory c of
+ DecimalNumber -> True
+ LetterNumber -> True
+ OtherNumber -> True
+ _ -> False
+
+isPunctuation :: Char -> Bool
+isPunctuation c = case generalCategory c of
+ ConnectorPunctuation -> True
+ DashPunctuation -> True
+ OpenPunctuation -> True
+ ClosePunctuation -> True
+ InitialQuote -> True
+ FinalQuote -> True
+ OtherPunctuation -> True
+ _ -> False
+
+isSymbol :: Char -> Bool
+isSymbol c = case generalCategory c of
+ MathSymbol -> True
+ CurrencySymbol -> True
+ ModifierSymbol -> True
+ OtherSymbol -> True
+ _ -> False
+
+isSeparator :: Char -> Bool
+isSeparator c = case generalCategory c of
+ Space -> True
+ LineSeparator -> True
+ ParagraphSeparator -> True
+ _ -> False
+
-- | Selects the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.
isAscii :: Char -> Bool
@@ -67,7 +172,8 @@ isSpace c = c == ' ' ||
c == '\r' ||
c == '\f' ||
c == '\v' ||
- c == '\xa0'
+ c == '\xa0' ||
+ iswspace (fromIntegral (ord c)) /= 0
-- | Selects alphabetic Unicode characters (letters) that are not lower-case.
-- (In Unicode terms, this includes letters in upper and title cases,
@@ -78,15 +184,6 @@ isUpper :: Char -> Bool
isLower :: Char -> Bool
-- | Selects alphabetic Unicode characters (letters).
---
--- Note: the Haskell 98 definition of 'isAlpha' is:
---
--- > isAlpha c = isUpper c || isLower c
---
--- the implementation here diverges from the Haskell 98
--- definition in the sense that Unicode alphabetic characters which
--- are neither upper nor lower case will still be identified as
--- alphabetic by 'isAlpha'.
isAlpha :: Char -> Bool
-- | Selects alphabetic or numeric digit Unicode characters.
@@ -120,21 +217,14 @@ toUpper :: Char -> Char
toLower :: Char -> Char
-- -----------------------------------------------------------------------------
--- Win32 implementation
+-- Implementation with the supplied auto-generated Unicode character properties
+-- table (default)
-#if (defined(HAVE_WCTYPE_H) && HAVE_ISWSPACE && defined(HTYPE_WINT_T)) || mingw32_HOST_OS
+#if 1
--- Use the wide-char classification functions if available. Glibc
--- seems to implement these properly, even for chars > 0xffff, as long
--- as you call setlocale() to set the locale to something other than
--- "C". Therefore, we call setlocale() in hs_init().
-
--- Win32 uses UTF-16, so presumably the system-supplied iswlower() and
--- friends won't work properly with characters > 0xffff. These
--- characters are represented as surrogate pairs in UTF-16.
+-- Regardless of the O/S and Library, use the functions contained in WCsubst.c
type WInt = HTYPE_WINT_T
-type CInt = HTYPE_INT
isDigit c = iswdigit (fromIntegral (ord c)) /= 0
isAlpha c = iswalpha (fromIntegral (ord c)) /= 0
@@ -147,39 +237,46 @@ isLower c = iswlower (fromIntegral (ord c)) /= 0
toLower c = chr (fromIntegral (towlower (fromIntegral (ord c))))
toUpper c = chr (fromIntegral (towupper (fromIntegral (ord c))))
+toTitle c = chr (fromIntegral (towtitle (fromIntegral (ord c))))
+
+foreign import ccall unsafe "u_iswdigit"
+ iswdigit :: CInt -> CInt
+
+foreign import ccall unsafe "u_iswalpha"
+ iswalpha :: CInt -> CInt
-foreign import ccall unsafe "iswdigit"
- iswdigit :: WInt -> CInt
+foreign import ccall unsafe "u_iswalnum"
+ iswalnum :: CInt -> CInt
-foreign import ccall unsafe "iswalpha"
- iswalpha :: WInt -> CInt
+foreign import ccall unsafe "u_iswcntrl"
+ iswcntrl :: CInt -> CInt
-foreign import ccall unsafe "iswalnum"
- iswalnum :: WInt -> CInt
+foreign import ccall unsafe "u_iswspace"
+ iswspace :: CInt -> CInt
-foreign import ccall unsafe "iswcntrl"
- iswcntrl :: WInt -> CInt
+foreign import ccall unsafe "u_iswprint"
+ iswprint :: CInt -> CInt
-foreign import ccall unsafe "iswspace"
- iswspace :: WInt -> CInt
+foreign import ccall unsafe "u_iswlower"
+ iswlower :: CInt -> CInt
-foreign import ccall unsafe "iswprint"
- iswprint :: WInt -> CInt
+foreign import ccall unsafe "u_iswupper"
+ iswupper :: CInt -> CInt
-foreign import ccall unsafe "iswlower"
- iswlower :: WInt -> CInt
+foreign import ccall unsafe "u_towlower"
+ towlower :: CInt -> CInt
-foreign import ccall unsafe "iswupper"
- iswupper :: WInt -> CInt
+foreign import ccall unsafe "u_towupper"
+ towupper :: CInt -> CInt
-foreign import ccall unsafe "towlower"
- towlower :: WInt -> WInt
+foreign import ccall unsafe "u_towtitle"
+ towtitle :: CInt -> CInt
-foreign import ccall unsafe "towupper"
- towupper :: WInt -> WInt
+foreign import ccall unsafe "u_gencat"
+ wgencat :: CInt -> Int
-- -----------------------------------------------------------------------------
--- No libunicode, so fall back to the ASCII-only implementation
+-- No libunicode, so fall back to the ASCII-only implementation (never used, indeed)
#else