diff options
author | Ben Gamari <bgamari.foss@gmail.com> | 2015-09-08 08:38:40 -0500 |
---|---|---|
committer | Austin Seipp <austin@well-typed.com> | 2015-09-08 08:39:05 -0500 |
commit | e4a73f4fa1cc9681aee3ce13ee15073deed54635 (patch) | |
tree | 4ae445764e5c7615705238cafb8533de9c359f2b | |
parent | 8be43dd966c9c56e530eab266d6bf2710f9b07f4 (diff) | |
download | haskell-e4a73f4fa1cc9681aee3ce13ee15073deed54635.tar.gz |
Move GeneralCategory et al to GHC.Unicode
This allows these to be used from Text.Read.Lex import cycles.
Reviewed By: thomie, austin
Differential Revision: https://phabricator.haskell.org/D1121
GHC Trac Issues: #10444
-rw-r--r-- | libraries/base/Data/Char.hs | 209 | ||||
-rw-r--r-- | libraries/base/GHC/Read.hs | 4 | ||||
-rw-r--r-- | libraries/base/GHC/Show.hs | 2 | ||||
-rw-r--r-- | libraries/base/GHC/Unicode.hs | 215 |
4 files changed, 219 insertions, 211 deletions
diff --git a/libraries/base/Data/Char.hs b/libraries/base/Data/Char.hs index e4e7fbfcb8..c8dd9331c6 100644 --- a/libraries/base/Data/Char.hs +++ b/libraries/base/Data/Char.hs @@ -53,14 +53,12 @@ module Data.Char ) where import GHC.Base -import GHC.Arr (Ix) import GHC.Char import GHC.Real (fromIntegral) import GHC.Show -import GHC.Read (Read, readLitChar, lexLitChar) +import GHC.Read (readLitChar, lexLitChar) import GHC.Unicode import GHC.Num -import GHC.Enum -- $setup -- Allow the use of Prelude in doctests. @@ -105,121 +103,6 @@ digitToInt c hexl = ord c - ord 'a' hexu = ord c - ord 'A' --- | Unicode General Categories (column 2 of the UnicodeData table) in --- the order they are listed in the Unicode standard (the Unicode --- Character Database, in particular). --- --- ==== __Examples__ --- --- Basic usage: --- --- >>> :t OtherLetter --- OtherLetter :: GeneralCategory --- --- 'Eq' instance: --- --- >>> UppercaseLetter == UppercaseLetter --- True --- >>> UppercaseLetter == LowercaseLetter --- False --- --- 'Ord' instance: --- --- >>> NonSpacingMark <= MathSymbol --- True --- --- 'Enum' instance: --- --- >>> enumFromTo ModifierLetter SpacingCombiningMark --- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark] --- --- 'Read' instance: --- --- >>> read "DashPunctuation" :: GeneralCategory --- DashPunctuation --- >>> read "17" :: GeneralCategory --- *** Exception: Prelude.read: no parse --- --- 'Show' instance: --- --- >>> show EnclosingMark --- "EnclosingMark" --- --- 'Bounded' instance: --- --- >>> minBound :: GeneralCategory --- UppercaseLetter --- >>> maxBound :: GeneralCategory --- NotAssigned --- --- 'Ix' instance: --- --- >>> import Data.Ix ( index ) --- >>> index (OtherLetter,Control) FinalQuote --- 12 --- >>> index (OtherLetter,Control) Format --- *** Exception: Error in array index --- -data GeneralCategory - = UppercaseLetter -- ^ Lu: Letter, Uppercase - | LowercaseLetter -- ^ Ll: Letter, Lowercase - | TitlecaseLetter -- ^ Lt: Letter, Titlecase - | ModifierLetter -- ^ Lm: Letter, Modifier - | OtherLetter -- ^ Lo: Letter, Other - | NonSpacingMark -- ^ Mn: Mark, Non-Spacing - | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining - | EnclosingMark -- ^ Me: Mark, Enclosing - | DecimalNumber -- ^ Nd: Number, Decimal - | LetterNumber -- ^ Nl: Number, Letter - | OtherNumber -- ^ No: Number, Other - | ConnectorPunctuation -- ^ Pc: Punctuation, Connector - | DashPunctuation -- ^ Pd: Punctuation, Dash - | OpenPunctuation -- ^ Ps: Punctuation, Open - | ClosePunctuation -- ^ Pe: Punctuation, Close - | InitialQuote -- ^ Pi: Punctuation, Initial quote - | FinalQuote -- ^ Pf: Punctuation, Final quote - | OtherPunctuation -- ^ Po: Punctuation, Other - | MathSymbol -- ^ Sm: Symbol, Math - | CurrencySymbol -- ^ Sc: Symbol, Currency - | ModifierSymbol -- ^ Sk: Symbol, Modifier - | OtherSymbol -- ^ So: Symbol, Other - | Space -- ^ Zs: Separator, Space - | LineSeparator -- ^ Zl: Separator, Line - | ParagraphSeparator -- ^ Zp: Separator, Paragraph - | Control -- ^ Cc: Other, Control - | Format -- ^ Cf: Other, Format - | Surrogate -- ^ Cs: Other, Surrogate - | PrivateUse -- ^ Co: Other, Private Use - | NotAssigned -- ^ Cn: Other, Not Assigned - deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix) - --- | The Unicode general category of the character. This relies on the --- 'Enum' instance of 'GeneralCategory', which must remain in the --- same order as the categories are presented in the Unicode --- standard. --- --- ==== __Examples__ --- --- Basic usage: --- --- >>> generalCategory 'a' --- LowercaseLetter --- >>> generalCategory 'A' --- UppercaseLetter --- >>> generalCategory '0' --- DecimalNumber --- >>> generalCategory '%' --- OtherPunctuation --- >>> generalCategory '♥' --- OtherSymbol --- >>> generalCategory '\31' --- Control --- >>> generalCategory ' ' --- Space --- -generalCategory :: Char -> GeneralCategory -generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c - -- derived character classifiers -- | Selects alphabetic Unicode characters (lower-case, upper-case and @@ -360,96 +243,6 @@ isNumber c = case generalCategory c of OtherNumber -> True _ -> False --- | Selects Unicode punctuation characters, including various kinds --- of connectors, brackets and quotes. --- --- This function returns 'True' if its argument has one of the --- following 'GeneralCategory's, or 'False' otherwise: --- --- * 'ConnectorPunctuation' --- * 'DashPunctuation' --- * 'OpenPunctuation' --- * 'ClosePunctuation' --- * 'InitialQuote' --- * 'FinalQuote' --- * 'OtherPunctuation' --- --- These classes are defined in the --- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, --- part of the Unicode standard. The same document defines what is --- and is not a \"Punctuation\". --- --- ==== __Examples__ --- --- Basic usage: --- --- >>> isPunctuation 'a' --- False --- >>> isPunctuation '7' --- False --- >>> isPunctuation '♥' --- False --- >>> isPunctuation '"' --- True --- >>> isPunctuation '?' --- True --- >>> isPunctuation '—' --- True --- -isPunctuation :: Char -> Bool -isPunctuation c = case generalCategory c of - ConnectorPunctuation -> True - DashPunctuation -> True - OpenPunctuation -> True - ClosePunctuation -> True - InitialQuote -> True - FinalQuote -> True - OtherPunctuation -> True - _ -> False - --- | Selects Unicode symbol characters, including mathematical and --- currency symbols. --- --- This function returns 'True' if its argument has one of the --- following 'GeneralCategory's, or 'False' otherwise: --- --- * 'MathSymbol' --- * 'CurrencySymbol' --- * 'ModifierSymbol' --- * 'OtherSymbol' --- --- These classes are defined in the --- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, --- part of the Unicode standard. The same document defines what is --- and is not a \"Symbol\". --- --- ==== __Examples__ --- --- Basic usage: --- --- >>> isSymbol 'a' --- False --- >>> isSymbol '6' --- False --- >>> isSymbol '=' --- True --- --- The definition of \"math symbol\" may be a little --- counter-intuitive depending on one's background: --- --- >>> isSymbol '+' --- True --- >>> isSymbol '-' --- False --- -isSymbol :: Char -> Bool -isSymbol c = case generalCategory c of - MathSymbol -> True - CurrencySymbol -> True - ModifierSymbol -> True - OtherSymbol -> True - _ -> False - -- | Selects Unicode space and separator characters. -- -- This function returns 'True' if its argument has one of the diff --git a/libraries/base/GHC/Read.hs b/libraries/base/GHC/Read.hs index 298b068c7e..b4b88c0e9d 100644 --- a/libraries/base/GHC/Read.hs +++ b/libraries/base/GHC/Read.hs @@ -59,7 +59,7 @@ import Text.ParserCombinators.ReadPrec import Data.Maybe -import GHC.Unicode ( isDigit ) +import GHC.Unicode import GHC.Num import GHC.Real import GHC.Float @@ -312,6 +312,8 @@ choose sps = foldr ((+++) . try_one) pfail sps -- Simple instances of Read -------------------------------------------------------------- +deriving instance Read GeneralCategory + instance Read Char where readPrec = parens diff --git a/libraries/base/GHC/Show.hs b/libraries/base/GHC/Show.hs index 25d9b97b56..4aeecb15f3 100644 --- a/libraries/base/GHC/Show.hs +++ b/libraries/base/GHC/Show.hs @@ -50,8 +50,8 @@ module GHC.Show where import GHC.Base -import GHC.Num import GHC.List ((!!), foldr1, break) +import GHC.Num -- | The @shows@ functions return a function that prepends the -- output 'String' to an existing 'String'. This allows constant-time diff --git a/libraries/base/GHC/Unicode.hs b/libraries/base/GHC/Unicode.hs index 627780586f..0e2ce4c0ef 100644 --- a/libraries/base/GHC/Unicode.hs +++ b/libraries/base/GHC/Unicode.hs @@ -1,5 +1,5 @@ {-# LANGUAGE Trustworthy #-} -{-# LANGUAGE CPP, NoImplicitPrelude #-} +{-# LANGUAGE CPP, NoImplicitPrelude, StandaloneDeriving #-} {-# OPTIONS_HADDOCK hide #-} ----------------------------------------------------------------------------- @@ -19,11 +19,13 @@ ----------------------------------------------------------------------------- module GHC.Unicode ( + GeneralCategory (..), generalCategory, isAscii, isLatin1, isControl, isAsciiUpper, isAsciiLower, isPrint, isSpace, isUpper, isLower, isAlpha, isDigit, isOctDigit, isHexDigit, isAlphaNum, + isPunctuation, isSymbol, toUpper, toLower, toTitle, wgencat ) where @@ -31,10 +33,131 @@ module GHC.Unicode ( import GHC.Base import GHC.Char (chr) import GHC.Real +import GHC.Enum ( Enum (..), Bounded (..) ) +import GHC.Arr ( Ix (..) ) import GHC.Num +-- Data.Char.chr already imports this and we need to define a Show instance +-- for GeneralCategory +import GHC.Show ( Show ) + #include "HsBaseConfig.h" +-- | Unicode General Categories (column 2 of the UnicodeData table) in +-- the order they are listed in the Unicode standard (the Unicode +-- Character Database, in particular). +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> :t OtherLetter +-- OtherLetter :: GeneralCategory +-- +-- 'Eq' instance: +-- +-- >>> UppercaseLetter == UppercaseLetter +-- True +-- >>> UppercaseLetter == LowercaseLetter +-- False +-- +-- 'Ord' instance: +-- +-- >>> NonSpacingMark <= MathSymbol +-- True +-- +-- 'Enum' instance: +-- +-- >>> enumFromTo ModifierLetter SpacingCombiningMark +-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark] +-- +-- 'Read' instance: +-- +-- >>> read "DashPunctuation" :: GeneralCategory +-- DashPunctuation +-- >>> read "17" :: GeneralCategory +-- *** Exception: Prelude.read: no parse +-- +-- 'Show' instance: +-- +-- >>> show EnclosingMark +-- "EnclosingMark" +-- +-- 'Bounded' instance: +-- +-- >>> minBound :: GeneralCategory +-- UppercaseLetter +-- >>> maxBound :: GeneralCategory +-- NotAssigned +-- +-- 'Ix' instance: +-- +-- >>> import Data.Ix ( index ) +-- >>> index (OtherLetter,Control) FinalQuote +-- 12 +-- >>> index (OtherLetter,Control) Format +-- *** Exception: Error in array index +-- +data GeneralCategory + = UppercaseLetter -- ^ Lu: Letter, Uppercase + | LowercaseLetter -- ^ Ll: Letter, Lowercase + | TitlecaseLetter -- ^ Lt: Letter, Titlecase + | ModifierLetter -- ^ Lm: Letter, Modifier + | OtherLetter -- ^ Lo: Letter, Other + | NonSpacingMark -- ^ Mn: Mark, Non-Spacing + | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining + | EnclosingMark -- ^ Me: Mark, Enclosing + | DecimalNumber -- ^ Nd: Number, Decimal + | LetterNumber -- ^ Nl: Number, Letter + | OtherNumber -- ^ No: Number, Other + | ConnectorPunctuation -- ^ Pc: Punctuation, Connector + | DashPunctuation -- ^ Pd: Punctuation, Dash + | OpenPunctuation -- ^ Ps: Punctuation, Open + | ClosePunctuation -- ^ Pe: Punctuation, Close + | InitialQuote -- ^ Pi: Punctuation, Initial quote + | FinalQuote -- ^ Pf: Punctuation, Final quote + | OtherPunctuation -- ^ Po: Punctuation, Other + | MathSymbol -- ^ Sm: Symbol, Math + | CurrencySymbol -- ^ Sc: Symbol, Currency + | ModifierSymbol -- ^ Sk: Symbol, Modifier + | OtherSymbol -- ^ So: Symbol, Other + | Space -- ^ Zs: Separator, Space + | LineSeparator -- ^ Zl: Separator, Line + | ParagraphSeparator -- ^ Zp: Separator, Paragraph + | Control -- ^ Cc: Other, Control + | Format -- ^ Cf: Other, Format + | Surrogate -- ^ Cs: Other, Surrogate + | PrivateUse -- ^ Co: Other, Private Use + | NotAssigned -- ^ Cn: Other, Not Assigned + deriving (Show, Eq, Ord, Enum, Bounded, Ix) + +-- | The Unicode general category of the character. This relies on the +-- 'Enum' instance of 'GeneralCategory', which must remain in the +-- same order as the categories are presented in the Unicode +-- standard. +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> generalCategory 'a' +-- LowercaseLetter +-- >>> generalCategory 'A' +-- UppercaseLetter +-- >>> generalCategory '0' +-- DecimalNumber +-- >>> generalCategory '%' +-- OtherPunctuation +-- >>> generalCategory '♥' +-- OtherSymbol +-- >>> generalCategory '\31' +-- Control +-- >>> generalCategory ' ' +-- Space +-- +generalCategory :: Char -> GeneralCategory +generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c + -- | Selects the first 128 characters of the Unicode character set, -- corresponding to the ASCII character set. isAscii :: Char -> Bool @@ -118,6 +241,96 @@ isHexDigit c = isDigit c || (fromIntegral (ord c - ord 'A')::Word) <= 5 || (fromIntegral (ord c - ord 'a')::Word) <= 5 +-- | Selects Unicode punctuation characters, including various kinds +-- of connectors, brackets and quotes. +-- +-- This function returns 'True' if its argument has one of the +-- following 'GeneralCategory's, or 'False' otherwise: +-- +-- * 'ConnectorPunctuation' +-- * 'DashPunctuation' +-- * 'OpenPunctuation' +-- * 'ClosePunctuation' +-- * 'InitialQuote' +-- * 'FinalQuote' +-- * 'OtherPunctuation' +-- +-- These classes are defined in the +-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, +-- part of the Unicode standard. The same document defines what is +-- and is not a \"Punctuation\". +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> isPunctuation 'a' +-- False +-- >>> isPunctuation '7' +-- False +-- >>> isPunctuation '♥' +-- False +-- >>> isPunctuation '"' +-- True +-- >>> isPunctuation '?' +-- True +-- >>> isPunctuation '—' +-- True +-- +isPunctuation :: Char -> Bool +isPunctuation c = case generalCategory c of + ConnectorPunctuation -> True + DashPunctuation -> True + OpenPunctuation -> True + ClosePunctuation -> True + InitialQuote -> True + FinalQuote -> True + OtherPunctuation -> True + _ -> False + +-- | Selects Unicode symbol characters, including mathematical and +-- currency symbols. +-- +-- This function returns 'True' if its argument has one of the +-- following 'GeneralCategory's, or 'False' otherwise: +-- +-- * 'MathSymbol' +-- * 'CurrencySymbol' +-- * 'ModifierSymbol' +-- * 'OtherSymbol' +-- +-- These classes are defined in the +-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>, +-- part of the Unicode standard. The same document defines what is +-- and is not a \"Symbol\". +-- +-- ==== __Examples__ +-- +-- Basic usage: +-- +-- >>> isSymbol 'a' +-- False +-- >>> isSymbol '6' +-- False +-- >>> isSymbol '=' +-- True +-- +-- The definition of \"math symbol\" may be a little +-- counter-intuitive depending on one's background: +-- +-- >>> isSymbol '+' +-- True +-- >>> isSymbol '-' +-- False +-- +isSymbol :: Char -> Bool +isSymbol c = case generalCategory c of + MathSymbol -> True + CurrencySymbol -> True + ModifierSymbol -> True + OtherSymbol -> True + _ -> False + -- | Convert a letter to the corresponding upper-case letter, if any. -- Any other character is returned unchanged. toUpper :: Char -> Char |