summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Gamari <bgamari.foss@gmail.com>2015-09-08 08:38:40 -0500
committerAustin Seipp <austin@well-typed.com>2015-09-08 08:39:05 -0500
commite4a73f4fa1cc9681aee3ce13ee15073deed54635 (patch)
tree4ae445764e5c7615705238cafb8533de9c359f2b
parent8be43dd966c9c56e530eab266d6bf2710f9b07f4 (diff)
downloadhaskell-e4a73f4fa1cc9681aee3ce13ee15073deed54635.tar.gz
Move GeneralCategory et al to GHC.Unicode
This allows these to be used from Text.Read.Lex import cycles. Reviewed By: thomie, austin Differential Revision: https://phabricator.haskell.org/D1121 GHC Trac Issues: #10444
-rw-r--r--libraries/base/Data/Char.hs209
-rw-r--r--libraries/base/GHC/Read.hs4
-rw-r--r--libraries/base/GHC/Show.hs2
-rw-r--r--libraries/base/GHC/Unicode.hs215
4 files changed, 219 insertions, 211 deletions
diff --git a/libraries/base/Data/Char.hs b/libraries/base/Data/Char.hs
index e4e7fbfcb8..c8dd9331c6 100644
--- a/libraries/base/Data/Char.hs
+++ b/libraries/base/Data/Char.hs
@@ -53,14 +53,12 @@ module Data.Char
) where
import GHC.Base
-import GHC.Arr (Ix)
import GHC.Char
import GHC.Real (fromIntegral)
import GHC.Show
-import GHC.Read (Read, readLitChar, lexLitChar)
+import GHC.Read (readLitChar, lexLitChar)
import GHC.Unicode
import GHC.Num
-import GHC.Enum
-- $setup
-- Allow the use of Prelude in doctests.
@@ -105,121 +103,6 @@ digitToInt c
hexl = ord c - ord 'a'
hexu = ord c - ord 'A'
--- | Unicode General Categories (column 2 of the UnicodeData table) in
--- the order they are listed in the Unicode standard (the Unicode
--- Character Database, in particular).
---
--- ==== __Examples__
---
--- Basic usage:
---
--- >>> :t OtherLetter
--- OtherLetter :: GeneralCategory
---
--- 'Eq' instance:
---
--- >>> UppercaseLetter == UppercaseLetter
--- True
--- >>> UppercaseLetter == LowercaseLetter
--- False
---
--- 'Ord' instance:
---
--- >>> NonSpacingMark <= MathSymbol
--- True
---
--- 'Enum' instance:
---
--- >>> enumFromTo ModifierLetter SpacingCombiningMark
--- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
---
--- 'Read' instance:
---
--- >>> read "DashPunctuation" :: GeneralCategory
--- DashPunctuation
--- >>> read "17" :: GeneralCategory
--- *** Exception: Prelude.read: no parse
---
--- 'Show' instance:
---
--- >>> show EnclosingMark
--- "EnclosingMark"
---
--- 'Bounded' instance:
---
--- >>> minBound :: GeneralCategory
--- UppercaseLetter
--- >>> maxBound :: GeneralCategory
--- NotAssigned
---
--- 'Ix' instance:
---
--- >>> import Data.Ix ( index )
--- >>> index (OtherLetter,Control) FinalQuote
--- 12
--- >>> index (OtherLetter,Control) Format
--- *** Exception: Error in array index
---
-data GeneralCategory
- = UppercaseLetter -- ^ Lu: Letter, Uppercase
- | LowercaseLetter -- ^ Ll: Letter, Lowercase
- | TitlecaseLetter -- ^ Lt: Letter, Titlecase
- | ModifierLetter -- ^ Lm: Letter, Modifier
- | OtherLetter -- ^ Lo: Letter, Other
- | NonSpacingMark -- ^ Mn: Mark, Non-Spacing
- | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
- | EnclosingMark -- ^ Me: Mark, Enclosing
- | DecimalNumber -- ^ Nd: Number, Decimal
- | LetterNumber -- ^ Nl: Number, Letter
- | OtherNumber -- ^ No: Number, Other
- | ConnectorPunctuation -- ^ Pc: Punctuation, Connector
- | DashPunctuation -- ^ Pd: Punctuation, Dash
- | OpenPunctuation -- ^ Ps: Punctuation, Open
- | ClosePunctuation -- ^ Pe: Punctuation, Close
- | InitialQuote -- ^ Pi: Punctuation, Initial quote
- | FinalQuote -- ^ Pf: Punctuation, Final quote
- | OtherPunctuation -- ^ Po: Punctuation, Other
- | MathSymbol -- ^ Sm: Symbol, Math
- | CurrencySymbol -- ^ Sc: Symbol, Currency
- | ModifierSymbol -- ^ Sk: Symbol, Modifier
- | OtherSymbol -- ^ So: Symbol, Other
- | Space -- ^ Zs: Separator, Space
- | LineSeparator -- ^ Zl: Separator, Line
- | ParagraphSeparator -- ^ Zp: Separator, Paragraph
- | Control -- ^ Cc: Other, Control
- | Format -- ^ Cf: Other, Format
- | Surrogate -- ^ Cs: Other, Surrogate
- | PrivateUse -- ^ Co: Other, Private Use
- | NotAssigned -- ^ Cn: Other, Not Assigned
- deriving (Eq, Ord, Enum, Read, Show, Bounded, Ix)
-
--- | The Unicode general category of the character. This relies on the
--- 'Enum' instance of 'GeneralCategory', which must remain in the
--- same order as the categories are presented in the Unicode
--- standard.
---
--- ==== __Examples__
---
--- Basic usage:
---
--- >>> generalCategory 'a'
--- LowercaseLetter
--- >>> generalCategory 'A'
--- UppercaseLetter
--- >>> generalCategory '0'
--- DecimalNumber
--- >>> generalCategory '%'
--- OtherPunctuation
--- >>> generalCategory '♥'
--- OtherSymbol
--- >>> generalCategory '\31'
--- Control
--- >>> generalCategory ' '
--- Space
---
-generalCategory :: Char -> GeneralCategory
-generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
-
-- derived character classifiers
-- | Selects alphabetic Unicode characters (lower-case, upper-case and
@@ -360,96 +243,6 @@ isNumber c = case generalCategory c of
OtherNumber -> True
_ -> False
--- | Selects Unicode punctuation characters, including various kinds
--- of connectors, brackets and quotes.
---
--- This function returns 'True' if its argument has one of the
--- following 'GeneralCategory's, or 'False' otherwise:
---
--- * 'ConnectorPunctuation'
--- * 'DashPunctuation'
--- * 'OpenPunctuation'
--- * 'ClosePunctuation'
--- * 'InitialQuote'
--- * 'FinalQuote'
--- * 'OtherPunctuation'
---
--- These classes are defined in the
--- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
--- part of the Unicode standard. The same document defines what is
--- and is not a \"Punctuation\".
---
--- ==== __Examples__
---
--- Basic usage:
---
--- >>> isPunctuation 'a'
--- False
--- >>> isPunctuation '7'
--- False
--- >>> isPunctuation '♥'
--- False
--- >>> isPunctuation '"'
--- True
--- >>> isPunctuation '?'
--- True
--- >>> isPunctuation '—'
--- True
---
-isPunctuation :: Char -> Bool
-isPunctuation c = case generalCategory c of
- ConnectorPunctuation -> True
- DashPunctuation -> True
- OpenPunctuation -> True
- ClosePunctuation -> True
- InitialQuote -> True
- FinalQuote -> True
- OtherPunctuation -> True
- _ -> False
-
--- | Selects Unicode symbol characters, including mathematical and
--- currency symbols.
---
--- This function returns 'True' if its argument has one of the
--- following 'GeneralCategory's, or 'False' otherwise:
---
--- * 'MathSymbol'
--- * 'CurrencySymbol'
--- * 'ModifierSymbol'
--- * 'OtherSymbol'
---
--- These classes are defined in the
--- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
--- part of the Unicode standard. The same document defines what is
--- and is not a \"Symbol\".
---
--- ==== __Examples__
---
--- Basic usage:
---
--- >>> isSymbol 'a'
--- False
--- >>> isSymbol '6'
--- False
--- >>> isSymbol '='
--- True
---
--- The definition of \"math symbol\" may be a little
--- counter-intuitive depending on one's background:
---
--- >>> isSymbol '+'
--- True
--- >>> isSymbol '-'
--- False
---
-isSymbol :: Char -> Bool
-isSymbol c = case generalCategory c of
- MathSymbol -> True
- CurrencySymbol -> True
- ModifierSymbol -> True
- OtherSymbol -> True
- _ -> False
-
-- | Selects Unicode space and separator characters.
--
-- This function returns 'True' if its argument has one of the
diff --git a/libraries/base/GHC/Read.hs b/libraries/base/GHC/Read.hs
index 298b068c7e..b4b88c0e9d 100644
--- a/libraries/base/GHC/Read.hs
+++ b/libraries/base/GHC/Read.hs
@@ -59,7 +59,7 @@ import Text.ParserCombinators.ReadPrec
import Data.Maybe
-import GHC.Unicode ( isDigit )
+import GHC.Unicode
import GHC.Num
import GHC.Real
import GHC.Float
@@ -312,6 +312,8 @@ choose sps = foldr ((+++) . try_one) pfail sps
-- Simple instances of Read
--------------------------------------------------------------
+deriving instance Read GeneralCategory
+
instance Read Char where
readPrec =
parens
diff --git a/libraries/base/GHC/Show.hs b/libraries/base/GHC/Show.hs
index 25d9b97b56..4aeecb15f3 100644
--- a/libraries/base/GHC/Show.hs
+++ b/libraries/base/GHC/Show.hs
@@ -50,8 +50,8 @@ module GHC.Show
where
import GHC.Base
-import GHC.Num
import GHC.List ((!!), foldr1, break)
+import GHC.Num
-- | The @shows@ functions return a function that prepends the
-- output 'String' to an existing 'String'. This allows constant-time
diff --git a/libraries/base/GHC/Unicode.hs b/libraries/base/GHC/Unicode.hs
index 627780586f..0e2ce4c0ef 100644
--- a/libraries/base/GHC/Unicode.hs
+++ b/libraries/base/GHC/Unicode.hs
@@ -1,5 +1,5 @@
{-# LANGUAGE Trustworthy #-}
-{-# LANGUAGE CPP, NoImplicitPrelude #-}
+{-# LANGUAGE CPP, NoImplicitPrelude, StandaloneDeriving #-}
{-# OPTIONS_HADDOCK hide #-}
-----------------------------------------------------------------------------
@@ -19,11 +19,13 @@
-----------------------------------------------------------------------------
module GHC.Unicode (
+ GeneralCategory (..), generalCategory,
isAscii, isLatin1, isControl,
isAsciiUpper, isAsciiLower,
isPrint, isSpace, isUpper,
isLower, isAlpha, isDigit,
isOctDigit, isHexDigit, isAlphaNum,
+ isPunctuation, isSymbol,
toUpper, toLower, toTitle,
wgencat
) where
@@ -31,10 +33,131 @@ module GHC.Unicode (
import GHC.Base
import GHC.Char (chr)
import GHC.Real
+import GHC.Enum ( Enum (..), Bounded (..) )
+import GHC.Arr ( Ix (..) )
import GHC.Num
+-- Data.Char.chr already imports this and we need to define a Show instance
+-- for GeneralCategory
+import GHC.Show ( Show )
+
#include "HsBaseConfig.h"
+-- | Unicode General Categories (column 2 of the UnicodeData table) in
+-- the order they are listed in the Unicode standard (the Unicode
+-- Character Database, in particular).
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> :t OtherLetter
+-- OtherLetter :: GeneralCategory
+--
+-- 'Eq' instance:
+--
+-- >>> UppercaseLetter == UppercaseLetter
+-- True
+-- >>> UppercaseLetter == LowercaseLetter
+-- False
+--
+-- 'Ord' instance:
+--
+-- >>> NonSpacingMark <= MathSymbol
+-- True
+--
+-- 'Enum' instance:
+--
+-- >>> enumFromTo ModifierLetter SpacingCombiningMark
+-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
+--
+-- 'Read' instance:
+--
+-- >>> read "DashPunctuation" :: GeneralCategory
+-- DashPunctuation
+-- >>> read "17" :: GeneralCategory
+-- *** Exception: Prelude.read: no parse
+--
+-- 'Show' instance:
+--
+-- >>> show EnclosingMark
+-- "EnclosingMark"
+--
+-- 'Bounded' instance:
+--
+-- >>> minBound :: GeneralCategory
+-- UppercaseLetter
+-- >>> maxBound :: GeneralCategory
+-- NotAssigned
+--
+-- 'Ix' instance:
+--
+-- >>> import Data.Ix ( index )
+-- >>> index (OtherLetter,Control) FinalQuote
+-- 12
+-- >>> index (OtherLetter,Control) Format
+-- *** Exception: Error in array index
+--
+data GeneralCategory
+ = UppercaseLetter -- ^ Lu: Letter, Uppercase
+ | LowercaseLetter -- ^ Ll: Letter, Lowercase
+ | TitlecaseLetter -- ^ Lt: Letter, Titlecase
+ | ModifierLetter -- ^ Lm: Letter, Modifier
+ | OtherLetter -- ^ Lo: Letter, Other
+ | NonSpacingMark -- ^ Mn: Mark, Non-Spacing
+ | SpacingCombiningMark -- ^ Mc: Mark, Spacing Combining
+ | EnclosingMark -- ^ Me: Mark, Enclosing
+ | DecimalNumber -- ^ Nd: Number, Decimal
+ | LetterNumber -- ^ Nl: Number, Letter
+ | OtherNumber -- ^ No: Number, Other
+ | ConnectorPunctuation -- ^ Pc: Punctuation, Connector
+ | DashPunctuation -- ^ Pd: Punctuation, Dash
+ | OpenPunctuation -- ^ Ps: Punctuation, Open
+ | ClosePunctuation -- ^ Pe: Punctuation, Close
+ | InitialQuote -- ^ Pi: Punctuation, Initial quote
+ | FinalQuote -- ^ Pf: Punctuation, Final quote
+ | OtherPunctuation -- ^ Po: Punctuation, Other
+ | MathSymbol -- ^ Sm: Symbol, Math
+ | CurrencySymbol -- ^ Sc: Symbol, Currency
+ | ModifierSymbol -- ^ Sk: Symbol, Modifier
+ | OtherSymbol -- ^ So: Symbol, Other
+ | Space -- ^ Zs: Separator, Space
+ | LineSeparator -- ^ Zl: Separator, Line
+ | ParagraphSeparator -- ^ Zp: Separator, Paragraph
+ | Control -- ^ Cc: Other, Control
+ | Format -- ^ Cf: Other, Format
+ | Surrogate -- ^ Cs: Other, Surrogate
+ | PrivateUse -- ^ Co: Other, Private Use
+ | NotAssigned -- ^ Cn: Other, Not Assigned
+ deriving (Show, Eq, Ord, Enum, Bounded, Ix)
+
+-- | The Unicode general category of the character. This relies on the
+-- 'Enum' instance of 'GeneralCategory', which must remain in the
+-- same order as the categories are presented in the Unicode
+-- standard.
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> generalCategory 'a'
+-- LowercaseLetter
+-- >>> generalCategory 'A'
+-- UppercaseLetter
+-- >>> generalCategory '0'
+-- DecimalNumber
+-- >>> generalCategory '%'
+-- OtherPunctuation
+-- >>> generalCategory '♥'
+-- OtherSymbol
+-- >>> generalCategory '\31'
+-- Control
+-- >>> generalCategory ' '
+-- Space
+--
+generalCategory :: Char -> GeneralCategory
+generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
+
-- | Selects the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.
isAscii :: Char -> Bool
@@ -118,6 +241,96 @@ isHexDigit c = isDigit c ||
(fromIntegral (ord c - ord 'A')::Word) <= 5 ||
(fromIntegral (ord c - ord 'a')::Word) <= 5
+-- | Selects Unicode punctuation characters, including various kinds
+-- of connectors, brackets and quotes.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'ConnectorPunctuation'
+-- * 'DashPunctuation'
+-- * 'OpenPunctuation'
+-- * 'ClosePunctuation'
+-- * 'InitialQuote'
+-- * 'FinalQuote'
+-- * 'OtherPunctuation'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Punctuation\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isPunctuation 'a'
+-- False
+-- >>> isPunctuation '7'
+-- False
+-- >>> isPunctuation '♥'
+-- False
+-- >>> isPunctuation '"'
+-- True
+-- >>> isPunctuation '?'
+-- True
+-- >>> isPunctuation '—'
+-- True
+--
+isPunctuation :: Char -> Bool
+isPunctuation c = case generalCategory c of
+ ConnectorPunctuation -> True
+ DashPunctuation -> True
+ OpenPunctuation -> True
+ ClosePunctuation -> True
+ InitialQuote -> True
+ FinalQuote -> True
+ OtherPunctuation -> True
+ _ -> False
+
+-- | Selects Unicode symbol characters, including mathematical and
+-- currency symbols.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'MathSymbol'
+-- * 'CurrencySymbol'
+-- * 'ModifierSymbol'
+-- * 'OtherSymbol'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Symbol\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isSymbol 'a'
+-- False
+-- >>> isSymbol '6'
+-- False
+-- >>> isSymbol '='
+-- True
+--
+-- The definition of \"math symbol\" may be a little
+-- counter-intuitive depending on one's background:
+--
+-- >>> isSymbol '+'
+-- True
+-- >>> isSymbol '-'
+-- False
+--
+isSymbol :: Char -> Bool
+isSymbol c = case generalCategory c of
+ MathSymbol -> True
+ CurrencySymbol -> True
+ ModifierSymbol -> True
+ OtherSymbol -> True
+ _ -> False
+
-- | Convert a letter to the corresponding upper-case letter, if any.
-- Any other character is returned unchanged.
toUpper :: Char -> Char