Move GeneralCategory et al to GHC.Unicode

This allows these to be used from Text.Read.Lex import cycles. Reviewed By: thomie, austin Differential Revision: https://phabricator.haskell.org/D1121 GHC Trac Issues: #10444
author: Ben Gamari <bgamari.foss@gmail.com> 2015-09-08 08:38:40 -0500
committer: Austin Seipp <austin@well-typed.com> 2015-09-08 08:39:05 -0500
commit: e4a73f4fa1cc9681aee3ce13ee15073deed54635 (patch)
tree: 4ae445764e5c7615705238cafb8533de9c359f2b /libraries/base/GHC/Unicode.hs
parent: 8be43dd966c9c56e530eab266d6bf2710f9b07f4 (diff)
download: haskell-e4a73f4fa1cc9681aee3ce13ee15073deed54635.tar.gz
1 files changed, 214 insertions, 1 deletions
diff --git a/libraries/base/GHC/Unicode.hs b/libraries/base/GHC/Unicode.hs
index 627780586f..0e2ce4c0ef 100644
--- a/libraries/base/GHC/Unicode.hs
+++ b/libraries/base/GHC/Unicode.hs
@@ -1,5 +1,5 @@
 {-# LANGUAGE Trustworthy #-}
-{-# LANGUAGE CPP, NoImplicitPrelude #-}
+{-# LANGUAGE CPP, NoImplicitPrelude, StandaloneDeriving #-}
 {-# OPTIONS_HADDOCK hide #-}
 
 -----------------------------------------------------------------------------
@@ -19,11 +19,13 @@
 -----------------------------------------------------------------------------
 
 module GHC.Unicode (
+        GeneralCategory (..), generalCategory,
         isAscii, isLatin1, isControl,
         isAsciiUpper, isAsciiLower,
         isPrint, isSpace,  isUpper,
         isLower, isAlpha,  isDigit,
         isOctDigit, isHexDigit, isAlphaNum,
+        isPunctuation, isSymbol,
         toUpper, toLower, toTitle,
         wgencat
     ) where
@@ -31,10 +33,131 @@ module GHC.Unicode (
 import GHC.Base
 import GHC.Char        (chr)
 import GHC.Real
+import GHC.Enum ( Enum (..), Bounded (..) )
+import GHC.Arr ( Ix (..) )
 import GHC.Num
 
+-- Data.Char.chr already imports this and we need to define a Show instance
+-- for GeneralCategory
+import GHC.Show ( Show )
+
 #include "HsBaseConfig.h"
 
+-- | Unicode General Categories (column 2 of the UnicodeData table) in
+-- the order they are listed in the Unicode standard (the Unicode
+-- Character Database, in particular).
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> :t OtherLetter
+-- OtherLetter :: GeneralCategory
+--
+-- 'Eq' instance:
+--
+-- >>> UppercaseLetter == UppercaseLetter
+-- True
+-- >>> UppercaseLetter == LowercaseLetter
+-- False
+--
+-- 'Ord' instance:
+--
+-- >>> NonSpacingMark <= MathSymbol
+-- True
+--
+-- 'Enum' instance:
+--
+-- >>> enumFromTo ModifierLetter SpacingCombiningMark
+-- [ModifierLetter,OtherLetter,NonSpacingMark,SpacingCombiningMark]
+--
+-- 'Read' instance:
+--
+-- >>> read "DashPunctuation" :: GeneralCategory
+-- DashPunctuation
+-- >>> read "17" :: GeneralCategory
+-- *** Exception: Prelude.read: no parse
+--
+-- 'Show' instance:
+--
+-- >>> show EnclosingMark
+-- "EnclosingMark"
+--
+-- 'Bounded' instance:
+--
+-- >>> minBound :: GeneralCategory
+-- UppercaseLetter
+-- >>> maxBound :: GeneralCategory
+-- NotAssigned
+--
+-- 'Ix' instance:
+--
+--  >>> import Data.Ix ( index )
+--  >>> index (OtherLetter,Control) FinalQuote
+--  12
+--  >>> index (OtherLetter,Control) Format
+--  *** Exception: Error in array index
+--
+data GeneralCategory
+        = UppercaseLetter       -- ^ Lu: Letter, Uppercase
+        | LowercaseLetter       -- ^ Ll: Letter, Lowercase
+        | TitlecaseLetter       -- ^ Lt: Letter, Titlecase
+        | ModifierLetter        -- ^ Lm: Letter, Modifier
+        | OtherLetter           -- ^ Lo: Letter, Other
+        | NonSpacingMark        -- ^ Mn: Mark, Non-Spacing
+        | SpacingCombiningMark  -- ^ Mc: Mark, Spacing Combining
+        | EnclosingMark         -- ^ Me: Mark, Enclosing
+        | DecimalNumber         -- ^ Nd: Number, Decimal
+        | LetterNumber          -- ^ Nl: Number, Letter
+        | OtherNumber           -- ^ No: Number, Other
+        | ConnectorPunctuation  -- ^ Pc: Punctuation, Connector
+        | DashPunctuation       -- ^ Pd: Punctuation, Dash
+        | OpenPunctuation       -- ^ Ps: Punctuation, Open
+        | ClosePunctuation      -- ^ Pe: Punctuation, Close
+        | InitialQuote          -- ^ Pi: Punctuation, Initial quote
+        | FinalQuote            -- ^ Pf: Punctuation, Final quote
+        | OtherPunctuation      -- ^ Po: Punctuation, Other
+        | MathSymbol            -- ^ Sm: Symbol, Math
+        | CurrencySymbol        -- ^ Sc: Symbol, Currency
+        | ModifierSymbol        -- ^ Sk: Symbol, Modifier
+        | OtherSymbol           -- ^ So: Symbol, Other
+        | Space                 -- ^ Zs: Separator, Space
+        | LineSeparator         -- ^ Zl: Separator, Line
+        | ParagraphSeparator    -- ^ Zp: Separator, Paragraph
+        | Control               -- ^ Cc: Other, Control
+        | Format                -- ^ Cf: Other, Format
+        | Surrogate             -- ^ Cs: Other, Surrogate
+        | PrivateUse            -- ^ Co: Other, Private Use
+        | NotAssigned           -- ^ Cn: Other, Not Assigned
+        deriving (Show, Eq, Ord, Enum, Bounded, Ix)
+
+-- | The Unicode general category of the character. This relies on the
+-- 'Enum' instance of 'GeneralCategory', which must remain in the
+-- same order as the categories are presented in the Unicode
+-- standard.
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> generalCategory 'a'
+-- LowercaseLetter
+-- >>> generalCategory 'A'
+-- UppercaseLetter
+-- >>> generalCategory '0'
+-- DecimalNumber
+-- >>> generalCategory '%'
+-- OtherPunctuation
+-- >>> generalCategory '♥'
+-- OtherSymbol
+-- >>> generalCategory '\31'
+-- Control
+-- >>> generalCategory ' '
+-- Space
+--
+generalCategory :: Char -> GeneralCategory
+generalCategory c = toEnum $ fromIntegral $ wgencat $ fromIntegral $ ord c
+
 -- | Selects the first 128 characters of the Unicode character set,
 -- corresponding to the ASCII character set.
 isAscii                 :: Char -> Bool
@@ -118,6 +241,96 @@ isHexDigit c            =  isDigit c ||
                            (fromIntegral (ord c - ord 'A')::Word) <= 5 ||
                            (fromIntegral (ord c - ord 'a')::Word) <= 5
 
+-- | Selects Unicode punctuation characters, including various kinds
+-- of connectors, brackets and quotes.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'ConnectorPunctuation'
+-- * 'DashPunctuation'
+-- * 'OpenPunctuation'
+-- * 'ClosePunctuation'
+-- * 'InitialQuote'
+-- * 'FinalQuote'
+-- * 'OtherPunctuation'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Punctuation\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isPunctuation 'a'
+-- False
+-- >>> isPunctuation '7'
+-- False
+-- >>> isPunctuation '♥'
+-- False
+-- >>> isPunctuation '"'
+-- True
+-- >>> isPunctuation '?'
+-- True
+-- >>> isPunctuation '—'
+-- True
+--
+isPunctuation :: Char -> Bool
+isPunctuation c = case generalCategory c of
+        ConnectorPunctuation    -> True
+        DashPunctuation         -> True
+        OpenPunctuation         -> True
+        ClosePunctuation        -> True
+        InitialQuote            -> True
+        FinalQuote              -> True
+        OtherPunctuation        -> True
+        _                       -> False
+
+-- | Selects Unicode symbol characters, including mathematical and
+-- currency symbols.
+--
+-- This function returns 'True' if its argument has one of the
+-- following 'GeneralCategory's, or 'False' otherwise:
+--
+-- * 'MathSymbol'
+-- * 'CurrencySymbol'
+-- * 'ModifierSymbol'
+-- * 'OtherSymbol'
+--
+-- These classes are defined in the
+-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table Unicode Character Database>,
+-- part of the Unicode standard. The same document defines what is
+-- and is not a \"Symbol\".
+--
+-- ==== __Examples__
+--
+-- Basic usage:
+--
+-- >>> isSymbol 'a'
+-- False
+-- >>> isSymbol '6'
+-- False
+-- >>> isSymbol '='
+-- True
+--
+-- The definition of \"math symbol\" may be a little
+-- counter-intuitive depending on one's background:
+--
+-- >>> isSymbol '+'
+-- True
+-- >>> isSymbol '-'
+-- False
+--
+isSymbol :: Char -> Bool
+isSymbol c = case generalCategory c of
+        MathSymbol              -> True
+        CurrencySymbol          -> True
+        ModifierSymbol          -> True
+        OtherSymbol             -> True
+        _                       -> False
+
 -- | Convert a letter to the corresponding upper-case letter, if any.
 -- Any other character is returned unchanged.
 toUpper                 :: Char -> Char
author	Ben Gamari <bgamari.foss@gmail.com>	2015-09-08 08:38:40 -0500
committer	Austin Seipp <austin@well-typed.com>	2015-09-08 08:39:05 -0500
commit	e4a73f4fa1cc9681aee3ce13ee15073deed54635 (patch)
tree	4ae445764e5c7615705238cafb8533de9c359f2b /libraries/base/GHC/Unicode.hs
parent	8be43dd966c9c56e530eab266d6bf2710f9b07f4 (diff)
download	haskell-e4a73f4fa1cc9681aee3ce13ee15073deed54635.tar.gz