summaryrefslogtreecommitdiff
path: root/compiler/GHC/Parser/CharClass.hs
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/GHC/Parser/CharClass.hs')
-rw-r--r--compiler/GHC/Parser/CharClass.hs215
1 files changed, 215 insertions, 0 deletions
diff --git a/compiler/GHC/Parser/CharClass.hs b/compiler/GHC/Parser/CharClass.hs
new file mode 100644
index 0000000000..dc98d48f94
--- /dev/null
+++ b/compiler/GHC/Parser/CharClass.hs
@@ -0,0 +1,215 @@
+-- Character classification
+{-# LANGUAGE CPP #-}
+module GHC.Parser.CharClass
+ ( is_ident -- Char# -> Bool
+ , is_symbol -- Char# -> Bool
+ , is_any -- Char# -> Bool
+ , is_space -- Char# -> Bool
+ , is_lower -- Char# -> Bool
+ , is_upper -- Char# -> Bool
+ , is_digit -- Char# -> Bool
+ , is_alphanum -- Char# -> Bool
+
+ , is_decdigit, is_hexdigit, is_octdigit, is_bindigit
+ , hexDigit, octDecDigit
+ ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import Data.Bits ( Bits((.&.),(.|.)) )
+import Data.Char ( ord, chr )
+import Data.Word
+import Panic
+
+-- Bit masks
+
+cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Word8
+cIdent = 1
+cSymbol = 2
+cAny = 4
+cSpace = 8
+cLower = 16
+cUpper = 32
+cDigit = 64
+
+-- | The predicates below look costly, but aren't, GHC+GCC do a great job
+-- at the big case below.
+
+{-# INLINABLE is_ctype #-}
+is_ctype :: Word8 -> Char -> Bool
+is_ctype mask c = (charType c .&. mask) /= 0
+
+is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit,
+ is_alphanum :: Char -> Bool
+is_ident = is_ctype cIdent
+is_symbol = is_ctype cSymbol
+is_any = is_ctype cAny
+is_space = is_ctype cSpace
+is_lower = is_ctype cLower
+is_upper = is_ctype cUpper
+is_digit = is_ctype cDigit
+is_alphanum = is_ctype (cLower+cUpper+cDigit)
+
+-- Utils
+
+hexDigit :: Char -> Int
+hexDigit c | is_decdigit c = ord c - ord '0'
+ | otherwise = ord (to_lower c) - ord 'a' + 10
+
+octDecDigit :: Char -> Int
+octDecDigit c = ord c - ord '0'
+
+is_decdigit :: Char -> Bool
+is_decdigit c
+ = c >= '0' && c <= '9'
+
+is_hexdigit :: Char -> Bool
+is_hexdigit c
+ = is_decdigit c
+ || (c >= 'a' && c <= 'f')
+ || (c >= 'A' && c <= 'F')
+
+is_octdigit :: Char -> Bool
+is_octdigit c = c >= '0' && c <= '7'
+
+is_bindigit :: Char -> Bool
+is_bindigit c = c == '0' || c == '1'
+
+to_lower :: Char -> Char
+to_lower c
+ | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a'))
+ | otherwise = c
+
+charType :: Char -> Word8
+charType c = case c of
+ '\0' -> 0 -- \000
+ '\1' -> 0 -- \001
+ '\2' -> 0 -- \002
+ '\3' -> 0 -- \003
+ '\4' -> 0 -- \004
+ '\5' -> 0 -- \005
+ '\6' -> 0 -- \006
+ '\7' -> 0 -- \007
+ '\8' -> 0 -- \010
+ '\9' -> cSpace -- \t (not allowed in strings, so !cAny)
+ '\10' -> cSpace -- \n (ditto)
+ '\11' -> cSpace -- \v (ditto)
+ '\12' -> cSpace -- \f (ditto)
+ '\13' -> cSpace -- ^M (ditto)
+ '\14' -> 0 -- \016
+ '\15' -> 0 -- \017
+ '\16' -> 0 -- \020
+ '\17' -> 0 -- \021
+ '\18' -> 0 -- \022
+ '\19' -> 0 -- \023
+ '\20' -> 0 -- \024
+ '\21' -> 0 -- \025
+ '\22' -> 0 -- \026
+ '\23' -> 0 -- \027
+ '\24' -> 0 -- \030
+ '\25' -> 0 -- \031
+ '\26' -> 0 -- \032
+ '\27' -> 0 -- \033
+ '\28' -> 0 -- \034
+ '\29' -> 0 -- \035
+ '\30' -> 0 -- \036
+ '\31' -> 0 -- \037
+ '\32' -> cAny .|. cSpace --
+ '\33' -> cAny .|. cSymbol -- !
+ '\34' -> cAny -- "
+ '\35' -> cAny .|. cSymbol -- #
+ '\36' -> cAny .|. cSymbol -- $
+ '\37' -> cAny .|. cSymbol -- %
+ '\38' -> cAny .|. cSymbol -- &
+ '\39' -> cAny .|. cIdent -- '
+ '\40' -> cAny -- (
+ '\41' -> cAny -- )
+ '\42' -> cAny .|. cSymbol -- *
+ '\43' -> cAny .|. cSymbol -- +
+ '\44' -> cAny -- ,
+ '\45' -> cAny .|. cSymbol -- -
+ '\46' -> cAny .|. cSymbol -- .
+ '\47' -> cAny .|. cSymbol -- /
+ '\48' -> cAny .|. cIdent .|. cDigit -- 0
+ '\49' -> cAny .|. cIdent .|. cDigit -- 1
+ '\50' -> cAny .|. cIdent .|. cDigit -- 2
+ '\51' -> cAny .|. cIdent .|. cDigit -- 3
+ '\52' -> cAny .|. cIdent .|. cDigit -- 4
+ '\53' -> cAny .|. cIdent .|. cDigit -- 5
+ '\54' -> cAny .|. cIdent .|. cDigit -- 6
+ '\55' -> cAny .|. cIdent .|. cDigit -- 7
+ '\56' -> cAny .|. cIdent .|. cDigit -- 8
+ '\57' -> cAny .|. cIdent .|. cDigit -- 9
+ '\58' -> cAny .|. cSymbol -- :
+ '\59' -> cAny -- ;
+ '\60' -> cAny .|. cSymbol -- <
+ '\61' -> cAny .|. cSymbol -- =
+ '\62' -> cAny .|. cSymbol -- >
+ '\63' -> cAny .|. cSymbol -- ?
+ '\64' -> cAny .|. cSymbol -- @
+ '\65' -> cAny .|. cIdent .|. cUpper -- A
+ '\66' -> cAny .|. cIdent .|. cUpper -- B
+ '\67' -> cAny .|. cIdent .|. cUpper -- C
+ '\68' -> cAny .|. cIdent .|. cUpper -- D
+ '\69' -> cAny .|. cIdent .|. cUpper -- E
+ '\70' -> cAny .|. cIdent .|. cUpper -- F
+ '\71' -> cAny .|. cIdent .|. cUpper -- G
+ '\72' -> cAny .|. cIdent .|. cUpper -- H
+ '\73' -> cAny .|. cIdent .|. cUpper -- I
+ '\74' -> cAny .|. cIdent .|. cUpper -- J
+ '\75' -> cAny .|. cIdent .|. cUpper -- K
+ '\76' -> cAny .|. cIdent .|. cUpper -- L
+ '\77' -> cAny .|. cIdent .|. cUpper -- M
+ '\78' -> cAny .|. cIdent .|. cUpper -- N
+ '\79' -> cAny .|. cIdent .|. cUpper -- O
+ '\80' -> cAny .|. cIdent .|. cUpper -- P
+ '\81' -> cAny .|. cIdent .|. cUpper -- Q
+ '\82' -> cAny .|. cIdent .|. cUpper -- R
+ '\83' -> cAny .|. cIdent .|. cUpper -- S
+ '\84' -> cAny .|. cIdent .|. cUpper -- T
+ '\85' -> cAny .|. cIdent .|. cUpper -- U
+ '\86' -> cAny .|. cIdent .|. cUpper -- V
+ '\87' -> cAny .|. cIdent .|. cUpper -- W
+ '\88' -> cAny .|. cIdent .|. cUpper -- X
+ '\89' -> cAny .|. cIdent .|. cUpper -- Y
+ '\90' -> cAny .|. cIdent .|. cUpper -- Z
+ '\91' -> cAny -- [
+ '\92' -> cAny .|. cSymbol -- backslash
+ '\93' -> cAny -- ]
+ '\94' -> cAny .|. cSymbol -- ^
+ '\95' -> cAny .|. cIdent .|. cLower -- _
+ '\96' -> cAny -- `
+ '\97' -> cAny .|. cIdent .|. cLower -- a
+ '\98' -> cAny .|. cIdent .|. cLower -- b
+ '\99' -> cAny .|. cIdent .|. cLower -- c
+ '\100' -> cAny .|. cIdent .|. cLower -- d
+ '\101' -> cAny .|. cIdent .|. cLower -- e
+ '\102' -> cAny .|. cIdent .|. cLower -- f
+ '\103' -> cAny .|. cIdent .|. cLower -- g
+ '\104' -> cAny .|. cIdent .|. cLower -- h
+ '\105' -> cAny .|. cIdent .|. cLower -- i
+ '\106' -> cAny .|. cIdent .|. cLower -- j
+ '\107' -> cAny .|. cIdent .|. cLower -- k
+ '\108' -> cAny .|. cIdent .|. cLower -- l
+ '\109' -> cAny .|. cIdent .|. cLower -- m
+ '\110' -> cAny .|. cIdent .|. cLower -- n
+ '\111' -> cAny .|. cIdent .|. cLower -- o
+ '\112' -> cAny .|. cIdent .|. cLower -- p
+ '\113' -> cAny .|. cIdent .|. cLower -- q
+ '\114' -> cAny .|. cIdent .|. cLower -- r
+ '\115' -> cAny .|. cIdent .|. cLower -- s
+ '\116' -> cAny .|. cIdent .|. cLower -- t
+ '\117' -> cAny .|. cIdent .|. cLower -- u
+ '\118' -> cAny .|. cIdent .|. cLower -- v
+ '\119' -> cAny .|. cIdent .|. cLower -- w
+ '\120' -> cAny .|. cIdent .|. cLower -- x
+ '\121' -> cAny .|. cIdent .|. cLower -- y
+ '\122' -> cAny .|. cIdent .|. cLower -- z
+ '\123' -> cAny -- {
+ '\124' -> cAny .|. cSymbol -- |
+ '\125' -> cAny -- }
+ '\126' -> cAny .|. cSymbol -- ~
+ '\127' -> 0 -- \177
+ _ -> panic ("charType: " ++ show c)