diff options
Diffstat (limited to 'compiler/GHC/Parser/CharClass.hs')
-rw-r--r-- | compiler/GHC/Parser/CharClass.hs | 215 |
1 files changed, 215 insertions, 0 deletions
diff --git a/compiler/GHC/Parser/CharClass.hs b/compiler/GHC/Parser/CharClass.hs new file mode 100644 index 0000000000..dc98d48f94 --- /dev/null +++ b/compiler/GHC/Parser/CharClass.hs @@ -0,0 +1,215 @@ +-- Character classification +{-# LANGUAGE CPP #-} +module GHC.Parser.CharClass + ( is_ident -- Char# -> Bool + , is_symbol -- Char# -> Bool + , is_any -- Char# -> Bool + , is_space -- Char# -> Bool + , is_lower -- Char# -> Bool + , is_upper -- Char# -> Bool + , is_digit -- Char# -> Bool + , is_alphanum -- Char# -> Bool + + , is_decdigit, is_hexdigit, is_octdigit, is_bindigit + , hexDigit, octDecDigit + ) where + +#include "HsVersions.h" + +import GhcPrelude + +import Data.Bits ( Bits((.&.),(.|.)) ) +import Data.Char ( ord, chr ) +import Data.Word +import Panic + +-- Bit masks + +cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Word8 +cIdent = 1 +cSymbol = 2 +cAny = 4 +cSpace = 8 +cLower = 16 +cUpper = 32 +cDigit = 64 + +-- | The predicates below look costly, but aren't, GHC+GCC do a great job +-- at the big case below. + +{-# INLINABLE is_ctype #-} +is_ctype :: Word8 -> Char -> Bool +is_ctype mask c = (charType c .&. mask) /= 0 + +is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit, + is_alphanum :: Char -> Bool +is_ident = is_ctype cIdent +is_symbol = is_ctype cSymbol +is_any = is_ctype cAny +is_space = is_ctype cSpace +is_lower = is_ctype cLower +is_upper = is_ctype cUpper +is_digit = is_ctype cDigit +is_alphanum = is_ctype (cLower+cUpper+cDigit) + +-- Utils + +hexDigit :: Char -> Int +hexDigit c | is_decdigit c = ord c - ord '0' + | otherwise = ord (to_lower c) - ord 'a' + 10 + +octDecDigit :: Char -> Int +octDecDigit c = ord c - ord '0' + +is_decdigit :: Char -> Bool +is_decdigit c + = c >= '0' && c <= '9' + +is_hexdigit :: Char -> Bool +is_hexdigit c + = is_decdigit c + || (c >= 'a' && c <= 'f') + || (c >= 'A' && c <= 'F') + +is_octdigit :: Char -> Bool +is_octdigit c = c >= '0' && c <= '7' + +is_bindigit :: Char -> Bool +is_bindigit c = c == '0' || c == '1' + +to_lower :: Char -> Char +to_lower c + | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a')) + | otherwise = c + +charType :: Char -> Word8 +charType c = case c of + '\0' -> 0 -- \000 + '\1' -> 0 -- \001 + '\2' -> 0 -- \002 + '\3' -> 0 -- \003 + '\4' -> 0 -- \004 + '\5' -> 0 -- \005 + '\6' -> 0 -- \006 + '\7' -> 0 -- \007 + '\8' -> 0 -- \010 + '\9' -> cSpace -- \t (not allowed in strings, so !cAny) + '\10' -> cSpace -- \n (ditto) + '\11' -> cSpace -- \v (ditto) + '\12' -> cSpace -- \f (ditto) + '\13' -> cSpace -- ^M (ditto) + '\14' -> 0 -- \016 + '\15' -> 0 -- \017 + '\16' -> 0 -- \020 + '\17' -> 0 -- \021 + '\18' -> 0 -- \022 + '\19' -> 0 -- \023 + '\20' -> 0 -- \024 + '\21' -> 0 -- \025 + '\22' -> 0 -- \026 + '\23' -> 0 -- \027 + '\24' -> 0 -- \030 + '\25' -> 0 -- \031 + '\26' -> 0 -- \032 + '\27' -> 0 -- \033 + '\28' -> 0 -- \034 + '\29' -> 0 -- \035 + '\30' -> 0 -- \036 + '\31' -> 0 -- \037 + '\32' -> cAny .|. cSpace -- + '\33' -> cAny .|. cSymbol -- ! + '\34' -> cAny -- " + '\35' -> cAny .|. cSymbol -- # + '\36' -> cAny .|. cSymbol -- $ + '\37' -> cAny .|. cSymbol -- % + '\38' -> cAny .|. cSymbol -- & + '\39' -> cAny .|. cIdent -- ' + '\40' -> cAny -- ( + '\41' -> cAny -- ) + '\42' -> cAny .|. cSymbol -- * + '\43' -> cAny .|. cSymbol -- + + '\44' -> cAny -- , + '\45' -> cAny .|. cSymbol -- - + '\46' -> cAny .|. cSymbol -- . + '\47' -> cAny .|. cSymbol -- / + '\48' -> cAny .|. cIdent .|. cDigit -- 0 + '\49' -> cAny .|. cIdent .|. cDigit -- 1 + '\50' -> cAny .|. cIdent .|. cDigit -- 2 + '\51' -> cAny .|. cIdent .|. cDigit -- 3 + '\52' -> cAny .|. cIdent .|. cDigit -- 4 + '\53' -> cAny .|. cIdent .|. cDigit -- 5 + '\54' -> cAny .|. cIdent .|. cDigit -- 6 + '\55' -> cAny .|. cIdent .|. cDigit -- 7 + '\56' -> cAny .|. cIdent .|. cDigit -- 8 + '\57' -> cAny .|. cIdent .|. cDigit -- 9 + '\58' -> cAny .|. cSymbol -- : + '\59' -> cAny -- ; + '\60' -> cAny .|. cSymbol -- < + '\61' -> cAny .|. cSymbol -- = + '\62' -> cAny .|. cSymbol -- > + '\63' -> cAny .|. cSymbol -- ? + '\64' -> cAny .|. cSymbol -- @ + '\65' -> cAny .|. cIdent .|. cUpper -- A + '\66' -> cAny .|. cIdent .|. cUpper -- B + '\67' -> cAny .|. cIdent .|. cUpper -- C + '\68' -> cAny .|. cIdent .|. cUpper -- D + '\69' -> cAny .|. cIdent .|. cUpper -- E + '\70' -> cAny .|. cIdent .|. cUpper -- F + '\71' -> cAny .|. cIdent .|. cUpper -- G + '\72' -> cAny .|. cIdent .|. cUpper -- H + '\73' -> cAny .|. cIdent .|. cUpper -- I + '\74' -> cAny .|. cIdent .|. cUpper -- J + '\75' -> cAny .|. cIdent .|. cUpper -- K + '\76' -> cAny .|. cIdent .|. cUpper -- L + '\77' -> cAny .|. cIdent .|. cUpper -- M + '\78' -> cAny .|. cIdent .|. cUpper -- N + '\79' -> cAny .|. cIdent .|. cUpper -- O + '\80' -> cAny .|. cIdent .|. cUpper -- P + '\81' -> cAny .|. cIdent .|. cUpper -- Q + '\82' -> cAny .|. cIdent .|. cUpper -- R + '\83' -> cAny .|. cIdent .|. cUpper -- S + '\84' -> cAny .|. cIdent .|. cUpper -- T + '\85' -> cAny .|. cIdent .|. cUpper -- U + '\86' -> cAny .|. cIdent .|. cUpper -- V + '\87' -> cAny .|. cIdent .|. cUpper -- W + '\88' -> cAny .|. cIdent .|. cUpper -- X + '\89' -> cAny .|. cIdent .|. cUpper -- Y + '\90' -> cAny .|. cIdent .|. cUpper -- Z + '\91' -> cAny -- [ + '\92' -> cAny .|. cSymbol -- backslash + '\93' -> cAny -- ] + '\94' -> cAny .|. cSymbol -- ^ + '\95' -> cAny .|. cIdent .|. cLower -- _ + '\96' -> cAny -- ` + '\97' -> cAny .|. cIdent .|. cLower -- a + '\98' -> cAny .|. cIdent .|. cLower -- b + '\99' -> cAny .|. cIdent .|. cLower -- c + '\100' -> cAny .|. cIdent .|. cLower -- d + '\101' -> cAny .|. cIdent .|. cLower -- e + '\102' -> cAny .|. cIdent .|. cLower -- f + '\103' -> cAny .|. cIdent .|. cLower -- g + '\104' -> cAny .|. cIdent .|. cLower -- h + '\105' -> cAny .|. cIdent .|. cLower -- i + '\106' -> cAny .|. cIdent .|. cLower -- j + '\107' -> cAny .|. cIdent .|. cLower -- k + '\108' -> cAny .|. cIdent .|. cLower -- l + '\109' -> cAny .|. cIdent .|. cLower -- m + '\110' -> cAny .|. cIdent .|. cLower -- n + '\111' -> cAny .|. cIdent .|. cLower -- o + '\112' -> cAny .|. cIdent .|. cLower -- p + '\113' -> cAny .|. cIdent .|. cLower -- q + '\114' -> cAny .|. cIdent .|. cLower -- r + '\115' -> cAny .|. cIdent .|. cLower -- s + '\116' -> cAny .|. cIdent .|. cLower -- t + '\117' -> cAny .|. cIdent .|. cLower -- u + '\118' -> cAny .|. cIdent .|. cLower -- v + '\119' -> cAny .|. cIdent .|. cLower -- w + '\120' -> cAny .|. cIdent .|. cLower -- x + '\121' -> cAny .|. cIdent .|. cLower -- y + '\122' -> cAny .|. cIdent .|. cLower -- z + '\123' -> cAny -- { + '\124' -> cAny .|. cSymbol -- | + '\125' -> cAny -- } + '\126' -> cAny .|. cSymbol -- ~ + '\127' -> 0 -- \177 + _ -> panic ("charType: " ++ show c) |