summaryrefslogtreecommitdiff
path: root/compiler/GHC/Utils/Lexeme.hs
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/GHC/Utils/Lexeme.hs')
-rw-r--r--compiler/GHC/Utils/Lexeme.hs240
1 files changed, 240 insertions, 0 deletions
diff --git a/compiler/GHC/Utils/Lexeme.hs b/compiler/GHC/Utils/Lexeme.hs
new file mode 100644
index 0000000000..2ea773a2f0
--- /dev/null
+++ b/compiler/GHC/Utils/Lexeme.hs
@@ -0,0 +1,240 @@
+-- (c) The GHC Team
+--
+-- Functions to evaluate whether or not a string is a valid identifier.
+-- There is considerable overlap between the logic here and the logic
+-- in Lexer.x, but sadly there seems to be no way to merge them.
+
+module GHC.Utils.Lexeme (
+ -- * Lexical characteristics of Haskell names
+
+ -- | Use these functions to figure what kind of name a 'FastString'
+ -- represents; these functions do /not/ check that the identifier
+ -- is valid.
+
+ isLexCon, isLexVar, isLexId, isLexSym,
+ isLexConId, isLexConSym, isLexVarId, isLexVarSym,
+ startsVarSym, startsVarId, startsConSym, startsConId,
+
+ -- * Validating identifiers
+
+ -- | These functions (working over plain old 'String's) check
+ -- to make sure that the identifier is valid.
+ okVarOcc, okConOcc, okTcOcc,
+ okVarIdOcc, okVarSymOcc, okConIdOcc, okConSymOcc
+
+ -- Some of the exports above are not used within GHC, but may
+ -- be of value to GHC API users.
+
+ ) where
+
+import GhcPrelude
+
+import FastString
+
+import Data.Char
+import qualified Data.Set as Set
+
+import GHC.Lexeme
+
+{-
+
+************************************************************************
+* *
+ Lexical categories
+* *
+************************************************************************
+
+These functions test strings to see if they fit the lexical categories
+defined in the Haskell report.
+
+Note [Classification of generated names]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some names generated for internal use can show up in debugging output,
+e.g. when using -ddump-simpl. These generated names start with a $
+but should still be pretty-printed using prefix notation. We make sure
+this is the case in isLexVarSym by only classifying a name as a symbol
+if all its characters are symbols, not just its first one.
+-}
+
+isLexCon, isLexVar, isLexId, isLexSym :: FastString -> Bool
+isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FastString -> Bool
+
+isLexCon cs = isLexConId cs || isLexConSym cs
+isLexVar cs = isLexVarId cs || isLexVarSym cs
+
+isLexId cs = isLexConId cs || isLexVarId cs
+isLexSym cs = isLexConSym cs || isLexVarSym cs
+
+-------------
+isLexConId cs -- Prefix type or data constructors
+ | nullFS cs = False -- e.g. "Foo", "[]", "(,)"
+ | cs == (fsLit "[]") = True
+ | otherwise = startsConId (headFS cs)
+
+isLexVarId cs -- Ordinary prefix identifiers
+ | nullFS cs = False -- e.g. "x", "_x"
+ | otherwise = startsVarId (headFS cs)
+
+isLexConSym cs -- Infix type or data constructors
+ | nullFS cs = False -- e.g. ":-:", ":", "->"
+ | cs == (fsLit "->") = True
+ | otherwise = startsConSym (headFS cs)
+
+isLexVarSym fs -- Infix identifiers e.g. "+"
+ | fs == (fsLit "~R#") = True
+ | otherwise
+ = case (if nullFS fs then [] else unpackFS fs) of
+ [] -> False
+ (c:cs) -> startsVarSym c && all isVarSymChar cs
+ -- See Note [Classification of generated names]
+
+{-
+
+************************************************************************
+* *
+ Detecting valid names for Template Haskell
+* *
+************************************************************************
+
+-}
+
+----------------------
+-- External interface
+----------------------
+
+-- | Is this an acceptable variable name?
+okVarOcc :: String -> Bool
+okVarOcc str@(c:_)
+ | startsVarId c
+ = okVarIdOcc str
+ | startsVarSym c
+ = okVarSymOcc str
+okVarOcc _ = False
+
+-- | Is this an acceptable constructor name?
+okConOcc :: String -> Bool
+okConOcc str@(c:_)
+ | startsConId c
+ = okConIdOcc str
+ | startsConSym c
+ = okConSymOcc str
+ | str == "[]"
+ = True
+okConOcc _ = False
+
+-- | Is this an acceptable type name?
+okTcOcc :: String -> Bool
+okTcOcc "[]" = True
+okTcOcc "->" = True
+okTcOcc "~" = True
+okTcOcc str@(c:_)
+ | startsConId c
+ = okConIdOcc str
+ | startsConSym c
+ = okConSymOcc str
+ | startsVarSym c
+ = okVarSymOcc str
+okTcOcc _ = False
+
+-- | Is this an acceptable alphanumeric variable name, assuming it starts
+-- with an acceptable letter?
+okVarIdOcc :: String -> Bool
+okVarIdOcc str = okIdOcc str &&
+ -- admit "_" as a valid identifier. Required to support typed
+ -- holes in Template Haskell. See #10267
+ (str == "_" || not (str `Set.member` reservedIds))
+
+-- | Is this an acceptable symbolic variable name, assuming it starts
+-- with an acceptable character?
+okVarSymOcc :: String -> Bool
+okVarSymOcc str = all okSymChar str &&
+ not (str `Set.member` reservedOps) &&
+ not (isDashes str)
+
+-- | Is this an acceptable alphanumeric constructor name, assuming it
+-- starts with an acceptable letter?
+okConIdOcc :: String -> Bool
+okConIdOcc str = okIdOcc str ||
+ is_tuple_name1 True str ||
+ -- Is it a boxed tuple...
+ is_tuple_name1 False str ||
+ -- ...or an unboxed tuple (#12407)...
+ is_sum_name1 str
+ -- ...or an unboxed sum (#12514)?
+ where
+ -- check for tuple name, starting at the beginning
+ is_tuple_name1 True ('(' : rest) = is_tuple_name2 True rest
+ is_tuple_name1 False ('(' : '#' : rest) = is_tuple_name2 False rest
+ is_tuple_name1 _ _ = False
+
+ -- check for tuple tail
+ is_tuple_name2 True ")" = True
+ is_tuple_name2 False "#)" = True
+ is_tuple_name2 boxed (',' : rest) = is_tuple_name2 boxed rest
+ is_tuple_name2 boxed (ws : rest)
+ | isSpace ws = is_tuple_name2 boxed rest
+ is_tuple_name2 _ _ = False
+
+ -- check for sum name, starting at the beginning
+ is_sum_name1 ('(' : '#' : rest) = is_sum_name2 False rest
+ is_sum_name1 _ = False
+
+ -- check for sum tail, only allowing at most one underscore
+ is_sum_name2 _ "#)" = True
+ is_sum_name2 underscore ('|' : rest) = is_sum_name2 underscore rest
+ is_sum_name2 False ('_' : rest) = is_sum_name2 True rest
+ is_sum_name2 underscore (ws : rest)
+ | isSpace ws = is_sum_name2 underscore rest
+ is_sum_name2 _ _ = False
+
+-- | Is this an acceptable symbolic constructor name, assuming it
+-- starts with an acceptable character?
+okConSymOcc :: String -> Bool
+okConSymOcc ":" = True
+okConSymOcc str = all okSymChar str &&
+ not (str `Set.member` reservedOps)
+
+----------------------
+-- Internal functions
+----------------------
+
+-- | Is this string an acceptable id, possibly with a suffix of hashes,
+-- but not worrying about case or clashing with reserved words?
+okIdOcc :: String -> Bool
+okIdOcc str
+ = let hashes = dropWhile okIdChar str in
+ all (== '#') hashes -- -XMagicHash allows a suffix of hashes
+ -- of course, `all` says "True" to an empty list
+
+-- | Is this character acceptable in an identifier (after the first letter)?
+-- See alexGetByte in Lexer.x
+okIdChar :: Char -> Bool
+okIdChar c = case generalCategory c of
+ UppercaseLetter -> True
+ LowercaseLetter -> True
+ TitlecaseLetter -> True
+ ModifierLetter -> True -- See #10196
+ OtherLetter -> True -- See #1103
+ NonSpacingMark -> True -- See #7650
+ DecimalNumber -> True
+ OtherNumber -> True -- See #4373
+ _ -> c == '\'' || c == '_'
+
+-- | All reserved identifiers. Taken from section 2.4 of the 2010 Report.
+reservedIds :: Set.Set String
+reservedIds = Set.fromList [ "case", "class", "data", "default", "deriving"
+ , "do", "else", "foreign", "if", "import", "in"
+ , "infix", "infixl", "infixr", "instance", "let"
+ , "module", "newtype", "of", "then", "type", "where"
+ , "_" ]
+
+-- | All reserved operators. Taken from section 2.4 of the 2010 Report.
+reservedOps :: Set.Set String
+reservedOps = Set.fromList [ "..", ":", "::", "=", "\\", "|", "<-", "->"
+ , "@", "~", "=>" ]
+
+-- | Does this string contain only dashes and has at least 2 of them?
+isDashes :: String -> Bool
+isDashes ('-' : '-' : rest) = all (== '-') rest
+isDashes _ = False