summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Eisenberg <eir@cis.upenn.edu>2014-11-03 15:34:53 -0500
committerRichard Eisenberg <eir@cis.upenn.edu>2014-11-21 11:15:46 -0500
commitda2fca9e2be8c61c91c034d8c2302d8b1d1e7b41 (patch)
treeccc30f5a5b1d8ff4cc1ef66cfec4499bb28846d8
parentdbf360a5264d5d6597e046dcd9b4f49effa91eee (diff)
downloadhaskell-da2fca9e2be8c61c91c034d8c2302d8b1d1e7b41.tar.gz
Fix #7484, checking for good binder names in Convert.
This commit also refactors a bunch of lexeme-oriented code into a new module Lexeme, and includes a submodule update for haddock.
-rw-r--r--compiler/basicTypes/Lexeme.hs252
-rw-r--r--compiler/basicTypes/OccName.lhs72
-rw-r--r--compiler/ghc.cabal.in1
-rw-r--r--compiler/ghc.mk1
-rw-r--r--compiler/hsSyn/Convert.lhs14
-rw-r--r--compiler/parser/Lexer.x6
-rw-r--r--compiler/typecheck/TcGenDeriv.lhs1
-rw-r--r--compiler/typecheck/TcSplice.lhs1
-rw-r--r--testsuite/tests/th/all.T2
m---------utils/haddock0
10 files changed, 270 insertions, 80 deletions
diff --git a/compiler/basicTypes/Lexeme.hs b/compiler/basicTypes/Lexeme.hs
new file mode 100644
index 0000000000..c5bda4dbdd
--- /dev/null
+++ b/compiler/basicTypes/Lexeme.hs
@@ -0,0 +1,252 @@
+-- (c) The GHC Team
+--
+-- Functions to evaluate whether or not a string is a valid identifier.
+-- There is considerable overlap between the logic here and the logic
+-- in Lexer.x, but sadly there seems to be way to merge them.
+
+module Lexeme (
+ -- * Lexical characteristics of Haskell names
+
+ -- | Use these functions to figure what kind of name a 'FastString'
+ -- represents; these functions do /not/ check that the identifier
+ -- is valid.
+
+ isLexCon, isLexVar, isLexId, isLexSym,
+ isLexConId, isLexConSym, isLexVarId, isLexVarSym,
+ startsVarSym, startsVarId, startsConSym, startsConId,
+
+ -- * Validating identifiers
+
+ -- | These functions (working over plain old 'String's) check
+ -- to make sure that the identifier is valid.
+ okVarOcc, okConOcc, okTcOcc,
+ okVarIdOcc, okVarSymOcc, okConIdOcc, okConSymOcc
+
+ -- Some of the exports above are not used within GHC, but may
+ -- be of value to GHC API users.
+
+ ) where
+
+import FastString
+
+import Data.Char
+import qualified Data.Set as Set
+
+{-
+
+************************************************************************
+* *
+ Lexical categories
+* *
+************************************************************************
+
+These functions test strings to see if they fit the lexical categories
+defined in the Haskell report.
+
+Note [Classification of generated names]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some names generated for internal use can show up in debugging output,
+e.g. when using -ddump-simpl. These generated names start with a $
+but should still be pretty-printed using prefix notation. We make sure
+this is the case in isLexVarSym by only classifying a name as a symbol
+if all its characters are symbols, not just its first one.
+-}
+
+isLexCon, isLexVar, isLexId, isLexSym :: FastString -> Bool
+isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FastString -> Bool
+
+isLexCon cs = isLexConId cs || isLexConSym cs
+isLexVar cs = isLexVarId cs || isLexVarSym cs
+
+isLexId cs = isLexConId cs || isLexVarId cs
+isLexSym cs = isLexConSym cs || isLexVarSym cs
+
+-------------
+isLexConId cs -- Prefix type or data constructors
+ | nullFS cs = False -- e.g. "Foo", "[]", "(,)"
+ | cs == (fsLit "[]") = True
+ | otherwise = startsConId (headFS cs)
+
+isLexVarId cs -- Ordinary prefix identifiers
+ | nullFS cs = False -- e.g. "x", "_x"
+ | otherwise = startsVarId (headFS cs)
+
+isLexConSym cs -- Infix type or data constructors
+ | nullFS cs = False -- e.g. ":-:", ":", "->"
+ | cs == (fsLit "->") = True
+ | otherwise = startsConSym (headFS cs)
+
+isLexVarSym fs -- Infix identifiers e.g. "+"
+ | fs == (fsLit "~R#") = True
+ | otherwise
+ = case (if nullFS fs then [] else unpackFS fs) of
+ [] -> False
+ (c:cs) -> startsVarSym c && all isVarSymChar cs
+ -- See Note [Classification of generated names]
+
+-------------
+startsVarSym, startsVarId, startsConSym, startsConId :: Char -> Bool
+startsVarSym c = startsVarSymASCII c || (ord c > 0x7f && isSymbol c) -- Infix Ids
+startsConSym c = c == ':' -- Infix data constructors
+startsVarId c = c == '_' || case generalCategory c of -- Ordinary Ids
+ LowercaseLetter -> True
+ OtherLetter -> True -- See #1103
+ _ -> False
+startsConId c = isUpper c || c == '(' -- Ordinary type constructors and data constructors
+
+startsVarSymASCII :: Char -> Bool
+startsVarSymASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
+
+isVarSymChar :: Char -> Bool
+isVarSymChar c = c == ':' || startsVarSym c
+
+{-
+
+************************************************************************
+* *
+ Detecting valid names for Template Haskell
+* *
+************************************************************************
+
+-}
+
+----------------------
+-- External interface
+----------------------
+
+-- | Is this an acceptable variable name?
+okVarOcc :: String -> Bool
+okVarOcc str@(c:_)
+ | startsVarId c
+ = okVarIdOcc str
+ | startsVarSym c
+ = okVarSymOcc str
+okVarOcc _ = False
+
+-- | Is this an acceptable constructor name?
+okConOcc :: String -> Bool
+okConOcc str@(c:_)
+ | startsConId c
+ = okConIdOcc str
+ | startsConSym c
+ = okConSymOcc str
+ | str == "[]"
+ = True
+okConOcc _ = False
+
+-- | Is this an acceptable type name?
+okTcOcc :: String -> Bool
+okTcOcc "[]" = True
+okTcOcc "->" = True
+okTcOcc "~" = True
+okTcOcc str@(c:_)
+ | startsConId c
+ = okConIdOcc str
+ | startsConSym c
+ = okConSymOcc str
+ | startsVarSym c
+ = okVarSymOcc str
+okTcOcc _ = False
+
+-- | Is this an acceptable alphanumeric variable name, assuming it starts
+-- with an acceptable letter?
+okVarIdOcc :: String -> Bool
+okVarIdOcc str = okIdOcc str &&
+ not (str `Set.member` reservedIds)
+
+-- | Is this an acceptable symbolic variable name, assuming it starts
+-- with an acceptable character?
+okVarSymOcc :: String -> Bool
+okVarSymOcc str = all okSymChar str &&
+ not (str `Set.member` reservedOps) &&
+ not (isDashes str)
+
+-- | Is this an acceptable alphanumeric constructor name, assuming it
+-- starts with an acceptable letter?
+okConIdOcc :: String -> Bool
+okConIdOcc str = okIdOcc str ||
+ is_tuple_name1 str
+ where
+ -- check for tuple name, starting at the beginning
+ is_tuple_name1 ('(' : rest) = is_tuple_name2 rest
+ is_tuple_name1 _ = False
+
+ -- check for tuple tail
+ is_tuple_name2 ")" = True
+ is_tuple_name2 (',' : rest) = is_tuple_name2 rest
+ is_tuple_name2 (ws : rest)
+ | isSpace ws = is_tuple_name2 rest
+ is_tuple_name2 _ = False
+
+-- | Is this an acceptable symbolic constructor name, assuming it
+-- starts with an acceptable character?
+okConSymOcc :: String -> Bool
+okConSymOcc ":" = True
+okConSymOcc str = all okSymChar str &&
+ not (str `Set.member` reservedOps)
+
+----------------------
+-- Internal functions
+----------------------
+
+-- | Is this string an acceptable id, possibly with a suffix of hashes,
+-- but not worrying about case or clashing with reserved words?
+okIdOcc :: String -> Bool
+okIdOcc str
+ = let hashes = dropWhile okIdChar str in
+ all (== '#') hashes -- -XMagicHash allows a suffix of hashes
+ -- of course, `all` says "True" to an empty list
+
+-- | Is this character acceptable in an identifier (after the first letter)?
+-- See alexGetByte in Lexer.x
+okIdChar :: Char -> Bool
+okIdChar c = case generalCategory c of
+ UppercaseLetter -> True
+ LowercaseLetter -> True
+ OtherLetter -> True
+ TitlecaseLetter -> True
+ DecimalNumber -> True
+ OtherNumber -> True
+ _ -> c == '\'' || c == '_'
+
+-- | Is this character acceptable in a symbol (after the first char)?
+-- See alexGetByte in Lexer.x
+okSymChar :: Char -> Bool
+okSymChar c
+ | c `elem` specialSymbols
+ = False
+ | c `elem` "_\"'"
+ = False
+ | otherwise
+ = case generalCategory c of
+ ConnectorPunctuation -> True
+ DashPunctuation -> True
+ OtherPunctuation -> True
+ MathSymbol -> True
+ CurrencySymbol -> True
+ ModifierSymbol -> True
+ OtherSymbol -> True
+ _ -> False
+
+-- | All reserved identifiers. Taken from section 2.4 of the 2010 Report.
+reservedIds :: Set.Set String
+reservedIds = Set.fromList [ "case", "class", "data", "default", "deriving"
+ , "do", "else", "foreign", "if", "import", "in"
+ , "infix", "infixl", "infixr", "instance", "let"
+ , "module", "newtype", "of", "then", "type", "where"
+ , "_" ]
+
+-- | All punctuation that cannot appear in symbols. See $special in Lexer.x.
+specialSymbols :: [Char]
+specialSymbols = "(),;[]`{}"
+
+-- | All reserved operators. Taken from section 2.4 of the 2010 Report.
+reservedOps :: Set.Set String
+reservedOps = Set.fromList [ "..", ":", "::", "=", "\\", "|", "<-", "->"
+ , "@", "~", "=>" ]
+
+-- | Does this string contain only dashes and has at least 2 of them?
+isDashes :: String -> Bool
+isDashes ('-' : '-' : rest) = all (== '-') rest
+isDashes _ = False
diff --git a/compiler/basicTypes/OccName.lhs b/compiler/basicTypes/OccName.lhs
index 0010ad37dc..fdc7c95918 100644
--- a/compiler/basicTypes/OccName.lhs
+++ b/compiler/basicTypes/OccName.lhs
@@ -94,11 +94,6 @@ module OccName (
-- * Tidying up
TidyOccEnv, emptyTidyOccEnv, tidyOccName, initTidyOccEnv,
- -- * Lexical characteristics of Haskell names
- isLexCon, isLexVar, isLexId, isLexSym,
- isLexConId, isLexConSym, isLexVarId, isLexVarSym,
- startsVarSym, startsVarId, startsConSym, startsConId,
-
-- FsEnv
FastStringEnv, emptyFsEnv, lookupFsEnv, extendFsEnv, mkFsEnv
) where
@@ -110,6 +105,7 @@ import UniqFM
import UniqSet
import FastString
import Outputable
+import Lexeme
import Binary
import Data.Char
import Data.Data
@@ -851,72 +847,6 @@ tidyOccName env occ@(OccName occ_sp fs)
%************************************************************************
%* *
-\subsection{Lexical categories}
-%* *
-%************************************************************************
-
-These functions test strings to see if they fit the lexical categories
-defined in the Haskell report.
-
-Note [Classification of generated names]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some names generated for internal use can show up in debugging output,
-e.g. when using -ddump-simpl. These generated names start with a $
-but should still be pretty-printed using prefix notation. We make sure
-this is the case in isLexVarSym by only classifying a name as a symbol
-if all its characters are symbols, not just its first one.
-
-\begin{code}
-isLexCon, isLexVar, isLexId, isLexSym :: FastString -> Bool
-isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FastString -> Bool
-
-isLexCon cs = isLexConId cs || isLexConSym cs
-isLexVar cs = isLexVarId cs || isLexVarSym cs
-
-isLexId cs = isLexConId cs || isLexVarId cs
-isLexSym cs = isLexConSym cs || isLexVarSym cs
-
--------------
-
-isLexConId cs -- Prefix type or data constructors
- | nullFS cs = False -- e.g. "Foo", "[]", "(,)"
- | cs == (fsLit "[]") = True
- | otherwise = startsConId (headFS cs)
-
-isLexVarId cs -- Ordinary prefix identifiers
- | nullFS cs = False -- e.g. "x", "_x"
- | otherwise = startsVarId (headFS cs)
-
-isLexConSym cs -- Infix type or data constructors
- | nullFS cs = False -- e.g. ":-:", ":", "->"
- | cs == (fsLit "->") = True
- | otherwise = startsConSym (headFS cs)
-
-isLexVarSym fs -- Infix identifiers e.g. "+"
- | fs == (fsLit "~R#") = True
- | otherwise
- = case (if nullFS fs then [] else unpackFS fs) of
- [] -> False
- (c:cs) -> startsVarSym c && all isVarSymChar cs
- -- See Note [Classification of generated names]
-
--------------
-startsVarSym, startsVarId, startsConSym, startsConId :: Char -> Bool
-startsVarSym c = isSymbolASCII c || (ord c > 0x7f && isSymbol c) -- Infix Ids
-startsConSym c = c == ':' -- Infix data constructors
-startsVarId c = isLower c || c == '_' -- Ordinary Ids
-startsConId c = isUpper c || c == '(' -- Ordinary type constructors and data constructors
-
-isSymbolASCII :: Char -> Bool
-isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
-
-isVarSymChar :: Char -> Bool
-isVarSymChar c = c == ':' || startsVarSym c
-\end{code}
-
-%************************************************************************
-%* *
Binary instance
Here rather than BinIface because OccName is abstract
%* *
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index bfc2e9cf7c..4aa2e3a4d5 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -161,6 +161,7 @@ Library
Hooks
Id
IdInfo
+ Lexeme
Literal
Llvm
Llvm.AbsSyn
diff --git a/compiler/ghc.mk b/compiler/ghc.mk
index 9716ef285a..752a607c72 100644
--- a/compiler/ghc.mk
+++ b/compiler/ghc.mk
@@ -536,6 +536,7 @@ compiler_stage2_dll0_MODULES = \
IfaceType \
InstEnv \
Kind \
+ Lexeme \
ListSetOps \
Literal \
LoadIface \
diff --git a/compiler/hsSyn/Convert.lhs b/compiler/hsSyn/Convert.lhs
index 83c286d4dc..141b8b840a 100644
--- a/compiler/hsSyn/Convert.lhs
+++ b/compiler/hsSyn/Convert.lhs
@@ -30,6 +30,7 @@ import ForeignCall
import Unique
import ErrUtils
import Bag
+import Lexeme
import Util
import FastString
import Outputable
@@ -1122,14 +1123,11 @@ cvtName ctxt_ns (TH.Name occ flavour)
occ_str = TH.occString occ
okOcc :: OccName.NameSpace -> String -> Bool
-okOcc _ [] = False
-okOcc ns str@(c:_)
- | OccName.isVarNameSpace ns = startsVarId c || startsVarSym c
- | OccName.isDataConNameSpace ns = startsConId c || startsConSym c || str == "[]"
- | otherwise = startsConId c || startsConSym c ||
- startsVarSym c || str == "[]" || str == "->"
- -- allow type operators like "+"
-
+okOcc ns str
+ | OccName.isVarNameSpace ns = okVarOcc str
+ | OccName.isDataConNameSpace ns = okConOcc str
+ | otherwise = okTcOcc str
+
-- Determine the name space of a name in a type
--
isVarName :: TH.Name -> Bool
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index b4223c82d8..1e8712b2d5 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -115,6 +115,8 @@ import Ctype
-- -----------------------------------------------------------------------------
-- Alex "Character set macros"
+-- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs
+-- Any changes here should likely be reflected there.
$unispace = \x05 -- Trick Alex into handling Unicode. See alexGetByte.
$nl = [\n\r\f]
$whitechar = [$nl\v\ $unispace]
@@ -1802,6 +1804,10 @@ alexGetByte (AI loc s)
-- character is encountered we output these values
-- with the actual character value hidden in the state.
| otherwise =
+ -- NB: The logic behind these definitions is also reflected
+ -- in basicTypes/Lexeme.hs
+ -- Any changes here should likely be reflected there.
+
case generalCategory c of
UppercaseLetter -> upper
LowercaseLetter -> lower
diff --git a/compiler/typecheck/TcGenDeriv.lhs b/compiler/typecheck/TcGenDeriv.lhs
index df45001870..0779e67363 100644
--- a/compiler/typecheck/TcGenDeriv.lhs
+++ b/compiler/typecheck/TcGenDeriv.lhs
@@ -60,6 +60,7 @@ import Util
import Var
import MonadUtils
import Outputable
+import Lexeme
import FastString
import Pair
import Bag
diff --git a/compiler/typecheck/TcSplice.lhs b/compiler/typecheck/TcSplice.lhs
index 3302d028a5..7c8085ebe2 100644
--- a/compiler/typecheck/TcSplice.lhs
+++ b/compiler/typecheck/TcSplice.lhs
@@ -91,6 +91,7 @@ import BasicTypes hiding( SuccessFlag(..) )
import Maybes( MaybeErr(..) )
import DynFlags
import Panic
+import Lexeme
import FastString
import Outputable
import Control.Monad ( when )
diff --git a/testsuite/tests/th/all.T b/testsuite/tests/th/all.T
index 5109473d96..1144156607 100644
--- a/testsuite/tests/th/all.T
+++ b/testsuite/tests/th/all.T
@@ -343,4 +343,4 @@ test('T9066', normal, compile, ['-v0'])
test('T8100', normal, compile, ['-v0'])
test('T9064', normal, compile, ['-v0'])
test('T9209', normal, compile_fail, ['-v0'])
-test('T7484', expect_broken(7484), compile_fail, ['-v0'])
+test('T7484', normal, compile_fail, ['-v0'])
diff --git a/utils/haddock b/utils/haddock
-Subproject 19409126be62383bc64d79698b265ffaf96269a
+Subproject 2b3712d701c1df626abbc60525c35e735272e45