1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
|
{
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# OPTIONS_GHC -funbox-strict-fields #-}
module GHC.Parser.HaddockLex (lexHsDoc, lexStringLiteral) where
import GHC.Prelude
import GHC.Data.FastString
import GHC.Hs.Doc
import GHC.Parser.Lexer
import GHC.Parser.Annotation
import GHC.Types.SrcLoc
import GHC.Types.SourceText
import GHC.Data.StringBuffer
import qualified GHC.Data.Strict as Strict
import GHC.Types.Name.Reader
import GHC.Utils.Outputable
import GHC.Utils.Error
import GHC.Utils.Encoding
import GHC.Hs.Extension
import qualified GHC.Data.EnumSet as EnumSet
import Data.Maybe
import Data.Word
import Data.ByteString ( ByteString )
import qualified Data.ByteString as BS
import qualified GHC.LanguageExtensions as LangExt
}
-- -----------------------------------------------------------------------------
-- Alex "Character set macros"
-- Copied from GHC/Parser/Lexer.x
-- NB: The logic behind these definitions is also reflected in "GHC.Utils.Lexeme"
-- Any changes here should likely be reflected there.
$unispace = \x05 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$nl = [\n\r\f]
$whitechar = [$nl\v\ $unispace]
$white_no_nl = $whitechar # \n -- TODO #8424
$tab = \t
$ascdigit = 0-9
$unidigit = \x03 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$decdigit = $ascdigit -- exactly $ascdigit, no more no less.
$digit = [$ascdigit $unidigit]
$special = [\(\)\,\;\[\]\`\{\}]
$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
$unisymbol = \x04 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$symbol = [$ascsymbol $unisymbol] # [$special \_\"\']
$unilarge = \x01 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$asclarge = [A-Z]
$large = [$asclarge $unilarge]
$unismall = \x02 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$ascsmall = [a-z]
$small = [$ascsmall $unismall \_]
$uniidchar = \x07 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$idchar = [$small $large $digit $uniidchar \']
$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$graphic = [$small $large $symbol $digit $idchar $special $unigraphic \"\']
$alpha = [$small $large]
-- The character sets marked "TODO" are mostly overly inclusive
-- and should be defined more precisely once alex has better
-- support for unicode character sets (see
-- https://github.com/simonmar/alex/issues/126).
@id = $alpha $idchar* \#* | $symbol+
@modname = $large $idchar*
@qualid = (@modname \.)* @id
:-
\' @qualid \' | \` @qualid \` { getIdentifier 1 }
\'\` @qualid \`\' | \'\( @qualid \)\' | \`\( @qualid \)\` { getIdentifier 2 }
[. \n] ;
{
data AlexInput = AlexInput
{ alexInput_position :: !RealSrcLoc
, alexInput_string :: !ByteString
}
-- NB: As long as we don't use a left-context we don't need to track the
-- previous input character.
alexInputPrevChar :: AlexInput -> Word8
alexInputPrevChar = error "Left-context not supported"
alexGetByte :: AlexInput -> Maybe (Word8, AlexInput)
alexGetByte (AlexInput p s) = case utf8UnconsByteString s of
Nothing -> Nothing
Just (c,bs) -> Just (adjustChar c, AlexInput (advanceSrcLoc p c) bs)
alexScanTokens :: RealSrcLoc -> ByteString -> [(RealSrcSpan, ByteString)]
alexScanTokens start str0 = go (AlexInput start str0)
where go inp@(AlexInput pos str) =
case alexScan inp 0 of
AlexSkip inp' _ln -> go inp'
AlexToken inp'@(AlexInput _ str') _ act -> act pos (BS.length str - BS.length str') str : go inp'
AlexEOF -> []
AlexError (AlexInput p _) -> error $ "lexical error at " ++ show p
--------------------------------------------------------------------------------
-- | Extract identifier from Alex state.
getIdentifier :: Int -- ^ adornment length
-> RealSrcLoc
-> Int
-- ^ Token length
-> ByteString
-- ^ The remaining input beginning with the found token
-> (RealSrcSpan, ByteString)
getIdentifier !i !loc0 !len0 !s0 =
(mkRealSrcSpan loc1 loc2, ident)
where
(adornment, s1) = BS.splitAt i s0
ident = BS.take (len0 - 2*i) s1
loc1 = advanceSrcLocBS loc0 adornment
loc2 = advanceSrcLocBS loc1 ident
advanceSrcLocBS :: RealSrcLoc -> ByteString -> RealSrcLoc
advanceSrcLocBS !loc bs = case utf8UnconsByteString bs of
Nothing -> loc
Just (c, bs') -> advanceSrcLocBS (advanceSrcLoc loc c) bs'
-- | Lex 'StringLiteral' for warning messages
lexStringLiteral :: P (LocatedN RdrName) -- ^ A precise identifier parser
-> Located StringLiteral
-> Located (WithHsDocIdentifiers StringLiteral GhcPs)
lexStringLiteral identParser (L l sl@(StringLiteral _ fs _))
= L l (WithHsDocIdentifiers sl idents)
where
bs = bytesFS fs
idents = mapMaybe (uncurry (validateIdentWith identParser)) plausibleIdents
plausibleIdents :: [(SrcSpan,ByteString)]
plausibleIdents = case l of
RealSrcSpan span _ -> [(RealSrcSpan span' Strict.Nothing, tok) | (span', tok) <- alexScanTokens (realSrcSpanStart span) bs]
UnhelpfulSpan reason -> [(UnhelpfulSpan reason, tok) | (_, tok) <- alexScanTokens fakeLoc bs]
fakeLoc = mkRealSrcLoc (mkFastString "") 0 0
-- | Lex identifiers from a docstring.
lexHsDoc :: P (LocatedN RdrName) -- ^ A precise identifier parser
-> HsDocString
-> HsDoc GhcPs
lexHsDoc identParser doc =
WithHsDocIdentifiers doc idents
where
docStrings = docStringChunks doc
idents = concat [mapMaybe maybeDocIdentifier (plausibleIdents doc) | doc <- docStrings]
maybeDocIdentifier :: (SrcSpan, ByteString) -> Maybe (Located RdrName)
maybeDocIdentifier = uncurry (validateIdentWith identParser)
plausibleIdents :: LHsDocStringChunk -> [(SrcSpan,ByteString)]
plausibleIdents (L (RealSrcSpan span _) (HsDocStringChunk s))
= [(RealSrcSpan span' Strict.Nothing, tok) | (span', tok) <- alexScanTokens (realSrcSpanStart span) s]
plausibleIdents (L (UnhelpfulSpan reason) (HsDocStringChunk s))
= [(UnhelpfulSpan reason, tok) | (_, tok) <- alexScanTokens fakeLoc s] -- preserve the original reason
fakeLoc = mkRealSrcLoc (mkFastString "") 0 0
validateIdentWith :: P (LocatedN RdrName) -> SrcSpan -> ByteString -> Maybe (Located RdrName)
validateIdentWith identParser mloc str0 =
let -- These ParserFlags should be as "inclusive" as possible, allowing
-- identifiers defined with any language extension.
pflags = mkParserOpts
(EnumSet.fromList [LangExt.MagicHash])
dopts
[]
False False False False
dopts = DiagOpts
{ diag_warning_flags = EnumSet.empty
, diag_fatal_warning_flags = EnumSet.empty
, diag_warn_is_error = False
, diag_reverse_errors = False
, diag_max_errors = Nothing
, diag_ppr_ctx = defaultSDocContext
}
buffer = stringBufferFromByteString str0
realSrcLc = case mloc of
RealSrcSpan loc _ -> realSrcSpanStart loc
UnhelpfulSpan _ -> mkRealSrcLoc (mkFastString "") 0 0
pstate = initParserState pflags buffer realSrcLc
in case unP identParser pstate of
POk _ name -> Just $ case mloc of
RealSrcSpan _ _ -> reLoc name
UnhelpfulSpan _ -> L mloc (unLoc name) -- Preserve the original reason
_ -> Nothing
}
|