compiler/GHC/Parser/HaddockLex.x


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

{
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# OPTIONS_GHC -funbox-strict-fields #-}

module GHC.Parser.HaddockLex (lexHsDoc, lexStringLiteral) where

import GHC.Prelude

import GHC.Data.FastString
import GHC.Hs.Doc
import GHC.Parser.Lexer
import GHC.Parser.Annotation
import GHC.Types.SrcLoc
import GHC.Types.SourceText
import GHC.Data.StringBuffer
import qualified GHC.Data.Strict as Strict
import GHC.Types.Name.Reader
import GHC.Utils.Outputable
import GHC.Utils.Error
import GHC.Utils.Encoding
import GHC.Hs.Extension

import qualified GHC.Data.EnumSet as EnumSet

import Data.Maybe
import Data.Word

import Data.ByteString ( ByteString )
import qualified Data.ByteString as BS

import qualified GHC.LanguageExtensions as LangExt
}

-- -----------------------------------------------------------------------------
-- Alex "Character set macros"
-- Copied from GHC/Parser/Lexer.x

-- NB: The logic behind these definitions is also reflected in "GHC.Utils.Lexeme"
-- Any changes here should likely be reflected there.
$unispace    = \x05 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$nl          = [\n\r\f]
$whitechar   = [$nl\v\ $unispace]
$white_no_nl = $whitechar # \n -- TODO #8424
$tab         = \t

$ascdigit  = 0-9
$unidigit  = \x03 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$decdigit  = $ascdigit -- exactly $ascdigit, no more no less.
$digit     = [$ascdigit $unidigit]

$special   = [\(\)\,\;\[\]\`\{\}]
$ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
$unisymbol = \x04 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$symbol    = [$ascsymbol $unisymbol] # [$special \_\"\']

$unilarge  = \x01 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$asclarge  = [A-Z]
$large     = [$asclarge $unilarge]

$unismall  = \x02 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$ascsmall  = [a-z]
$small     = [$ascsmall $unismall \_]

$uniidchar = \x07 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$idchar    = [$small $large $digit $uniidchar \']

$unigraphic = \x06 -- Trick Alex into handling Unicode. See Note [Unicode in Alex].
$graphic   = [$small $large $symbol $digit $idchar $special $unigraphic \"\']

$alpha     = [$small $large]

-- The character sets marked "TODO" are mostly overly inclusive
-- and should be defined more precisely once alex has better
-- support for unicode character sets (see
-- https://github.com/simonmar/alex/issues/126).

@id = $alpha $idchar* \#* | $symbol+
@modname = $large $idchar*
@qualid = (@modname \.)* @id

:-
  \' @qualid \' | \` @qualid \` { getIdentifier 1 }
  \'\` @qualid \`\' | \'\( @qualid \)\' | \`\( @qualid \)\` { getIdentifier 2 }
  [. \n] ;

{
data AlexInput = AlexInput
  { alexInput_position     :: !RealSrcLoc
  , alexInput_string       :: !ByteString
  }

-- NB: As long as we don't use a left-context we don't need to track the
-- previous input character.
alexInputPrevChar :: AlexInput -> Word8
alexInputPrevChar = error "Left-context not supported"

alexGetByte :: AlexInput -> Maybe (Word8, AlexInput)
alexGetByte (AlexInput p s) = case utf8UnconsByteString s of
  Nothing -> Nothing
  Just (c,bs) -> Just (adjustChar c, AlexInput (advanceSrcLoc p c) bs)

alexScanTokens :: RealSrcLoc -> ByteString -> [(RealSrcSpan, ByteString)]
alexScanTokens start str0 = go (AlexInput start str0)
  where go inp@(AlexInput pos str) =
          case alexScan inp 0 of
            AlexSkip  inp' _ln          -> go inp'
            AlexToken inp'@(AlexInput _ str') _ act -> act pos (BS.length str - BS.length str') str : go inp'
            AlexEOF                     -> []
            AlexError (AlexInput p _) -> error $ "lexical error at " ++ show p

--------------------------------------------------------------------------------

-- | Extract identifier from Alex state.
getIdentifier :: Int -- ^ adornment length
              -> RealSrcLoc
              -> Int
                 -- ^ Token length
              -> ByteString
                 -- ^ The remaining input beginning with the found token
              -> (RealSrcSpan, ByteString)
getIdentifier !i !loc0 !len0 !s0 =
    (mkRealSrcSpan loc1 loc2, ident)
  where
    (adornment, s1) = BS.splitAt i s0
    ident = BS.take (len0 - 2*i) s1
    loc1 = advanceSrcLocBS loc0 adornment
    loc2 = advanceSrcLocBS loc1 ident

advanceSrcLocBS :: RealSrcLoc -> ByteString -> RealSrcLoc
advanceSrcLocBS !loc bs = case utf8UnconsByteString bs of
  Nothing -> loc
  Just (c, bs') -> advanceSrcLocBS (advanceSrcLoc loc c) bs'

-- | Lex 'StringLiteral' for warning messages
lexStringLiteral :: P (LocatedN RdrName) -- ^ A precise identifier parser
                 -> Located StringLiteral
                 -> Located (WithHsDocIdentifiers StringLiteral GhcPs)
lexStringLiteral identParser (L l sl@(StringLiteral _ fs _))
  = L l (WithHsDocIdentifiers sl idents)
  where
    bs = bytesFS fs

    idents = mapMaybe (uncurry (validateIdentWith identParser)) plausibleIdents

    plausibleIdents :: [(SrcSpan,ByteString)]
    plausibleIdents = case l of
      RealSrcSpan span _ -> [(RealSrcSpan span' Strict.Nothing, tok) | (span', tok) <- alexScanTokens (realSrcSpanStart span) bs]
      UnhelpfulSpan reason -> [(UnhelpfulSpan reason, tok) | (_, tok) <- alexScanTokens fakeLoc bs]

    fakeLoc = mkRealSrcLoc (mkFastString "") 0 0

-- | Lex identifiers from a docstring.
lexHsDoc :: P (LocatedN RdrName)      -- ^ A precise identifier parser
         -> HsDocString
         -> HsDoc GhcPs
lexHsDoc identParser doc =
    WithHsDocIdentifiers doc idents
  where
    docStrings = docStringChunks doc
    idents = concat [mapMaybe maybeDocIdentifier (plausibleIdents doc) | doc <- docStrings]

    maybeDocIdentifier :: (SrcSpan, ByteString) -> Maybe (Located RdrName)
    maybeDocIdentifier = uncurry (validateIdentWith identParser)

    plausibleIdents :: LHsDocStringChunk -> [(SrcSpan,ByteString)]
    plausibleIdents (L (RealSrcSpan span _) (HsDocStringChunk s))
      = [(RealSrcSpan span' Strict.Nothing, tok) | (span', tok) <- alexScanTokens (realSrcSpanStart span) s]
    plausibleIdents (L (UnhelpfulSpan reason) (HsDocStringChunk s))
      = [(UnhelpfulSpan reason, tok) | (_, tok) <- alexScanTokens fakeLoc s] -- preserve the original reason

    fakeLoc = mkRealSrcLoc (mkFastString "") 0 0

validateIdentWith :: P (LocatedN RdrName) -> SrcSpan -> ByteString -> Maybe (Located RdrName)
validateIdentWith identParser mloc str0 =
  let -- These ParserFlags should be as "inclusive" as possible, allowing
      -- identifiers defined with any language extension.
      pflags = mkParserOpts
                 (EnumSet.fromList [LangExt.MagicHash])
                 dopts
                 []
                 False False False False
      dopts = DiagOpts
        { diag_warning_flags = EnumSet.empty
          , diag_fatal_warning_flags = EnumSet.empty
          , diag_warn_is_error = False
          , diag_reverse_errors = False
          , diag_max_errors = Nothing
          , diag_ppr_ctx = defaultSDocContext
        }
      buffer = stringBufferFromByteString str0
      realSrcLc = case mloc of
        RealSrcSpan loc _ -> realSrcSpanStart loc
        UnhelpfulSpan _ -> mkRealSrcLoc (mkFastString "") 0 0
      pstate = initParserState pflags buffer realSrcLc
  in case unP identParser pstate of
    POk _ name -> Just $ case mloc of
       RealSrcSpan _ _ -> reLoc name
       UnhelpfulSpan _ -> L mloc (unLoc name) -- Preserve the original reason
    _ -> Nothing
}