From 479376c313208ee764d212846892cb904b8e5b89 Mon Sep 17 00:00:00 2001 From: Adrian Thurston Date: Sat, 30 Nov 2019 10:52:10 -0800 Subject: rust grammar: a proper implementation of raw strings --- grammar/rust.lm | 49 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) (limited to 'grammar') diff --git a/grammar/rust.lm b/grammar/rust.lm index b5cc7bb6..f6dd8c15 100644 --- a/grammar/rust.lm +++ b/grammar/rust.lm @@ -50,20 +50,53 @@ lex [0-9]+ '.' [0-9]+ float_exponent? | [0-9]+ ( '.' [0-9]+ )? float_exponent? float_suffix / - - token raw_string / - 'r"' ( any* :>> '"' ) | - 'r#"' ( any* :>> '"#' ) | - 'r##"' ( any* :>> '"##' ) | - 'r###"' ( any* :>> '"###' ) | - 'r####"' ( any* :>> '"####' ) | - 'r#####"' ( any* :>> '"#####' ) / + + # Raw open. Rest handled in a its own lexical region. + token raw_open / 'r' '#' * '"' / + { + # Stash the length (not including r) for comparison against potential + # close strings. + RawOpenLength = match_length - 1 + RawOpen: str = input->pull( match_length ) + input->push( make_token( typeid, RawOpen ) ) + } ignore / "//" [^\n]* '\n' / ignore / "/*" any* :>> "*/" / ignore / [ \t\n]+ / end +# Raw strings. +def raw_string + [raw_open raw_content* raw_close] + +global RawOpenLength: int = 0 + +# Lexical region dedicated to raw strings. Attempts to close by matching +# candidates and then testing the length. +lex + token raw_close / '"' '#'* / + { + # Check the length. We use >= to match the close because rust is lazy + # in matching it. If it is longer we just chop it. Probably will result + # in a parse error. + if match_length >= RawOpenLength { + # Chop it by using RawOpenLength in the pull from input. + Candidate: str = input->pull( RawOpenLength ) + input->push( make_token( typeid, Candidate ) ) + } + else { + # Otherwise just send it as raw content. + Candidate: str = input->pull( match_length ) + input->push( make_token( typeid, Candidate ) ) + } + } + + # Content, send out strings not containing # or ". Or single such chars + # that are not part of a sequence that is first matched by close candidate. + token raw_content / [^"#]+ | any / +end + namespace attr lex token id / [A-Za-z_] [A-Za-z_0-9]* / -- cgit v1.2.1