# mako/lexer.py # Copyright 2006-2020 the Mako authors and contributors # # This module is part of Mako and is released under # the MIT License: http://www.opensource.org/licenses/mit-license.php """provides the Lexer class for parsing template strings into parse trees.""" import codecs import re from mako import exceptions from mako import parsetree from mako.pygen import adjust_whitespace _regexp_cache = {} class Lexer: def __init__( self, text, filename=None, input_encoding=None, preprocessor=None, ): self.text = text self.filename = filename self.template = parsetree.TemplateNode(self.filename) self.matched_lineno = 1 self.matched_charpos = 0 self.lineno = 1 self.match_position = 0 self.tag = [] self.control_line = [] self.ternary_stack = [] self.encoding = input_encoding if preprocessor is None: self.preprocessor = [] elif not hasattr(preprocessor, "__iter__"): self.preprocessor = [preprocessor] else: self.preprocessor = preprocessor @property def exception_kwargs(self): return { "source": self.text, "lineno": self.matched_lineno, "pos": self.matched_charpos, "filename": self.filename, } def match(self, regexp, flags=None): """compile the given regexp, cache the reg, and call match_reg().""" try: reg = _regexp_cache[(regexp, flags)] except KeyError: if flags: reg = re.compile(regexp, flags) else: reg = re.compile(regexp) _regexp_cache[(regexp, flags)] = reg return self.match_reg(reg) def match_reg(self, reg): """match the given regular expression object to the current text position. if a match occurs, update the current text and line position. """ mp = self.match_position match = reg.match(self.text, self.match_position) if match: (start, end) = match.span() if end == start: self.match_position = end + 1 else: self.match_position = end self.matched_lineno = self.lineno lines = re.findall(r"\n", self.text[mp : self.match_position]) cp = mp - 1 while cp >= 0 and cp < self.textlength and self.text[cp] != "\n": cp -= 1 self.matched_charpos = mp - cp self.lineno += len(lines) # print "MATCHED:", match.group(0), "LINE START:", # self.matched_lineno, "LINE END:", self.lineno # print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \ # (match and "TRUE" or "FALSE") return match def parse_until_text(self, watch_nesting, *text): startpos = self.match_position text_re = r"|".join(text) brace_level = 0 paren_level = 0 bracket_level = 0 while True: match = self.match(r"#.*\n") if match: continue match = self.match( r"(\"\"\"|\'\'\'|\"|\')[^\\]*?(\\.[^\\]*?)*\1", re.S ) if match: continue match = self.match(r"(%s)" % text_re) if match and not ( watch_nesting and (brace_level > 0 or paren_level > 0 or bracket_level > 0) ): return ( self.text[ startpos : self.match_position - len(match.group(1)) ], match.group(1), ) elif not match: match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S) if match: brace_level += match.group(1).count("{") brace_level -= match.group(1).count("}") paren_level += match.group(1).count("(") paren_level -= match.group(1).count(")") bracket_level += match.group(1).count("[") bracket_level -= match.group(1).count("]") continue raise exceptions.SyntaxException( "Expected: %s" % ",".join(text), **self.exception_kwargs ) def append_node(self, nodecls, *args, **kwargs): kwargs.setdefault("source", self.text) kwargs.setdefault("lineno", self.matched_lineno) kwargs.setdefault("pos", self.matched_charpos) kwargs["filename"] = self.filename node = nodecls(*args, **kwargs) if len(self.tag): self.tag[-1].nodes.append(node) else: self.template.nodes.append(node) # build a set of child nodes for the control line # (used for loop variable detection) # also build a set of child nodes on ternary control lines # (used for determining if a pass needs to be auto-inserted if self.control_line: control_frame = self.control_line[-1] control_frame.nodes.append(node) if not ( isinstance(node, parsetree.ControlLine) and control_frame.is_ternary(node.keyword) ): if self.ternary_stack and self.ternary_stack[-1]: self.ternary_stack[-1][-1].nodes.append(node) if isinstance(node, parsetree.Tag): if len(self.tag): node.parent = self.tag[-1] self.tag.append(node) elif isinstance(node, parsetree.ControlLine): if node.isend: self.control_line.pop() self.ternary_stack.pop() elif node.is_primary: self.control_line.append(node) self.ternary_stack.append([]) elif self.control_line and self.control_line[-1].is_ternary( node.keyword ): self.ternary_stack[-1].append(node) elif self.control_line and not self.control_line[-1].is_ternary( node.keyword ): raise exceptions.SyntaxException( "Keyword '%s' not a legal ternary for keyword '%s'" % (node.keyword, self.control_line[-1].keyword), **self.exception_kwargs ) _coding_re = re.compile(r"#.*coding[:=]\s*([-\w.]+).*\r?\n") def decode_raw_stream(self, text, decode_raw, known_encoding, filename): """given string/unicode or bytes/string, determine encoding from magic encoding comment, return body as unicode or raw if decode_raw=False """ if isinstance(text, str): m = self._coding_re.match(text) encoding = m and m.group(1) or known_encoding or "utf-8" return encoding, text if text.startswith(codecs.BOM_UTF8): text = text[len(codecs.BOM_UTF8) :] parsed_encoding = "utf-8" m = self._coding_re.match(text.decode("utf-8", "ignore")) if m is not None and m.group(1) != "utf-8": raise exceptions.CompileException( "Found utf-8 BOM in file, with conflicting " "magic encoding comment of '%s'" % m.group(1), text.decode("utf-8", "ignore"), 0, 0, filename, ) else: m = self._coding_re.match(text.decode("utf-8", "ignore")) if m: parsed_encoding = m.group(1) else: parsed_encoding = known_encoding or "utf-8" if decode_raw: try: text = text.decode(parsed_encoding) except UnicodeDecodeError: raise exceptions.CompileException( "Unicode decode operation of encoding '%s' failed" % parsed_encoding, text.decode("utf-8", "ignore"), 0, 0, filename, ) return parsed_encoding, text def parse(self): self.encoding, self.text = self.decode_raw_stream( self.text, True, self.encoding, self.filename ) for preproc in self.preprocessor: self.text = preproc(self.text) # push the match marker past the # encoding comment. self.match_reg(self._coding_re) self.textlength = len(self.text) while True: if self.match_position > self.textlength: break if self.match_end(): break if self.match_expression(): continue if self.match_control_line(): continue if self.match_comment(): continue if self.match_tag_start(): continue if self.match_tag_end(): continue if self.match_python_block(): continue if self.match_text(): continue if self.match_position > self.textlength: break raise exceptions.CompileException("assertion failed") if len(self.tag): raise exceptions.SyntaxException( "Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs ) if len(self.control_line): raise exceptions.SyntaxException( "Unterminated control keyword: '%s'" % self.control_line[-1].keyword, self.text, self.control_line[-1].lineno, self.control_line[-1].pos, self.filename, ) return self.template def match_tag_start(self): match = self.match( r""" \<% # opening tag ([\w\.\:]+) # keyword ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \ # sign, string expression \s* # more whitespace (/)?> # closing """, re.I | re.S | re.X, ) if match: keyword, attr, isend = match.groups() self.keyword = keyword attributes = {} if attr: for att in re.findall( r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr ): key, val1, val2 = att text = val1 or val2 text = text.replace("\r\n", "\n") attributes[key] = text self.append_node(parsetree.Tag, keyword, attributes) if isend: self.tag.pop() else: if keyword == "text": match = self.match(r"(.*?)(?=\)", re.S) if not match: raise exceptions.SyntaxException( "Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs ) self.append_node(parsetree.Text, match.group(1)) return self.match_tag_end() return True else: return False def match_tag_end(self): match = self.match(r"\") if match: if not len(self.tag): raise exceptions.SyntaxException( "Closing tag without opening tag: " % match.group(1), **self.exception_kwargs ) elif self.tag[-1].keyword != match.group(1): raise exceptions.SyntaxException( "Closing tag does not match tag: <%%%s>" % (match.group(1), self.tag[-1].keyword), **self.exception_kwargs ) self.tag.pop() return True else: return False def match_end(self): match = self.match(r"\Z", re.S) if match: string = match.group() if string: return string else: return True else: return False def match_text(self): match = self.match( r""" (.*?) # anything, followed by: ( (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based # comment preceded by a # consumed newline and whitespace | (?=\${) # an expression | (?=") # the trailing newline helps # compiler.parse() not complain about indentation text = adjust_whitespace(text) + "\n" self.append_node( parsetree.Code, text, match.group(1) == "!", lineno=line, pos=pos, ) return True else: return False def match_expression(self): match = self.match(r"\${") if match: line, pos = self.matched_lineno, self.matched_charpos text, end = self.parse_until_text(True, r"\|", r"}") if end == "|": escapes, end = self.parse_until_text(True, r"}") else: escapes = "" text = text.replace("\r\n", "\n") self.append_node( parsetree.Expression, text, escapes.strip(), lineno=line, pos=pos, ) return True else: return False def match_control_line(self): match = self.match( r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)" r"(?:\r?\n|\Z)", re.M, ) if match: operator = match.group(1) text = match.group(2) if operator == "%": m2 = re.match(r"(end)?(\w+)\s*(.*)", text) if not m2: raise exceptions.SyntaxException( "Invalid control line: '%s'" % text, **self.exception_kwargs ) isend, keyword = m2.group(1, 2) isend = isend is not None if isend: if not len(self.control_line): raise exceptions.SyntaxException( "No starting keyword '%s' for '%s'" % (keyword, text), **self.exception_kwargs ) elif self.control_line[-1].keyword != keyword: raise exceptions.SyntaxException( "Keyword '%s' doesn't match keyword '%s'" % (text, self.control_line[-1].keyword), **self.exception_kwargs ) self.append_node(parsetree.ControlLine, keyword, isend, text) else: self.append_node(parsetree.Comment, text) return True else: return False def match_comment(self): """matches the multiline version of a comment""" match = self.match(r"<%doc>(.*?)", re.S) if match: self.append_node(parsetree.Comment, match.group(1)) return True else: return False