From 7f8f00988e6c58d9d49d6e4c973ee177345b46b0 Mon Sep 17 00:00:00 2001 From: "German M. Bravo" Date: Thu, 15 Aug 2013 17:07:27 -0500 Subject: Scanner updated --- scss/src/_speedups.c | 2 +- scss/src/scanner.c | 101 ++++++++++++++++++------------------ scss/src/scanner.py | 127 +--------------------------------------------- scss/src/yapps/runtime.py | 84 +++++++++++++----------------- 4 files changed, 87 insertions(+), 227 deletions(-) diff --git a/scss/src/_speedups.c b/scss/src/_speedups.c index b4b990c..59c56e5 100644 --- a/scss/src/_speedups.c +++ b/scss/src/_speedups.c @@ -6,7 +6,7 @@ * https://github.com/Kronuz/pyScss * * MIT license (http://www.opensource.org/licenses/mit-license.php) -* Copyright (c) 2011 German M. Bravo (Kronuz), All rights reserved. +* Copyright (c) 2011, 2013 German M. Bravo (Kronuz), All rights reserved. */ #include #include "block_locator.h" diff --git a/scss/src/scanner.c b/scss/src/scanner.c index ca74ba7..a196aa5 100644 --- a/scss/src/scanner.c +++ b/scss/src/scanner.c @@ -6,7 +6,7 @@ * https://github.com/Kronuz/pyScss * * MIT license (http://www.opensource.org/licenses/mit-license.php) -* Copyright (c) 2011 German M. Bravo (Kronuz), All rights reserved. +* Copyright (c) 2011, 2013 German M. Bravo (Kronuz), All rights reserved. */ #include @@ -159,10 +159,10 @@ Pattern_finalize(void) { static long _Scanner_scan(Scanner *self, Pattern *restrictions, int restrictions_sz) { - Token best_token, *p_token; + Token best_pat, *last_read_token; Restriction *p_restriction; Pattern *regex; - int j, k, max, skip; + int j, k, max, ignore; size_t len; char *aux; @@ -172,7 +172,7 @@ _Scanner_scan(Scanner *self, Pattern *restrictions, int restrictions_sz) while (1) { regex = NULL; - best_token.regex = NULL; + best_pat.regex = NULL; /* Search the patterns for a match, with earlier tokens in the list having preference */ for (j = 0; j < Pattern_patterns_sz; j++) { @@ -181,24 +181,24 @@ _Scanner_scan(Scanner *self, Pattern *restrictions, int restrictions_sz) fprintf(stderr, "\tTrying %s: %s at pos %d -> %s\n", repr(regex->tok), repr(regex->expr), self->pos, repr(self->input)); #endif /* First check to see if we're restricting to this token */ - skip = restrictions_sz; - if (skip) { + ignore = restrictions_sz; + if (ignore) { max = (restrictions_sz > self->ignore_sz) ? restrictions_sz : self->ignore_sz; for (k = 0; k < max; k++) { if (k < restrictions_sz && strcmp(regex->tok, restrictions[k].tok) == 0) { - skip = 0; + ignore = 0; break; } if (k < self->ignore_sz && regex == self->ignore[k]) { - skip = 0; + ignore = 0; break; } } - if (skip) { - continue; + if (ignore) { #ifdef DEBUG fprintf(stderr, "\tSkipping %s!\n", repr(regex->tok)); #endif + continue; } } if (Pattern_match( @@ -206,7 +206,7 @@ _Scanner_scan(Scanner *self, Pattern *restrictions, int restrictions_sz) self->input, self->input_sz, self->pos, - &best_token + &best_pat )) { #ifdef DEBUG fprintf(stderr, "Match OK! %s: %s at pos %d\n", repr(regex->tok), repr(regex->expr), self->pos); @@ -215,7 +215,7 @@ _Scanner_scan(Scanner *self, Pattern *restrictions, int restrictions_sz) } } /* If we didn't find anything, raise an error */ - if (best_token.regex == NULL) { + if (best_pat.regex == NULL) { if (restrictions_sz) { sprintf(self->exc, "SyntaxError[@ char %d: %s found while trying to find one of the restricted tokens: ", self->pos, (regex == NULL) ? "???" : repr(regex->tok)); aux = self->exc + strlen(self->exc); @@ -234,55 +234,52 @@ _Scanner_scan(Scanner *self, Pattern *restrictions, int restrictions_sz) sprintf(self->exc, "SyntaxError[@ char %d: Bad token: %s]", self->pos, (regex == NULL) ? "???" : repr(regex->tok)); return SCANNER_EXC_BAD_TOKEN; } - /* If we found something that isn't to be ignored, return it */ - skip = 0; + + ignore = 0; /* Should this token be ignored? */ for (k = 0; k < self->ignore_sz; k++) { - if (best_token.regex == self->ignore[k]) { - /* This token should be ignored... */ - self->pos += best_token.string_sz; - skip = 1; + if (best_pat.regex == self->ignore[k]) { + ignore = 1; break; } } - if (!skip) { - break; - } - } - if (best_token.regex) { - self->pos = (int)(best_token.string - self->input + best_token.string_sz); - /* Only add this token if it's not in the list (to prevent looping) */ - p_token = &self->tokens[self->tokens_sz - 1]; - if (self->tokens_sz == 0 || - p_token->regex != best_token.regex || - p_token->string != best_token.string || - p_token->string_sz != best_token.string_sz - ) { - if (self->tokens_sz >= self->tokens_bsz) { - /* Needs to expand block */ - self->tokens_bsz = self->tokens_bsz + BLOCK_SIZE_PATTERNS; - PyMem_Resize(self->tokens, Token, self->tokens_bsz); - PyMem_Resize(self->restrictions, Restriction, self->tokens_bsz); - } - memcpy(&self->tokens[self->tokens_sz], &best_token, sizeof(Token)); - p_restriction = &self->restrictions[self->tokens_sz]; - if (restrictions_sz) { - p_restriction->patterns = PyMem_New(Pattern *, restrictions_sz); - p_restriction->patterns_sz = 0; - for (j = 0; j < restrictions_sz; j++) { - regex = Pattern_regex(restrictions[j].tok, restrictions[j].expr); - if (regex) { - p_restriction->patterns[p_restriction->patterns_sz++] = regex; + self->pos += best_pat.string_sz; + + /* If we found something that isn't to be ignored, return it */ + if (!ignore) { + /* Only add this token if it's not in the list (to prevent looping) */ + last_read_token = &self->tokens[self->tokens_sz - 1]; + if (self->tokens_sz == 0 || + last_read_token->regex != best_pat.regex || + last_read_token->string != best_pat.string || + last_read_token->string_sz != best_pat.string_sz + ) { + if (self->tokens_sz >= self->tokens_bsz) { + /* Needs to expand blocks */ + self->tokens_bsz = self->tokens_bsz + BLOCK_SIZE_PATTERNS; + PyMem_Resize(self->tokens, Token, self->tokens_bsz); + PyMem_Resize(self->restrictions, Restriction, self->tokens_bsz); + } + memcpy(&self->tokens[self->tokens_sz], &best_pat, sizeof(Token)); + p_restriction = &self->restrictions[self->tokens_sz]; + if (restrictions_sz) { + p_restriction->patterns = PyMem_New(Pattern *, restrictions_sz); + p_restriction->patterns_sz = 0; + for (j = 0; j < restrictions_sz; j++) { + regex = Pattern_regex(restrictions[j].tok, restrictions[j].expr); + if (regex) { + p_restriction->patterns[p_restriction->patterns_sz++] = regex; + } } + } else { + p_restriction->patterns = NULL; + p_restriction->patterns_sz = 0; } - } else { - p_restriction->patterns = NULL; - p_restriction->patterns_sz = 0; + self->tokens_sz++; + return 1; } - self->tokens_sz++; - return 1; + return 0; } } - return 0; } diff --git a/scss/src/scanner.py b/scss/src/scanner.py index 97c0b34..d148b30 100755 --- a/scss/src/scanner.py +++ b/scss/src/scanner.py @@ -84,132 +84,7 @@ PATTERNS = [ # Parser DEBUG = False - -class NoMoreTokens(Exception): - """ - Another exception object, for when we run out of tokens - """ - pass - - -class Scanner(object): - def __init__(self, patterns, ignore, input=None): - """ - Patterns is [(terminal,regex)...] - Ignore is [terminal,...]; - Input is a string - """ - self.reset(input) - self.ignore = ignore - # The stored patterns are a pair (compiled regex,source - # regex). If the patterns variable passed in to the - # constructor is None, we assume that the class already has a - # proper .patterns list constructed - if patterns is not None: - self.patterns = [] - for k, r in patterns: - self.patterns.append((k, re.compile(r))) - - def reset(self, input): - self.tokens = [] - self.restrictions = [] - self.input = input - self.pos = 0 - - def __repr__(self): - """ - Print the last 10 tokens that have been scanned in - """ - output = '' - for t in self.tokens[-10:]: - output = "%s\n (@%s) %s = %s" % (output, t[0], t[2], repr(t[3])) - return output - - def _scan(self, restrict): - """ - Should scan another token and add it to the list, self.tokens, - and add the restriction to self.restrictions - """ - # Keep looking for a token, ignoring any in self.ignore - token = None - while True: - tok = None - best_pat = None - # Search the patterns for a match, with earlier - # tokens in the list having preference - best_pat_len = 0 - for tok, regex in self.patterns: - if DEBUG: - print("\tTrying %s: %s at pos %d -> %s" % (repr(tok), repr(regex.pattern), self.pos, repr(self.input))) - # First check to see if we're restricting to this token - if restrict and tok not in restrict and tok not in self.ignore: - if DEBUG: - print "\tSkipping %s!" % repr(tok) - continue - m = regex.match(self.input, self.pos) - if m: - # We got a match - best_pat = tok - best_pat_len = len(m.group(0)) - if DEBUG: - print("Match OK! %s: %s at pos %d" % (repr(tok), repr(regex.pattern), self.pos)) - break - - # If we didn't find anything, raise an error - if best_pat is None: - msg = "Bad token: %s" % ("???" if tok is None else repr(tok),) - if restrict: - msg = "%s found while trying to find one of the restricted tokens: %s" % ("???" if tok is None else repr(tok), ", ".join(repr(r) for r in restrict)) - raise SyntaxError("SyntaxError[@ char %s: %s]" % (repr(self.pos), msg)) - - # If we found something that isn't to be ignored, return it - if best_pat in self.ignore: - # This token should be ignored... - self.pos += best_pat_len - else: - end_pos = self.pos + best_pat_len - # Create a token with this data - token = ( - self.pos, - end_pos, - best_pat, - self.input[self.pos:end_pos] - ) - break - if token is not None: - self.pos = token[1] - # Only add this token if it's not in the list - # (to prevent looping) - if not self.tokens or token != self.tokens[-1]: - self.tokens.append(token) - self.restrictions.append(restrict) - return 1 - return 0 - - def token(self, i, restrict=None): - """ - Get the i'th token, and if i is one past the end, then scan - for another token; restrict is a list of tokens that - are allowed, or 0 for any token. - """ - tokens_len = len(self.tokens) - if i == tokens_len: # We are at the end, get the next... - tokens_len += self._scan(restrict) - elif i >= 0 and i < tokens_len: - if restrict and self.restrictions[i] and restrict > self.restrictions[i]: - raise NotImplementedError("Unimplemented: restriction set changed") - if i >= 0 and i < tokens_len: - return self.tokens[i] - raise NoMoreTokens() - - def rewind(self, i): - tokens_len = len(self.tokens) - if i <= tokens_len: - token = self.tokens[i] - self.tokens = self.tokens[:i] - self.restrictions = self.restrictions[:i] - self.pos = token[0] - +from yapps.runtime import Scanner class _Scanner_a(Scanner): patterns = None diff --git a/scss/src/yapps/runtime.py b/scss/src/yapps/runtime.py index e23210a..6920fdb 100644 --- a/scss/src/yapps/runtime.py +++ b/scss/src/yapps/runtime.py @@ -21,7 +21,6 @@ import re import sys DEBUG = False -MIN_WINDOW = 4096 # File lookup window @@ -71,9 +70,6 @@ class Token(object): return output -in_name = 0 - - class Scanner(object): """Yapps scanner. @@ -85,9 +81,11 @@ class Scanner(object): restriction (the set is always the full set of tokens). """ + MIN_WINDOW = 4096 + in_name = 0 def __init__(self, patterns, ignore, input="", - file=None, filename=None, stacked=False): + file=None, filename=None): """Initialize the scanner. Parameters: @@ -106,13 +104,11 @@ class Scanner(object): """ if not filename: - global in_name - filename = "" % in_name - in_name += 1 + filename = "" % self.__class__.in_name + self.__class__.in_name += 1 self.reset(input, file, filename) self.ignore = ignore - self.stacked = stacked if patterns is not None: # Compile the regex strings into regex objects @@ -216,19 +212,19 @@ class Scanner(object): """Get more input if possible.""" if not self.file: return - if len(self.input) - self.pos >= MIN_WINDOW: + if len(self.input) - self.pos >= self.MIN_WINDOW: return - data = self.file.read(MIN_WINDOW) + data = self.file.read(self.MIN_WINDOW) if data is None or data == "": self.file = None # Drop bytes from the start, if necessary. - if self.pos > 2 * MIN_WINDOW: - self.del_pos += MIN_WINDOW - self.del_line += self.input[:MIN_WINDOW].count("\n") - self.pos -= MIN_WINDOW - self.input = self.input[MIN_WINDOW:] + data + if self.pos > 2 * self.MIN_WINDOW: + self.del_pos += self.MIN_WINDOW + self.del_line += self.input[:self.MIN_WINDOW].count("\n") + self.pos -= self.MIN_WINDOW + self.input = self.input[self.MIN_WINDOW:] + data else: self.input = self.input + data @@ -245,21 +241,17 @@ class Scanner(object): Should scan another token and add it to the list, self.tokens, and add the restriction to self.restrictions """ + token = None # Keep looking for a token, ignoring any in self.ignore while True: tok = None self.grab_input() - # special handling for end-of-file - if self.stacked and self.pos == len(self.input): - raise StopIteration - # Search the patterns for the longest match, with earlier # tokens in the list having preference - best_match = -1 + best_pat_len = -1 best_pat = None - best_m = None for tok, regex in self.patterns: if DEBUG: print("\tTrying %s: %s at pos %d -> %s" % (repr(tok), repr(regex.pattern), self.pos, repr(self.input))) @@ -269,56 +261,44 @@ class Scanner(object): print "\tSkipping %s!" % repr(tok) continue m = regex.match(self.input, self.pos) - if m and m.end() - m.start() > best_match: + if m and m.end() - m.start() > best_pat_len: # We got a match that's better than the previous one best_pat = tok - best_match = m.end() - m.start() - best_m = m + best_pat_len = m.end() - m.start() if DEBUG: print("Match OK! %s: %s at pos %d" % (repr(tok), repr(regex.pattern), self.pos)) + break # If we didn't find anything, raise an error - if best_pat is None or best_match < 0: + if best_pat is None or best_pat_len < 0: msg = "Bad token: %s" % ("???" if tok is None else repr(tok),) if restrict: msg = "%s found while trying to find one of the restricted tokens: %s" % ("???" if tok is None else repr(tok), ", ".join(repr(r) for r in restrict)) raise SyntaxError(self.get_pos(), msg, context=context) - ignore = best_pat in self.ignore - end_pos = self.pos + best_match - value = self.input[self.pos:end_pos] + ignore = best_pat in self.ignore # Should this token be ignored? + start_pos = self.pos + end_pos = start_pos + best_pat_len + self.pos = end_pos + + # If we found something that isn't to be ignored, return it if not ignore: + value = self.input[start_pos:end_pos] # token = Token(type=best_pat, value=value, pos=self.get_pos()) token = ( - self.pos, + start_pos, end_pos, best_pat, value, ) - self.pos = end_pos - - npos = value.rfind("\n") - if npos > -1: - self.col = best_match - npos - self.line += value.count('\n') - else: - self.col += best_match - - # If we found something that isn't to be ignored, return it - if not ignore: # print repr(token) if not self.tokens or token != self.last_read_token: - # Only add this token if it's not in the list - # (to prevent looping) + # Only add this token if it's not in the list (to prevent looping) self.last_read_token = token self.tokens.append(token) self.restrictions.append(restrict) return 1 return 0 - else: - ignore = self.ignore[best_pat] - if ignore: - ignore(self, best_m) def token(self, i, restrict=None, **kwargs): """ @@ -335,7 +315,15 @@ class Scanner(object): raise NotImplementedError("Unimplemented: restriction set changed") if i >= 0 and i < tokens_len: return self.tokens[i] - raise NoMoreTokens + raise NoMoreTokens() + + def rewind(self, i): + tokens_len = len(self.tokens) + if i <= tokens_len: + token = self.tokens[i] + self.tokens = self.tokens[:i] + self.restrictions = self.restrictions[:i] + self.pos = token[0] class Parser(object): -- cgit v1.2.1