diff options
author | Tim Hatch <tim@timhatch.com> | 2012-08-27 00:30:35 -0700 |
---|---|---|
committer | Tim Hatch <tim@timhatch.com> | 2012-08-27 00:30:35 -0700 |
commit | 08ad16c5ad5e80dfddd7ae81e368227e6bb6b989 (patch) | |
tree | 241ddb7a63077090db41462691995b459d3bc807 /pygments | |
parent | 33e67db7ac05825a5b873104d900958b8c55de06 (diff) | |
download | pygments-08ad16c5ad5e80dfddd7ae81e368227e6bb6b989.tar.gz |
Centralize regex metachar escaping, since the surrogate support breaks
one-parsed-char per unicode codepoint already.
Diffstat (limited to 'pygments')
-rw-r--r-- | pygments/lexers/dotnet.py | 20 | ||||
-rw-r--r-- | pygments/lexers/jvm.py | 11 | ||||
-rw-r--r-- | pygments/unistring.py | 4 |
3 files changed, 12 insertions, 23 deletions
diff --git a/pygments/lexers/dotnet.py b/pygments/lexers/dotnet.py index 0a2770ca..17edddc0 100644 --- a/pygments/lexers/dotnet.py +++ b/pygments/lexers/dotnet.py @@ -23,10 +23,6 @@ __all__ = ['CSharpLexer', 'NemerleLexer', 'BooLexer', 'VbNetLexer', 'CSharpAspxLexer', 'VbNetAspxLexer', 'FSharpLexer'] -def _escape(st): - return st.replace(u'\\', ur'\\').replace(u'-', ur'\-').\ - replace(u'[', ur'\[').replace(u']', ur'\]') - class CSharpLexer(RegexLexer): """ For `C# <http://msdn2.microsoft.com/en-us/vcsharp/default.aspx>`_ @@ -67,10 +63,9 @@ class CSharpLexer(RegexLexer): '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'), 'full': ('@?(?:_|[^' + - _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl')) + '])' - + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', - 'Nl', 'Nd', 'Pc', 'Cf', 'Mn', - 'Mc')) + ']*'), + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl') + '])' + + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', + 'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'), } tokens = {} @@ -179,11 +174,10 @@ class NemerleLexer(RegexLexer): basic = ('@?[_' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + ']' + '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'), - full = ('@?(?:_|[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', - 'Lo', 'Nl')) + '])' - + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', - 'Nl', 'Nd', 'Pc', 'Cf', 'Mn', - 'Mc')) + ']*'), + full = ('@?(?:_|[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', + 'Nl') + '])' + + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', + 'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'), ) tokens = {} diff --git a/pygments/lexers/jvm.py b/pygments/lexers/jvm.py index 696bb1a1..c8caa2c0 100644 --- a/pygments/lexers/jvm.py +++ b/pygments/lexers/jvm.py @@ -789,20 +789,15 @@ class KotlinLexer(RegexLexer): # for the range of allowed unicode characters in identifiers, # see http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-334.pdf - def _escape(st): - return st.replace(u'\\', ur'\\').replace(u'-', ur'\-').\ - replace(u'[', ur'\[').replace(u']', ur'\]') - levels = { 'none': '@?[_a-zA-Z][a-zA-Z0-9_]*', 'basic': ('@?[_' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + ']' + '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'), 'full': ('@?(?:_|[^' + - _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl')) + '])' - + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', - 'Nl', 'Nd', 'Pc', 'Cf', 'Mn', - 'Mc')) + ']*'), + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl') + '])' + + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', + 'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'), } tokens = {} diff --git a/pygments/unistring.py b/pygments/unistring.py index 2b99f1f0..b6f53e89 100644 --- a/pygments/unistring.py +++ b/pygments/unistring.py @@ -114,8 +114,8 @@ if __name__ == '__main__': # Hack to avoid combining this combining with the preceeding high # surrogate, 0xdbff, when doing a repr. c = u'\\' + c - elif ord(c) in (0x2d, 0x5c): - # Escape backslash itself and dash. + elif ord(c) in (0x2d, 0x5b, 0x5c, 0x5d): + # Escape regex metachars. c = u'\\' + c categories.setdefault(cat, []).append(c) |