Centralize regex metachar escaping, since the surrogate support breaks

one-parsed-char per unicode codepoint already.
author: Tim Hatch <tim@timhatch.com> 2012-08-27 00:30:35 -0700
committer: Tim Hatch <tim@timhatch.com> 2012-08-27 00:30:35 -0700
commit: 08ad16c5ad5e80dfddd7ae81e368227e6bb6b989 (patch)
tree: 241ddb7a63077090db41462691995b459d3bc807 /pygments
parent: 33e67db7ac05825a5b873104d900958b8c55de06 (diff)
download: pygments-08ad16c5ad5e80dfddd7ae81e368227e6bb6b989.tar.gz
3 files changed, 12 insertions, 23 deletions
diff --git a/pygments/lexers/dotnet.py b/pygments/lexers/dotnet.py
index 0a2770ca..17edddc0 100644
--- a/pygments/lexers/dotnet.py
+++ b/pygments/lexers/dotnet.py
@@ -23,10 +23,6 @@ __all__ = ['CSharpLexer', 'NemerleLexer', 'BooLexer', 'VbNetLexer',
            'CSharpAspxLexer', 'VbNetAspxLexer', 'FSharpLexer']
 
 
-def _escape(st):
-    return st.replace(u'\\', ur'\\').replace(u'-', ur'\-').\
-           replace(u'[', ur'\[').replace(u']', ur'\]')
-
 class CSharpLexer(RegexLexer):
     """
     For `C# <http://msdn2.microsoft.com/en-us/vcsharp/default.aspx>`_
@@ -67,10 +63,9 @@ class CSharpLexer(RegexLexer):
                   '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl +
                   uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'),
         'full': ('@?(?:_|[^' +
-                 _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl')) + '])'
-                 + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo',
-                                                'Nl', 'Nd', 'Pc', 'Cf', 'Mn',
-                                                'Mc')) + ']*'),
+                 uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl') + '])'
+                 + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl',
+                                        'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'),
     }
 
     tokens = {}
@@ -179,11 +174,10 @@ class NemerleLexer(RegexLexer):
         basic = ('@?[_' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + ']' +
                  '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl +
                  uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'),
-        full = ('@?(?:_|[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm',
-                                                    'Lo', 'Nl')) + '])'
-                + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo',
-                                               'Nl', 'Nd', 'Pc', 'Cf', 'Mn',
-                                               'Mc')) + ']*'),
+        full = ('@?(?:_|[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo',
+                                            'Nl') + '])'
+                + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl',
+                                       'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'),
     )
 
     tokens = {}
diff --git a/pygments/lexers/jvm.py b/pygments/lexers/jvm.py
index 696bb1a1..c8caa2c0 100644
--- a/pygments/lexers/jvm.py
+++ b/pygments/lexers/jvm.py
@@ -789,20 +789,15 @@ class KotlinLexer(RegexLexer):
     # for the range of allowed unicode characters in identifiers,
     # see http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-334.pdf
 
-    def _escape(st):
-        return st.replace(u'\\', ur'\\').replace(u'-', ur'\-').\
-               replace(u'[', ur'\[').replace(u']', ur'\]')
-
     levels = {
         'none': '@?[_a-zA-Z][a-zA-Z0-9_]*',
         'basic': ('@?[_' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl + ']' +
                   '[' + uni.Lu + uni.Ll + uni.Lt + uni.Lm + uni.Nl +
                   uni.Nd + uni.Pc + uni.Cf + uni.Mn + uni.Mc + ']*'),
         'full': ('@?(?:_|[^' +
-                 _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl')) + '])'
-                 + '[^' + _escape(uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo',
-                                                'Nl', 'Nd', 'Pc', 'Cf', 'Mn',
-                                                'Mc')) + ']*'),
+                 uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl') + '])'
+                 + '[^' + uni.allexcept('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl',
+                                        'Nd', 'Pc', 'Cf', 'Mn', 'Mc') + ']*'),
     }
 
     tokens = {}
diff --git a/pygments/unistring.py b/pygments/unistring.py
index 2b99f1f0..b6f53e89 100644
--- a/pygments/unistring.py
+++ b/pygments/unistring.py
@@ -114,8 +114,8 @@ if __name__ == '__main__':
             # Hack to avoid combining this combining with the preceeding high
             # surrogate, 0xdbff, when doing a repr.
             c = u'\\' + c
-        elif ord(c) in (0x2d, 0x5c):
-            # Escape backslash itself and dash.
+        elif ord(c) in (0x2d, 0x5b, 0x5c, 0x5d):
+            # Escape regex metachars.
             c = u'\\' + c
         categories.setdefault(cat, []).append(c)
author	Tim Hatch <tim@timhatch.com>	2012-08-27 00:30:35 -0700
committer	Tim Hatch <tim@timhatch.com>	2012-08-27 00:30:35 -0700
commit	08ad16c5ad5e80dfddd7ae81e368227e6bb6b989 (patch)
tree	241ddb7a63077090db41462691995b459d3bc807 /pygments
parent	33e67db7ac05825a5b873104d900958b8c55de06 (diff)
download	pygments-08ad16c5ad5e80dfddd7ae81e368227e6bb6b989.tar.gz