utils.py: Improved glob() function

Instead of using PurePath from pathlib, we use a modified version of fnmatch.translate() to generate a regular expression. This has multiple advantages: * The private usage of regular expressions allows the Element internal splitting rules to more easily adapt to the new globbing syntax for split rules * With the glob function under our control, we can document it more clearly * We have support for bash 'globstar' mode, which allows recursive globs with double asterisk.
author: Tristan Van Berkom <tristan.vanberkom@codethink.co.uk> 2017-06-30 16:04:44 +0900
committer: Tristan Van Berkom <tristan.vanberkom@codethink.co.uk> 2017-06-30 16:08:01 +0900
commit: f2d890e7494e06fd835b6f63a5c1e3a2dc1018bd (patch)
tree: 8473daf936bffa2e420938ad601fd4e885158c0e
parent: 1204b690ce5a05759947b7db44f1d87e7adb2c1a (diff)
download: buildstream-f2d890e7494e06fd835b6f63a5c1e3a2dc1018bd.tar.gz
1 files changed, 98 insertions, 6 deletions
diff --git a/buildstream/utils.py b/buildstream/utils.py
index 18854e961..5bf3863d2 100644
--- a/buildstream/utils.py
+++ b/buildstream/utils.py
@@ -30,7 +30,7 @@ import calendar
 import psutil
 import subprocess
 import signal
-from pathlib import PurePath
+import re
 from . import ProgramNotFoundError
 from . import _yaml
 from . import _signals
@@ -110,20 +110,45 @@ def glob(paths, pattern):
 
     This generator will iterate over the passed *paths* and
     yield only the filenames which matched the provided *pattern*.
+
+    +--------+------------------------------------------------------------------+
+    | Meta   | Description                                                      |
+    +========+==================================================================+
+    | \*     | Match any pattern except a path separator                        |
+    +--------+------------------------------------------------------------------+
+    | \**    | Match any pattern, including path separators                     |
+    +--------+------------------------------------------------------------------+
+    | ?      | Match any single character                                       |
+    +--------+------------------------------------------------------------------+
+    | [abc]  | Match one of the specified characters                            |
+    +--------+------------------------------------------------------------------+
+    | [a-z]  | Match one of the characters in the specified range               |
+    +--------+------------------------------------------------------------------+
+    | [!abc] | Match any single character, except the specified characters      |
+    +--------+------------------------------------------------------------------+
+    | [!a-z] | Match any single character, except those in the specified range  |
+    +--------+------------------------------------------------------------------+
+
+    .. note::
+
+       Escaping of the metacharacters is not possible
+
     """
-    # When using PurePath.match(), it behaves as expected
-    # only when comparing two absolute filenames, so we
-    # force them to be absolute
+    # Ensure leading slash, just because we want patterns
+    # to match file lists regardless of whether the patterns
+    # or file lists had a leading slash or not.
     if not pattern.startswith(os.sep):
         pattern = os.sep + pattern
 
+    expression = _glob2re(pattern)
+    regexer = re.compile(expression)
+
     for filename in paths:
         filename_try = filename
         if not filename_try.startswith(os.sep):
             filename_try = os.sep + filename_try
 
-        path = PurePath(filename_try)
-        if path.match(pattern):
+        if regexer.match(filename_try):
             yield filename
 
 
@@ -694,3 +719,70 @@ def _call(*popenargs, **kwargs):
         output = output.decode('UTF-8')
 
     return (exit_code, output)
+
+
+# _glob2re()
+#
+# Function to translate a glob style pattern into a regex
+#
+# Args:
+#    pat (str): The glob pattern
+#
+# This is a modified version of the python standard library's
+# fnmatch.translate() function which supports path like globbing
+# a bit more correctly, and additionally supports recursive glob
+# patterns with double asterisk.
+#
+# Note that this will only support the most basic of standard
+# glob patterns, and additionally the recursive double asterisk.
+#
+# Support includes:
+#
+#   *          Match any pattern except a path separator
+#   **         Match any pattern, including path separators
+#   ?          Match any single character
+#   [abc]      Match one of the specified characters
+#   [A-Z]      Match one of the characters in the specified range
+#   [!abc]     Match any single character, except the specified characters
+#   [!A-Z]     Match any single character, except those in the specified range
+#
+def _glob2re(pat):
+    i, n = 0, len(pat)
+    res = ''
+    while i < n:
+        c = pat[i]
+        i = i + 1
+        if c == '*':
+            # fnmatch.translate() simply uses the '.*' separator here,
+            # we only want that for double asterisk (bash 'globstar' behavior)
+            #
+            if i < n and pat[i] == '*':
+                res = res + '.*'
+                i = i + 1
+            else:
+                res = res + '[^/]*'
+        elif c == '?':
+            # fnmatch.translate() simply uses the '.' wildcard here, but
+            # we dont want to match path separators here
+            res = res + '[^/]'
+        elif c == '[':
+            j = i
+            if j < n and pat[j] == '!':
+                j = j + 1
+            if j < n and pat[j] == ']':
+                j = j + 1
+            while j < n and pat[j] != ']':
+                j = j + 1
+            if j >= n:
+                res = res + '\\['
+            else:
+                stuff = pat[i:j].replace('\\', '\\\\')
+                i = j + 1
+                if stuff[0] == '!':
+                    stuff = '^' + stuff[1:]
+                elif stuff[0] == '^':
+                    stuff = '\\' + stuff
+                res = '%s[%s]' % (res, stuff)
+        else:
+            res = res + re.escape(c)
+    return res + '\Z(?ms)'
author	Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>	2017-06-30 16:04:44 +0900
committer	Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>	2017-06-30 16:08:01 +0900
commit	f2d890e7494e06fd835b6f63a5c1e3a2dc1018bd (patch)
tree	8473daf936bffa2e420938ad601fd4e885158c0e
parent	1204b690ce5a05759947b7db44f1d87e7adb2c1a (diff)
download	buildstream-f2d890e7494e06fd835b6f63a5c1e3a2dc1018bd.tar.gz