summaryrefslogtreecommitdiff
path: root/utils/data-generators
diff options
context:
space:
mode:
authorSam Thursfield <sam@afuera.me.uk>2016-05-07 15:42:55 +0100
committerSam Thursfield <sam@afuera.me.uk>2016-06-09 19:55:24 +0100
commit23524e77f620cd53a38ce54f2e90ee6e893cacdf (patch)
tree5dfe4fb7618dbd5dc56c4541909775545db6ec82 /utils/data-generators
parent61fd495fa679b1d894d92978566ab91fa09e5509 (diff)
downloadtracker-23524e77f620cd53a38ce54f2e90ee6e893cacdf.tar.gz
utils: Add create-tree-from-real-data script
This script grabs a mix files from your system and creates a link farm, with the aim of creating a tree of Tracker test data that exercises each of the extract modules.
Diffstat (limited to 'utils/data-generators')
-rwxr-xr-xutils/data-generators/create-tree-from-real-data179
1 files changed, 179 insertions, 0 deletions
diff --git a/utils/data-generators/create-tree-from-real-data b/utils/data-generators/create-tree-from-real-data
new file mode 100755
index 000000000..53453ad04
--- /dev/null
+++ b/utils/data-generators/create-tree-from-real-data
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+
+'''Create tree from real data!
+
+This script is to generate a directory tree full of media files, which
+you can use for testing Tracker against. It works by using real media
+files from the computer you're running on. So it's more realistic than
+testing with completely faked data, but you can exert *some* control
+over the test environment by specifying how many files of each type
+that you want to have.
+
+The real media files are discovered using the Tracker instance of
+the current running user.
+
+It analyses the existing extract rules to ensure every extractor is
+covered by the test data (warning if no data is available).
+
+Example usage:
+
+ ./create-tree-from-real-data --output-dir=~/tracker-test-data/ \
+ --rules-dir=/opt/tracker-unstable/share/tracker/extract-rules
+
+'''
+
+from gi.repository import Tracker
+
+import argparse
+import os
+import sys
+
+if sys.version_info[0] >= 3:
+ import configparser
+ import urllib.parse as urlparse
+ from urllib.parse import unquote as url_unquote
+else:
+ import ConfigParser as configparser
+ import urlparse
+ from urllib import unquote as url_unquote
+
+
+def argument_parser():
+ parser = argparse.ArgumentParser(
+ description="Assemble test data for the Tracker extractors.")
+ parser.add_argument(
+ '--rules-dir', default='/usr/share/tracker/extract-rules',
+ help="location to find Tracker extract rules (default: %(default)s)")
+ parser.add_argument(
+ '--output-dir', default=None,
+ help="where to create the tree of test data files")
+ parser.add_argument(
+ '--limit', default=10,
+ help="number of files to include in the tree, per extract module")
+ return parser
+
+
+def read_tracker_extract_rule(filename):
+ assert os.path.exists(filename)
+ parser = configparser.ConfigParser()
+ try:
+ parser.read(filename)
+ return dict(parser.items('ExtractorRule'))
+ except configparser.Error as e:
+ raise RuntimeError("%s: %s" % (filename, e))
+
+
+def read_tracker_extract_rules(rules_dir):
+ '''Returns a dict mapping extract module name to the MIME types it reads.
+
+ '''
+ rule_map = {}
+
+ for rule_filename in os.listdir(rules_dir):
+ rule = read_tracker_extract_rule(os.path.join(rules_dir, rule_filename))
+
+ if 'modulepath' not in rule:
+ # Ignore fallback-only rules
+ continue
+
+ module = rule['modulepath']
+ mimetypes = rule['mimetypes'].rstrip(';').split(';')
+
+ if len(mimetypes) == 0:
+ continue
+
+ if module in rule_map:
+ rule_map[module].update(mimetypes)
+ else:
+ rule_map[module] = set(mimetypes)
+
+ return rule_map
+
+
+def make_sparql_list(items):
+ return ', '.join("'" + Tracker.sparql_escape_string(t) + "'" for t in items)
+
+
+def resources_with_mime_types(db, mime_types, limit=10):
+ query = '''
+ SELECT ?url {
+ ?r nie:url ?url ;
+ nie:mimeType ?mime .
+ FILTER ( ?mime IN (%s) )
+ }
+ LIMIT %i
+ ''' % (make_sparql_list(mime_types), limit)
+
+ result = db.query(query)
+ while result.next():
+ yield result.get_string(0)[0]
+
+
+def file_url_to_path(url):
+ '''Convert file:// URL to a pathname.'''
+ scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
+ if scheme != 'file':
+ raise RuntimeError("Only file:// URLs are supported.")
+ if any([netloc, params, query, fragment]):
+ raise RuntimeError("URL has info other than a file path: %s", url)
+ return url_unquote(path)
+
+
+def unique_output_name(path):
+ if os.path.exists(path):
+ tail = 2
+ while os.path.exists(path + ('.%s' % tail)):
+ tail += 1
+ path = path + ('.%s' % tail)
+ return path
+
+
+def main():
+ args = argument_parser().parse_args()
+
+ rule_map = read_tracker_extract_rules(args.rules_dir)
+
+ db = Tracker.SparqlConnection.get()
+
+ show_only = (args.output_dir == None)
+
+ if show_only:
+ print("No output dir specified, writing information to stdout.")
+
+ extractors_with_no_files = []
+
+ for extractor, mime_types in rule_map.items():
+ resources = list(resources_with_mime_types(db, mime_types, limit=2))
+
+ if (len(resources) == 0):
+ extractors_with_no_files.append(extractor)
+ elif show_only:
+ sys.stdout.write(extractor)
+ sys.stdout.write('\n - ')
+ sys.stdout.write('\n - '.join(resources))
+ sys.stdout.write('\n\n')
+ else:
+ output_dir = os.path.expanduser(
+ os.path.join(args.output_dir, extractor))
+ sys.stdout.write("Creating %i links in: %s\n" % (
+ len(resources), output_dir))
+ if not os.path.exists(output_dir):
+ os.makedirs(output_dir)
+ for uri in resources:
+ src_path = file_url_to_path(uri)
+ if not os.path.exists(src_path):
+ raise RuntimeError("Tracker returned non-existant file %s"
+ % src_path)
+ src_name = os.path.basename(src_path)
+ dst = unique_output_name(os.path.join(output_dir, src_name))
+ try:
+ os.symlink(src_path, dst)
+ except OSError as e:
+ raise RuntimeError("%s: %s" % (dst, e))
+
+ sys.stdout.write("\nExtractors with no data available:\n")
+ for extractor in extractors_with_no_files:
+ sys.stdout.write(" %s (mime types: %s)\n" % (extractor, ' '.join(rule_map[extractor])))
+
+
+main()