diff options
author | Siegfried-Angel Gevatter Pujals <rainct@ubuntu.com> | 2012-04-14 16:48:42 +0200 |
---|---|---|
committer | Siegfried-Angel Gevatter Pujals <rainct@ubuntu.com> | 2012-04-14 16:52:02 +0200 |
commit | be42b99f338aa14181c43923df588e2641e8ab7c (patch) | |
tree | c7fbae371a61fd8a44241f8aa73ee15f559c2af4 /tools | |
parent | 75ca272b8dc4ee42bcf9ea12b17c9325612bdbd5 (diff) | |
download | zeitgeist-be42b99f338aa14181c43923df588e2641e8ab7c.tar.gz |
Add tools/generate_events.py
Added a script to generate and insert random events into the database.
This is useful, for instance, in order to quickly obtain a huge database
for testing purposes.
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/generate_events.py | 262 |
1 files changed, 262 insertions, 0 deletions
diff --git a/tools/generate_events.py b/tools/generate_events.py new file mode 100755 index 00000000..15dedd80 --- /dev/null +++ b/tools/generate_events.py @@ -0,0 +1,262 @@ +#! /usr/bin/env python +# -.- coding: utf-8 -.- + +# Zeitgeist - Insert random events into the database +# +# Copyright © 2012 Canonical Ltd. +# By Siegfried-A. Gevatter <siegfried.gevatter@collabora.co.uk> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 2.1 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# ############################################################################# +# WARNING: make sure you launch Zeitgeist with ZEITGEIST_DATA_PATH set if +# you don't want to fill your real database! +# ############################################################################# + +import os +import sys +import time +import random +from collections import deque +from gi.repository import GLib, GObject + +from zeitgeist import mimetypes +from zeitgeist.datamodel import * +from zeitgeist.client import ZeitgeistDBusInterface + +class EventGenerator: + + NUM_WORDS = 1000 + NUM_SIMULTANEOUS_URIS = 1000 + MAX_EVENT_AGE = 366*24*3600*1000 + + _words = None + _mimetypes = None + _desktop_files = None + _schemas = None + _uri_table = None + + def __init__(self): + # Initialize a pool of random words for use in URIs, etc. + dictionary_words = map(str.strip, + open('/usr/share/dict/words').readlines()) + dictionary_words = filter(lambda x: '\'s' not in x, dictionary_words) + self._words = random.sample(dictionary_words, self.NUM_WORDS) + + # Initialize a pool of MIME-Types + self._mimetypes = mimetypes.MIMES.keys() + + # Initialize a pool of application names + self._desktop_files = filter(lambda actor: actor.endswith('.desktop'), + os.listdir('/usr/share/applications')) + + # Initialize a list of URI schemas + self._schemas = ('application', 'davs', 'http', 'https', 'ftp') + + # Initialize a cache of URIs + self._uri_table = deque(maxlen=self.NUM_SIMULTANEOUS_URIS) + + def get_word(self): + # FIXME: add numbers and stuff? + return random.choice(self._words) + + def get_extension(self): + if random.random() < 0.8: + extensions = [ + 'odt', 'odp', 'doc', + 'oga', 'ogv', 'mp3' + 'png', 'jpg', 'gif', 'tiff' + 'html', 'xml', 'txt' + 'py', 'c', 'cpp', 'js', 'vala' + ] + else: + extensions = self._words + return filter(str.isalpha, random.choice(extensions)) + + def get_path(self, force_directory=False): + path = '' + num_parts = 1 + abs(int(random.gauss(3, 3))) + for i in range(num_parts): + path += '/%s' % self.get_word() + if random.random() < 0.9 and not force_directory: + path += '.%s' % self.get_extension() + return path + + def get_schema(self): + rand = random.random() + if rand < 0.005: + return '%s://' % random.choice(self._words) + elif rand < 0.4: + return '%s://' % random.choice(self._schemas) + else: + return 'file:///' + + def generate_uri(self): + file_uri = GLib.filename_to_uri(self.get_path(), None) + return self.get_schema() + file_uri[8:] + + def get_uri(self): + """ + We keep a cache of NUM_SIMULATENOUS_URIS uris for reuse. Every access + has a 1% chance of replacing a URI in the table with a new one. + """ + index = random.randint(0, self.NUM_SIMULTANEOUS_URIS) + if index >= len(self._uri_table): + # The URI table isn't fully initialized yet... + uri = self.generate_uri() + self._uri_table.append(uri) + return uri + if random.random() < 0.01: + # Generate a new URI + self._uri_table[index] = self.generate_uri() + return self._uri_table[index] + + def get_text(self): + num_words = abs(int(random.gauss(4, 3))) + return ' '.join(self.get_word() for i in range(num_words)) + + def get_subject_origin(self, uri): + scheme = GLib.uri_parse_scheme(uri) + if scheme == 'file': + return GLib.path_get_dirname(uri) + elif scheme in ('http', 'https'): + scheme, domain = uri.split('://', 1) + return '%s://%s' % (scheme, domain.split('/', 1)[0]) + else: + return GLib.filename_to_uri( + self.get_path(force_directory=True), None) + + def get_event_origin(self): + if random.random() < 0.005: + return self.get_uri() + return '' + + def get_actor(self): + return 'application://%s' % random.choice(self._desktop_files) + + def get_timestamp(self): + current_time = int(time.time() * 1000) + return random.randint(current_time - self.MAX_EVENT_AGE, current_time) + + def get_event_interpretation(self): + interpretations = Interpretation.EVENT_INTERPRETATION.get_children() + return random.choice(list(interpretations)) + + def get_subject_interpretation(self): + ev_interp = Interpretation.EVENT_INTERPRETATION.get_children() + subj_interp = set(Interpretation.get_children()) + subj_interp.difference_update(ev_interp) + return random.choice(list(subj_interp)) + + def get_event_manifestation(self): + if random.random() < 0.3: + manifestations = Manifestation.EVENT_MANIFESTATION.get_children() + return random.choice(list(manifestations)) + else: + return Manifestation.USER_ACTIVITY + + def get_subject_manifestation(self): + ev_manif = Manifestation.EVENT_MANIFESTATION.get_children() + subj_manif = set(Interpretation.get_children()) + subj_manif.difference_update(ev_manif) + return random.choice(list(subj_manif)) + + def get_subject(self, event_interpretation): + uri = self.get_uri() + + subject = Subject.new_for_values( + uri = uri, + current_uri = uri, + interpretation = self.get_subject_interpretation(), + manifestation = self.get_subject_manifestation(), + origin = self.get_subject_origin(uri), + mimetype = random.choice(self._mimetypes), + text = self.get_text(), + storage = "") + + if event_interpretation == Interpretation.MOVE_EVENT: + while subject.uri == subject.current_uri: + subject.current_uri = self.get_uri() + + return subject + + def get_event(self): + event_interpretation = self.get_event_interpretation() + event = Event.new_for_values( + timestamp = self.get_timestamp(), + interpretation = event_interpretation, + manifestation = self.get_event_manifestation(), + actor = self.get_actor(), + origin = self.get_event_origin()) + + num_subjects = max(1, abs(int(random.gauss(1, 1)))) + while len(event.subjects) < num_subjects: + subject = self.get_subject(event_interpretation) + if subject.uri not in (x.uri for x in event.get_subjects()): + # events with two subjects having the same URI aren't supported + event.append_subject(subject) + + return event + +class EventInserter(): + + BUFFER_SIZE = 100 + + _log = None + _buffer = None + _events_inserted = None + + def __init__(self): + self._log = ZeitgeistDBusInterface() + self._buffer = [] + self._events_inserted = 0 + + def insert(self, event): + buffer_full = len(self._buffer) >= self.BUFFER_SIZE + if buffer_full: + self.flush() + self._buffer.append(event) + return buffer_full + + def flush(self): + if self._buffer: + self._log.InsertEvents(self._buffer) + self._events_inserted += len(self._buffer) + self._buffer = [] + + def get_insertion_count(self): + return self._events_inserted + +def main(): + limit = '10000000' if len(sys.argv) < 2 else sys.argv[1] + if len(sys.argv) > 2 or not limit.isdigit(): + print "Usage: %s [<num_events>]" % sys.argv[0] + sys.exit(1) + limit = int(limit) + + event_inserter = EventInserter() + try: + generator = EventGenerator() + for i in xrange(limit): + event = generator.get_event() + event.payload = 'generate_events.py' + if event_inserter.insert(event): + print "Inserted %d events." % i + except KeyboardInterrupt: + pass + event_inserter.flush() + print "Inserted %d events. Done." % event_inserter.get_insertion_count() + +if __name__ == '__main__': + main() |