diff options
-rw-r--r-- | README.md | 21 | ||||
-rwxr-xr-x | lorry | 57 | ||||
-rwxr-xr-x | lorry.raw-file-importer | 121 |
3 files changed, 196 insertions, 3 deletions
@@ -67,6 +67,8 @@ Optional: * **Perl**: Needed if you want to import tarballs. +* **Git LFS**: Needed if you want to import raw files. + Lorry file specification ------------------------ @@ -141,7 +143,7 @@ is assumed to be the master branch. ### Subversion To support all the branches and tags a layout needs to be specified as svn is very flexible with the possible layouts, however the most common is to have the -working branch in a directory called trunk, and the branches and tags in +working branch in a directory called trunk, and the branches and tags in respectively named subdirectories. Because this is so common "standard" can be used as the layout @@ -282,6 +284,23 @@ will be tagged as 'bkai00mp.ttf') } } +### Raw File + +Lorry can store raw files in a git LFS repository, which may allow your git +server to serve those files via its repository browser. + +For convenience, raw file lorries can specify multiple sources to store in the same repository. +Each raw file will be stored under a subpath corresponding to the source name. + + { + "raw-file-repo": { + "type": "raw-file", + "urls": { + "authorities/england-and-wales": "http://geoportal1-ons.opendata.arcgis.com/datasets/0b09996863af4b5db78058225bac5d1b_0.kml", + "radiological-monitoring-data": "https://fsadata.github.io/radiological-monitoring-data/data/provisional-2016-bi-annual-aquatic-and-terrestrial-monitoring-results-26062017-raw-results.csv" + } + } + } Tips ---- @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2011-2020 Codethink Limited +# Copyright (C) 2011-2021 Codethink Limited # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -343,6 +343,7 @@ class Lorry(cliapp.Application): 'git': self.mirror_git, 'hg': self.gitify_hg, 'svn': self.gitify_svn, + 'raw-file': self.gitify_raw_file, 'tarball': functools.partial(self.gitify_archive, 'tar'), 'zip': functools.partial(self.gitify_archive, 'zip'), 'gzip': functools.partial(self.gitify_archive, 'gzip') @@ -792,6 +793,59 @@ class Lorry(cliapp.Application): *plugin_options], cwd=gitdir) + def gitify_raw_file(self, project_name, dirname, gitdir, spec): + raw_file_branch = 'master' + raw_file_refspecs = 'refs/heads/{branch}:refs/heads/{branch}'.format(branch=raw_file_branch) + # Fetch the files + new_files = {} + for src in spec['urls']: + url = spec['urls'][src] + url_path = urllib.parse.urlparse(url)[2] + basename = os.path.basename(url_path) + file_dest = os.path.join(dirname, basename) + self.progress('.. checking if we need to fetch %s' % basename) + if file_missing_or_empty(file_dest): + new_files[src] = file_dest + self.progress('.. attempting to fetch %s' % basename) + try: + with open(file_dest, 'wb') as raw_file, \ + urllib.request.urlopen(url) as urlfile: + raw_file.write(urlfile.read()) + try: + # HTTP dates use (one of) the email date formats + url_date = email.utils.mktime_tz( + email.utils.parsedate_tz( + urlfile.info()['Last-Modified'])) + except (KeyError, ValueError, TypeError): + url_date = None + if url_date: + os.utime(file_dest, (url_date, url_date)) + except Exception: + if os.path.exists(file_dest): + os.unlink(file_dest) + raise + else: + self.progress('nothing to do for %s' % basename) + + if not len(new_files): + self.progress('.. no need to run, nothing to do') + return + + self.ensure_gitdir(gitdir) + # Ensure the repo is up-to-date + pullurl = "%s/%s.git" % (self.settings['mirror-base-url-push'], project_name) + try: + self.run_program(['git', 'fetch', pullurl, raw_file_refspecs], cwd=gitdir) + except: + # TODO: Be more specific about which exceptions are fine + pass + + # Ensure the repo supports git LFS + self.run_program(['git', 'lfs', 'install', '--local'], cwd=gitdir) + + for src in new_files: + self.run_program(["%s.raw-file-importer" % lorry_path, new_files[src], src], cwd=gitdir) + def gitify_archive(self, archive_type, project_name, dirname, gitdir, spec): assert archive_type in ['zip', 'gzip', 'tar'] @@ -933,4 +987,3 @@ class Lorry(cliapp.Application): if __name__ == '__main__': Lorry(version=__version__).run() - diff --git a/lorry.raw-file-importer b/lorry.raw-file-importer new file mode 100755 index 0000000..bca46b5 --- /dev/null +++ b/lorry.raw-file-importer @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +## Copyright 2021 Codethink Limited + +# raw file archive using git-lfs frontend for git-fast-import + +import hashlib +import os +import shutil +import subprocess +import sys +import time + +branch_name = 'master' +branch_ref = 'refs/heads/%s' % branch_name +committer_name = 'Lorry Raw File Importer' +committer_email = 'lorry-raw-file-importer@lorry' + +def commit_lfs_gitattributes(fast_import): + commit_time = int(time.time()) + commit = ( + 'commit {ref}\n' + 'committer {committer_name} <{committer_email}> {commit_time} +0000\n' + 'data <<EOM\n' + 'Ensure LFS is configured\n' + 'EOM\n' + 'M 100644 inline .gitattributes\n' + 'data <<EOM\n' + '* filter=lfs diff=lfs merge=lfs -text\n' + '.gitattributes filter diff merge text=auto\n' + 'EOM\n' + '\n' + ).format( + ref=branch_ref, + committer_name=committer_name, committer_email=committer_email, commit_time=commit_time, + ) + fast_import.write(commit.encode('utf-8')) + + +def commit_lfs_file(raw_file, relative_path, last_commit, fast_import): + # git-lfs-pointer of the file + ret = subprocess.run(['git', 'lfs', 'pointer', '--file', raw_file], capture_output=True, check=True) + pointer_digest = ret.stdout + datasize = len(pointer_digest) + + # sha256sum of the file + # slightly wasteful because git-lfs-pointer also generates a sha256sum + with open(raw_file, 'rb') as f: + shasum = hashlib.sha256(f.read()).hexdigest() + + # Add the file to the repo + out_dir = os.path.join('lfs', 'objects', shasum[0:2], shasum[2:4]) + os.makedirs(out_dir, exist_ok=True) + shutil.copyfile(raw_file, os.path.join(out_dir, shasum)) + + # Commit the data to master + commit_time = int(time.time()) + basename = os.path.basename(raw_file) + if relative_path: + path = '{}/{}'.format(relative_path, basename) + else: + path = basename + + fromline = 'from {}\n'.format(last_commit) if last_commit else '' + + commit = ( + 'commit {ref}\n' + 'committer {committer_name} <{committer_email}> {commit_time} +0000\n' + 'data <<EOM\n' + 'import {basename}\n' + 'EOM\n' + '{fromline}' + 'M 100644 inline {path}\n' + 'data {datasize}\n' + '{data}\n' + '\n' + ).format( + ref=branch_ref, + committer_name=committer_name, committer_email=committer_email, commit_time=commit_time, + basename=basename, + fromline=fromline, + path=path, + datasize=datasize, data=pointer_digest.decode('utf-8')) + fast_import.write(commit.encode('utf-8')) + + +def get_last_commit(): + # show the full hash of the latest commit + out = subprocess.run(['git', 'rev-parse', branch_ref], capture_output=True, text=True) + if out.returncode != 0: + return None + + # Will be same string as input when no commits exist yet + out = out.stdout.strip() + return None if out == branch_ref else out + + +def main(): + if len(sys.argv) < 2 or len(sys.argv) > 3: + print('usage:', sys.argv[0], '<file>', '[<relative path>]') + sys.exit(1) + + raw_file = sys.argv[1] + if len(sys.argv) < 3: + relpath = None + else: + relpath = sys.argv[2] + + last_commit = get_last_commit() + with subprocess.Popen('git fast-import --quiet', shell=True, + stdin=subprocess.PIPE) as import_proc: + if not last_commit: + commit_lfs_gitattributes(import_proc.stdin) + + commit_lfs_file(raw_file, relpath, last_commit, import_proc.stdin) + import_proc.stdin.close() + if import_proc.wait() != 0: + sys.exit(1) + + +main() |