summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Brown <ben@demerara.io>2021-11-16 12:09:26 +0000
committerBen Brown <ben@demerara.io>2021-11-16 12:09:26 +0000
commitbfff9d77b5da74f75a4530387b34ac9164d79029 (patch)
treee057e0ceb2689b362bbc4d69bf9fc4927364322b
parent64182a57a49203cf87c9d6e99d9a29d47e229bf0 (diff)
parent7d85bd957e72e2669e2b20364275dc248df26328 (diff)
downloadlorry-bfff9d77b5da74f75a4530387b34ac9164d79029.tar.gz
Merge branch 'jonathanmaw/lfs-raw-file-repo' into 'master'
Introduce support for raw file lorries See merge request CodethinkLabs/lorry/lorry!21
-rw-r--r--README.md21
-rwxr-xr-xlorry57
-rwxr-xr-xlorry.raw-file-importer121
3 files changed, 196 insertions, 3 deletions
diff --git a/README.md b/README.md
index a3e4905..0067010 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,8 @@ Optional:
* **Perl**: Needed if you want to import tarballs.
+* **Git LFS**: Needed if you want to import raw files.
+
Lorry file specification
------------------------
@@ -141,7 +143,7 @@ is assumed to be the master branch.
### Subversion
To support all the branches and tags a layout needs to be specified as svn is
very flexible with the possible layouts, however the most common is to have the
-working branch in a directory called trunk, and the branches and tags in
+working branch in a directory called trunk, and the branches and tags in
respectively named subdirectories.
Because this is so common "standard" can be used as the layout
@@ -282,6 +284,23 @@ will be tagged as 'bkai00mp.ttf')
}
}
+### Raw File
+
+Lorry can store raw files in a git LFS repository, which may allow your git
+server to serve those files via its repository browser.
+
+For convenience, raw file lorries can specify multiple sources to store in the same repository.
+Each raw file will be stored under a subpath corresponding to the source name.
+
+ {
+ "raw-file-repo": {
+ "type": "raw-file",
+ "urls": {
+ "authorities/england-and-wales": "http://geoportal1-ons.opendata.arcgis.com/datasets/0b09996863af4b5db78058225bac5d1b_0.kml",
+ "radiological-monitoring-data": "https://fsadata.github.io/radiological-monitoring-data/data/provisional-2016-bi-annual-aquatic-and-terrestrial-monitoring-results-26062017-raw-results.csv"
+ }
+ }
+ }
Tips
----
diff --git a/lorry b/lorry
index 0e047db..cadc0f2 100755
--- a/lorry
+++ b/lorry
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
-# Copyright (C) 2011-2020 Codethink Limited
+# Copyright (C) 2011-2021 Codethink Limited
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -343,6 +343,7 @@ class Lorry(cliapp.Application):
'git': self.mirror_git,
'hg': self.gitify_hg,
'svn': self.gitify_svn,
+ 'raw-file': self.gitify_raw_file,
'tarball': functools.partial(self.gitify_archive, 'tar'),
'zip': functools.partial(self.gitify_archive, 'zip'),
'gzip': functools.partial(self.gitify_archive, 'gzip')
@@ -792,6 +793,59 @@ class Lorry(cliapp.Application):
*plugin_options],
cwd=gitdir)
+ def gitify_raw_file(self, project_name, dirname, gitdir, spec):
+ raw_file_branch = 'master'
+ raw_file_refspecs = 'refs/heads/{branch}:refs/heads/{branch}'.format(branch=raw_file_branch)
+ # Fetch the files
+ new_files = {}
+ for src in spec['urls']:
+ url = spec['urls'][src]
+ url_path = urllib.parse.urlparse(url)[2]
+ basename = os.path.basename(url_path)
+ file_dest = os.path.join(dirname, basename)
+ self.progress('.. checking if we need to fetch %s' % basename)
+ if file_missing_or_empty(file_dest):
+ new_files[src] = file_dest
+ self.progress('.. attempting to fetch %s' % basename)
+ try:
+ with open(file_dest, 'wb') as raw_file, \
+ urllib.request.urlopen(url) as urlfile:
+ raw_file.write(urlfile.read())
+ try:
+ # HTTP dates use (one of) the email date formats
+ url_date = email.utils.mktime_tz(
+ email.utils.parsedate_tz(
+ urlfile.info()['Last-Modified']))
+ except (KeyError, ValueError, TypeError):
+ url_date = None
+ if url_date:
+ os.utime(file_dest, (url_date, url_date))
+ except Exception:
+ if os.path.exists(file_dest):
+ os.unlink(file_dest)
+ raise
+ else:
+ self.progress('nothing to do for %s' % basename)
+
+ if not len(new_files):
+ self.progress('.. no need to run, nothing to do')
+ return
+
+ self.ensure_gitdir(gitdir)
+ # Ensure the repo is up-to-date
+ pullurl = "%s/%s.git" % (self.settings['mirror-base-url-push'], project_name)
+ try:
+ self.run_program(['git', 'fetch', pullurl, raw_file_refspecs], cwd=gitdir)
+ except:
+ # TODO: Be more specific about which exceptions are fine
+ pass
+
+ # Ensure the repo supports git LFS
+ self.run_program(['git', 'lfs', 'install', '--local'], cwd=gitdir)
+
+ for src in new_files:
+ self.run_program(["%s.raw-file-importer" % lorry_path, new_files[src], src], cwd=gitdir)
+
def gitify_archive(self, archive_type, project_name, dirname, gitdir, spec):
assert archive_type in ['zip', 'gzip', 'tar']
@@ -933,4 +987,3 @@ class Lorry(cliapp.Application):
if __name__ == '__main__':
Lorry(version=__version__).run()
-
diff --git a/lorry.raw-file-importer b/lorry.raw-file-importer
new file mode 100755
index 0000000..bca46b5
--- /dev/null
+++ b/lorry.raw-file-importer
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+## Copyright 2021 Codethink Limited
+
+# raw file archive using git-lfs frontend for git-fast-import
+
+import hashlib
+import os
+import shutil
+import subprocess
+import sys
+import time
+
+branch_name = 'master'
+branch_ref = 'refs/heads/%s' % branch_name
+committer_name = 'Lorry Raw File Importer'
+committer_email = 'lorry-raw-file-importer@lorry'
+
+def commit_lfs_gitattributes(fast_import):
+ commit_time = int(time.time())
+ commit = (
+ 'commit {ref}\n'
+ 'committer {committer_name} <{committer_email}> {commit_time} +0000\n'
+ 'data <<EOM\n'
+ 'Ensure LFS is configured\n'
+ 'EOM\n'
+ 'M 100644 inline .gitattributes\n'
+ 'data <<EOM\n'
+ '* filter=lfs diff=lfs merge=lfs -text\n'
+ '.gitattributes filter diff merge text=auto\n'
+ 'EOM\n'
+ '\n'
+ ).format(
+ ref=branch_ref,
+ committer_name=committer_name, committer_email=committer_email, commit_time=commit_time,
+ )
+ fast_import.write(commit.encode('utf-8'))
+
+
+def commit_lfs_file(raw_file, relative_path, last_commit, fast_import):
+ # git-lfs-pointer of the file
+ ret = subprocess.run(['git', 'lfs', 'pointer', '--file', raw_file], capture_output=True, check=True)
+ pointer_digest = ret.stdout
+ datasize = len(pointer_digest)
+
+ # sha256sum of the file
+ # slightly wasteful because git-lfs-pointer also generates a sha256sum
+ with open(raw_file, 'rb') as f:
+ shasum = hashlib.sha256(f.read()).hexdigest()
+
+ # Add the file to the repo
+ out_dir = os.path.join('lfs', 'objects', shasum[0:2], shasum[2:4])
+ os.makedirs(out_dir, exist_ok=True)
+ shutil.copyfile(raw_file, os.path.join(out_dir, shasum))
+
+ # Commit the data to master
+ commit_time = int(time.time())
+ basename = os.path.basename(raw_file)
+ if relative_path:
+ path = '{}/{}'.format(relative_path, basename)
+ else:
+ path = basename
+
+ fromline = 'from {}\n'.format(last_commit) if last_commit else ''
+
+ commit = (
+ 'commit {ref}\n'
+ 'committer {committer_name} <{committer_email}> {commit_time} +0000\n'
+ 'data <<EOM\n'
+ 'import {basename}\n'
+ 'EOM\n'
+ '{fromline}'
+ 'M 100644 inline {path}\n'
+ 'data {datasize}\n'
+ '{data}\n'
+ '\n'
+ ).format(
+ ref=branch_ref,
+ committer_name=committer_name, committer_email=committer_email, commit_time=commit_time,
+ basename=basename,
+ fromline=fromline,
+ path=path,
+ datasize=datasize, data=pointer_digest.decode('utf-8'))
+ fast_import.write(commit.encode('utf-8'))
+
+
+def get_last_commit():
+ # show the full hash of the latest commit
+ out = subprocess.run(['git', 'rev-parse', branch_ref], capture_output=True, text=True)
+ if out.returncode != 0:
+ return None
+
+ # Will be same string as input when no commits exist yet
+ out = out.stdout.strip()
+ return None if out == branch_ref else out
+
+
+def main():
+ if len(sys.argv) < 2 or len(sys.argv) > 3:
+ print('usage:', sys.argv[0], '<file>', '[<relative path>]')
+ sys.exit(1)
+
+ raw_file = sys.argv[1]
+ if len(sys.argv) < 3:
+ relpath = None
+ else:
+ relpath = sys.argv[2]
+
+ last_commit = get_last_commit()
+ with subprocess.Popen('git fast-import --quiet', shell=True,
+ stdin=subprocess.PIPE) as import_proc:
+ if not last_commit:
+ commit_lfs_gitattributes(import_proc.stdin)
+
+ commit_lfs_file(raw_file, relpath, last_commit, import_proc.stdin)
+ import_proc.stdin.close()
+ if import_proc.wait() != 0:
+ sys.exit(1)
+
+
+main()