summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Hutchings <ben.hutchings@codethink.co.uk>2020-07-15 12:41:47 +0000
committerBen Hutchings <ben.hutchings@codethink.co.uk>2020-07-15 12:41:47 +0000
commit4600f8f0beba655f72b4bb0098f94d47a0152a29 (patch)
tree9bde04bbeb64d39a045c8e20f8641404bd69b5fc
parente756b7ba186917327109780248a9d57820eb55a6 (diff)
parent8948a44ffb03ba5467daceebdefc47c8900f51b2 (diff)
downloadlorry-4600f8f0beba655f72b4bb0098f94d47a0152a29.tar.gz
Merge branch 'bwh/reproducibility' into 'master'
Fix reproducibility of file imports Closes #1 and #4 See merge request CodethinkLabs/lorry/lorry!8
-rw-r--r--README4
-rwxr-xr-xlorry121
-rwxr-xr-xlorry.tar-importer1
-rwxr-xr-xlorry.zip-importer49
4 files changed, 165 insertions, 10 deletions
diff --git a/README b/README
index 357e77c..e46408a 100644
--- a/README
+++ b/README
@@ -244,10 +244,6 @@ These are obsolete now and are ignored by Lorry.
}
}
-NOTE: tarball imports are unlikely to give the same commit SHA1 but the tree
-SHA1 inside (which is what is used for artifact cache IDs) should remain
-stable.
-
### Zip
Lorry can import a zip file fetched from a URL. The contents will be
diff --git a/lorry b/lorry
index 0318ef8..eba2ef0 100755
--- a/lorry
+++ b/lorry
@@ -19,7 +19,7 @@ import cliapp
import json
import logging
import os
-import urllib.request, urllib.parse
+import urllib.request, urllib.parse, urllib.response
import string
import sys
from datetime import datetime
@@ -27,6 +27,10 @@ import shutil
import traceback
import functools
import stat
+import email.message
+import email.utils
+import ftplib
+import re
import yaml
@@ -69,6 +73,109 @@ def find_bazaar_command():
return find_exec_in_path('bzr') or find_exec_in_path('brz')
+# This is a simplified replacement for urllib.request.FTPHandler, with
+# one additional feature: it uses the MDTM extension specified in RFC
+# 3659, and sets the Last-Modified header based on the result.
+class SimpleFTPHandler(urllib.request.BaseHandler):
+ # Priority needs to be higher (numerically lower) than the
+ # standard FTPHandler
+ handler_order = urllib.request.FTPHandler.handler_order - 1
+
+ # Format is YYYYMMDDhhmmss with optional fractional seconds (which
+ # we ignore). The implicit time zone is UTC.
+ _mdtm_response_re = re.compile(r'^213 (\d{14})(?:\.\d+)?$')
+
+ def ftp_open(self, req):
+ from urllib.request import URLError
+
+ if getattr(req, 'method', None) not in [None, 'GET']:
+ raise URLError('SimpleFTPHandler: only supports GET method')
+ url_parts = urllib.parse.urlparse(req.full_url)
+ if url_parts.username or url_parts.password:
+ raise URLError('SimpleFTPHandler: only supports anonymous FTP')
+ if ';' in url_parts.path or url_parts.params or url_parts.query:
+ raise URLError('SimpleFTPHandler: does not support parameters')
+
+ path_parts = []
+ for part in url_parts.path.split('/'):
+ if part == '':
+ continue
+ part = urllib.parse.unquote(part)
+ if '\r\n' in part:
+ raise URLError('SimpleFTPHandler: illegal characters in path')
+ path_parts.append(part)
+
+ ftp = ftplib.FTP()
+ try:
+ ftp.connect(url_parts.hostname, url_parts.port or 21)
+ ftp.login()
+ for part in path_parts[:-1]:
+ ftp.cwd(part)
+
+ # Try to get the mtime from the server, ignoring error
+ # or invalid responses
+ mtime = None
+ try:
+ mdtm_response = ftp.sendcmd('MDTM ' + path_parts[-1])
+ except ftplib.error_reply:
+ pass
+ else:
+ match = self._mdtm_response_re.match(mdtm_response)
+ if match:
+ mtime_s = match.group(1)
+ try:
+ mtime = datetime(
+ int(mtime_s[0:4]), int(mtime_s[4:6]),
+ int(mtime_s[6:8]), int(mtime_s[8:10]),
+ int(mtime_s[10:12]), int(mtime_s[12:14]))
+ except ValueError:
+ pass
+
+ # Start binary mode transfer
+ ftp.voidcmd('TYPE I')
+ data_sock, size = ftp.ntransfercmd('RETR ' + path_parts[-1])
+ data_file = data_sock.makefile('rb')
+
+ try:
+ # Synthesise an HTTP-like response header
+ header = email.message.EmailMessage()
+ if size is not None:
+ header['Content-Length'] = str(size)
+ if mtime is not None:
+ header['Last-Modified'] = \
+ mtime.strftime('%a, %d %b %Y %H:%M:%S GMT')
+
+ # Wrap up the file with a close hook to close the
+ # control socket as well, and the extra metadata
+ # expected in a response object
+ response = urllib.response.addinfourl(
+ urllib.response.addclosehook(data_file, self._ftp_close),
+ header, req.full_url)
+ self.ftp = ftp
+ ftp = None
+ data_file = None
+ return response
+
+ finally:
+ # Close data socket on error
+ if data_file:
+ data_file.close()
+
+ except ftplib.all_errors as e:
+ # Re-raise as URLError
+ raise URLError('SimpleFTPHandler: %r' % e) \
+ .with_traceback(sys.exc_info()[2])
+
+ finally:
+ # Close control socket on error
+ if ftp:
+ ftp.close()
+
+ def _ftp_close(self):
+ self.ftp.close()
+ del self.ftp
+
+
class Lorry(cliapp.Application):
def add_settings(self):
@@ -130,6 +237,9 @@ class Lorry(cliapp.Application):
if not os.path.exists(self.settings['working-area']):
os.makedirs(self.settings['working-area'])
+ urllib.request.install_opener(
+ urllib.request.build_opener(SimpleFTPHandler))
+
for arg in args:
self.progress('Processing spec file %s' % arg)
with open(arg) as f:
@@ -530,7 +640,16 @@ class Lorry(cliapp.Application):
with open(archive_dest, 'wb') as archive_file:
urlfile = urllib.request.urlopen(spec['url'])
archive_file.write(urlfile.read())
+ try:
+ # HTTP dates use (one of) the email date formats
+ url_date = email.utils.mktime_tz(
+ email.utils.parsedate_tz(
+ urlfile.info()['Last-Modified']))
+ except (KeyError, ValueError, TypeError):
+ url_date = None
urlfile.close()
+ if url_date:
+ os.utime(archive_dest, (url_date, url_date))
except Exception:
if os.path.exists(archive_dest):
os.unlink(archive_dest)
diff --git a/lorry.tar-importer b/lorry.tar-importer
index 6cdad1d..2a7c746 100755
--- a/lorry.tar-importer
+++ b/lorry.tar-importer
@@ -43,7 +43,6 @@ open(FI, '|-', 'git', 'fast-import', '--quiet')
or die "Unable to start git fast-import: $!\n";
foreach my $tar_file (@ARGV)
{
- my $commit_time = time;
$tar_file =~ m,([^/]+)$,;
my $tar_name = $1;
diff --git a/lorry.zip-importer b/lorry.zip-importer
index cab5e26..dfb73eb 100755
--- a/lorry.zip-importer
+++ b/lorry.zip-importer
@@ -10,7 +10,9 @@
## python import-zips.py *.zip
## git log --stat import-zips
+import calendar
import os.path
+import struct
import subprocess
import sys
import time
@@ -23,12 +25,52 @@ committer_name = 'Lorry Zip Importer'
committer_email = 'lorry-zip-importer@lorry'
+# File header 'extra' field tags
+EXT_TAG_UNIX0 = 0x000d # PKWARE Unix, aka Unix type 0
+EXT_TAG_TIME = 0x5455 # Extended Timestamp
+EXT_TIME_FLAG_MTIME = 1 # mtime present (and first)
+EXT_TAG_UNIX1 = 0x5855 # Info-ZIP Unix type 1
+
+
+# Iterate over fields within a file header 'extra' block
+def zip_extra_fields(extra):
+ pos = 0
+ while len(extra) >= pos + 4:
+ tag, size = struct.unpack('<HH', extra[pos : pos + 4])
+ pos += 4
+ if len(extra) < pos + size:
+ return
+ yield tag, extra[pos : pos + size]
+ pos += size
+
+
+# Make our best guess at the mtime of a zip file entry
+def zip_info_mtime(info):
+ # Look for Unix-format mtime in the 'extra' block
+ for tag, data in zip_extra_fields(info.extra):
+ format = None
+ if tag in [EXT_TAG_UNIX0, EXT_TAG_UNIX1]:
+ format = '<4xL' # AcTime, ModTime
+ elif tag == EXT_TAG_TIME:
+ # First byte indicates which timestamps follow
+ if len(data) >= 1 and data[0] & EXT_TIME_FLAG_MTIME:
+ format = '<xL' # Flags, ModTime
+ if format:
+ min_len = struct.calcsize(format)
+ if len(data) >= min_len:
+ return struct.unpack(format, data[:min_len])[0]
+
+ # Timestamps in the main header are in local time, but the time
+ # zone offset is unspecified. We choose to interpret them as UTC.
+ return calendar.timegm(info.date_time + (0, 0, 0))
+
+
def export(zipfile, fast_import):
def printlines(list):
for str in list:
fast_import.write(str.encode('utf-8') + b"\n")
- commit_time = (1970, 1, 1, 0, 0, 0)
+ commit_time = 0
next_mark = 1
common_prefix = None
mark = dict()
@@ -39,8 +81,7 @@ def export(zipfile, fast_import):
continue
info = zip.getinfo(name)
- if commit_time < info.date_time:
- commit_time = info.date_time
+ commit_time = max(commit_time, zip_info_mtime(info))
if common_prefix == None:
common_prefix = name[:name.rfind('/') + 1]
else:
@@ -56,7 +97,7 @@ def export(zipfile, fast_import):
fast_import.write(zip.read(name) + b"\n")
committer = committer_name + ' <' + committer_email + '> %d +0000' % \
- time.mktime(commit_time + (0, 0, 0))
+ commit_time
zipfile_basename = os.path.basename(zipfile)
printlines(('commit ' + branch_ref, 'committer ' + committer, \