diff options
author | Ben Hutchings <ben.hutchings@codethink.co.uk> | 2020-07-15 12:41:47 +0000 |
---|---|---|
committer | Ben Hutchings <ben.hutchings@codethink.co.uk> | 2020-07-15 12:41:47 +0000 |
commit | 4600f8f0beba655f72b4bb0098f94d47a0152a29 (patch) | |
tree | 9bde04bbeb64d39a045c8e20f8641404bd69b5fc | |
parent | e756b7ba186917327109780248a9d57820eb55a6 (diff) | |
parent | 8948a44ffb03ba5467daceebdefc47c8900f51b2 (diff) | |
download | lorry-4600f8f0beba655f72b4bb0098f94d47a0152a29.tar.gz |
Merge branch 'bwh/reproducibility' into 'master'
Fix reproducibility of file imports
Closes #1 and #4
See merge request CodethinkLabs/lorry/lorry!8
-rw-r--r-- | README | 4 | ||||
-rwxr-xr-x | lorry | 121 | ||||
-rwxr-xr-x | lorry.tar-importer | 1 | ||||
-rwxr-xr-x | lorry.zip-importer | 49 |
4 files changed, 165 insertions, 10 deletions
@@ -244,10 +244,6 @@ These are obsolete now and are ignored by Lorry. } } -NOTE: tarball imports are unlikely to give the same commit SHA1 but the tree -SHA1 inside (which is what is used for artifact cache IDs) should remain -stable. - ### Zip Lorry can import a zip file fetched from a URL. The contents will be @@ -19,7 +19,7 @@ import cliapp import json import logging import os -import urllib.request, urllib.parse +import urllib.request, urllib.parse, urllib.response import string import sys from datetime import datetime @@ -27,6 +27,10 @@ import shutil import traceback import functools import stat +import email.message +import email.utils +import ftplib +import re import yaml @@ -69,6 +73,109 @@ def find_bazaar_command(): return find_exec_in_path('bzr') or find_exec_in_path('brz') +# This is a simplified replacement for urllib.request.FTPHandler, with +# one additional feature: it uses the MDTM extension specified in RFC +# 3659, and sets the Last-Modified header based on the result. +class SimpleFTPHandler(urllib.request.BaseHandler): + # Priority needs to be higher (numerically lower) than the + # standard FTPHandler + handler_order = urllib.request.FTPHandler.handler_order - 1 + + # Format is YYYYMMDDhhmmss with optional fractional seconds (which + # we ignore). The implicit time zone is UTC. + _mdtm_response_re = re.compile(r'^213 (\d{14})(?:\.\d+)?$') + + def ftp_open(self, req): + from urllib.request import URLError + + if getattr(req, 'method', None) not in [None, 'GET']: + raise URLError('SimpleFTPHandler: only supports GET method') + url_parts = urllib.parse.urlparse(req.full_url) + if url_parts.username or url_parts.password: + raise URLError('SimpleFTPHandler: only supports anonymous FTP') + if ';' in url_parts.path or url_parts.params or url_parts.query: + raise URLError('SimpleFTPHandler: does not support parameters') + + path_parts = [] + for part in url_parts.path.split('/'): + if part == '': + continue + part = urllib.parse.unquote(part) + if '\r\n' in part: + raise URLError('SimpleFTPHandler: illegal characters in path') + path_parts.append(part) + + ftp = ftplib.FTP() + try: + ftp.connect(url_parts.hostname, url_parts.port or 21) + ftp.login() + for part in path_parts[:-1]: + ftp.cwd(part) + + # Try to get the mtime from the server, ignoring error + # or invalid responses + mtime = None + try: + mdtm_response = ftp.sendcmd('MDTM ' + path_parts[-1]) + except ftplib.error_reply: + pass + else: + match = self._mdtm_response_re.match(mdtm_response) + if match: + mtime_s = match.group(1) + try: + mtime = datetime( + int(mtime_s[0:4]), int(mtime_s[4:6]), + int(mtime_s[6:8]), int(mtime_s[8:10]), + int(mtime_s[10:12]), int(mtime_s[12:14])) + except ValueError: + pass + + # Start binary mode transfer + ftp.voidcmd('TYPE I') + data_sock, size = ftp.ntransfercmd('RETR ' + path_parts[-1]) + data_file = data_sock.makefile('rb') + + try: + # Synthesise an HTTP-like response header + header = email.message.EmailMessage() + if size is not None: + header['Content-Length'] = str(size) + if mtime is not None: + header['Last-Modified'] = \ + mtime.strftime('%a, %d %b %Y %H:%M:%S GMT') + + # Wrap up the file with a close hook to close the + # control socket as well, and the extra metadata + # expected in a response object + response = urllib.response.addinfourl( + urllib.response.addclosehook(data_file, self._ftp_close), + header, req.full_url) + self.ftp = ftp + ftp = None + data_file = None + return response + + finally: + # Close data socket on error + if data_file: + data_file.close() + + except ftplib.all_errors as e: + # Re-raise as URLError + raise URLError('SimpleFTPHandler: %r' % e) \ + .with_traceback(sys.exc_info()[2]) + + finally: + # Close control socket on error + if ftp: + ftp.close() + + def _ftp_close(self): + self.ftp.close() + del self.ftp + + class Lorry(cliapp.Application): def add_settings(self): @@ -130,6 +237,9 @@ class Lorry(cliapp.Application): if not os.path.exists(self.settings['working-area']): os.makedirs(self.settings['working-area']) + urllib.request.install_opener( + urllib.request.build_opener(SimpleFTPHandler)) + for arg in args: self.progress('Processing spec file %s' % arg) with open(arg) as f: @@ -530,7 +640,16 @@ class Lorry(cliapp.Application): with open(archive_dest, 'wb') as archive_file: urlfile = urllib.request.urlopen(spec['url']) archive_file.write(urlfile.read()) + try: + # HTTP dates use (one of) the email date formats + url_date = email.utils.mktime_tz( + email.utils.parsedate_tz( + urlfile.info()['Last-Modified'])) + except (KeyError, ValueError, TypeError): + url_date = None urlfile.close() + if url_date: + os.utime(archive_dest, (url_date, url_date)) except Exception: if os.path.exists(archive_dest): os.unlink(archive_dest) diff --git a/lorry.tar-importer b/lorry.tar-importer index 6cdad1d..2a7c746 100755 --- a/lorry.tar-importer +++ b/lorry.tar-importer @@ -43,7 +43,6 @@ open(FI, '|-', 'git', 'fast-import', '--quiet') or die "Unable to start git fast-import: $!\n"; foreach my $tar_file (@ARGV) { - my $commit_time = time; $tar_file =~ m,([^/]+)$,; my $tar_name = $1; diff --git a/lorry.zip-importer b/lorry.zip-importer index cab5e26..dfb73eb 100755 --- a/lorry.zip-importer +++ b/lorry.zip-importer @@ -10,7 +10,9 @@ ## python import-zips.py *.zip ## git log --stat import-zips +import calendar import os.path +import struct import subprocess import sys import time @@ -23,12 +25,52 @@ committer_name = 'Lorry Zip Importer' committer_email = 'lorry-zip-importer@lorry' +# File header 'extra' field tags +EXT_TAG_UNIX0 = 0x000d # PKWARE Unix, aka Unix type 0 +EXT_TAG_TIME = 0x5455 # Extended Timestamp +EXT_TIME_FLAG_MTIME = 1 # mtime present (and first) +EXT_TAG_UNIX1 = 0x5855 # Info-ZIP Unix type 1 + + +# Iterate over fields within a file header 'extra' block +def zip_extra_fields(extra): + pos = 0 + while len(extra) >= pos + 4: + tag, size = struct.unpack('<HH', extra[pos : pos + 4]) + pos += 4 + if len(extra) < pos + size: + return + yield tag, extra[pos : pos + size] + pos += size + + +# Make our best guess at the mtime of a zip file entry +def zip_info_mtime(info): + # Look for Unix-format mtime in the 'extra' block + for tag, data in zip_extra_fields(info.extra): + format = None + if tag in [EXT_TAG_UNIX0, EXT_TAG_UNIX1]: + format = '<4xL' # AcTime, ModTime + elif tag == EXT_TAG_TIME: + # First byte indicates which timestamps follow + if len(data) >= 1 and data[0] & EXT_TIME_FLAG_MTIME: + format = '<xL' # Flags, ModTime + if format: + min_len = struct.calcsize(format) + if len(data) >= min_len: + return struct.unpack(format, data[:min_len])[0] + + # Timestamps in the main header are in local time, but the time + # zone offset is unspecified. We choose to interpret them as UTC. + return calendar.timegm(info.date_time + (0, 0, 0)) + + def export(zipfile, fast_import): def printlines(list): for str in list: fast_import.write(str.encode('utf-8') + b"\n") - commit_time = (1970, 1, 1, 0, 0, 0) + commit_time = 0 next_mark = 1 common_prefix = None mark = dict() @@ -39,8 +81,7 @@ def export(zipfile, fast_import): continue info = zip.getinfo(name) - if commit_time < info.date_time: - commit_time = info.date_time + commit_time = max(commit_time, zip_info_mtime(info)) if common_prefix == None: common_prefix = name[:name.rfind('/') + 1] else: @@ -56,7 +97,7 @@ def export(zipfile, fast_import): fast_import.write(zip.read(name) + b"\n") committer = committer_name + ' <' + committer_email + '> %d +0000' % \ - time.mktime(commit_time + (0, 0, 0)) + commit_time zipfile_basename = os.path.basename(zipfile) printlines(('commit ' + branch_ref, 'committer ' + committer, \ |