summaryrefslogtreecommitdiff
path: root/lorry
diff options
context:
space:
mode:
authorBen Hutchings <ben.hutchings@codethink.co.uk>2020-07-11 00:44:21 +0100
committerBen Hutchings <ben.hutchings@codethink.co.uk>2020-07-15 00:35:34 +0100
commit9662091853cbb3e7c9d9962afce53e94aa238924 (patch)
tree7a1223ad7f184933152acee72e6bbdfc59faf35a /lorry
parent47ac5b7452f5b6d112ac0bcda6250697ee2f6fa9 (diff)
downloadlorry-9662091853cbb3e7c9d9962afce53e94aa238924.tar.gz
lorry: Get modification time for FTP downloads
lorry.gzip-importer uses the given file's modification time (mtime) as the commit time, but this is currently unrelated to the mtime on the server and so is unreproducible. For FTP downloads, urllib's default handler does not provide an mtime and there is no reasonable way to add that in a subclass. Define and use a new handler class that attempts to get the mtime from the server using the MDTM extension. Related to #4.
Diffstat (limited to 'lorry')
-rwxr-xr-xlorry111
1 files changed, 110 insertions, 1 deletions
diff --git a/lorry b/lorry
index 0318ef8..4f7f574 100755
--- a/lorry
+++ b/lorry
@@ -19,7 +19,7 @@ import cliapp
import json
import logging
import os
-import urllib.request, urllib.parse
+import urllib.request, urllib.parse, urllib.response
import string
import sys
from datetime import datetime
@@ -27,6 +27,9 @@ import shutil
import traceback
import functools
import stat
+import email.message
+import ftplib
+import re
import yaml
@@ -69,6 +72,109 @@ def find_bazaar_command():
return find_exec_in_path('bzr') or find_exec_in_path('brz')
+# This is a simplified replacement for urllib.request.FTPHandler, with
+# one additional feature: it uses the MDTM extension specified in RFC
+# 3659, and sets the Last-Modified header based on the result.
+class SimpleFTPHandler(urllib.request.BaseHandler):
+ # Priority needs to be higher (numerically lower) than the
+ # standard FTPHandler
+ handler_order = urllib.request.FTPHandler.handler_order - 1
+
+ # Format is YYYYMMDDhhmmss with optional fractional seconds (which
+ # we ignore). The implicit time zone is UTC.
+ _mdtm_response_re = re.compile(r'^213 (\d{14})(?:\.\d+)?$')
+
+ def ftp_open(self, req):
+ from urllib.request import URLError
+
+ if getattr(req, 'method', None) not in [None, 'GET']:
+ raise URLError('SimpleFTPHandler: only supports GET method')
+ url_parts = urllib.parse.urlparse(req.full_url)
+ if url_parts.username or url_parts.password:
+ raise URLError('SimpleFTPHandler: only supports anonymous FTP')
+ if ';' in url_parts.path or url_parts.params or url_parts.query:
+ raise URLError('SimpleFTPHandler: does not support parameters')
+
+ path_parts = []
+ for part in url_parts.path.split('/'):
+ if part == '':
+ continue
+ part = urllib.parse.unquote(part)
+ if '\r\n' in part:
+ raise URLError('SimpleFTPHandler: illegal characters in path')
+ path_parts.append(part)
+
+ ftp = ftplib.FTP()
+ try:
+ ftp.connect(url_parts.hostname, url_parts.port or 21)
+ ftp.login()
+ for part in path_parts[:-1]:
+ ftp.cwd(part)
+
+ # Try to get the mtime from the server, ignoring error
+ # or invalid responses
+ mtime = None
+ try:
+ mdtm_response = ftp.sendcmd('MDTM ' + path_parts[-1])
+ except ftplib.error_reply:
+ pass
+ else:
+ match = self._mdtm_response_re.match(mdtm_response)
+ if match:
+ mtime_s = match.group(1)
+ try:
+ mtime = datetime(
+ int(mtime_s[0:4]), int(mtime_s[4:6]),
+ int(mtime_s[6:8]), int(mtime_s[8:10]),
+ int(mtime_s[10:12]), int(mtime_s[12:14]))
+ except ValueError:
+ pass
+
+ # Start binary mode transfer
+ ftp.voidcmd('TYPE I')
+ data_sock, size = ftp.ntransfercmd('RETR ' + path_parts[-1])
+ data_file = data_sock.makefile('rb')
+
+ try:
+ # Synthesise an HTTP-like response header
+ header = email.message.EmailMessage()
+ if size is not None:
+ header['Content-Length'] = str(size)
+ if mtime is not None:
+ header['Last-Modified'] = \
+ mtime.strftime('%a, %d %b %Y %H:%M:%S GMT')
+
+ # Wrap up the file with a close hook to close the
+ # control socket as well, and the extra metadata
+ # expected in a response object
+ response = urllib.response.addinfourl(
+ urllib.response.addclosehook(data_file, self._ftp_close),
+ header, req.full_url)
+ self.ftp = ftp
+ ftp = None
+ data_file = None
+ return response
+
+ finally:
+ # Close data socket on error
+ if data_file:
+ data_file.close()
+
+ except ftplib.all_errors as e:
+ # Re-raise as URLError
+ raise URLError('SimpleFTPHandler: %r' % e) \
+ .with_traceback(sys.exc_info()[2])
+
+ finally:
+ # Close control socket on error
+ if ftp:
+ ftp.close()
+
+ def _ftp_close(self):
+ self.ftp.close()
+ del self.ftp
+
+
class Lorry(cliapp.Application):
def add_settings(self):
@@ -130,6 +236,9 @@ class Lorry(cliapp.Application):
if not os.path.exists(self.settings['working-area']):
os.makedirs(self.settings['working-area'])
+ urllib.request.install_opener(
+ urllib.request.build_opener(SimpleFTPHandler))
+
for arg in args:
self.progress('Processing spec file %s' % arg)
with open(arg) as f: