From 14987b0be21689eeac1371e6489d59f297c93b6e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 10 Sep 2020 21:51:55 +0100 Subject: scripts/check-lorry-urls: Check for archive URLs pointing to non-archives Some sites serve generic HTML pages with status 200 for missing files, instead of status 404. Check that the Content-Type is not text/html. (We can't check for a "correct" Content-Type, as archives might be served with almost any other type.) Also check that the file is not empty (Content-Length: 0), as this can't be a valid archive. --- scripts/check-lorry-urls | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/check-lorry-urls b/scripts/check-lorry-urls index 0829684..92037ab 100755 --- a/scripts/check-lorry-urls +++ b/scripts/check-lorry-urls @@ -115,11 +115,14 @@ def check_file(filename): with urllib.request.urlopen( urllib.request.Request(upstream_url, method='HEAD')) as result: - pass + if result.info().get('Content-Type') == 'text/html': + error('Content-Type is HTML') + elif result.info().get('Content-Length') == '0': + error('file is empty') + else: + info('OK') except (urllib.error.URLError, http.client.HTTPException) as e: error(str(e)) - else: - info('OK') # Throttle requests time.sleep(0.1) -- cgit v1.2.1