diff options
author | Ben Hutchings <ben.hutchings@codethink.co.uk> | 2020-09-10 21:51:55 +0100 |
---|---|---|
committer | Ben Hutchings <ben.hutchings@codethink.co.uk> | 2020-09-10 21:57:15 +0100 |
commit | 14987b0be21689eeac1371e6489d59f297c93b6e (patch) | |
tree | 0509123cec48c2999b3a61b7b3faf9688d93012f | |
parent | 1fa0571d0658f6196d31402e63457735e5fcf291 (diff) | |
download | lorries-14987b0be21689eeac1371e6489d59f297c93b6e.tar.gz |
scripts/check-lorry-urls: Check for archive URLs pointing to non-archivesbwh/url-check-fixes
Some sites serve generic HTML pages with status 200 for missing files,
instead of status 404. Check that the Content-Type is not text/html.
(We can't check for a "correct" Content-Type, as archives might be
served with almost any other type.)
Also check that the file is not empty (Content-Length: 0), as this
can't be a valid archive.
-rwxr-xr-x | scripts/check-lorry-urls | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/scripts/check-lorry-urls b/scripts/check-lorry-urls index 0829684..92037ab 100755 --- a/scripts/check-lorry-urls +++ b/scripts/check-lorry-urls @@ -115,11 +115,14 @@ def check_file(filename): with urllib.request.urlopen( urllib.request.Request(upstream_url, method='HEAD')) as result: - pass + if result.info().get('Content-Type') == 'text/html': + error('Content-Type is HTML') + elif result.info().get('Content-Length') == '0': + error('file is empty') + else: + info('OK') except (urllib.error.URLError, http.client.HTTPException) as e: error(str(e)) - else: - info('OK') # Throttle requests time.sleep(0.1) |