summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Hutchings <ben.hutchings@codethink.co.uk>2020-09-10 21:51:55 +0100
committerBen Hutchings <ben.hutchings@codethink.co.uk>2020-09-10 21:57:15 +0100
commit14987b0be21689eeac1371e6489d59f297c93b6e (patch)
tree0509123cec48c2999b3a61b7b3faf9688d93012f
parent1fa0571d0658f6196d31402e63457735e5fcf291 (diff)
downloadlorries-bwh/url-check-fixes.tar.gz
scripts/check-lorry-urls: Check for archive URLs pointing to non-archivesbwh/url-check-fixes
Some sites serve generic HTML pages with status 200 for missing files, instead of status 404. Check that the Content-Type is not text/html. (We can't check for a "correct" Content-Type, as archives might be served with almost any other type.) Also check that the file is not empty (Content-Length: 0), as this can't be a valid archive.
-rwxr-xr-xscripts/check-lorry-urls9
1 files changed, 6 insertions, 3 deletions
diff --git a/scripts/check-lorry-urls b/scripts/check-lorry-urls
index 0829684..92037ab 100755
--- a/scripts/check-lorry-urls
+++ b/scripts/check-lorry-urls
@@ -115,11 +115,14 @@ def check_file(filename):
with urllib.request.urlopen(
urllib.request.Request(upstream_url,
method='HEAD')) as result:
- pass
+ if result.info().get('Content-Type') == 'text/html':
+ error('Content-Type is HTML')
+ elif result.info().get('Content-Length') == '0':
+ error('file is empty')
+ else:
+ info('OK')
except (urllib.error.URLError, http.client.HTTPException) as e:
error(str(e))
- else:
- info('OK')
# Throttle requests
time.sleep(0.1)