From 14987b0be21689eeac1371e6489d59f297c93b6e Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Thu, 10 Sep 2020 21:51:55 +0100
Subject: scripts/check-lorry-urls: Check for archive URLs pointing to
 non-archives

Some sites serve generic HTML pages with status 200 for missing files,
instead of status 404.  Check that the Content-Type is not text/html.
(We can't check for a "correct" Content-Type, as archives might be
served with almost any other type.)

Also check that the file is not empty (Content-Length: 0), as this
can't be a valid archive.
---
 scripts/check-lorry-urls | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/check-lorry-urls b/scripts/check-lorry-urls
index 0829684..92037ab 100755
--- a/scripts/check-lorry-urls
+++ b/scripts/check-lorry-urls
@@ -115,11 +115,14 @@ def check_file(filename):
                     with urllib.request.urlopen(
                             urllib.request.Request(upstream_url,
                                                    method='HEAD')) as result:
-                        pass
+                        if result.info().get('Content-Type') == 'text/html':
+                            error('Content-Type is HTML')
+                        elif result.info().get('Content-Length') == '0':
+                            error('file is empty')
+                        else:
+                            info('OK')
                 except (urllib.error.URLError, http.client.HTTPException) as e:
                     error(str(e))
-                else:
-                    info('OK')
 
             # Throttle requests
             time.sleep(0.1)
-- 
cgit v1.2.1