summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorGiampaolo Rodola <g.rodola@gmail.com>2017-05-01 18:16:51 +0200
committerGiampaolo Rodola <g.rodola@gmail.com>2017-05-01 18:16:51 +0200
commit789773dfeae12b0b089887eb0e0b5360adf442d9 (patch)
tree7f23ba62aea5160753a01537bea598524957aa9d /scripts
parent9c7974c1929330c1e79ca024320366db918c205c (diff)
downloadpsutil-789773dfeae12b0b089887eb0e0b5360adf442d9.tar.gz
#1036: add exception handling + some minor coding style adjustments
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/internal/check_broken_links.py78
1 files changed, 38 insertions, 40 deletions
diff --git a/scripts/internal/check_broken_links.py b/scripts/internal/check_broken_links.py
index b14e9f59..ec492f61 100755
--- a/scripts/internal/check_broken_links.py
+++ b/scripts/internal/check_broken_links.py
@@ -1,24 +1,24 @@
#!/usr/bin/env python
-# Author : Himanshu Shekhar < https://github.com/himanshub16 > (2017)
-
-# Copyright (c) 2009, Giampaolo Rodola'. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Copyright (c) 2009, Giampaolo Rodola', Himanshu Shekhar.
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
"""
-Checks for broken links in file names specified as command line parameters.
-
-There are a ton of a solutions available for validating URLs in string using
-regex, but less for searching, of which very few are accurate.
-This snippet is intended to just do the required work, and avoid complexities.
-Django Validator has pretty good regex for validation, but we have to find
-urls instead of validating them. (REFERENCES [7])
+Checks for broken links in file names specified as command line
+parameters.
+
+There are a ton of a solutions available for validating URLs in string
+using regex, but less for searching, of which very few are accurate.
+This snippet is intended to just do the required work, and avoid
+complexities. Django Validator has pretty good regex for validation,
+but we have to find urls instead of validating them (REFERENCES [7]).
There's always room for improvement.
Method:
* Match URLs using regex (REFERENCES [1]])
-* Some URLs need to be fixed, as they have < (or) > due to inefficient regex.
+* Some URLs need to be fixed, as they have < (or) > due to inefficient
+ regex.
* Remove duplicates (because regex is not 100% efficient as of now).
* Check validity of URL, using HEAD request. (HEAD to save bandwidth)
Uses requests module for others are painful to use. REFERENCES[9]
@@ -36,6 +36,7 @@ Using [1] with some modificatons for including ftp
[8] https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/HEAD
[9] http://docs.python-requests.org/
+Author: Himanshu Shekhar <https://github.com/himanshub16> (2017)
"""
from __future__ import print_function
@@ -43,18 +44,16 @@ from __future__ import print_function
import os
import re
import sys
-
+import traceback
import concurrent.futures
+
import requests
HERE = os.path.abspath(os.path.dirname(__file__))
-
REGEX = r'(?:http|ftp|https)?://' \
r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
-
REQUEST_TIMEOUT = 30
-
# There are some status codes sent by websites on HEAD request.
# Like 503 by Microsoft, and 401 by Apple
# They need to be sent GET request
@@ -62,27 +61,21 @@ RETRY_STATUSES = [503, 401, 403]
def get_urls(filename):
- """Extracts all URLs available in specified filename
- """
- # fname = os.path.abspath(os.path.join(HERE, filename))
- # expecting absolute path
+ """Extracts all URLs available in specified filename."""
with open(filename) as fs:
text = fs.read()
-
urls = re.findall(REGEX, text)
# remove duplicates, list for sets are not iterable
urls = list(set(urls))
# correct urls which are between < and/or >
for i, url in enumerate(urls):
urls[i] = re.sub("[\*<>\(\)\)]", '', url)
-
return urls
def validate_url(url):
"""Validate the URL by attempting an HTTP connection.
Makes an HTTP-HEAD request for each URL.
- Uses requests module.
"""
try:
res = requests.head(url, timeout=REQUEST_TIMEOUT)
@@ -100,32 +93,36 @@ def parallel_validator(urls):
urls: tuple(filename, url)
"""
fails = [] # list of tuples (filename, url)
- completed = 0
+ current = 0
total = len(urls)
-
with concurrent.futures.ThreadPoolExecutor() as executor:
fut_to_url = {executor.submit(validate_url, url[1]): url
for url in urls}
-
for fut in concurrent.futures.as_completed(fut_to_url):
- if not fut.result():
- url = fut_to_url[fut]
- fails.append(url) # actually a tuple of url and filename
- completed += 1
- sys.stdout.write("\r" + str(completed)+' / '+str(total))
+ current += 1
+ sys.stdout.write("\r%s / %s" % (current, total))
sys.stdout.flush()
-
- print()
+ fname, url = fut_to_url[fut]
+ try:
+ ok = fut.result()
+ except Exception:
+ fails.append((fname, url))
+ print()
+ print("warn: error while validating %s" % url, file=sys.stderr)
+ traceback.print_exc()
+ else:
+ if not ok:
+ fails.append((fname, url))
+ if fails:
+ print()
return fails
def main():
- """Main function
- """
files = sys.argv[1:]
-
if not files:
return sys.exit("usage: %s <FILES...>" % __name__)
+
all_urls = []
for fname in files:
urls = get_urls(fname)
@@ -134,12 +131,13 @@ def main():
fails = parallel_validator(all_urls)
if not fails:
- print("all links are valid. cheers!")
+ print("all links are valid; cheers!")
else:
for fail in fails:
- print(fail[1] + ' : ' + fail[0] + os.linesep)
+ fname, url = fail
+ print("%s : %s " % (url, fname))
print('-' * 20)
- print("total :", len(fails), "fails!")
+ print("total: %s fails!" % len(fails))
sys.exit(1)