#1036: add exception handling + some minor coding style adjustments

author: Giampaolo Rodola <g.rodola@gmail.com> 2017-05-01 18:16:51 +0200
committer: Giampaolo Rodola <g.rodola@gmail.com> 2017-05-01 18:16:51 +0200
commit: 789773dfeae12b0b089887eb0e0b5360adf442d9 (patch)
tree: 7f23ba62aea5160753a01537bea598524957aa9d /scripts
parent: 9c7974c1929330c1e79ca024320366db918c205c (diff)
download: psutil-789773dfeae12b0b089887eb0e0b5360adf442d9.tar.gz
1 files changed, 38 insertions, 40 deletions
diff --git a/scripts/internal/check_broken_links.py b/scripts/internal/check_broken_links.py
index b14e9f59..ec492f61 100755
--- a/scripts/internal/check_broken_links.py
+++ b/scripts/internal/check_broken_links.py
@@ -1,24 +1,24 @@
 #!/usr/bin/env python
 
-# Author : Himanshu Shekhar < https://github.com/himanshub16 > (2017)
-
-# Copyright (c) 2009, Giampaolo Rodola'. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
+# Copyright (c) 2009, Giampaolo Rodola', Himanshu Shekhar.
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
 
 """
-Checks for broken links in file names specified as command line parameters.
-
-There are a ton of a solutions available for validating URLs in string using
-regex, but less for searching, of which very few are accurate.
-This snippet is intended to just do the required work, and avoid complexities.
-Django Validator has pretty good regex for validation, but we have to find
-urls instead of validating them. (REFERENCES [7])
+Checks for broken links in file names specified as command line
+parameters.
+
+There are a ton of a solutions available for validating URLs in string
+using regex, but less for searching, of which very few are accurate.
+This snippet is intended to just do the required work, and avoid
+complexities. Django Validator has pretty good regex for validation,
+but we have to find urls instead of validating them (REFERENCES [7]).
 There's always room for improvement.
 
 Method:
 * Match URLs using regex (REFERENCES [1]])
-* Some URLs need to be fixed, as they have < (or) > due to inefficient regex.
+* Some URLs need to be fixed, as they have < (or) > due to inefficient
+  regex.
 * Remove duplicates (because regex is not 100% efficient as of now).
 * Check validity of URL, using HEAD request. (HEAD to save bandwidth)
   Uses requests module for others are painful to use. REFERENCES[9]
@@ -36,6 +36,7 @@ Using [1] with some modificatons for including ftp
 [8] https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/HEAD
 [9] http://docs.python-requests.org/
 
+Author: Himanshu Shekhar <https://github.com/himanshub16> (2017)
 """
 
 from __future__ import print_function
@@ -43,18 +44,16 @@ from __future__ import print_function
 import os
 import re
 import sys
-
+import traceback
 import concurrent.futures
+
 import requests
 
 
 HERE = os.path.abspath(os.path.dirname(__file__))
-
 REGEX = r'(?:http|ftp|https)?://' \
         r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
-
 REQUEST_TIMEOUT = 30
-
 # There are some status codes sent by websites on HEAD request.
 # Like 503 by Microsoft, and 401 by Apple
 # They need to be sent GET request
@@ -62,27 +61,21 @@ RETRY_STATUSES = [503, 401, 403]
 
 
 def get_urls(filename):
-    """Extracts all URLs available in specified filename
-    """
-    # fname = os.path.abspath(os.path.join(HERE, filename))
-    # expecting absolute path
+    """Extracts all URLs available in specified filename."""
     with open(filename) as fs:
         text = fs.read()
-
     urls = re.findall(REGEX, text)
     # remove duplicates, list for sets are not iterable
     urls = list(set(urls))
     # correct urls which are between < and/or >
     for i, url in enumerate(urls):
         urls[i] = re.sub("[\*<>\(\)\)]", '', url)
-
     return urls
 
 
 def validate_url(url):
     """Validate the URL by attempting an HTTP connection.
     Makes an HTTP-HEAD request for each URL.
-    Uses requests module.
     """
     try:
         res = requests.head(url, timeout=REQUEST_TIMEOUT)
@@ -100,32 +93,36 @@ def parallel_validator(urls):
     urls: tuple(filename, url)
     """
     fails = []  # list of tuples (filename, url)
-    completed = 0
+    current = 0
     total = len(urls)
-
     with concurrent.futures.ThreadPoolExecutor() as executor:
         fut_to_url = {executor.submit(validate_url, url[1]): url
                       for url in urls}
-
         for fut in concurrent.futures.as_completed(fut_to_url):
-            if not fut.result():
-                url = fut_to_url[fut]
-                fails.append(url)  # actually a tuple of url and filename
-            completed += 1
-            sys.stdout.write("\r" + str(completed)+' / '+str(total))
+            current += 1
+            sys.stdout.write("\r%s / %s" % (current, total))
             sys.stdout.flush()
-
-    print()
+            fname, url = fut_to_url[fut]
+            try:
+                ok = fut.result()
+            except Exception:
+                fails.append((fname, url))
+                print()
+                print("warn: error while validating %s" % url, file=sys.stderr)
+                traceback.print_exc()
+            else:
+                if not ok:
+                    fails.append((fname, url))
+    if fails:
+        print()
     return fails
 
 
 def main():
-    """Main function
-    """
     files = sys.argv[1:]
-
     if not files:
         return sys.exit("usage: %s <FILES...>" % __name__)
+
     all_urls = []
     for fname in files:
         urls = get_urls(fname)
@@ -134,12 +131,13 @@ def main():
 
     fails = parallel_validator(all_urls)
     if not fails:
-        print("all links are valid. cheers!")
+        print("all links are valid; cheers!")
     else:
         for fail in fails:
-            print(fail[1] + ' : ' + fail[0] + os.linesep)
+            fname, url = fail
+            print("%s : %s " % (url, fname))
         print('-' * 20)
-        print("total :", len(fails), "fails!")
+        print("total: %s fails!" % len(fails))
         sys.exit(1)
author	Giampaolo Rodola <g.rodola@gmail.com>	2017-05-01 18:16:51 +0200
committer	Giampaolo Rodola <g.rodola@gmail.com>	2017-05-01 18:16:51 +0200
commit	789773dfeae12b0b089887eb0e0b5360adf442d9 (patch)
tree	7f23ba62aea5160753a01537bea598524957aa9d /scripts
parent	9c7974c1929330c1e79ca024320366db918c205c (diff)
download	psutil-789773dfeae12b0b089887eb0e0b5360adf442d9.tar.gz