diff options
| author | Giampaolo Rodola <g.rodola@gmail.com> | 2017-05-01 17:20:28 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2017-05-01 17:20:28 +0200 |
| commit | 2a7ba3d6e270effdd69975517be4c84790757e96 (patch) | |
| tree | 86f23b10331ba8405e0224b8aba7accfb5f55fcc | |
| parent | 209980937428fc53cf26a3d22df138cc291d50c4 (diff) | |
| parent | 552ee29eb1151ddec0b652db8d974abfa38440bd (diff) | |
| download | psutil-2a7ba3d6e270effdd69975517be4c84790757e96.tar.gz | |
Merge pull request #1036 from himanshub16/check-broken-links
Check broken links
| -rw-r--r-- | Makefile | 7 | ||||
| -rwxr-xr-x | scripts/internal/check_broken_links.py | 147 | ||||
| -rwxr-xr-x | scripts/internal/winmake.py | 1 |
3 files changed, 154 insertions, 1 deletions
@@ -22,7 +22,8 @@ DEPS = \ setuptools \ sphinx \ twine \ - unittest2 + unittest2 \ + requests # In not in a virtualenv, add --user options for install commands. INSTALL_OPTS = `$(PYTHON) -c "import sys; print('' if hasattr(sys, 'real_prefix') else '--user')"` @@ -278,3 +279,7 @@ doc: cd docs && make html && cd _build/html/ && zip doc.zip -r . mv docs/_build/html/doc.zip . @echo "done; now manually upload doc.zip from here: https://pypi.python.org/pypi?:action=pkg_edit&name=psutil" + +# check whether the links mentioned in some files are valid. +check-broken-links: + git ls-files | grep \\.rst$ | xargs $(PYTHON) scripts/internal/check_broken_links.py diff --git a/scripts/internal/check_broken_links.py b/scripts/internal/check_broken_links.py new file mode 100755 index 00000000..b14e9f59 --- /dev/null +++ b/scripts/internal/check_broken_links.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python + +# Author : Himanshu Shekhar < https://github.com/himanshub16 > (2017) + +# Copyright (c) 2009, Giampaolo Rodola'. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +""" +Checks for broken links in file names specified as command line parameters. + +There are a ton of a solutions available for validating URLs in string using +regex, but less for searching, of which very few are accurate. +This snippet is intended to just do the required work, and avoid complexities. +Django Validator has pretty good regex for validation, but we have to find +urls instead of validating them. (REFERENCES [7]) +There's always room for improvement. + +Method: +* Match URLs using regex (REFERENCES [1]]) +* Some URLs need to be fixed, as they have < (or) > due to inefficient regex. +* Remove duplicates (because regex is not 100% efficient as of now). +* Check validity of URL, using HEAD request. (HEAD to save bandwidth) + Uses requests module for others are painful to use. REFERENCES[9] + Handles redirects, http, https, ftp as well. + +REFERENCES: +Using [1] with some modificatons for including ftp +[1] http://stackoverflow.com/a/6883094/5163807 +[2] http://stackoverflow.com/a/31952097/5163807 +[3] http://daringfireball.net/2010/07/improved_regex_for_matching_urls +[4] https://mathiasbynens.be/demo/url-regex +[5] https://github.com/django/django/blob/master/django/core/validators.py +[6] https://data.iana.org/TLD/tlds-alpha-by-domain.txt +[7] https://codereview.stackexchange.com/questions/19663/http-url-validating +[8] https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/HEAD +[9] http://docs.python-requests.org/ + +""" + +from __future__ import print_function + +import os +import re +import sys + +import concurrent.futures +import requests + + +HERE = os.path.abspath(os.path.dirname(__file__)) + +REGEX = r'(?:http|ftp|https)?://' \ + r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' + +REQUEST_TIMEOUT = 30 + +# There are some status codes sent by websites on HEAD request. +# Like 503 by Microsoft, and 401 by Apple +# They need to be sent GET request +RETRY_STATUSES = [503, 401, 403] + + +def get_urls(filename): + """Extracts all URLs available in specified filename + """ + # fname = os.path.abspath(os.path.join(HERE, filename)) + # expecting absolute path + with open(filename) as fs: + text = fs.read() + + urls = re.findall(REGEX, text) + # remove duplicates, list for sets are not iterable + urls = list(set(urls)) + # correct urls which are between < and/or > + for i, url in enumerate(urls): + urls[i] = re.sub("[\*<>\(\)\)]", '', url) + + return urls + + +def validate_url(url): + """Validate the URL by attempting an HTTP connection. + Makes an HTTP-HEAD request for each URL. + Uses requests module. + """ + try: + res = requests.head(url, timeout=REQUEST_TIMEOUT) + # some websites deny 503, like Microsoft + # and some send 401, like Apple, observations + if (not res.ok) and (res.status_code in RETRY_STATUSES): + res = requests.get(url, timeout=REQUEST_TIMEOUT) + return res.ok + except requests.exceptions.RequestException: + return False + + +def parallel_validator(urls): + """validates all urls in parallel + urls: tuple(filename, url) + """ + fails = [] # list of tuples (filename, url) + completed = 0 + total = len(urls) + + with concurrent.futures.ThreadPoolExecutor() as executor: + fut_to_url = {executor.submit(validate_url, url[1]): url + for url in urls} + + for fut in concurrent.futures.as_completed(fut_to_url): + if not fut.result(): + url = fut_to_url[fut] + fails.append(url) # actually a tuple of url and filename + completed += 1 + sys.stdout.write("\r" + str(completed)+' / '+str(total)) + sys.stdout.flush() + + print() + return fails + + +def main(): + """Main function + """ + files = sys.argv[1:] + + if not files: + return sys.exit("usage: %s <FILES...>" % __name__) + all_urls = [] + for fname in files: + urls = get_urls(fname) + for url in urls: + all_urls.append((fname, url)) + + fails = parallel_validator(all_urls) + if not fails: + print("all links are valid. cheers!") + else: + for fail in fails: + print(fail[1] + ' : ' + fail[0] + os.linesep) + print('-' * 20) + print("total :", len(fails), "fails!") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/scripts/internal/winmake.py b/scripts/internal/winmake.py index 0c8f8fea..c9139977 100755 --- a/scripts/internal/winmake.py +++ b/scripts/internal/winmake.py @@ -41,6 +41,7 @@ DEPS = [ "unittest2", "wheel", "wmi", + "requests" ] _cmds = {} |
