summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiampaolo Rodola <g.rodola@gmail.com>2017-05-01 17:20:28 +0200
committerGitHub <noreply@github.com>2017-05-01 17:20:28 +0200
commit2a7ba3d6e270effdd69975517be4c84790757e96 (patch)
tree86f23b10331ba8405e0224b8aba7accfb5f55fcc
parent209980937428fc53cf26a3d22df138cc291d50c4 (diff)
parent552ee29eb1151ddec0b652db8d974abfa38440bd (diff)
downloadpsutil-2a7ba3d6e270effdd69975517be4c84790757e96.tar.gz
Merge pull request #1036 from himanshub16/check-broken-links
Check broken links
-rw-r--r--Makefile7
-rwxr-xr-xscripts/internal/check_broken_links.py147
-rwxr-xr-xscripts/internal/winmake.py1
3 files changed, 154 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index fba1f93d..953225e3 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,8 @@ DEPS = \
setuptools \
sphinx \
twine \
- unittest2
+ unittest2 \
+ requests
# In not in a virtualenv, add --user options for install commands.
INSTALL_OPTS = `$(PYTHON) -c "import sys; print('' if hasattr(sys, 'real_prefix') else '--user')"`
@@ -278,3 +279,7 @@ doc:
cd docs && make html && cd _build/html/ && zip doc.zip -r .
mv docs/_build/html/doc.zip .
@echo "done; now manually upload doc.zip from here: https://pypi.python.org/pypi?:action=pkg_edit&name=psutil"
+
+# check whether the links mentioned in some files are valid.
+check-broken-links:
+ git ls-files | grep \\.rst$ | xargs $(PYTHON) scripts/internal/check_broken_links.py
diff --git a/scripts/internal/check_broken_links.py b/scripts/internal/check_broken_links.py
new file mode 100755
index 00000000..b14e9f59
--- /dev/null
+++ b/scripts/internal/check_broken_links.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+
+# Author : Himanshu Shekhar < https://github.com/himanshub16 > (2017)
+
+# Copyright (c) 2009, Giampaolo Rodola'. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+Checks for broken links in file names specified as command line parameters.
+
+There are a ton of a solutions available for validating URLs in string using
+regex, but less for searching, of which very few are accurate.
+This snippet is intended to just do the required work, and avoid complexities.
+Django Validator has pretty good regex for validation, but we have to find
+urls instead of validating them. (REFERENCES [7])
+There's always room for improvement.
+
+Method:
+* Match URLs using regex (REFERENCES [1]])
+* Some URLs need to be fixed, as they have < (or) > due to inefficient regex.
+* Remove duplicates (because regex is not 100% efficient as of now).
+* Check validity of URL, using HEAD request. (HEAD to save bandwidth)
+ Uses requests module for others are painful to use. REFERENCES[9]
+ Handles redirects, http, https, ftp as well.
+
+REFERENCES:
+Using [1] with some modificatons for including ftp
+[1] http://stackoverflow.com/a/6883094/5163807
+[2] http://stackoverflow.com/a/31952097/5163807
+[3] http://daringfireball.net/2010/07/improved_regex_for_matching_urls
+[4] https://mathiasbynens.be/demo/url-regex
+[5] https://github.com/django/django/blob/master/django/core/validators.py
+[6] https://data.iana.org/TLD/tlds-alpha-by-domain.txt
+[7] https://codereview.stackexchange.com/questions/19663/http-url-validating
+[8] https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/HEAD
+[9] http://docs.python-requests.org/
+
+"""
+
+from __future__ import print_function
+
+import os
+import re
+import sys
+
+import concurrent.futures
+import requests
+
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+REGEX = r'(?:http|ftp|https)?://' \
+ r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+
+REQUEST_TIMEOUT = 30
+
+# There are some status codes sent by websites on HEAD request.
+# Like 503 by Microsoft, and 401 by Apple
+# They need to be sent GET request
+RETRY_STATUSES = [503, 401, 403]
+
+
+def get_urls(filename):
+ """Extracts all URLs available in specified filename
+ """
+ # fname = os.path.abspath(os.path.join(HERE, filename))
+ # expecting absolute path
+ with open(filename) as fs:
+ text = fs.read()
+
+ urls = re.findall(REGEX, text)
+ # remove duplicates, list for sets are not iterable
+ urls = list(set(urls))
+ # correct urls which are between < and/or >
+ for i, url in enumerate(urls):
+ urls[i] = re.sub("[\*<>\(\)\)]", '', url)
+
+ return urls
+
+
+def validate_url(url):
+ """Validate the URL by attempting an HTTP connection.
+ Makes an HTTP-HEAD request for each URL.
+ Uses requests module.
+ """
+ try:
+ res = requests.head(url, timeout=REQUEST_TIMEOUT)
+ # some websites deny 503, like Microsoft
+ # and some send 401, like Apple, observations
+ if (not res.ok) and (res.status_code in RETRY_STATUSES):
+ res = requests.get(url, timeout=REQUEST_TIMEOUT)
+ return res.ok
+ except requests.exceptions.RequestException:
+ return False
+
+
+def parallel_validator(urls):
+ """validates all urls in parallel
+ urls: tuple(filename, url)
+ """
+ fails = [] # list of tuples (filename, url)
+ completed = 0
+ total = len(urls)
+
+ with concurrent.futures.ThreadPoolExecutor() as executor:
+ fut_to_url = {executor.submit(validate_url, url[1]): url
+ for url in urls}
+
+ for fut in concurrent.futures.as_completed(fut_to_url):
+ if not fut.result():
+ url = fut_to_url[fut]
+ fails.append(url) # actually a tuple of url and filename
+ completed += 1
+ sys.stdout.write("\r" + str(completed)+' / '+str(total))
+ sys.stdout.flush()
+
+ print()
+ return fails
+
+
+def main():
+ """Main function
+ """
+ files = sys.argv[1:]
+
+ if not files:
+ return sys.exit("usage: %s <FILES...>" % __name__)
+ all_urls = []
+ for fname in files:
+ urls = get_urls(fname)
+ for url in urls:
+ all_urls.append((fname, url))
+
+ fails = parallel_validator(all_urls)
+ if not fails:
+ print("all links are valid. cheers!")
+ else:
+ for fail in fails:
+ print(fail[1] + ' : ' + fail[0] + os.linesep)
+ print('-' * 20)
+ print("total :", len(fails), "fails!")
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/internal/winmake.py b/scripts/internal/winmake.py
index 0c8f8fea..c9139977 100755
--- a/scripts/internal/winmake.py
+++ b/scripts/internal/winmake.py
@@ -41,6 +41,7 @@ DEPS = [
"unittest2",
"wheel",
"wmi",
+ "requests"
]
_cmds = {}