summaryrefslogtreecommitdiff
path: root/pyparsing/common.py
diff options
context:
space:
mode:
authorptmcg <ptmcg@austin.rr.com>2021-09-08 22:51:46 -0500
committerptmcg <ptmcg@austin.rr.com>2021-09-08 22:51:46 -0500
commite2fb9f25431544b3c783e13e7fffc0e17bcf9fc8 (patch)
tree0a31fc24763af89c78441ed58242cb4625023924 /pyparsing/common.py
parent6bdee2fec058493b4e1809aa25de3494f41e3627 (diff)
downloadpyparsing-git-e2fb9f25431544b3c783e13e7fffc0e17bcf9fc8.tar.gz
Add url expression to pyparsing_common (#249)
Diffstat (limited to 'pyparsing/common.py')
-rw-r--r--pyparsing/common.py48
1 files changed, 48 insertions, 0 deletions
diff --git a/pyparsing/common.py b/pyparsing/common.py
index c6d91f6..0eb286e 100644
--- a/pyparsing/common.py
+++ b/pyparsing/common.py
@@ -358,6 +358,54 @@ class pyparsing_common:
downcase_tokens = staticmethod(token_map(lambda t: t.lower()))
"""Parse action to convert tokens to lower case."""
+ url = Regex(
+ # https://mathiasbynens.be/demo/url-regex
+ # https://gist.github.com/dperini/729294
+ r"^" +
+ # protocol identifier (optional)
+ # short syntax // still required
+ r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" +
+ # user:pass BasicAuth (optional)
+ r"(?:(?P<auth>\S+(?::\S*)?)@)?" +
+ r"(?P<host>" +
+ # IP address exclusion
+ # private & local networks
+ r"(?!(?:10|127)(?:\.\d{1,3}){3})" +
+ r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" +
+ r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" +
+ # IP address dotted notation octets
+ # excludes loopback network 0.0.0.0
+ # excludes reserved space >= 224.0.0.0
+ # excludes network & broadcast addresses
+ # (first & last IP address of each class)
+ r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" +
+ r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" +
+ r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +
+ r"|" +
+ # host & domain names, may end with dot
+ # can be replaced by a shortest alternative
+ # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
+ r"(?:" +
+ r"(?:" +
+ r"[a-z0-9\u00a1-\uffff]" +
+ r"[a-z0-9\u00a1-\uffff_-]{0,62}" +
+ r")?" +
+ r"[a-z0-9\u00a1-\uffff]\." +
+ r")+" +
+ # TLD identifier name, may end with dot
+ r"(?:[a-z\u00a1-\uffff]{2,}\.?)" +
+ r")" +
+ # port number (optional)
+ r"(?P<port>:\d{2,5})?" +
+ # resource path (optional)
+ r"(?P<path>\/[^?# ]*)?" +
+ # query string (optional)
+ r"(?P<query>\?[^#]*)?" +
+ # fragment (optional)
+ r"(?P<fragment>#\S*)?" +
+ r"$"
+ ).set_name("url")
+
# pre-PEP8 compatibility names
convertToInteger = convert_to_integer
convertToFloat = convert_to_float