diff options
author | ptmcg <ptmcg@austin.rr.com> | 2021-09-08 22:51:46 -0500 |
---|---|---|
committer | ptmcg <ptmcg@austin.rr.com> | 2021-09-08 22:51:46 -0500 |
commit | e2fb9f25431544b3c783e13e7fffc0e17bcf9fc8 (patch) | |
tree | 0a31fc24763af89c78441ed58242cb4625023924 /pyparsing/common.py | |
parent | 6bdee2fec058493b4e1809aa25de3494f41e3627 (diff) | |
download | pyparsing-git-e2fb9f25431544b3c783e13e7fffc0e17bcf9fc8.tar.gz |
Add url expression to pyparsing_common (#249)
Diffstat (limited to 'pyparsing/common.py')
-rw-r--r-- | pyparsing/common.py | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/pyparsing/common.py b/pyparsing/common.py index c6d91f6..0eb286e 100644 --- a/pyparsing/common.py +++ b/pyparsing/common.py @@ -358,6 +358,54 @@ class pyparsing_common: downcase_tokens = staticmethod(token_map(lambda t: t.lower())) """Parse action to convert tokens to lower case.""" + url = Regex( + # https://mathiasbynens.be/demo/url-regex + # https://gist.github.com/dperini/729294 + r"^" + + # protocol identifier (optional) + # short syntax // still required + r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" + + # user:pass BasicAuth (optional) + r"(?:(?P<auth>\S+(?::\S*)?)@)?" + + r"(?P<host>" + + # IP address exclusion + # private & local networks + r"(?!(?:10|127)(?:\.\d{1,3}){3})" + + r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" + + r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" + + # IP address dotted notation octets + # excludes loopback network 0.0.0.0 + # excludes reserved space >= 224.0.0.0 + # excludes network & broadcast addresses + # (first & last IP address of each class) + r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" + + r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" + + r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" + + r"|" + + # host & domain names, may end with dot + # can be replaced by a shortest alternative + # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+ + r"(?:" + + r"(?:" + + r"[a-z0-9\u00a1-\uffff]" + + r"[a-z0-9\u00a1-\uffff_-]{0,62}" + + r")?" + + r"[a-z0-9\u00a1-\uffff]\." + + r")+" + + # TLD identifier name, may end with dot + r"(?:[a-z\u00a1-\uffff]{2,}\.?)" + + r")" + + # port number (optional) + r"(?P<port>:\d{2,5})?" + + # resource path (optional) + r"(?P<path>\/[^?# ]*)?" + + # query string (optional) + r"(?P<query>\?[^#]*)?" + + # fragment (optional) + r"(?P<fragment>#\S*)?" + + r"$" + ).set_name("url") + # pre-PEP8 compatibility names convertToInteger = convert_to_integer convertToFloat = convert_to_float |