From d66cd522c20810bf24f73af0adece6c86d307699 Mon Sep 17 00:00:00 2001 From: ptmcg Date: Thu, 2 Jun 2016 12:37:27 +0000 Subject: Added pyparsing_common.stripHTMLTags; added links to pyparsing_common docstring git-svn-id: svn://svn.code.sf.net/p/pyparsing/code/trunk@360 9bf210a0-9d2d-494c-87cf-cfb32e7dff7b --- src/CHANGES | 3 ++- src/pyparsing.py | 22 +++++++++++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/CHANGES b/src/CHANGES index a24709f..3b33ed9 100644 --- a/src/CHANGES +++ b/src/CHANGES @@ -2,7 +2,7 @@ Change Log ========== -Verison 2.1.5 - +Verison 2.1.5 - June, 2016 ------------------------------ - Added a new parse action construction helper tokenMap, which will apply a function and optional arguments to each element in a @@ -36,6 +36,7 @@ Verison 2.1.5 - . ISO8601 date and date time strings . UUID (xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) . hex integer (returned as int) + . stripHTMLTags (parse action to remove tags from HTML source) - runTests now returns a two-tuple: success if all tests succeed, and an output list of each test and its output lines. diff --git a/src/pyparsing.py b/src/pyparsing.py index a9edfcb..e68b78c 100644 --- a/src/pyparsing.py +++ b/src/pyparsing.py @@ -58,7 +58,7 @@ The pyparsing module handles some of the problems that are typically vexing when """ __version__ = "2.1.5" -__versionTime__ = "24 May 2016 04:18 UTC" +__versionTime__ = "02 Jun 2016 12:25 UTC" __author__ = "Paul McGuire " import string @@ -3940,12 +3940,15 @@ commaSeparatedList = delimitedList( Optional( quotedString.copy() | _commasepite class pyparsing_common: """ Here are some common low-level expressions that may be useful in jump-starting parser development: - - numeric forms (integers, reals, scientific notation) - - parse actions for converting numeric strings to Python int and/or float types - - common programming identifiers - - network addresses (MAC, IPv4, IPv6) - - ISO8601 dates and datetimes - - UUID + - numeric forms (L{integers}, L{reals}, L{scientific notation}) + - common L{programming identifiers} + - network addresses (L{MAC}, L{IPv4}, L{IPv6}) + - ISO8601 L{dates} and L{datetime} + - L{UUID} + Parse actions: + - C{L{convertToInteger}} + - C{L{convertToFloat}} + - C{L{stripHTMLTags}} """ convertToInteger = tokenMap(int) @@ -4005,6 +4008,11 @@ class pyparsing_common: uuid = Regex(r'[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}').setName("UUID") "UUID (C{xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx})" + + _html_stripper = anyOpenTag.suppress() | anyCloseTag.suppress() + def stripHTMLTags(s,l,tokens): + """Parse action to remove HTML tags from web page HTML source""" + return _html_stripper.transformString(tokens[0]) if __name__ == "__main__": -- cgit v1.2.1