From 1c491217bff7dc2c55d9ec9a0bcd2c9402119b8d Mon Sep 17 00:00:00 2001 From: Christopher Corley Date: Mon, 24 Dec 2012 12:57:01 -0600 Subject: Add fuzzy parsing that additionally returns the skipped substrings. --- dateutil/parser.py | 40 ++++++++++++++++++++++++++++++++++------ test.py | 16 +++++++++++++--- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/dateutil/parser.py b/dateutil/parser.py index a2604a3..aef8362 100644 --- a/dateutil/parser.py +++ b/dateutil/parser.py @@ -174,7 +174,7 @@ class parserinfo(object): # m from a.m/p.m, t from ISO T separator JUMP = [" ", ".", ",", ";", "-", "/", "'", "at", "on", "and", "ad", "m", "t", "of", - "st", "nd", "rd", "th"] + "st", "nd", "rd", "th"] WEEKDAYS = [("Mon", "Monday"), ("Tue", "Tuesday"), @@ -305,7 +305,10 @@ class parser(object): if not default: default = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - res = self._parse(timestr, **kwargs) + + + res, skipped_tokens = self._parse(timestr, **kwargs) + if res is None: raise ValueError("unknown string format") repl = {} @@ -339,6 +342,10 @@ class parser(object): ret = ret.replace(tzinfo=tz.tzutc()) elif res.tzoffset: ret = ret.replace(tzinfo=tz.tzoffset(res.tzname, res.tzoffset)) + + if skipped_tokens: + return ret, skipped_tokens + return ret class _result(_resultbase): @@ -346,7 +353,10 @@ class parser(object): "hour", "minute", "second", "microsecond", "tzname", "tzoffset"] - def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False): + def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False, fuzzy_with_tokens=False): + if fuzzy_with_tokens: + fuzzy = True + info = self.info if dayfirst is None: dayfirst = info.dayfirst @@ -354,6 +364,13 @@ class parser(object): yearfirst = info.yearfirst res = self._result() l = _timelex.split(timestr) + + + # keep up with the last token skipped so we can recombine + # consecutively skipped tokens (-2 for when i begins at 0). + last_skipped_token_i = -2 + skipped_tokens = list() + try: # year/month/day list @@ -387,7 +404,7 @@ class parser(object): res.minute = int(s[2:]) elif len_li == 6 or (len_li > 6 and l[i-1].find('.') == 6): # YYMMDD or HHMMSS[.ss] - s = l[i-1] + s = l[i-1] if not ymd and l[i-1].find('.') == -1: ymd.append(info.convertyear(int(s[:2]))) ymd.append(int(s[2:4])) @@ -636,6 +653,13 @@ class parser(object): if not (info.jump(l[i]) or fuzzy): return None + if last_skipped_token_i == i - 1: + # recombine the tokens + skipped_tokens[-1] += l[i] + else: + # just append + skipped_tokens.append(l[i]) + last_skipped_token_i = i i += 1 # Process year/month/day @@ -705,7 +729,11 @@ class parser(object): if not info.validate(res): return None - return res + + if fuzzy_with_tokens: + return res, tuple(skipped_tokens) + + return res, None DEFAULTPARSER = parser() def parse(timestr, parserinfo=None, **kwargs): @@ -888,7 +916,7 @@ class _tzparser(object): except (IndexError, ValueError, AssertionError): return None - + return res diff --git a/test.py b/test.py index 80a7776..4ec2c1f 100755 --- a/test.py +++ b/test.py @@ -3447,7 +3447,7 @@ class ParserTest(unittest.TestCase): def testLongMonth(self): self.assertEqual(parse("October", default=self.default), datetime(2003, 10, 25)) - + def testZeroYear(self): self.assertEqual(parse("31-Dec-00", default=self.default), datetime(2000, 12, 31)) @@ -3458,6 +3458,16 @@ class ParserTest(unittest.TestCase): self.assertEqual(parse(s, fuzzy=True), datetime(2003, 9, 25, 10, 49, 41, tzinfo=self.brsttz)) + def testFuzzyWithTokens(self): + s = "Today is 25 of September of 2003, exactly " \ + "at 10:49:41 with timezone -03:00." + self.assertEqual(parse(s, fuzzy_with_tokens=True), + (datetime(2003, 9, 25, 10, 49, 41, + tzinfo=self.brsttz), + ('Today is ', 'of ', ', exactly at ', + ' with timezone ', '.') + ) + ) def testExtraSpace(self): self.assertEqual(parse(" July 4 , 1976 12:01:02 am "), @@ -3907,7 +3917,7 @@ END:VTIMEZONE def testStrCmp1(self): self.assertEqual(tzstr("EST5EDT"), tzstr("EST5EDT4,M4.1.0/02:00:00,M10-5-0/02:00")) - + def testStrCmp2(self): self.assertEqual(tzstr("EST5EDT"), tzstr("EST5EDT,4,1,0,7200,10,-1,0,7200,3600")) @@ -3930,7 +3940,7 @@ END:VTIMEZONE tz = tzfile(BytesIO(base64.decodestring(self.TZFILE_EST5EDT))) self.assertEqual(datetime(2003, 4, 6, 1, 59, tzinfo=tz).tzname(), "EST") self.assertEqual(datetime(2003, 4, 6, 2, 00, tzinfo=tz).tzname(), "EDT") - + def testFileEnd1(self): tz = tzfile(BytesIO(base64.decodestring(self.TZFILE_EST5EDT))) self.assertEqual(datetime(2003, 10, 26, 0, 59, tzinfo=tz).tzname(), "EDT") -- cgit v1.2.1