summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristopher Corley <cscorley@ua.edu>2012-12-24 12:57:01 -0600
committerChristopher Corley <cscorley@ua.edu>2012-12-24 12:57:01 -0600
commit1c491217bff7dc2c55d9ec9a0bcd2c9402119b8d (patch)
treeeae3779f0d933a337a57a7cb6d08ae299c8d0d74
parent11a2deb52fda4fa8c42030093a693124b52091ae (diff)
downloaddateutil-1c491217bff7dc2c55d9ec9a0bcd2c9402119b8d.tar.gz
Add fuzzy parsing that additionally returns the skipped substrings.
-rw-r--r--dateutil/parser.py40
-rwxr-xr-xtest.py16
2 files changed, 47 insertions, 9 deletions
diff --git a/dateutil/parser.py b/dateutil/parser.py
index a2604a3..aef8362 100644
--- a/dateutil/parser.py
+++ b/dateutil/parser.py
@@ -174,7 +174,7 @@ class parserinfo(object):
# m from a.m/p.m, t from ISO T separator
JUMP = [" ", ".", ",", ";", "-", "/", "'",
"at", "on", "and", "ad", "m", "t", "of",
- "st", "nd", "rd", "th"]
+ "st", "nd", "rd", "th"]
WEEKDAYS = [("Mon", "Monday"),
("Tue", "Tuesday"),
@@ -305,7 +305,10 @@ class parser(object):
if not default:
default = datetime.datetime.now().replace(hour=0, minute=0,
second=0, microsecond=0)
- res = self._parse(timestr, **kwargs)
+
+
+ res, skipped_tokens = self._parse(timestr, **kwargs)
+
if res is None:
raise ValueError("unknown string format")
repl = {}
@@ -339,6 +342,10 @@ class parser(object):
ret = ret.replace(tzinfo=tz.tzutc())
elif res.tzoffset:
ret = ret.replace(tzinfo=tz.tzoffset(res.tzname, res.tzoffset))
+
+ if skipped_tokens:
+ return ret, skipped_tokens
+
return ret
class _result(_resultbase):
@@ -346,7 +353,10 @@ class parser(object):
"hour", "minute", "second", "microsecond",
"tzname", "tzoffset"]
- def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False):
+ def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False, fuzzy_with_tokens=False):
+ if fuzzy_with_tokens:
+ fuzzy = True
+
info = self.info
if dayfirst is None:
dayfirst = info.dayfirst
@@ -354,6 +364,13 @@ class parser(object):
yearfirst = info.yearfirst
res = self._result()
l = _timelex.split(timestr)
+
+
+ # keep up with the last token skipped so we can recombine
+ # consecutively skipped tokens (-2 for when i begins at 0).
+ last_skipped_token_i = -2
+ skipped_tokens = list()
+
try:
# year/month/day list
@@ -387,7 +404,7 @@ class parser(object):
res.minute = int(s[2:])
elif len_li == 6 or (len_li > 6 and l[i-1].find('.') == 6):
# YYMMDD or HHMMSS[.ss]
- s = l[i-1]
+ s = l[i-1]
if not ymd and l[i-1].find('.') == -1:
ymd.append(info.convertyear(int(s[:2])))
ymd.append(int(s[2:4]))
@@ -636,6 +653,13 @@ class parser(object):
if not (info.jump(l[i]) or fuzzy):
return None
+ if last_skipped_token_i == i - 1:
+ # recombine the tokens
+ skipped_tokens[-1] += l[i]
+ else:
+ # just append
+ skipped_tokens.append(l[i])
+ last_skipped_token_i = i
i += 1
# Process year/month/day
@@ -705,7 +729,11 @@ class parser(object):
if not info.validate(res):
return None
- return res
+
+ if fuzzy_with_tokens:
+ return res, tuple(skipped_tokens)
+
+ return res, None
DEFAULTPARSER = parser()
def parse(timestr, parserinfo=None, **kwargs):
@@ -888,7 +916,7 @@ class _tzparser(object):
except (IndexError, ValueError, AssertionError):
return None
-
+
return res
diff --git a/test.py b/test.py
index 80a7776..4ec2c1f 100755
--- a/test.py
+++ b/test.py
@@ -3447,7 +3447,7 @@ class ParserTest(unittest.TestCase):
def testLongMonth(self):
self.assertEqual(parse("October", default=self.default),
datetime(2003, 10, 25))
-
+
def testZeroYear(self):
self.assertEqual(parse("31-Dec-00", default=self.default),
datetime(2000, 12, 31))
@@ -3458,6 +3458,16 @@ class ParserTest(unittest.TestCase):
self.assertEqual(parse(s, fuzzy=True),
datetime(2003, 9, 25, 10, 49, 41,
tzinfo=self.brsttz))
+ def testFuzzyWithTokens(self):
+ s = "Today is 25 of September of 2003, exactly " \
+ "at 10:49:41 with timezone -03:00."
+ self.assertEqual(parse(s, fuzzy_with_tokens=True),
+ (datetime(2003, 9, 25, 10, 49, 41,
+ tzinfo=self.brsttz),
+ ('Today is ', 'of ', ', exactly at ',
+ ' with timezone ', '.')
+ )
+ )
def testExtraSpace(self):
self.assertEqual(parse(" July 4 , 1976 12:01:02 am "),
@@ -3907,7 +3917,7 @@ END:VTIMEZONE
def testStrCmp1(self):
self.assertEqual(tzstr("EST5EDT"),
tzstr("EST5EDT4,M4.1.0/02:00:00,M10-5-0/02:00"))
-
+
def testStrCmp2(self):
self.assertEqual(tzstr("EST5EDT"),
tzstr("EST5EDT,4,1,0,7200,10,-1,0,7200,3600"))
@@ -3930,7 +3940,7 @@ END:VTIMEZONE
tz = tzfile(BytesIO(base64.decodestring(self.TZFILE_EST5EDT)))
self.assertEqual(datetime(2003, 4, 6, 1, 59, tzinfo=tz).tzname(), "EST")
self.assertEqual(datetime(2003, 4, 6, 2, 00, tzinfo=tz).tzname(), "EDT")
-
+
def testFileEnd1(self):
tz = tzfile(BytesIO(base64.decodestring(self.TZFILE_EST5EDT)))
self.assertEqual(datetime(2003, 10, 26, 0, 59, tzinfo=tz).tzname(), "EDT")