summaryrefslogtreecommitdiff
path: root/deps/v8/src/dateparser-inl.h
blob: 7e5c4e355e1a9d6122241ddf7158048cea61c0cb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
// Copyright 2011 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_DATEPARSER_INL_H_
#define V8_DATEPARSER_INL_H_

#include "src/char-predicates-inl.h"
#include "src/dateparser.h"
#include "src/unicode-cache-inl.h"

namespace v8 {
namespace internal {

template <typename Char>
bool DateParser::Parse(Vector<Char> str,
                       FixedArray* out,
                       UnicodeCache* unicode_cache) {
  DCHECK(out->length() >= OUTPUT_SIZE);
  InputReader<Char> in(unicode_cache, str);
  DateStringTokenizer<Char> scanner(&in);
  TimeZoneComposer tz;
  TimeComposer time;
  DayComposer day;

  // Specification:
  // Accept ES5 ISO 8601 date-time-strings or legacy dates compatible
  // with Safari.
  // ES5 ISO 8601 dates:
  //   [('-'|'+')yy]yyyy[-MM[-DD]][THH:mm[:ss[.sss]][Z|(+|-)hh:mm]]
  //   where yyyy is in the range 0000..9999 and
  //         +/-yyyyyy is in the range -999999..+999999 -
  //           but -000000 is invalid (year zero must be positive),
  //         MM is in the range 01..12,
  //         DD is in the range 01..31,
  //         MM and DD defaults to 01 if missing,,
  //         HH is generally in the range 00..23, but can be 24 if mm, ss
  //           and sss are zero (or missing), representing midnight at the
  //           end of a day,
  //         mm and ss are in the range 00..59,
  //         sss is in the range 000..999,
  //         hh is in the range 00..23,
  //         mm, ss, and sss default to 00 if missing, and
  //         timezone defaults to Z if missing
  //           (following Safari, ISO actually demands local time).
  //  Extensions:
  //   We also allow sss to have more or less than three digits (but at
  //   least one).
  //   We allow hh:mm to be specified as hhmm.
  // Legacy dates:
  //  Any unrecognized word before the first number is ignored.
  //  Parenthesized text is ignored.
  //  An unsigned number followed by ':' is a time value, and is
  //  added to the TimeComposer. A number followed by '::' adds a second
  //  zero as well. A number followed by '.' is also a time and must be
  //  followed by milliseconds.
  //  Any other number is a date component and is added to DayComposer.
  //  A month name (or really: any word having the same first three letters
  //  as a month name) is recorded as a named month in the Day composer.
  //  A word recognizable as a time-zone is recorded as such, as is
  //  '(+|-)(hhmm|hh:)'.
  //  Legacy dates don't allow extra signs ('+' or '-') or umatched ')'
  //  after a number has been read (before the first number, any garbage
  //  is allowed).
  // Intersection of the two:
  //  A string that matches both formats (e.g. 1970-01-01) will be
  //  parsed as an ES5 date-time string - which means it will default
  //  to UTC time-zone. That's unavoidable if following the ES5
  //  specification.
  //  After a valid "T" has been read while scanning an ES5 datetime string,
  //  the input can no longer be a valid legacy date, since the "T" is a
  //  garbage string after a number has been read.

  // First try getting as far as possible with as ES5 Date Time String.
  DateToken next_unhandled_token = ParseES5DateTime(&scanner, &day, &time, &tz);
  if (next_unhandled_token.IsInvalid()) return false;
  bool has_read_number = !day.IsEmpty();
  // If there's anything left, continue with the legacy parser.
  for (DateToken token = next_unhandled_token;
       !token.IsEndOfInput();
       token = scanner.Next()) {
    if (token.IsNumber()) {
      has_read_number = true;
      int n = token.number();
      if (scanner.SkipSymbol(':')) {
        if (scanner.SkipSymbol(':')) {
          // n + "::"
          if (!time.IsEmpty()) return false;
          time.Add(n);
          time.Add(0);
        } else {
          // n + ":"
          if (!time.Add(n)) return false;
          if (scanner.Peek().IsSymbol('.')) scanner.Next();
        }
      } else if (scanner.SkipSymbol('.') && time.IsExpecting(n)) {
        time.Add(n);
        if (!scanner.Peek().IsNumber()) return false;
        int n = ReadMilliseconds(scanner.Next());
        if (n < 0) return false;
        time.AddFinal(n);
      } else if (tz.IsExpecting(n)) {
        tz.SetAbsoluteMinute(n);
      } else if (time.IsExpecting(n)) {
        time.AddFinal(n);
        // Require end, white space, "Z", "+" or "-" immediately after
        // finalizing time.
        DateToken peek = scanner.Peek();
        if (!peek.IsEndOfInput() &&
            !peek.IsWhiteSpace() &&
            !peek.IsKeywordZ() &&
            !peek.IsAsciiSign()) return false;
      } else {
        if (!day.Add(n)) return false;
        scanner.SkipSymbol('-');
      }
    } else if (token.IsKeyword()) {
      // Parse a "word" (sequence of chars. >= 'A').
      KeywordType type = token.keyword_type();
      int value = token.keyword_value();
      if (type == AM_PM && !time.IsEmpty()) {
        time.SetHourOffset(value);
      } else if (type == MONTH_NAME) {
        day.SetNamedMonth(value);
        scanner.SkipSymbol('-');
      } else if (type == TIME_ZONE_NAME && has_read_number) {
        tz.Set(value);
      } else {
        // Garbage words are illegal if a number has been read.
        if (has_read_number) return false;
        // The first number has to be separated from garbage words by
        // whitespace or other separators.
        if (scanner.Peek().IsNumber()) return false;
      }
    } else if (token.IsAsciiSign() && (tz.IsUTC() || !time.IsEmpty())) {
      // Parse UTC offset (only after UTC or time).
      tz.SetSign(token.ascii_sign());
      // The following number may be empty.
      int n = 0;
      int length = 0;
      if (scanner.Peek().IsNumber()) {
        DateToken token = scanner.Next();
        length = token.length();
        n = token.number();
      }
      has_read_number = true;

      if (scanner.Peek().IsSymbol(':')) {
        tz.SetAbsoluteHour(n);
        // TODO(littledan): Use minutes as part of timezone?
        tz.SetAbsoluteMinute(kNone);
      } else if (length == 2 || length == 1) {
        // Handle time zones like GMT-8
        tz.SetAbsoluteHour(n);
        tz.SetAbsoluteMinute(0);
      } else if (length == 4 || length == 3) {
        // Looks like the hhmm format
        tz.SetAbsoluteHour(n / 100);
        tz.SetAbsoluteMinute(n % 100);
      } else {
        // No need to accept time zones like GMT-12345
        return false;
      }
    } else if ((token.IsAsciiSign() || token.IsSymbol(')')) &&
               has_read_number) {
      // Extra sign or ')' is illegal if a number has been read.
      return false;
    } else {
      // Ignore other characters and whitespace.
    }
  }

  return day.Write(out) && time.Write(out) && tz.Write(out);
}


template<typename CharType>
DateParser::DateToken DateParser::DateStringTokenizer<CharType>::Scan() {
  int pre_pos = in_->position();
  if (in_->IsEnd()) return DateToken::EndOfInput();
  if (in_->IsAsciiDigit()) {
    int n = in_->ReadUnsignedNumeral();
    int length = in_->position() - pre_pos;
    return DateToken::Number(n, length);
  }
  if (in_->Skip(':')) return DateToken::Symbol(':');
  if (in_->Skip('-')) return DateToken::Symbol('-');
  if (in_->Skip('+')) return DateToken::Symbol('+');
  if (in_->Skip('.')) return DateToken::Symbol('.');
  if (in_->Skip(')')) return DateToken::Symbol(')');
  if (in_->IsAsciiAlphaOrAbove()) {
    DCHECK(KeywordTable::kPrefixLength == 3);
    uint32_t buffer[3] = {0, 0, 0};
    int length = in_->ReadWord(buffer, 3);
    int index = KeywordTable::Lookup(buffer, length);
    return DateToken::Keyword(KeywordTable::GetType(index),
                              KeywordTable::GetValue(index),
                              length);
  }
  if (in_->SkipWhiteSpace()) {
    return DateToken::WhiteSpace(in_->position() - pre_pos);
  }
  if (in_->SkipParentheses()) {
    return DateToken::Unknown();
  }
  in_->Next();
  return DateToken::Unknown();
}


template <typename Char>
bool DateParser::InputReader<Char>::SkipWhiteSpace() {
  if (unicode_cache_->IsWhiteSpaceOrLineTerminator(ch_)) {
    Next();
    return true;
  }
  return false;
}


template <typename Char>
bool DateParser::InputReader<Char>::SkipParentheses() {
  if (ch_ != '(') return false;
  int balance = 0;
  do {
    if (ch_ == ')') --balance;
    else if (ch_ == '(') ++balance;
    Next();
  } while (balance > 0 && ch_);
  return true;
}


template <typename Char>
DateParser::DateToken DateParser::ParseES5DateTime(
    DateStringTokenizer<Char>* scanner, DayComposer* day, TimeComposer* time,
    TimeZoneComposer* tz) {
  DCHECK(day->IsEmpty());
  DCHECK(time->IsEmpty());
  DCHECK(tz->IsEmpty());

  // Parse mandatory date string: [('-'|'+')yy]yyyy[':'MM[':'DD]]
  if (scanner->Peek().IsAsciiSign()) {
    // Keep the sign token, so we can pass it back to the legacy
    // parser if we don't use it.
    DateToken sign_token = scanner->Next();
    if (!scanner->Peek().IsFixedLengthNumber(6)) return sign_token;
    int sign = sign_token.ascii_sign();
    int year = scanner->Next().number();
    if (sign < 0 && year == 0) return sign_token;
    day->Add(sign * year);
  } else if (scanner->Peek().IsFixedLengthNumber(4)) {
    day->Add(scanner->Next().number());
  } else {
    return scanner->Next();
  }
  if (scanner->SkipSymbol('-')) {
    if (!scanner->Peek().IsFixedLengthNumber(2) ||
        !DayComposer::IsMonth(scanner->Peek().number())) return scanner->Next();
    day->Add(scanner->Next().number());
    if (scanner->SkipSymbol('-')) {
      if (!scanner->Peek().IsFixedLengthNumber(2) ||
          !DayComposer::IsDay(scanner->Peek().number())) return scanner->Next();
      day->Add(scanner->Next().number());
    }
  }
  // Check for optional time string: 'T'HH':'mm[':'ss['.'sss]]Z
  if (!scanner->Peek().IsKeywordType(TIME_SEPARATOR)) {
    if (!scanner->Peek().IsEndOfInput()) return scanner->Next();
  } else {
    // ES5 Date Time String time part is present.
    scanner->Next();
    if (!scanner->Peek().IsFixedLengthNumber(2) ||
        !Between(scanner->Peek().number(), 0, 24)) {
      return DateToken::Invalid();
    }
    // Allow 24:00[:00[.000]], but no other time starting with 24.
    bool hour_is_24 = (scanner->Peek().number() == 24);
    time->Add(scanner->Next().number());
    if (!scanner->SkipSymbol(':')) return DateToken::Invalid();
    if (!scanner->Peek().IsFixedLengthNumber(2) ||
        !TimeComposer::IsMinute(scanner->Peek().number()) ||
        (hour_is_24 && scanner->Peek().number() > 0)) {
      return DateToken::Invalid();
    }
    time->Add(scanner->Next().number());
    if (scanner->SkipSymbol(':')) {
      if (!scanner->Peek().IsFixedLengthNumber(2) ||
          !TimeComposer::IsSecond(scanner->Peek().number()) ||
          (hour_is_24 && scanner->Peek().number() > 0)) {
        return DateToken::Invalid();
      }
      time->Add(scanner->Next().number());
      if (scanner->SkipSymbol('.')) {
        if (!scanner->Peek().IsNumber() ||
            (hour_is_24 && scanner->Peek().number() > 0)) {
          return DateToken::Invalid();
        }
        // Allow more or less than the mandated three digits.
        time->Add(ReadMilliseconds(scanner->Next()));
      }
    }
    // Check for optional timezone designation: 'Z' | ('+'|'-')hh':'mm
    if (scanner->Peek().IsKeywordZ()) {
      scanner->Next();
      tz->Set(0);
    } else if (scanner->Peek().IsSymbol('+') ||
               scanner->Peek().IsSymbol('-')) {
      tz->SetSign(scanner->Next().symbol() == '+' ? 1 : -1);
      if (scanner->Peek().IsFixedLengthNumber(4)) {
        // hhmm extension syntax.
        int hourmin = scanner->Next().number();
        int hour = hourmin / 100;
        int min = hourmin % 100;
        if (!TimeComposer::IsHour(hour) || !TimeComposer::IsMinute(min)) {
          return DateToken::Invalid();
        }
        tz->SetAbsoluteHour(hour);
        tz->SetAbsoluteMinute(min);
      } else {
        // hh:mm standard syntax.
        if (!scanner->Peek().IsFixedLengthNumber(2) ||
            !TimeComposer::IsHour(scanner->Peek().number())) {
          return DateToken::Invalid();
        }
        tz->SetAbsoluteHour(scanner->Next().number());
        if (!scanner->SkipSymbol(':')) return DateToken::Invalid();
        if (!scanner->Peek().IsFixedLengthNumber(2) ||
            !TimeComposer::IsMinute(scanner->Peek().number())) {
          return DateToken::Invalid();
        }
        tz->SetAbsoluteMinute(scanner->Next().number());
      }
    }
    if (!scanner->Peek().IsEndOfInput()) return DateToken::Invalid();
  }
  // Successfully parsed ES5 Date Time String. Default to UTC if no TZ given.
  if (tz->IsEmpty()) tz->Set(0);
  day->set_iso_date();
  return DateToken::EndOfInput();
}


}  // namespace internal
}  // namespace v8

#endif  // V8_DATEPARSER_INL_H_