summaryrefslogtreecommitdiff
path: root/deps/v8/src/unicode.cc
diff options
context:
space:
mode:
Diffstat (limited to 'deps/v8/src/unicode.cc')
-rw-r--r--deps/v8/src/unicode.cc325
1 files changed, 55 insertions, 270 deletions
diff --git a/deps/v8/src/unicode.cc b/deps/v8/src/unicode.cc
index 22e5ca606e..4d7896ec37 100644
--- a/deps/v8/src/unicode.cc
+++ b/deps/v8/src/unicode.cc
@@ -21,7 +21,7 @@ static const uchar kSentinel = static_cast<uchar>(-1);
/**
* \file
- * Implementations of functions for working with unicode.
+ * Implementations of functions for working with Unicode.
*/
typedef signed short int16_t; // NOLINT
@@ -46,7 +46,7 @@ static inline bool IsStart(int32_t entry) {
#ifndef V8_INTL_SUPPORT
/**
- * Look up a character in the unicode table using a mix of binary and
+ * Look up a character in the Unicode table using a mix of binary and
* interpolation search. For a uniformly distributed array
* interpolation search beats binary search by a wide margin. However,
* in this case interpolation search degenerates because of some very
@@ -193,306 +193,91 @@ static int LookupMapping(const int32_t* table,
}
}
-static inline uint8_t NonASCIISequenceLength(byte first) {
- // clang-format off
- static const uint8_t lengths[256] = {
- // The first 128 entries correspond to ASCII characters.
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* OO - Of */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10 - 1f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30 - 3f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 50 - 5f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70 - 7f */
- // The following 64 entries correspond to continuation bytes.
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 - 8f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90 - 9f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0 - af */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0 - bf */
- // The next are two invalid overlong encodings and 30 two-byte sequences.
- 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* c0-c1 + c2-cf */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* d0-df */
- // 16 three-byte sequences.
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* e0-ef */
- // 5 four-byte sequences, followed by sequences that could only encode
- // code points outside of the unicode range.
- 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* f0-f4 + f5-ff */
- // clang-format on
- return lengths[first];
-}
-
-
-static inline bool IsContinuationCharacter(byte chr) {
- return chr >= 0x80 && chr <= 0xBF;
-}
-
// This method decodes an UTF-8 value according to RFC 3629 and
// https://encoding.spec.whatwg.org/#utf-8-decoder .
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
+ DCHECK_GT(max_length, 0);
DCHECK_GT(str[0], kMaxOneByteChar);
- size_t length = NonASCIISequenceLength(str[0]);
-
- // Check continuation characters.
- size_t max_count = std::min(length, max_length);
- size_t count = 1;
- while (count < max_count && IsContinuationCharacter(str[count])) {
- count++;
- }
+ State state = State::kAccept;
+ Utf8IncrementalBuffer buffer = 0;
+ uchar t;
- if (length >= 3 && count < 2) {
- // Not enough continuation bytes to check overlong sequences.
- *cursor += 1;
- return kBadChar;
- }
+ size_t i = 0;
+ do {
+ t = ValueOfIncremental(str[i], &i, &state, &buffer);
+ } while (i < max_length && t == kIncomplete);
- // Check overly long sequences & other conditions.
- if (length == 3) {
- if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
- // Overlong three-byte sequence? The first byte generates a kBadChar.
- *cursor += 1;
- return kBadChar;
- } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
- // High and low surrogate halves? The first byte generates a kBadChar.
- *cursor += 1;
- return kBadChar;
- }
- } else if (length == 4) {
- if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
- // Overlong four-byte sequence. The first byte generates a kBadChar.
- *cursor += 1;
- return kBadChar;
- } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
- // Code points outside of the unicode range. The first byte generates a
- // kBadChar.
- *cursor += 1;
- return kBadChar;
- }
- }
-
- *cursor += count;
-
- if (count != length) {
- // Not enough continuation characters.
- return kBadChar;
- }
-
- // All errors have been handled, so we only have to assemble the result.
- switch (length) {
- case 2:
- return ((str[0] << 6) + str[1]) - 0x00003080;
- case 3:
- return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
- case 4:
- return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
- 0x03C82080;
- }
-
- UNREACHABLE();
+ *cursor += i;
+ return (state == State::kAccept) ? t : kBadChar;
}
-/*
-Overlong sequence detection: Since Blink's TextCodecUTF8 rejects multi-byte
-characters which could be expressed with less bytes, we must too.
-
-Each continuation byte (10xxxxxx) carries 6 bits of payload. The lead bytes of
-1, 2, 3 and 4-byte characters are 0xxxxxxx, 110xxxxx, 1110xxxx and 11110xxx, and
-carry 7, 5, 4, and 3 bits of payload, respectively.
-
-Thus, a two-byte character can contain 11 bits of payload, a three-byte
-character 16, and a four-byte character 21.
-
-If we encounter a two-byte character which contains 7 bits or less, a three-byte
-character which contains 11 bits or less, or a four-byte character which
-contains 16 bits or less, we reject the character and generate a kBadChar for
-each of the bytes. This is because Blink handles overlong sequences by rejecting
-the first byte of the character (returning kBadChar); thus the rest are lonely
-continuation bytes and generate a kBadChar each.
-*/
-
-uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
+// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
+// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
+// when the stream is complete, to ensure incomplete sequences are handled.
+uchar Utf8::ValueOfIncremental(byte next, size_t* cursor, State* state,
+ Utf8IncrementalBuffer* buffer) {
DCHECK_NOT_NULL(buffer);
+ State old_state = *state;
+ *cursor += 1;
- // The common case: 1-byte Utf8 (and no incomplete char in the buffer)
- if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {
+ if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
+ DCHECK_EQ(0u, *buffer);
return static_cast<uchar>(next);
}
- if (*buffer == 0) {
- // We're at the start of a new character.
- uint32_t kind = NonASCIISequenceLength(next);
- CHECK_LE(kind, 4);
- if (kind >= 2) {
- // Start of 2..4 byte character, and no buffer.
-
- // The mask for the lower bits depends on the kind, and is
- // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that
- // with one shift.
- uint8_t mask = 0x7f >> kind;
+ // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
+ // char in that sequence.
+ Utf8DfaDecoder::Decode(next, state, buffer);
- // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
- // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
- // intended as a counter about how many bytes are still needed.
- uint32_t character_info = kind << 28 | (kind - 1) << 24;
- DCHECK_EQ(character_info & mask, 0);
- *buffer = character_info | (next & mask);
- return kIncomplete;
- } else {
- // No buffer, and not the start of a 1-byte char (handled at the
- // beginning), and not the start of a 2..4 byte char (or the start of an
- // overlong / invalid sequence)? Bad char.
+ switch (*state) {
+ case State::kAccept: {
+ uchar t = *buffer;
*buffer = 0;
- return kBadChar;
- }
- } else if (*buffer <= 0xff) {
- // We have one unprocessed byte left (from the last else case in this if
- // statement).
- uchar previous = *buffer;
- *buffer = 0;
- uchar t = ValueOfIncremental(previous, buffer);
- if (t == kIncomplete) {
- // If we have an incomplete character, process both the previous and the
- // next byte at once.
- return ValueOfIncremental(next, buffer);
- } else {
- // Otherwise, process the previous byte and save the next byte for next
- // time.
- DCHECK_EQ(0u, *buffer);
- *buffer = next;
return t;
}
- } else if (IsContinuationCharacter(next)) {
- // We're inside of a character, as described by buffer.
-
- // How many bytes (excluding this one) do we still expect?
- uint8_t bytes_expected = *buffer >> 28;
- uint8_t bytes_left = (*buffer >> 24) & 0x0f;
- // Two-byte overlong sequence detection is handled by
- // NonASCIISequenceLength, so we don't need to check anything here.
- if (bytes_expected == 3 && bytes_left == 2) {
- // Check that there are at least 12 bytes of payload.
- uint8_t lead_payload = *buffer & (0x7f >> bytes_expected);
- DCHECK_LE(lead_payload, 0xf);
- if (lead_payload == 0 && next < 0xa0) {
- // 0xa0 = 0b10100000 (payload: 100000). Overlong sequence: 0 bits from
- // the first byte, at most 5 from the second byte, and at most 6 from
- // the third -> in total at most 11.
-
- *buffer = next;
- return kBadChar;
- } else if (lead_payload == 0xd && next > 0x9f) {
- // The resulting code point would be on a range which is reserved for
- // UTF-16 surrogate halves.
- *buffer = next;
- return kBadChar;
- }
- } else if (bytes_expected == 4 && bytes_left == 3) {
- // Check that there are at least 17 bytes of payload.
- uint8_t lead_payload = *buffer & (0x7f >> bytes_expected);
+ case State::kReject:
+ *state = State::kAccept;
+ *buffer = 0;
- // If the lead byte was bigger than 0xf4 (payload: 4), it's not a start of
- // any valid character, and this is detected by NonASCIISequenceLength.
- DCHECK_LE(lead_payload, 0x4);
- if (lead_payload == 0 && next < 0x90) {
- // 0x90 = 10010000 (payload 10000). Overlong sequence: 0 bits from the
- // first byte, at most 4 from the second byte, at most 12 from the third
- // and fourth bytes -> in total at most 16.
- *buffer = next;
- return kBadChar;
- } else if (lead_payload == 4 && next > 0x8f) {
- // Invalid code point; value greater than 0b100001111000000000000
- // (0x10ffff).
- *buffer = next;
- return kBadChar;
+ // If we hit a bad byte, we need to determine if we were trying to start
+ // a sequence or continue one. If we were trying to start a sequence,
+ // that means it's just an invalid lead byte and we need to continue to
+ // the next (which we already did above). If we were already in a
+ // sequence, we need to reprocess this same byte after resetting to the
+ // initial state.
+ if (old_state != State::kAccept) {
+ // We were trying to continue a sequence, so let's reprocess this byte
+ // next time.
+ *cursor -= 1;
}
- }
+ return kBadChar;
- bytes_left--;
- // Update the value.
- uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
- if (bytes_left) {
- *buffer = (bytes_expected << 28 | bytes_left << 24 | value);
+ default:
return kIncomplete;
- } else {
-#ifdef DEBUG
- // Check that overlong sequences were already detected.
- bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
- (bytes_expected == 3 && value < 0x800) ||
- (bytes_expected == 4 && value < 0x8000);
- DCHECK(!sequence_was_too_long);
-#endif
- *buffer = 0;
- return value;
- }
- } else {
- // Within a character, but not a continuation character? Then the
- // previous char was a bad char. But we need to save the current
- // one.
- *buffer = next;
- return kBadChar;
}
}
-uchar Utf8::ValueOfIncrementalFinish(Utf8IncrementalBuffer* buffer) {
- DCHECK_NOT_NULL(buffer);
- if (*buffer == 0) {
+// Finishes the incremental decoding, ensuring that if an unfinished sequence
+// is left that it is replaced by a replacement char.
+uchar Utf8::ValueOfIncrementalFinish(State* state) {
+ if (*state == State::kAccept) {
return kBufferEmpty;
} else {
- // Process left-over chars. An incomplete char at the end maps to kBadChar.
- uchar t = ValueOfIncremental(0, buffer);
- return (t == kIncomplete) ? kBadChar : t;
+ DCHECK_GT(*state, State::kAccept);
+ *state = State::kAccept;
+ return kBadChar;
}
}
bool Utf8::ValidateEncoding(const byte* bytes, size_t length) {
- const byte* cursor = bytes;
- const byte* end = bytes + length;
-
- while (cursor < end) {
- // Skip over single-byte values.
- if (*cursor <= kMaxOneByteChar) {
- ++cursor;
- continue;
- }
-
- // Get the length the the character.
- size_t seq_length = NonASCIISequenceLength(*cursor);
- // For some invalid characters NonASCIISequenceLength returns 0.
- if (seq_length == 0) return false;
-
- const byte* char_end = cursor + seq_length;
-
- // Return false if we do not have enough bytes for the character.
- if (char_end > end) return false;
-
- // Check if the bytes of the character are continuation bytes.
- for (const byte* i = cursor + 1; i < char_end; ++i) {
- if (!IsContinuationCharacter(*i)) return false;
- }
-
- // Check overly long sequences & other conditions.
- if (seq_length == 3) {
- if (cursor[0] == 0xE0 && (cursor[1] < 0xA0 || cursor[1] > 0xBF)) {
- // Overlong three-byte sequence?
- return false;
- } else if (cursor[0] == 0xED && (cursor[1] < 0x80 || cursor[1] > 0x9F)) {
- // High and low surrogate halves?
- return false;
- }
- } else if (seq_length == 4) {
- if (cursor[0] == 0xF0 && (cursor[1] < 0x90 || cursor[1] > 0xBF)) {
- // Overlong four-byte sequence.
- return false;
- } else if (cursor[0] == 0xF4 && (cursor[1] < 0x80 || cursor[1] > 0x8F)) {
- // Code points outside of the unicode range.
- return false;
- }
- }
- cursor = char_end;
+ State state = State::kAccept;
+ Utf8IncrementalBuffer throw_away = 0;
+ for (size_t i = 0; i < length && state != State::kReject; i++) {
+ Utf8DfaDecoder::Decode(bytes[i], &state, &throw_away);
}
- return true;
+ return state == State::kAccept;
}
// Uppercase: point.category == 'Lu'
@@ -3333,7 +3118,7 @@ int CanonicalizationRange::Convert(uchar c,
}
-const uchar UnicodeData::kMaxCodePoint = 65533;
+const uchar UnicodeData::kMaxCodePoint = 0xFFFD;
int UnicodeData::GetByteCount() {
#ifndef V8_INTL_SUPPORT // NOLINT