diff options
Diffstat (limited to 'deps/v8/src/unicode-decoder.cc')
-rw-r--r-- | deps/v8/src/unicode-decoder.cc | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/deps/v8/src/unicode-decoder.cc b/deps/v8/src/unicode-decoder.cc new file mode 100644 index 0000000000..88eff3ad26 --- /dev/null +++ b/deps/v8/src/unicode-decoder.cc @@ -0,0 +1,78 @@ +// Copyright 2014 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + + +#include "src/unicode-inl.h" +#include "src/unicode-decoder.h" +#include <stdio.h> +#include <stdlib.h> + +namespace unibrow { + +void Utf8DecoderBase::Reset(uint16_t* buffer, unsigned buffer_length, + const uint8_t* stream, unsigned stream_length) { + // Assume everything will fit in the buffer and stream won't be needed. + last_byte_of_buffer_unused_ = false; + unbuffered_start_ = NULL; + bool writing_to_buffer = true; + // Loop until stream is read, writing to buffer as long as buffer has space. + unsigned utf16_length = 0; + while (stream_length != 0) { + unsigned cursor = 0; + uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor); + DCHECK(cursor > 0 && cursor <= stream_length); + stream += cursor; + stream_length -= cursor; + bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode; + utf16_length += is_two_characters ? 2 : 1; + // Don't need to write to the buffer, but still need utf16_length. + if (!writing_to_buffer) continue; + // Write out the characters to the buffer. + // Must check for equality with buffer_length as we've already updated it. + if (utf16_length <= buffer_length) { + if (is_two_characters) { + *buffer++ = Utf16::LeadSurrogate(character); + *buffer++ = Utf16::TrailSurrogate(character); + } else { + *buffer++ = character; + } + if (utf16_length == buffer_length) { + // Just wrote last character of buffer + writing_to_buffer = false; + unbuffered_start_ = stream; + } + continue; + } + // Have gone over buffer. + // Last char of buffer is unused, set cursor back. + DCHECK(is_two_characters); + writing_to_buffer = false; + last_byte_of_buffer_unused_ = true; + unbuffered_start_ = stream - cursor; + } + utf16_length_ = utf16_length; +} + + +void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream, uint16_t* data, + unsigned data_length) { + while (data_length != 0) { + unsigned cursor = 0; + uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor); + // There's a total lack of bounds checking for stream + // as it was already done in Reset. + stream += cursor; + if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) { + *data++ = Utf16::LeadSurrogate(character); + *data++ = Utf16::TrailSurrogate(character); + DCHECK(data_length > 1); + data_length -= 2; + } else { + *data++ = character; + data_length -= 1; + } + } +} + +} // namespace unibrow |