diff options
author | isaacs <i@izs.me> | 2012-03-28 19:51:38 -0700 |
---|---|---|
committer | isaacs <i@izs.me> | 2012-03-28 19:51:38 -0700 |
commit | 4b64542fe09477fc5c70e974eb1a78cdce755eb7 (patch) | |
tree | b4d4cdfd5b07efbdae51098b422fde7844ff4715 /deps/v8/src/unicode.h | |
parent | 8a15147bc53849417f8737dd873877d497867c9f (diff) | |
download | node-new-4b64542fe09477fc5c70e974eb1a78cdce755eb7.tar.gz |
Upgrade V8 to 3.9.24.6
Diffstat (limited to 'deps/v8/src/unicode.h')
-rw-r--r-- | deps/v8/src/unicode.h | 49 |
1 files changed, 46 insertions, 3 deletions
diff --git a/deps/v8/src/unicode.h b/deps/v8/src/unicode.h index fb9e6339e1..94ab1b4c1e 100644 --- a/deps/v8/src/unicode.h +++ b/deps/v8/src/unicode.h @@ -100,7 +100,7 @@ class UnicodeData { static const uchar kMaxCodePoint; }; -// --- U t f 8 --- +// --- U t f 8 a n d 16 --- template <typename Data> class Buffer { @@ -114,10 +114,46 @@ class Buffer { unsigned length_; }; + +class Utf16 { + public: + static inline bool IsLeadSurrogate(int code) { + if (code == kNoPreviousCharacter) return false; + return (code & 0xfc00) == 0xd800; + } + static inline bool IsTrailSurrogate(int code) { + if (code == kNoPreviousCharacter) return false; + return (code & 0xfc00) == 0xdc00; + } + + static inline int CombineSurrogatePair(uchar lead, uchar trail) { + return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff); + } + static const int kNoPreviousCharacter = -1; + static const uchar kMaxNonSurrogateCharCode = 0xffff; + // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes + // of UTF-8 data. The special case where the unit is a surrogate + // trail produces 1 byte net, because the encoding of the pair is + // 4 bytes and the 3 bytes that were used to encode the lead surrogate + // can be reclaimed. + static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3; + // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes. + // The illegality stems from the surrogate not being part of a pair. + static const int kUtf8BytesToCodeASurrogate = 3; + static inline uchar LeadSurrogate(int char_code) { + return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff); + } + static inline uchar TrailSurrogate(int char_code) { + return 0xdc00 + (char_code & 0x3ff); + } +}; + + class Utf8 { public: - static inline uchar Length(uchar chr); - static inline unsigned Encode(char* out, uchar c); + static inline uchar Length(uchar chr, int previous); + static inline unsigned Encode( + char* out, uchar c, int previous); static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, unsigned capacity, unsigned* chars_read, unsigned* offset); static uchar CalculateValue(const byte* str, @@ -130,6 +166,11 @@ class Utf8 { static const unsigned kMaxThreeByteChar = 0xffff; static const unsigned kMaxFourByteChar = 0x1fffff; + // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together + // that match are coded as a 4 byte UTF-8 sequence. + static const unsigned kBytesSavedByCombiningSurrogates = 2; + static const unsigned kSizeOfUnmatchedSurrogate = 3; + private: template <unsigned s> friend class Utf8InputBuffer; friend class Test; @@ -147,6 +188,7 @@ class CharacterStream { // Note that default implementation is not efficient. virtual void Seek(unsigned); unsigned Length(); + unsigned Utf16Length(); virtual ~CharacterStream() { } static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity, unsigned& offset); @@ -156,6 +198,7 @@ class CharacterStream { unsigned capacity, unsigned& offset); static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset); virtual void Rewind() = 0; + protected: virtual void FillBuffer() = 0; // The number of characters left in the current buffer |