diff options
author | Evan Welsh <contact@evanwelsh.com> | 2021-04-22 00:26:51 -0700 |
---|---|---|
committer | Philip Chimento <philip.chimento@gmail.com> | 2021-04-25 21:57:40 -0700 |
commit | 48d325a0fad60467b5e2cf9e49b9ffd19cd356a8 (patch) | |
tree | 8b64c2ab02c05f195a658bb4542eae49fc43a34b | |
parent | 260c74786a39f9194b0c6c72d6cbf0571d0d0f86 (diff) | |
download | gjs-ewlsh/text-encoding.tar.gz |
modules: Implement WHATWG Encoding specificationewlsh/text-encoding
-rw-r--r-- | .eslintignore | 2 | ||||
-rw-r--r-- | .eslintrc.yml | 2 | ||||
-rw-r--r-- | gjs/byteArray.cpp | 8 | ||||
-rw-r--r-- | gjs/jsapi-util-string.cpp | 35 | ||||
-rw-r--r-- | gjs/jsapi-util.h | 7 | ||||
-rw-r--r-- | gjs/text-encoding.cpp | 352 | ||||
-rw-r--r-- | gjs/text-encoding.h | 11 | ||||
-rw-r--r-- | installed-tests/js/meson.build | 1 | ||||
-rw-r--r-- | installed-tests/js/testEncoding.js | 1051 | ||||
-rw-r--r-- | js.gresource.xml | 2 | ||||
-rw-r--r-- | modules/core/_encodings.js | 280 | ||||
-rw-r--r-- | modules/core/_text.js | 127 | ||||
-rw-r--r-- | modules/core/overrides/GLib.js | 9 | ||||
-rw-r--r-- | modules/script/_bootstrap/default.js | 13 | ||||
-rw-r--r-- | modules/script/byteArray.js | 4 |
15 files changed, 1826 insertions, 78 deletions
diff --git a/.eslintignore b/.eslintignore index 9ee950d3..8f8f93ff 100644 --- a/.eslintignore +++ b/.eslintignore @@ -3,4 +3,6 @@ installed-tests/js/jasmine.js installed-tests/js/modules/badOverrides/WarnLib.js +# Until ESLint merges class fields. +modules/core/_text.js modules/script/jsUnit.js diff --git a/.eslintrc.yml b/.eslintrc.yml index 733db371..6887f1cb 100644 --- a/.eslintrc.yml +++ b/.eslintrc.yml @@ -253,5 +253,7 @@ globals: print: readonly printerr: readonly window: readonly + TextEncoder: readonly + TextDecoder: readonly parserOptions: ecmaVersion: 2020 diff --git a/gjs/byteArray.cpp b/gjs/byteArray.cpp index ecf97776..25a1fc74 100644 --- a/gjs/byteArray.cpp +++ b/gjs/byteArray.cpp @@ -53,7 +53,13 @@ static bool instance_to_string_func(JSContext* cx, unsigned argc, if (!gjs_parse_call_args(cx, "toString", args, "|s", "encoding", &encoding)) return false; - return to_string_impl(cx, this_obj, encoding.get(), args.rval()); + if (!JS_IsUint8Array(this_obj)) { + gjs_throw(cx, "Argument to ByteArray.toString() must be a Uint8Array"); + return false; + } + + return gjs_decode_from_uint8array(cx, this_obj, encoding.get(), true, + args.rval()); } GJS_JSAPI_RETURN_CONVENTION diff --git a/gjs/jsapi-util-string.cpp b/gjs/jsapi-util-string.cpp index 5fc1164a..9d4f7cfb 100644 --- a/gjs/jsapi-util-string.cpp +++ b/gjs/jsapi-util-string.cpp @@ -98,6 +98,41 @@ JS::UniqueChars gjs_string_to_utf8(JSContext* cx, const JS::Value value) { return JS_EncodeStringToUTF8(cx, str); } +bool gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string, + JS::MutableHandleValue value_p) { + JS::ConstUTF8CharsZ chars(utf8_string, strlen(utf8_string)); + size_t len; + JS::UniqueTwoByteChars twobyte_chars( + JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &len, js::MallocArena) + .get()); + if (!twobyte_chars) + return false; + + JS::RootedString str(cx, JS_NewUCStringCopyN(cx, twobyte_chars.get(), len)); + if (str) + value_p.setString(str); + + return str != nullptr; +} +bool gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string, + size_t len, JS::MutableHandleValue value_p) { + JS::UTF8Chars chars(utf8_string, len); + size_t outlen; + JS::UniqueTwoByteChars twobyte_chars( + JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen, + js::MallocArena) + .get()); + if (!twobyte_chars) + return false; + + JS::RootedString str(cx, + JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen)); + if (str) + value_p.setString(str); + + return str != nullptr; +} + bool gjs_string_from_utf8(JSContext *context, const char *utf8_string, diff --git a/gjs/jsapi-util.h b/gjs/jsapi-util.h index 4e399f25..697fc76b 100644 --- a/gjs/jsapi-util.h +++ b/gjs/jsapi-util.h @@ -428,6 +428,13 @@ void gjs_warning_reporter(JSContext*, JSErrorReport* report); GJS_JSAPI_RETURN_CONVENTION JS::UniqueChars gjs_string_to_utf8(JSContext* cx, const JS::Value string_val); +[[nodiscard]] bool gjs_lossy_string_from_utf8(JSContext* context, + const char* utf8_string, + JS::MutableHandleValue value_p); +[[nodiscard]] bool gjs_lossy_string_from_utf8_n(JSContext* context, + const char* utf8_string, + size_t len, + JS::MutableHandleValue value_p); GJS_JSAPI_RETURN_CONVENTION bool gjs_string_from_utf8(JSContext *context, const char *utf8_string, diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp index 395f0812..7fa74c84 100644 --- a/gjs/text-encoding.cpp +++ b/gjs/text-encoding.cpp @@ -40,27 +40,66 @@ static void gfree_arraybuffer_contents(void* contents, void*) { g_free(contents); } +static const char* FALLBACK = "\ufffd"; +static size_t FALLBACK_LEN = strlen(FALLBACK); + +[[nodiscard]] static bool gjs_convert_invalid_input(JSContext* cx, + uint8_t* data, size_t len, + const char* to_codeset, + const char* from_codeset, + char** converted); + GJS_JSAPI_RETURN_CONVENTION -bool to_string_impl_slow(JSContext* cx, uint8_t* data, uint32_t len, - const char* encoding, JS::MutableHandleValue rval) { - size_t bytes_written; +bool gjs_decode_from_uint8array_slow(JSContext* cx, uint8_t* data, uint32_t len, + const char* encoding, bool fatal, + JS::MutableHandleValue rval) { + size_t bytes_written, bytes_read; GError* error = nullptr; - GjsAutoChar u16_str = - g_convert(reinterpret_cast<char*>(data), len, - // Make sure the bytes of the UTF-16 string are laid out in memory - // such that we can simply reinterpret_cast<char16_t> them. + GjsAutoChar u16_str; + +// Make sure the bytes of the UTF-16 string are laid out in memory +// such that we can simply reinterpret_cast<char16_t> them. #if G_BYTE_ORDER == G_LITTLE_ENDIAN - "UTF-16LE", + const char* to_codeset = "UTF-16LE"; #else - "UTF-16BE", + const char* to_codeset = "UTF-16BE"; #endif - encoding, /* bytes read = */ nullptr, &bytes_written, &error); - if (!u16_str) - return gjs_throw_gerror_message(cx, error); // frees GError - // bytes_written should be bytes in a UTF-16 string so should be a multiple - // of 2 - g_assert((bytes_written % 2) == 0); + if (fatal) { + u16_str = + g_convert(reinterpret_cast<char*>(data), len, to_codeset, encoding, + /* bytes read = */ nullptr, &bytes_written, &error); + + // bytes_written should be bytes in a UTF-16 string so should be a + // multiple of 2 + g_assert((bytes_written % 2) == 0); + } else { + // This will fail if the input contains invalid codepoints in the + // from_codeset. It inserts a replacement character if the input is + // valid but can't be represented in the output. + u16_str = g_convert_with_fallback(reinterpret_cast<char*>(data), len, + to_codeset, encoding, FALLBACK, + &bytes_read, &bytes_written, &error); + if (u16_str) + g_assert((bytes_written % 2) == 0); + + // If the input is invalid we need to do the conversion ourselves. + if (error && g_error_matches(error, G_CONVERT_ERROR, + G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) { + // Clear the illegal sequence error. + g_clear_error(&error); + + char* str; + if (!gjs_convert_invalid_input(cx, data, len, to_codeset, encoding, + &str)) + return false; + + u16_str = str; + } + } + + if (error) + return gjs_throw_gerror_message(cx, error); // g_convert 0-terminates the string, although the 0 isn't included in // bytes_written @@ -73,26 +112,34 @@ bool to_string_impl_slow(JSContext* cx, uint8_t* data, uint32_t len, return true; } -// implement toString() with an optional encoding arg +inline bool is_utf8_label(const char* encoding) { + if (encoding) { + // Maybe we should be smarter about utf8 synonyms here. Doesn't matter + // much though. encoding_is_utf8 is just an optimization anyway. + if (strcasecmp(encoding, "utf-8") == 0) { + return true; + } else { + GjsAutoChar stripped(g_strdup(encoding)); + return (strcasecmp(g_strstrip(stripped), "utf-8") == 0); + } + } else { + return true; + } +} + GJS_JSAPI_RETURN_CONVENTION -bool to_string_impl(JSContext* cx, JS::HandleObject byte_array, - const char* encoding, JS::MutableHandleValue rval) { +bool gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array, + const char* encoding, bool fatal, + JS::MutableHandleValue rval) { if (!JS_IsUint8Array(byte_array)) { - gjs_throw(cx, "Argument to ByteArray.toString() must be a Uint8Array"); + gjs_throw( + cx, "Argument to gjs_decode_from_uint8array must be a Uint8Array"); return false; } - bool encoding_is_utf8; + bool encoding_is_utf8 = is_utf8_label(encoding); uint8_t* data; - if (encoding) { - // Maybe we should be smarter about utf8 synonyms here. Doesn't matter - // much though. encoding_is_utf8 is just an optimization anyway. - encoding_is_utf8 = (strcmp(encoding, "UTF-8") == 0); - } else { - encoding_is_utf8 = true; - } - uint32_t len; bool is_shared_memory; js::GetUint8ArrayLengthAndData(byte_array, &len, &is_shared_memory, &data); @@ -103,7 +150,8 @@ bool to_string_impl(JSContext* cx, JS::HandleObject byte_array, } if (!encoding_is_utf8) - return to_string_impl_slow(cx, data, len, encoding, rval); + return gjs_decode_from_uint8array_slow(cx, data, len, encoding, fatal, + rval); // optimization, avoids iconv overhead and runs libmozjs hardwired // utf8-to-utf16 @@ -111,12 +159,24 @@ bool to_string_impl(JSContext* cx, JS::HandleObject byte_array, // If there are any 0 bytes, including the terminating byte, stop at the // first one if (data[len - 1] == 0 || memchr(data, 0, len)) { - if (!gjs_string_from_utf8(cx, reinterpret_cast<char*>(data), rval)) - return false; + if (fatal) { + if (!gjs_string_from_utf8(cx, reinterpret_cast<char*>(data), rval)) + return false; + } else { + if (!gjs_lossy_string_from_utf8(cx, reinterpret_cast<char*>(data), + rval)) + return false; + } } else { - if (!gjs_string_from_utf8_n(cx, reinterpret_cast<char*>(data), len, - rval)) - return false; + if (fatal) { + if (!gjs_string_from_utf8_n(cx, reinterpret_cast<char*>(data), len, + rval)) + return false; + } else { + if (!gjs_lossy_string_from_utf8_n(cx, reinterpret_cast<char*>(data), + len, rval)) + return false; + } } uint8_t* current_data; @@ -139,50 +199,40 @@ bool to_string_impl(JSContext* cx, JS::HandleObject byte_array, return true; // This was the UTF-8 optimized path, so we explicitly pass the encoding - return to_string_impl_slow(cx, current_data, current_len, "UTF-8", rval); + return gjs_decode_from_uint8array_slow(cx, current_data, current_len, + "UTF-8", fatal, rval); } GJS_JSAPI_RETURN_CONVENTION -static bool to_string_func(JSContext* cx, unsigned argc, JS::Value* vp) { +static bool gjs_decode(JSContext* cx, unsigned argc, JS::Value* vp) { JS::CallArgs args = JS::CallArgsFromVp(argc, vp); JS::RootedObject byte_array(cx); + bool fatal = false; JS::UniqueChars encoding; - if (!gjs_parse_call_args(cx, "toString", args, "o|s", "byteArray", - &byte_array, "encoding", &encoding)) + if (!gjs_parse_call_args(cx, "toString", args, "o|bs", "byteArray", + &byte_array, "fatal", &fatal, "encoding", + &encoding)) return false; - return to_string_impl(cx, byte_array, encoding.get(), args.rval()); + return gjs_decode_from_uint8array(cx, byte_array, encoding.get(), fatal, + args.rval()); } // fromString() function implementation -GJS_JSAPI_RETURN_CONVENTION -static bool from_string_func(JSContext* cx, unsigned argc, JS::Value* vp) { - JS::CallArgs args = JS::CallArgsFromVp(argc, vp); - - JS::UniqueChars utf8; - JS::UniqueChars encoding; - if (!gjs_parse_call_args(cx, "fromString", args, "s|s", "string", &utf8, - "encoding", &encoding)) - return false; - - bool encoding_is_utf8; - if (argc > 1) { - // Maybe we should be smarter about utf8 synonyms here. Doesn't matter - // much though. encoding_is_utf8 is just an optimization anyway. - encoding_is_utf8 = (strcmp(encoding.get(), "UTF-8") == 0); - } else { - encoding_is_utf8 = true; - } - +[[nodiscard]] bool gjs_encode_to_uint8array(JSContext* cx, JS::HandleString str, + const char* encoding, + JS::MutableHandleValue rval) { JS::RootedObject array_buffer(cx); + + bool encoding_is_utf8 = is_utf8_label(encoding); if (encoding_is_utf8) { // optimization? avoids iconv overhead and runs libmozjs hardwired // utf16-to-utf8. + JS::UniqueChars utf8 = JS_EncodeStringToUTF8(cx, str); size_t len = strlen(utf8.get()); array_buffer = JS::NewArrayBufferWithContents(cx, len, utf8.release()); } else { - JSString* str = args[0].toString(); // Rooted by args GError* error = nullptr; char* encoded = nullptr; size_t bytes_written; @@ -200,7 +250,7 @@ static bool from_string_func(JSContext* cx, unsigned argc, JS::Value* vp) { return false; encoded = g_convert(reinterpret_cast<const char*>(chars), len, - /* to_encoding = */ encoding.get(), + /* to_encoding = */ encoding, /* from_encoding = */ "LATIN1", /* bytes_read = */ nullptr, &bytes_written, &error); @@ -212,7 +262,7 @@ static bool from_string_func(JSContext* cx, unsigned argc, JS::Value* vp) { encoded = g_convert( reinterpret_cast<const char*>(chars), len * 2, - /* to_encoding = */ encoding.get(), + /* to_encoding = */ encoding, /* from_encoding = */ "UTF-16", /* bytes_read = */ nullptr, &bytes_written, &error); } @@ -231,14 +281,188 @@ static bool from_string_func(JSContext* cx, unsigned argc, JS::Value* vp) { JS::RootedObject obj(cx, JS_NewUint8ArrayWithBuffer(cx, array_buffer, 0, -1)); - args.rval().setObject(*obj); + rval.setObject(*obj); + return true; +} + +static bool gjs_convert_invalid_input(JSContext* cx, uint8_t* data, size_t len, + const char* to_codeset, + const char* from_codeset, + char** converted) { + GError* error = nullptr; + GjsAutoUnref<GCharsetConverter> converter( + g_charset_converter_new(to_codeset, from_codeset, &error)); + + // This should only throw if an encoding is not available. + if (error) + return gjs_throw_gerror_message(cx, error); + + size_t bytes_written, bytes_read; + char buffer[1024]; + + // Cast data to convert input type, calculate length. + const char* input = reinterpret_cast<const char*>(data); + size_t input_len = len * sizeof(char); + + // Use a vector for the output for easy resizing. + std::vector<char> output; + size_t size = 0; + + do { + g_converter_convert(G_CONVERTER(converter.get()), input, input_len, + buffer, sizeof(buffer), G_CONVERTER_INPUT_AT_END, + &bytes_read, &bytes_written, &error); + + input += bytes_read; + input_len -= bytes_read; + + if (bytes_written > 0) { + output.resize(size + bytes_written); + std::copy(buffer, buffer + bytes_written, output.data() + size); + size += bytes_written; + } + + if (error) { + if (g_error_matches(error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA)) { + // Skip the invalid character + input += sizeof(char); + input_len -= sizeof(char); + + // Append fallback character to the output + output.resize(size + FALLBACK_LEN); + std::copy(FALLBACK, FALLBACK + FALLBACK_LEN, + output.data() + size); + size += FALLBACK_LEN; + + g_clear_error(&error); + } else if (bytes_written > 0 && + g_error_matches(error, G_IO_ERROR, + G_IO_ERROR_PARTIAL_INPUT)) { + // Only clear a partial input error if there are no bytes + // written. This occurs on the second loop, otherwise we could + // error mid-input. + g_clear_error(&error); + } else if (g_error_matches(error, G_IO_ERROR, + G_IO_ERROR_NO_SPACE)) { + // If the buffer was full, clear the error and continue + // converting. + g_clear_error(&error); + } + } + } while (input_len && !error); + + if (!error) { + char* arr = reinterpret_cast<char*>(g_malloc0(output.size())); + + std::copy(output.begin(), output.end(), arr); + + *converted = arr; + + // bytes_written should be bytes in a UTF-16 string so should be a + // multiple of 2 + g_assert((bytes_written % 2) == 0); + + return true; + } + + return gjs_throw_gerror_message(cx, error); +} + +GJS_JSAPI_RETURN_CONVENTION +bool gjs_encode_into_uint8array(JSContext* cx, JS::HandleString str, + JS::HandleObject uint8array, + JS::MutableHandleValue rval) { + if (!JS_IsUint8Array(uint8array)) { + gjs_throw( + cx, "Argument to gjs_encode_into_uint8array must be a Uint8Array"); + return false; + } + + auto len = JS_GetTypedArrayByteLength(uint8array); + bool shared; + + // TODO(ewlsh): Garbage collection cannot occur from here... + auto data = + JS_GetUint8ArrayData(uint8array, &shared, JS::AutoCheckCannotGC(cx)); + + if (shared) { + gjs_throw(cx, "Cannot encode data into shared memory."); + return false; + } + + auto maybe = JS_EncodeStringToUTF8BufferPartial( + cx, str, mozilla::AsWritableChars(mozilla::Span(data, len))); + // ... to here + + if (!maybe) { + JS_ReportOutOfMemory(cx); + return false; + } + + size_t read, written; + + mozilla::Tie(read, written) = *maybe; + + g_assert(written <= len); + + JS::RootedObject result(cx, JS_NewPlainObject(cx)); + JS::RootedValue readv(cx, JS::NumberValue(read)), + writtenv(cx, JS::NumberValue(written)); + + if (!JS_SetProperty(cx, result, "read", readv) || + !JS_SetProperty(cx, result, "written", writtenv)) { + return false; + } + + rval.setObject(*result); return true; } +[[nodiscard]] static bool gjs_encode(JSContext* cx, unsigned argc, + JS::Value* vp) { + JS::CallArgs args = JS::CallArgsFromVp(argc, vp); + + JS::UniqueChars encoding; + JS::UniqueChars utf8; + if (!gjs_parse_call_args(cx, "encode", args, "s|s", "string", &utf8, + "encoding", &encoding)) + return false; + + if (!args[0].isString()) { + gjs_throw(cx, "First argument to encode() must be a string."); + return false; + } + + JS::RootedString str(cx, args[0].toString()); + + return gjs_encode_to_uint8array(cx, str, encoding.get(), args.rval()); +} + +[[nodiscard]] static bool gjs_encode_into(JSContext* cx, unsigned argc, + JS::Value* vp) { + JS::CallArgs args = JS::CallArgsFromVp(argc, vp); + + JS::UniqueChars utf8; + JS::RootedObject uint8array(cx); + if (!gjs_parse_call_args(cx, "encodeInto", args, "so", "string", &utf8, + "uint8array", &uint8array)) + return false; + + if (!args[0].isString()) { + gjs_throw(cx, "First argument to encode() must be a string."); + return false; + } + + JS::RootedString str(cx, args[0].toString()); + + return gjs_encode_into_uint8array(cx, str, uint8array, args.rval()); +} + // clang-format off static JSFunctionSpec gjs_text_encoding_module_funcs[] = { - JS_FN("fromString", from_string_func, 2, 0), - JS_FN("toString", to_string_func, 2, 0), + JS_FN("decode", gjs_decode, 3, 0), + JS_FN("encodeInto", gjs_encode_into, 2, 0), + JS_FN("encode", gjs_encode, 2, 0), JS_FS_END}; // clang-format on diff --git a/gjs/text-encoding.h b/gjs/text-encoding.h index 7524a723..096df8ac 100644 --- a/gjs/text-encoding.h +++ b/gjs/text-encoding.h @@ -15,9 +15,14 @@ #include "gjs/macros.h" -GJS_JSAPI_RETURN_CONVENTION -bool to_string_impl(JSContext* cx, JS::HandleObject uint8array, - const char* encoding, JS::MutableHandleValue rval); +[[nodiscard]] bool gjs_decode_from_uint8array(JSContext* cx, + JS::HandleObject uint8array, + const char* encoding, bool fatal, + JS::MutableHandleValue rval); + +[[nodiscard]] bool gjs_encode_to_uint8array(JSContext* cx, JS::HandleString str, + const char* encoding, + JS::MutableHandleValue rval); GJS_JSAPI_RETURN_CONVENTION bool gjs_define_text_encoding_stuff(JSContext* cx, diff --git a/installed-tests/js/meson.build b/installed-tests/js/meson.build index 97f9cd07..85371e3a 100644 --- a/installed-tests/js/meson.build +++ b/installed-tests/js/meson.build @@ -94,6 +94,7 @@ subdir('libgjstesttools') jasmine_tests = [ 'self', 'ByteArray', + 'Encoding', 'Exceptions', 'Format', 'Fundamental', diff --git a/installed-tests/js/testEncoding.js b/installed-tests/js/testEncoding.js new file mode 100644 index 00000000..3bd510f4 --- /dev/null +++ b/installed-tests/js/testEncoding.js @@ -0,0 +1,1051 @@ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: Copyright 2018-2020 the Deno authors. All rights reserved. + +// Modified from https://github.com/denoland/deno/blob/923214c53725651792f6d55c5401bf6b475622ea/op_crates/web/08_text_encoding.js +// Data originally from https://encoding.spec.whatwg.org/encodings.json + +describe('Text Encoding', function () { + it('textDecoder', function () { + const fixture = new Uint8Array([ + 0xf0, 0x9d, 0x93, 0xbd, 0xf0, 0x9d, 0x93, 0xae, 0xf0, 0x9d, 0x94, 0x81, 0xf0, 0x9d, 0x93, 0xbd, + ]); + const decoder = new TextDecoder(); + expect(decoder.decode(fixture)).toBe('𝓽𝓮𝔁𝓽'); + }); + + it('textDecoderIgnoreBOM', function () { + const fixture = new Uint8Array([ + 0xef, 0xbb, 0xbf, 0xf0, 0x9d, 0x93, 0xbd, 0xf0, 0x9d, 0x93, 0xae, 0xf0, 0x9d, 0x94, 0x81, 0xf0, 0x9d, 0x93, 0xbd, + ]); + const decoder = new TextDecoder('utf-8', {ignoreBOM: true}); + expect(decoder.decode(fixture)).toBe('𝓽𝓮𝔁𝓽'); + }); + + it('textDecoderNotBOM', function () { + const fixture = new Uint8Array([ + 0xef, 0xbb, 0x89, 0xf0, 0x9d, 0x93, 0xbd, 0xf0, 0x9d, 0x93, 0xae, 0xf0, 0x9d, 0x94, 0x81, 0xf0, 0x9d, 0x93, 0xbd, + ]); + const decoder = new TextDecoder('utf-8', {ignoreBOM: true}); + expect(decoder.decode(fixture)).toBe('ﻉ𝓽𝓮𝔁𝓽'); + }); + + it('textDecoderASCII', function () { + const fixture = new Uint8Array([0x89, 0x95, 0x9f, 0xbf]); + const decoder = new TextDecoder('ascii'); + expect(decoder.decode(fixture)).toBe('‰•Ÿ¿'); + }); + + it('textDecoderErrorEncoding', function () { + expect(() => new TextDecoder('Foo')).toThrowError("Invalid encoding label: 'Foo'"); + }); + + it('textDecoderHandlesUndefined', function () { + const fixture = undefined; + const decoder = new TextDecoder(); + expect(decoder.decode(fixture)).toBe(''); + }); + + it('textDecoderThrowsOnEmpty', function () { + const fixture = ''; + const decoder = new TextDecoder(); + + expect(() => decoder.decode(fixture)) + .toThrowError('Provided input cannot be converted to ArrayBufferView or ArrayBuffer'); + }); + + it('textDecoderThrowsOnNull', function () { + const fixture = null; + const decoder = new TextDecoder(); + + expect(() => decoder.decode(fixture)) + .toThrowError('Provided input cannot be converted to ArrayBufferView or ArrayBuffer'); + }); + + it('textEncoder', function () { + const fixture = '𝓽𝓮𝔁𝓽'; + const encoder = new TextEncoder(); + + expect(Array.from(encoder.encode(fixture))).toEqual([ + 0xf0, 0x9d, 0x93, 0xbd, 0xf0, 0x9d, 0x93, 0xae, 0xf0, 0x9d, 0x94, 0x81, 0xf0, 0x9d, 0x93, 0xbd, + ]); + }); + + it('textEncodeInto', function () { + const fixture = 'text'; + const encoder = new TextEncoder(); + const bytes = new Uint8Array(5); + const result = encoder.encodeInto(fixture, bytes); + expect(result.read).toBe(4); + expect(result.written).toBe(4); + + expect(Array.from(bytes)).toEqual([0x74, 0x65, 0x78, 0x74, 0x00]); + }); + + it('textEncodeInto2', function () { + const fixture = '𝓽𝓮𝔁𝓽'; + const encoder = new TextEncoder(); + const bytes = new Uint8Array(17); + const result = encoder.encodeInto(fixture, bytes); + expect(result.read).toBe(8); + expect(result.written).toBe(16); + + expect(Array.from(bytes)).toEqual([ + 0xf0, 0x9d, 0x93, 0xbd, 0xf0, 0x9d, 0x93, 0xae, 0xf0, 0x9d, 0x94, 0x81, 0xf0, 0x9d, 0x93, 0xbd, 0x00, + ]); + }); + + it('textEncodeInto3', function () { + const fixture = '𝓽𝓮𝔁𝓽'; + const encoder = new TextEncoder(); + const bytes = new Uint8Array(5); + const result = encoder.encodeInto(fixture, bytes); + expect(result.read).toBe(2); + expect(result.written).toBe(4); + + expect(Array.from(bytes)).toEqual([0xf0, 0x9d, 0x93, 0xbd, 0x00]); + }); + + xit('textDecoderSharedUint8Array', function () { + const ab = new SharedArrayBuffer(6); + const dataView = new DataView(ab); + const charCodeA = 'A'.charCodeAt(0); + for (let i = 0; i < ab.byteLength; i++) + dataView.setUint8(i, charCodeA + i); + + const ui8 = new Uint8Array(ab); + const decoder = new TextDecoder(); + const actual = decoder.decode(ui8); + expect(actual).toBe('ABCDEF'); + }); + + xit('textDecoderSharedInt32Array', function () { + const ab = new SharedArrayBuffer(8); + const dataView = new DataView(ab); + const charCodeA = 'A'.charCodeAt(0); + for (let i = 0; i < ab.byteLength; i++) + dataView.setUint8(i, charCodeA + i); + + const i32 = new Int32Array(ab); + const decoder = new TextDecoder(); + const actual = decoder.decode(i32); + expect(actual).toBe('ABCDEFGH'); + }); + + it('toStringShouldBeWebCompatibility', function () { + const encoder = new TextEncoder(); + + expect(encoder.toString()).toBe('[object TextEncoder]'); + + const decoder = new TextDecoder(); + expect(decoder.toString()).toBe('[object TextDecoder]'); + }); + + it('singleByteEncodings', function () { + // Straight from https://encoding.spec.whatwg.org/encodings.json + const encodingsTable = [ + { + encodings: [ + { + labels: [ + 'unicode-1-1-utf-8', + 'unicode11utf8', + 'unicode20utf8', + 'utf-8', + 'utf8', + 'x-unicode20utf8', + ], + name: 'UTF-8', + }, + ], + heading: 'The Encoding', + }, + { + encodings: [ + { + labels: ['866', 'cp866', 'csibm866', 'ibm866'], + name: 'IBM866', + }, + { + labels: [ + 'csisolatin2', + 'iso-8859-2', + 'iso-ir-101', + 'iso8859-2', + 'iso88592', + 'iso_8859-2', + 'iso_8859-2:1987', + 'l2', + 'latin2', + ], + name: 'ISO-8859-2', + }, + { + labels: [ + 'csisolatin3', + 'iso-8859-3', + 'iso-ir-109', + 'iso8859-3', + 'iso88593', + 'iso_8859-3', + 'iso_8859-3:1988', + 'l3', + 'latin3', + ], + name: 'ISO-8859-3', + }, + { + labels: [ + 'csisolatin4', + 'iso-8859-4', + 'iso-ir-110', + 'iso8859-4', + 'iso88594', + 'iso_8859-4', + 'iso_8859-4:1988', + 'l4', + 'latin4', + ], + name: 'ISO-8859-4', + }, + { + labels: [ + 'csisolatincyrillic', + 'cyrillic', + 'iso-8859-5', + 'iso-ir-144', + 'iso8859-5', + 'iso88595', + 'iso_8859-5', + 'iso_8859-5:1988', + ], + name: 'ISO-8859-5', + }, + { + labels: [ + 'arabic', + 'asmo-708', + 'csiso88596e', + 'csiso88596i', + 'csisolatinarabic', + 'ecma-114', + 'iso-8859-6', + 'iso-8859-6-e', + 'iso-8859-6-i', + 'iso-ir-127', + 'iso8859-6', + 'iso88596', + 'iso_8859-6', + 'iso_8859-6:1987', + ], + name: 'ISO-8859-6', + }, + { + labels: [ + 'csisolatingreek', + 'ecma-118', + 'elot_928', + 'greek', + 'greek8', + 'iso-8859-7', + 'iso-ir-126', + 'iso8859-7', + 'iso88597', + 'iso_8859-7', + 'iso_8859-7:1987', + 'sun_eu_greek', + ], + name: 'ISO-8859-7', + }, + { + labels: [ + 'csiso88598e', + 'csisolatinhebrew', + 'hebrew', + 'iso-8859-8', + 'iso-8859-8-e', + 'iso-ir-138', + 'iso8859-8', + 'iso88598', + 'iso_8859-8', + 'iso_8859-8:1988', + 'visual', + ], + name: 'ISO-8859-8', + }, + { + labels: ['csiso88598i', 'iso-8859-8-i', 'logical'], + name: 'ISO-8859-8-I', + }, + { + labels: [ + 'csisolatin6', + 'iso-8859-10', + 'iso-ir-157', + 'iso8859-10', + 'iso885910', + 'l6', + 'latin6', + ], + name: 'ISO-8859-10', + }, + { + labels: ['iso-8859-13', 'iso8859-13', 'iso885913'], + name: 'ISO-8859-13', + }, + { + labels: ['iso-8859-14', 'iso8859-14', 'iso885914'], + name: 'ISO-8859-14', + }, + { + labels: [ + 'csisolatin9', + 'iso-8859-15', + 'iso8859-15', + 'iso885915', + 'iso_8859-15', + 'l9', + ], + name: 'ISO-8859-15', + }, + { + labels: ['iso-8859-16'], + name: 'ISO-8859-16', + }, + { + labels: ['cskoi8r', 'koi', 'koi8', 'koi8-r', 'koi8_r'], + name: 'KOI8-R', + }, + { + labels: ['koi8-ru', 'koi8-u'], + name: 'KOI8-U', + }, + { + labels: ['csmacintosh', 'mac', 'macintosh', 'x-mac-roman'], + name: 'macintosh', + }, + { + labels: [ + 'dos-874', + 'iso-8859-11', + 'iso8859-11', + 'iso885911', + 'tis-620', + 'windows-874', + ], + name: 'windows-874', + }, + { + labels: ['cp1250', 'windows-1250', 'x-cp1250'], + name: 'windows-1250', + }, + { + labels: ['cp1251', 'windows-1251', 'x-cp1251'], + name: 'windows-1251', + }, + { + labels: [ + 'ansi_x3.4-1968', + 'ascii', + 'cp1252', + 'cp819', + 'csisolatin1', + 'ibm819', + 'iso-8859-1', + 'iso-ir-100', + 'iso8859-1', + 'iso88591', + 'iso_8859-1', + 'iso_8859-1:1987', + 'l1', + 'latin1', + 'us-ascii', + 'windows-1252', + 'x-cp1252', + ], + name: 'windows-1252', + }, + { + labels: ['cp1253', 'windows-1253', 'x-cp1253'], + name: 'windows-1253', + }, + { + labels: [ + 'cp1254', + 'csisolatin5', + 'iso-8859-9', + 'iso-ir-148', + 'iso8859-9', + 'iso88599', + 'iso_8859-9', + 'iso_8859-9:1989', + 'l5', + 'latin5', + 'windows-1254', + 'x-cp1254', + ], + name: 'windows-1254', + }, + { + labels: ['cp1255', 'windows-1255', 'x-cp1255'], + name: 'windows-1255', + }, + { + labels: ['cp1256', 'windows-1256', 'x-cp1256'], + name: 'windows-1256', + }, + { + labels: ['cp1257', 'windows-1257', 'x-cp1257'], + name: 'windows-1257', + }, + { + labels: ['cp1258', 'windows-1258', 'x-cp1258'], + name: 'windows-1258', + }, + { + labels: ['x-mac-cyrillic', 'x-mac-ukrainian'], + name: 'x-mac-cyrillic', + }, + ], + heading: 'Legacy single-byte encodings', + }, + { + encodings: [ + { + labels: [ + 'chinese', + 'csgb2312', + 'csiso58gb231280', + 'gb2312', + 'gb_2312', + 'gb_2312-80', + 'gbk', + 'iso-ir-58', + 'x-gbk', + ], + name: 'GBK', + }, + { + labels: ['gb18030'], + name: 'gb18030', + }, + ], + heading: 'Legacy multi-byte Chinese (simplified) encodings', + }, + { + encodings: [ + { + labels: ['big5', 'big5-hkscs', 'cn-big5', 'csbig5', 'x-x-big5'], + name: 'Big5', + }, + ], + heading: 'Legacy multi-byte Chinese (traditional) encodings', + }, + { + encodings: [ + { + labels: ['cseucpkdfmtjapanese', 'euc-jp', 'x-euc-jp'], + name: 'EUC-JP', + }, + { + labels: ['csiso2022jp', 'iso-2022-jp'], + name: 'ISO-2022-JP', + }, + { + labels: [ + 'csshiftjis', + 'ms932', + 'ms_kanji', + 'shift-jis', + 'shift_jis', + 'sjis', + 'windows-31j', + 'x-sjis', + ], + name: 'Shift_JIS', + }, + ], + heading: 'Legacy multi-byte Japanese encodings', + }, + { + encodings: [ + { + labels: [ + 'cseuckr', + 'csksc56011987', + 'euc-kr', + 'iso-ir-149', + 'korean', + 'ks_c_5601-1987', + 'ks_c_5601-1989', + 'ksc5601', + 'ksc_5601', + 'windows-949', + ], + name: 'EUC-KR', + }, + ], + heading: 'Legacy multi-byte Korean encodings', + }, + { + encodings: [ + { + labels: [ + 'csiso2022kr', + 'hz-gb-2312', + 'iso-2022-cn', + 'iso-2022-cn-ext', + 'iso-2022-kr', + 'replacement', + ], + name: 'replacement', + }, + { + labels: ['unicodefffe', 'utf-16be'], + name: 'UTF-16BE', + }, + { + labels: [ + 'csunicode', + 'iso-10646-ucs-2', + 'ucs-2', + 'unicode', + 'unicodefeff', + 'utf-16', + 'utf-16le', + ], + name: 'UTF-16LE', + }, + { + labels: ['x-user-defined'], + name: 'x-user-defined', + }, + ], + heading: 'Legacy miscellaneous encodings', + }, + ]; + + const singleByteEncodings = encodingsTable.filter(group => { + return group.heading === 'Legacy single-byte encodings'; + })[0].encodings; + + // https://encoding.spec.whatwg.org/indexes.json + const singleByteIndexes = { + 'IBM866': [ + 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, + 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, + 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, + 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, + 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, + 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, + 9617, 9618, 9619, 9474, 9508, 9569, 9570, 9558, + 9557, 9571, 9553, 9559, 9565, 9564, 9563, 9488, + 9492, 9524, 9516, 9500, 9472, 9532, 9566, 9567, + 9562, 9556, 9577, 9574, 9568, 9552, 9580, 9575, + 9576, 9572, 9573, 9561, 9560, 9554, 9555, 9579, + 9578, 9496, 9484, 9608, 9604, 9612, 9616, 9600, + 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, + 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, + 1025, 1105, 1028, 1108, 1031, 1111, 1038, 1118, + 176, 8729, 183, 8730, 8470, 164, 9632, 160, + ], + 'ISO-8859-2': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 260, 728, 321, 164, 317, 346, 167, + 168, 352, 350, 356, 377, 173, 381, 379, + 176, 261, 731, 322, 180, 318, 347, 711, + 184, 353, 351, 357, 378, 733, 382, 380, + 340, 193, 194, 258, 196, 313, 262, 199, + 268, 201, 280, 203, 282, 205, 206, 270, + 272, 323, 327, 211, 212, 336, 214, 215, + 344, 366, 218, 368, 220, 221, 354, 223, + 341, 225, 226, 259, 228, 314, 263, 231, + 269, 233, 281, 235, 283, 237, 238, 271, + 273, 324, 328, 243, 244, 337, 246, 247, + 345, 367, 250, 369, 252, 253, 355, 729, + ], + 'ISO-8859-3': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 294, 728, 163, 164, null, 292, 167, + 168, 304, 350, 286, 308, 173, null, 379, + 176, 295, 178, 179, 180, 181, 293, 183, + 184, 305, 351, 287, 309, 189, null, 380, + 192, 193, 194, null, 196, 266, 264, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + null, 209, 210, 211, 212, 288, 214, 215, + 284, 217, 218, 219, 220, 364, 348, 223, + 224, 225, 226, null, 228, 267, 265, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + null, 241, 242, 243, 244, 289, 246, 247, + 285, 249, 250, 251, 252, 365, 349, 729, + ], + 'ISO-8859-4': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 260, 312, 342, 164, 296, 315, 167, + 168, 352, 274, 290, 358, 173, 381, 175, + 176, 261, 731, 343, 180, 297, 316, 711, + 184, 353, 275, 291, 359, 330, 382, 331, + 256, 193, 194, 195, 196, 197, 198, 302, + 268, 201, 280, 203, 278, 205, 206, 298, + 272, 325, 332, 310, 212, 213, 214, 215, + 216, 370, 218, 219, 220, 360, 362, 223, + 257, 225, 226, 227, 228, 229, 230, 303, + 269, 233, 281, 235, 279, 237, 238, 299, + 273, 326, 333, 311, 244, 245, 246, 247, + 248, 371, 250, 251, 252, 361, 363, 729, + ], + 'ISO-8859-5': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 1025, 1026, 1027, 1028, 1029, 1030, 1031, + 1032, 1033, 1034, 1035, 1036, 173, 1038, 1039, + 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, + 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, + 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, + 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, + 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, + 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, + 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, + 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, + 8470, 1105, 1106, 1107, 1108, 1109, 1110, 1111, + 1112, 1113, 1114, 1115, 1116, 167, 1118, 1119, + ], + 'ISO-8859-6': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, null, null, null, 164, null, null, null, + null, null, null, null, 1548, 173, null, null, + null, null, null, null, null, null, null, null, + null, null, null, 1563, null, null, null, 1567, + null, 1569, 1570, 1571, 1572, 1573, 1574, 1575, + 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, + 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, + 1592, 1593, 1594, null, null, null, null, null, + 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, + 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, + 1616, 1617, 1618, null, null, null, null, null, + null, null, null, null, null, null, null, null, + ], + 'ISO-8859-7': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 8216, 8217, 163, 8364, 8367, 166, 167, + 168, 169, 890, 171, 172, 173, null, 8213, + 176, 177, 178, 179, 900, 901, 902, 183, + 904, 905, 906, 187, 908, 189, 910, 911, + 912, 913, 914, 915, 916, 917, 918, 919, + 920, 921, 922, 923, 924, 925, 926, 927, + 928, 929, null, 931, 932, 933, 934, 935, + 936, 937, 938, 939, 940, 941, 942, 943, + 944, 945, 946, 947, 948, 949, 950, 951, + 952, 953, 954, 955, 956, 957, 958, 959, + 960, 961, 962, 963, 964, 965, 966, 967, + 968, 969, 970, 971, 972, 973, 974, null, + ], + 'ISO-8859-8': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, null, 162, 163, 164, 165, 166, 167, + 168, 169, 215, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 247, 187, 188, 189, 190, null, + null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, null, + null, null, null, null, null, null, null, 8215, + 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, + 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, + 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, + 1512, 1513, 1514, null, null, 8206, 8207, null, + ], + 'ISO-8859-10': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 260, 274, 290, 298, 296, 310, 167, + 315, 272, 352, 358, 381, 173, 362, 330, + 176, 261, 275, 291, 299, 297, 311, 183, + 316, 273, 353, 359, 382, 8213, 363, 331, + 256, 193, 194, 195, 196, 197, 198, 302, + 268, 201, 280, 203, 278, 205, 206, 207, + 208, 325, 332, 211, 212, 213, 214, 360, + 216, 370, 218, 219, 220, 221, 222, 223, + 257, 225, 226, 227, 228, 229, 230, 303, + 269, 233, 281, 235, 279, 237, 238, 239, + 240, 326, 333, 243, 244, 245, 246, 361, + 248, 371, 250, 251, 252, 253, 254, 312, + ], + 'ISO-8859-13': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 8221, 162, 163, 164, 8222, 166, 167, + 216, 169, 342, 171, 172, 173, 174, 198, + 176, 177, 178, 179, 8220, 181, 182, 183, + 248, 185, 343, 187, 188, 189, 190, 230, + 260, 302, 256, 262, 196, 197, 280, 274, + 268, 201, 377, 278, 290, 310, 298, 315, + 352, 323, 325, 211, 332, 213, 214, 215, + 370, 321, 346, 362, 220, 379, 381, 223, + 261, 303, 257, 263, 228, 229, 281, 275, + 269, 233, 378, 279, 291, 311, 299, 316, + 353, 324, 326, 243, 333, 245, 246, 247, + 371, 322, 347, 363, 252, 380, 382, 8217, + ], + 'ISO-8859-14': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 7682, 7683, 163, 266, 267, 7690, 167, + 7808, 169, 7810, 7691, 7922, 173, 174, 376, + 7710, 7711, 288, 289, 7744, 7745, 182, 7766, + 7809, 7767, 7811, 7776, 7923, 7812, 7813, 7777, + 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + 372, 209, 210, 211, 212, 213, 214, 7786, + 216, 217, 218, 219, 220, 221, 374, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 373, 241, 242, 243, 244, 245, 246, 7787, + 248, 249, 250, 251, 252, 253, 375, 255, + ], + 'ISO-8859-15': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 8364, 165, 352, 167, + 353, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 381, 181, 182, 183, + 382, 185, 186, 187, 338, 339, 376, 191, + 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + ], + 'ISO-8859-16': [ + 128, 129, 130, 131, 132, 133, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 260, 261, 321, 8364, 8222, 352, 167, + 353, 169, 536, 171, 377, 173, 378, 379, + 176, 177, 268, 322, 381, 8221, 182, 183, + 382, 269, 537, 187, 338, 339, 376, 380, + 192, 193, 194, 258, 196, 262, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + 272, 323, 210, 211, 212, 336, 214, 346, + 368, 217, 218, 219, 220, 280, 538, 223, + 224, 225, 226, 259, 228, 263, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 273, 324, 242, 243, 244, 337, 246, 347, + 369, 249, 250, 251, 252, 281, 539, 255, + ], + 'KOI8-R': [ + 9472, 9474, 9484, 9488, 9492, 9496, 9500, 9508, + 9516, 9524, 9532, 9600, 9604, 9608, 9612, 9616, + 9617, 9618, 9619, 8992, 9632, 8729, 8730, 8776, + 8804, 8805, 160, 8993, 176, 178, 183, 247, + 9552, 9553, 9554, 1105, 9555, 9556, 9557, 9558, + 9559, 9560, 9561, 9562, 9563, 9564, 9565, 9566, + 9567, 9568, 9569, 1025, 9570, 9571, 9572, 9573, + 9574, 9575, 9576, 9577, 9578, 9579, 9580, 169, + 1102, 1072, 1073, 1094, 1076, 1077, 1092, 1075, + 1093, 1080, 1081, 1082, 1083, 1084, 1085, 1086, + 1087, 1103, 1088, 1089, 1090, 1091, 1078, 1074, + 1100, 1099, 1079, 1096, 1101, 1097, 1095, 1098, + 1070, 1040, 1041, 1062, 1044, 1045, 1060, 1043, + 1061, 1048, 1049, 1050, 1051, 1052, 1053, 1054, + 1055, 1071, 1056, 1057, 1058, 1059, 1046, 1042, + 1068, 1067, 1047, 1064, 1069, 1065, 1063, 1066, + ], + 'KOI8-U': [ + 9472, 9474, 9484, 9488, 9492, 9496, 9500, 9508, + 9516, 9524, 9532, 9600, 9604, 9608, 9612, 9616, + 9617, 9618, 9619, 8992, 9632, 8729, 8730, 8776, + 8804, 8805, 160, 8993, 176, 178, 183, 247, + 9552, 9553, 9554, 1105, 1108, 9556, 1110, 1111, + 9559, 9560, 9561, 9562, 9563, 1169, 1118, 9566, + 9567, 9568, 9569, 1025, 1028, 9571, 1030, 1031, + 9574, 9575, 9576, 9577, 9578, 1168, 1038, 169, + 1102, 1072, 1073, 1094, 1076, 1077, 1092, 1075, + 1093, 1080, 1081, 1082, 1083, 1084, 1085, 1086, + 1087, 1103, 1088, 1089, 1090, 1091, 1078, 1074, + 1100, 1099, 1079, 1096, 1101, 1097, 1095, 1098, + 1070, 1040, 1041, 1062, 1044, 1045, 1060, 1043, + 1061, 1048, 1049, 1050, 1051, 1052, 1053, 1054, + 1055, 1071, 1056, 1057, 1058, 1059, 1046, 1042, + 1068, 1067, 1047, 1064, 1069, 1065, 1063, 1066, + ], + 'macintosh': [ + 196, 197, 199, 201, 209, 214, 220, 225, + 224, 226, 228, 227, 229, 231, 233, 232, + 234, 235, 237, 236, 238, 239, 241, 243, + 242, 244, 246, 245, 250, 249, 251, 252, + 8224, 176, 162, 163, 167, 8226, 182, 223, + 174, 169, 8482, 180, 168, 8800, 198, 216, + 8734, 177, 8804, 8805, 165, 181, 8706, 8721, + 8719, 960, 8747, 170, 186, 937, 230, 248, + 191, 161, 172, 8730, 402, 8776, 8710, 171, + 187, 8230, 160, 192, 195, 213, 338, 339, + 8211, 8212, 8220, 8221, 8216, 8217, 247, 9674, + 255, 376, 8260, 8364, 8249, 8250, 64257, 64258, + 8225, 183, 8218, 8222, 8240, 194, 202, 193, + 203, 200, 205, 206, 207, 204, 211, 212, + 63743, 210, 218, 219, 217, 305, 710, 732, + 175, 728, 729, 730, 184, 733, 731, 711, + ], + 'windows-874': [ + 8364, 129, 130, 131, 132, 8230, 134, 135, + 136, 137, 138, 139, 140, 141, 142, 143, + 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 152, 153, 154, 155, 156, 157, 158, 159, + 160, 3585, 3586, 3587, 3588, 3589, 3590, 3591, + 3592, 3593, 3594, 3595, 3596, 3597, 3598, 3599, + 3600, 3601, 3602, 3603, 3604, 3605, 3606, 3607, + 3608, 3609, 3610, 3611, 3612, 3613, 3614, 3615, + 3616, 3617, 3618, 3619, 3620, 3621, 3622, 3623, + 3624, 3625, 3626, 3627, 3628, 3629, 3630, 3631, + 3632, 3633, 3634, 3635, 3636, 3637, 3638, 3639, + 3640, 3641, 3642, null, null, null, null, 3647, + 3648, 3649, 3650, 3651, 3652, 3653, 3654, 3655, + 3656, 3657, 3658, 3659, 3660, 3661, 3662, 3663, + 3664, 3665, 3666, 3667, 3668, 3669, 3670, 3671, + 3672, 3673, 3674, 3675, null, null, null, null, + ], + 'windows-1250': [ + 8364, 129, 8218, 131, 8222, 8230, 8224, 8225, + 136, 8240, 352, 8249, 346, 356, 381, 377, + 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 152, 8482, 353, 8250, 347, 357, 382, 378, + 160, 711, 728, 321, 164, 260, 166, 167, + 168, 169, 350, 171, 172, 173, 174, 379, + 176, 177, 731, 322, 180, 181, 182, 183, + 184, 261, 351, 187, 317, 733, 318, 380, + 340, 193, 194, 258, 196, 313, 262, 199, + 268, 201, 280, 203, 282, 205, 206, 270, + 272, 323, 327, 211, 212, 336, 214, 215, + 344, 366, 218, 368, 220, 221, 354, 223, + 341, 225, 226, 259, 228, 314, 263, 231, + 269, 233, 281, 235, 283, 237, 238, 271, + 273, 324, 328, 243, 244, 337, 246, 247, + 345, 367, 250, 369, 252, 253, 355, 729, + ], + 'windows-1251': [ + 1026, 1027, 8218, 1107, 8222, 8230, 8224, 8225, + 8364, 8240, 1033, 8249, 1034, 1036, 1035, 1039, + 1106, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 152, 8482, 1113, 8250, 1114, 1116, 1115, 1119, + 160, 1038, 1118, 1032, 164, 1168, 166, 167, + 1025, 169, 1028, 171, 172, 173, 174, 1031, + 176, 177, 1030, 1110, 1169, 181, 182, 183, + 1105, 8470, 1108, 187, 1112, 1029, 1109, 1111, + 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, + 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, + 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, + 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, + 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, + 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, + 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, + 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, + ], + 'windows-1252': [ + 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, + 710, 8240, 352, 8249, 338, 141, 381, 143, + 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 732, 8482, 353, 8250, 339, 157, 382, 376, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + ], + 'windows-1253': [ + 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, + 136, 8240, 138, 8249, 140, 141, 142, 143, + 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 152, 8482, 154, 8250, 156, 157, 158, 159, + 160, 901, 902, 163, 164, 165, 166, 167, + 168, 169, null, 171, 172, 173, 174, 8213, + 176, 177, 178, 179, 900, 181, 182, 183, + 904, 905, 906, 187, 908, 189, 910, 911, + 912, 913, 914, 915, 916, 917, 918, 919, + 920, 921, 922, 923, 924, 925, 926, 927, + 928, 929, null, 931, 932, 933, 934, 935, + 936, 937, 938, 939, 940, 941, 942, 943, + 944, 945, 946, 947, 948, 949, 950, 951, + 952, 953, 954, 955, 956, 957, 958, 959, + 960, 961, 962, 963, 964, 965, 966, 967, + 968, 969, 970, 971, 972, 973, 974, null, + ], + 'windows-1254': [ + 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, + 710, 8240, 352, 8249, 338, 141, 142, 143, + 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 732, 8482, 353, 8250, 339, 157, 158, 376, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, + 286, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 304, 350, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 287, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 305, 351, 255, + ], + 'windows-1255': [ + 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, + 710, 8240, 138, 8249, 140, 141, 142, 143, + 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 732, 8482, 154, 8250, 156, 157, 158, 159, + 160, 161, 162, 163, 8362, 165, 166, 167, + 168, 169, 215, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 247, 187, 188, 189, 190, 191, + 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463, + 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, + 1472, 1473, 1474, 1475, 1520, 1521, 1522, 1523, + 1524, null, null, null, null, null, null, null, + 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, + 1496, 1497, 1498, 1499, 1500, 1501, 1502, 1503, + 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, + 1512, 1513, 1514, null, null, 8206, 8207, null, + ], + 'windows-1256': [ + 8364, 1662, 8218, 402, 8222, 8230, 8224, 8225, + 710, 8240, 1657, 8249, 338, 1670, 1688, 1672, + 1711, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 1705, 8482, 1681, 8250, 339, 8204, 8205, 1722, + 160, 1548, 162, 163, 164, 165, 166, 167, + 168, 169, 1726, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 1563, 187, 188, 189, 190, 1567, + 1729, 1569, 1570, 1571, 1572, 1573, 1574, 1575, + 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, + 1584, 1585, 1586, 1587, 1588, 1589, 1590, 215, + 1591, 1592, 1593, 1594, 1600, 1601, 1602, 1603, + 224, 1604, 226, 1605, 1606, 1607, 1608, 231, + 232, 233, 234, 235, 1609, 1610, 238, 239, + 1611, 1612, 1613, 1614, 244, 1615, 1616, 247, + 1617, 249, 1618, 251, 252, 8206, 8207, 1746, + ], + 'windows-1257': [ + 8364, 129, 8218, 131, 8222, 8230, 8224, 8225, + 136, 8240, 138, 8249, 140, 168, 711, 184, 144, + 8216, 8217, 8220, 8221, 8226, 8211, 8212, 152, + 8482, 154, 8250, 156, 175, 731, 159, 160, + null, 162, 163, 164, null, 166, 167, 216, + 169, 342, 171, 172, 173, 174, 198, 176, + 177, 178, 179, 180, 181, 182, 183, 248, + 185, 343, 187, 188, 189, 190, 230, 260, + 302, 256, 262, 196, 197, 280, 274, 268, + 201, 377, 278, 290, 310, 298, 315, 352, + 323, 325, 211, 332, 213, 214, 215, 370, + 321, 346, 362, 220, 379, 381, 223, 261, + 303, 257, 263, 228, 229, 281, 275, 269, + 233, 378, 279, 291, 311, 299, 316, 353, + 324, 326, 243, 333, 245, 246, 247, 371, + 322, 347, 363, 252, 380, 382, 729, + ], + 'windows-1258': [ + 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, + 710, 8240, 138, 8249, 338, 141, 142, 143, + 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 732, 8482, 154, 8250, 339, 157, 158, 376, + 160, 161, 162, 163, 164, 165, 166, 167, + 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, + 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 258, 196, 197, 198, 199, + 200, 201, 202, 203, 768, 205, 206, 207, + 272, 209, 777, 211, 212, 416, 214, 215, + 216, 217, 218, 219, 220, 431, 771, 223, + 224, 225, 226, 259, 228, 229, 230, 231, + 232, 233, 234, 235, 769, 237, 238, 239, + 273, 241, 803, 243, 244, 417, 246, 247, + 248, 249, 250, 251, 252, 432, 8363, 255, + ], + 'x-mac-cyrillic': [ + 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, + 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, + 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, + 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, + 8224, 176, 1168, 163, 167, 8226, 182, 1030, + 174, 169, 8482, 1026, 1106, 8800, 1027, 1107, + 8734, 177, 8804, 8805, 1110, 181, 1169, 1032, + 1028, 1108, 1031, 1111, 1033, 1113, 1034, 1114, + 1112, 1029, 172, 8730, 402, 8776, 8710, 171, + 187, 8230, 160, 1035, 1115, 1036, 1116, 1109, + 8211, 8212, 8220, 8221, 8216, 8217, 247, 8222, + 1038, 1118, 1039, 1119, 8470, 1025, 1105, 1103, + 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, + 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, + 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, + 1096, 1097, 1098, 1099, 1100, 1101, 1102, 8364, + ], + }; + + function assertDecode(data, encoding) { + for (let i = 0, l = data.length; i < l; i++) { + const cp = data.charCodeAt(i); + let expectedCp = i < 0x80 ? i : singleByteIndexes[encoding][i - 0x80]; + if (expectedCp === null) + expectedCp = 0xfffd; + + expect(cp).toBe(expectedCp); + } + } + const buffer = new ArrayBuffer(255); + const view = new Uint8Array(buffer); + + for (let i = 0, l = view.byteLength; i < l; i++) + view[i] = i; + + + for (let i = 0, l = singleByteEncodings.length; i < l; i++) { + const encoding = singleByteEncodings[i]; + for (let i2 = 0, l2 = encoding.labels.length; i2 < l2; i2++) { + const label = encoding.labels[i2]; + const decoder = new TextDecoder(label); + + const data = decoder.decode(view); + + expect(decoder.encoding).toBe(encoding.name.toLowerCase()); + assertDecode(data, encoding.name); + } + } + }); +}); diff --git a/js.gresource.xml b/js.gresource.xml index fc55e597..a0b37730 100644 --- a/js.gresource.xml +++ b/js.gresource.xml @@ -42,8 +42,10 @@ <file>modules/core/_cairo.js</file> <file>modules/core/_common.js</file> + <file>modules/core/_encodings.js</file> <file>modules/core/_format.js</file> <file>modules/core/_gettext.js</file> <file>modules/core/_signals.js</file> + <file>modules/core/_text.js</file> </gresource> </gresources> diff --git a/modules/core/_encodings.js b/modules/core/_encodings.js new file mode 100644 index 00000000..dbeeb6d6 --- /dev/null +++ b/modules/core/_encodings.js @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later +// SPDX-FileCopyrightText: Node.js contributors. All rights reserved. + +// Modified from https://github.com/nodejs/node/blob/78680c1cbc8b0c435963bc512e826b2a6227c315/lib/internal/encoding.js +// Data originally from https://encoding.spec.whatwg.org/encodings.json + +/* exported getEncodingFromLabel */ + +const encodings = new Map([ + ['unicode-1-1-utf-8', 'utf-8'], + ['utf8', 'utf-8'], + ['utf-8', 'utf-8'], + ['866', 'ibm866'], + ['cp866', 'ibm866'], + ['csibm866', 'ibm866'], + ['ibm866', 'ibm866'], + ['csisolatin2', 'iso-8859-2'], + ['iso-8859-2', 'iso-8859-2'], + ['iso-ir-101', 'iso-8859-2'], + ['iso8859-2', 'iso-8859-2'], + ['iso88592', 'iso-8859-2'], + ['iso_8859-2', 'iso-8859-2'], + ['iso_8859-2:1987', 'iso-8859-2'], + ['l2', 'iso-8859-2'], + ['latin2', 'iso-8859-2'], + ['csisolatin3', 'iso-8859-3'], + ['iso-8859-3', 'iso-8859-3'], + ['iso-ir-109', 'iso-8859-3'], + ['iso8859-3', 'iso-8859-3'], + ['iso88593', 'iso-8859-3'], + ['iso_8859-3', 'iso-8859-3'], + ['iso_8859-3:1988', 'iso-8859-3'], + ['l3', 'iso-8859-3'], + ['latin3', 'iso-8859-3'], + ['csisolatin4', 'iso-8859-4'], + ['iso-8859-4', 'iso-8859-4'], + ['iso-ir-110', 'iso-8859-4'], + ['iso8859-4', 'iso-8859-4'], + ['iso88594', 'iso-8859-4'], + ['iso_8859-4', 'iso-8859-4'], + ['iso_8859-4:1988', 'iso-8859-4'], + ['l4', 'iso-8859-4'], + ['latin4', 'iso-8859-4'], + ['csisolatincyrillic', 'iso-8859-5'], + ['cyrillic', 'iso-8859-5'], + ['iso-8859-5', 'iso-8859-5'], + ['iso-ir-144', 'iso-8859-5'], + ['iso8859-5', 'iso-8859-5'], + ['iso88595', 'iso-8859-5'], + ['iso_8859-5', 'iso-8859-5'], + ['iso_8859-5:1988', 'iso-8859-5'], + ['arabic', 'iso-8859-6'], + ['asmo-708', 'iso-8859-6'], + ['csiso88596e', 'iso-8859-6'], + ['csiso88596i', 'iso-8859-6'], + ['csisolatinarabic', 'iso-8859-6'], + ['ecma-114', 'iso-8859-6'], + ['iso-8859-6', 'iso-8859-6'], + ['iso-8859-6-e', 'iso-8859-6'], + ['iso-8859-6-i', 'iso-8859-6'], + ['iso-ir-127', 'iso-8859-6'], + ['iso8859-6', 'iso-8859-6'], + ['iso88596', 'iso-8859-6'], + ['iso_8859-6', 'iso-8859-6'], + ['iso_8859-6:1987', 'iso-8859-6'], + ['csisolatingreek', 'iso-8859-7'], + ['ecma-118', 'iso-8859-7'], + ['elot_928', 'iso-8859-7'], + ['greek', 'iso-8859-7'], + ['greek8', 'iso-8859-7'], + ['iso-8859-7', 'iso-8859-7'], + ['iso-ir-126', 'iso-8859-7'], + ['iso8859-7', 'iso-8859-7'], + ['iso88597', 'iso-8859-7'], + ['iso_8859-7', 'iso-8859-7'], + ['iso_8859-7:1987', 'iso-8859-7'], + ['sun_eu_greek', 'iso-8859-7'], + ['csiso88598e', 'iso-8859-8'], + ['csisolatinhebrew', 'iso-8859-8'], + ['hebrew', 'iso-8859-8'], + ['iso-8859-8', 'iso-8859-8'], + ['iso-8859-8-e', 'iso-8859-8'], + ['iso-ir-138', 'iso-8859-8'], + ['iso8859-8', 'iso-8859-8'], + ['iso88598', 'iso-8859-8'], + ['iso_8859-8', 'iso-8859-8'], + ['iso_8859-8:1988', 'iso-8859-8'], + ['visual', 'iso-8859-8'], + ['csiso88598i', 'iso-8859-8-i'], + ['iso-8859-8-i', 'iso-8859-8-i'], + ['logical', 'iso-8859-8-i'], + ['csisolatin6', 'iso-8859-10'], + ['iso-8859-10', 'iso-8859-10'], + ['iso-ir-157', 'iso-8859-10'], + ['iso8859-10', 'iso-8859-10'], + ['iso885910', 'iso-8859-10'], + ['l6', 'iso-8859-10'], + ['latin6', 'iso-8859-10'], + ['iso-8859-13', 'iso-8859-13'], + ['iso8859-13', 'iso-8859-13'], + ['iso885913', 'iso-8859-13'], + ['iso-8859-14', 'iso-8859-14'], + ['iso8859-14', 'iso-8859-14'], + ['iso885914', 'iso-8859-14'], + ['csisolatin9', 'iso-8859-15'], + ['iso-8859-15', 'iso-8859-15'], + ['iso8859-15', 'iso-8859-15'], + ['iso885915', 'iso-8859-15'], + ['iso_8859-15', 'iso-8859-15'], + ['iso-8859-16', 'iso-8859-16'], + ['ISO-8859-16', 'iso-8859-16'], + ['l9', 'iso-8859-15'], + ['cskoi8r', 'koi8-r'], + ['koi', 'koi8-r'], + ['koi8', 'koi8-r'], + ['koi8-r', 'koi8-r'], + ['koi8_r', 'koi8-r'], + ['koi8-ru', 'koi8-u'], + ['koi8-u', 'koi8-u'], + ['csmacintosh', 'macintosh'], + ['mac', 'macintosh'], + ['macintosh', 'macintosh'], + ['x-mac-roman', 'macintosh'], + ['dos-874', 'windows-874'], + ['iso-8859-11', 'windows-874'], + ['iso8859-11', 'windows-874'], + ['iso885911', 'windows-874'], + ['tis-620', 'windows-874'], + ['windows-874', 'windows-874'], + ['cp1250', 'windows-1250'], + ['windows-1250', 'windows-1250'], + ['x-cp1250', 'windows-1250'], + ['cp1251', 'windows-1251'], + ['windows-1251', 'windows-1251'], + ['x-cp1251', 'windows-1251'], + ['ansi_x3.4-1968', 'windows-1252'], + ['ascii', 'windows-1252'], + ['cp1252', 'windows-1252'], + ['cp819', 'windows-1252'], + ['csisolatin1', 'windows-1252'], + ['ibm819', 'windows-1252'], + ['iso-8859-1', 'windows-1252'], + ['iso-ir-100', 'windows-1252'], + ['iso8859-1', 'windows-1252'], + ['iso88591', 'windows-1252'], + ['iso_8859-1', 'windows-1252'], + ['iso_8859-1:1987', 'windows-1252'], + ['l1', 'windows-1252'], + ['latin1', 'windows-1252'], + ['us-ascii', 'windows-1252'], + ['windows-1252', 'windows-1252'], + ['x-cp1252', 'windows-1252'], + ['cp1253', 'windows-1253'], + ['windows-1253', 'windows-1253'], + ['x-cp1253', 'windows-1253'], + ['cp1254', 'windows-1254'], + ['csisolatin5', 'windows-1254'], + ['iso-8859-9', 'windows-1254'], + ['iso-ir-148', 'windows-1254'], + ['iso8859-9', 'windows-1254'], + ['iso88599', 'windows-1254'], + ['iso_8859-9', 'windows-1254'], + ['iso_8859-9:1989', 'windows-1254'], + ['l5', 'windows-1254'], + ['latin5', 'windows-1254'], + ['windows-1254', 'windows-1254'], + ['x-cp1254', 'windows-1254'], + ['cp1255', 'windows-1255'], + ['windows-1255', 'windows-1255'], + ['x-cp1255', 'windows-1255'], + ['cp1256', 'windows-1256'], + ['windows-1256', 'windows-1256'], + ['x-cp1256', 'windows-1256'], + ['cp1257', 'windows-1257'], + ['windows-1257', 'windows-1257'], + ['x-cp1257', 'windows-1257'], + ['cp1258', 'windows-1258'], + ['windows-1258', 'windows-1258'], + ['x-cp1258', 'windows-1258'], + ['x-mac-cyrillic', 'x-mac-cyrillic'], + ['x-mac-ukrainian', 'x-mac-cyrillic'], + ['chinese', 'gbk'], + ['csgb2312', 'gbk'], + ['csiso58gb231280', 'gbk'], + ['gb2312', 'gbk'], + ['gb_2312', 'gbk'], + ['gb_2312-80', 'gbk'], + ['gbk', 'gbk'], + ['iso-ir-58', 'gbk'], + ['x-gbk', 'gbk'], + ['gb18030', 'gb18030'], + ['big5', 'big5'], + ['big5-hkscs', 'big5'], + ['cn-big5', 'big5'], + ['csbig5', 'big5'], + ['x-x-big5', 'big5'], + ['cseucpkdfmtjapanese', 'euc-jp'], + ['euc-jp', 'euc-jp'], + ['x-euc-jp', 'euc-jp'], + ['csiso2022jp', 'iso-2022-jp'], + ['iso-2022-jp', 'iso-2022-jp'], + ['csshiftjis', 'shift_jis'], + ['ms932', 'shift_jis'], + ['ms_kanji', 'shift_jis'], + ['shift-jis', 'shift_jis'], + ['shift_jis', 'shift_jis'], + ['sjis', 'shift_jis'], + ['windows-31j', 'shift_jis'], + ['x-sjis', 'shift_jis'], + ['cseuckr', 'euc-kr'], + ['csksc56011987', 'euc-kr'], + ['euc-kr', 'euc-kr'], + ['iso-ir-149', 'euc-kr'], + ['korean', 'euc-kr'], + ['ks_c_5601-1987', 'euc-kr'], + ['ks_c_5601-1989', 'euc-kr'], + ['ksc5601', 'euc-kr'], + ['ksc_5601', 'euc-kr'], + ['windows-949', 'euc-kr'], + ['utf-16be', 'utf-16be'], + ['utf-16le', 'utf-16le'], + ['utf-16', 'utf-16le'], +]); + +// Some of the web-specified encodings use +// aliases which aren't supported in iconv +const internalEncodings = new Map([ + ['x-mac-cyrillic', 'MacCyrillic'], + // For our purposes we can encoding 8-i as 8 + ['iso-8859-8-i', 'iso-8859-8'], +]); + +/** + * Trims ASCII whitespace from a string. + * `String.prototype.trim` removes non-ASCII whitespace. + * + * @param {string} label the label to trim + * @returns {string} + */ +const trimAsciiWhitespace = label => { + let s = 0; + let e = label.length; + while (s < e && ( + label[s] === '\u0009' || + label[s] === '\u000a' || + label[s] === '\u000c' || + label[s] === '\u000d' || + label[s] === '\u0020')) + s++; + + while (e > s && ( + label[e - 1] === '\u0009' || + label[e - 1] === '\u000a' || + label[e - 1] === '\u000c' || + label[e - 1] === '\u000d' || + label[e - 1] === '\u0020')) + e--; + + return label.slice(s, e); +}; + +/** + * @param {string} label the encoding label + * @returns {string | undefined} + */ +function getEncodingFromLabel(label) { + const enc = encodings.get(label); + + if (enc !== undefined) { + return { + internal: internalEncodings.get(enc), + external: enc, + }; + } + + + const trimmed = encodings.get(trimAsciiWhitespace(label.toLowerCase())); + + return {internal: internalEncodings.get(trimmed), external: trimmed}; +} diff --git a/modules/core/_text.js b/modules/core/_text.js new file mode 100644 index 00000000..9bdc7ef2 --- /dev/null +++ b/modules/core/_text.js @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: MIT OR LGPL-2.0-or-later +// SPDX-FileCopyrightText: Evan Welsh + +const Encoding = imports._encodingNative; + +const { getEncodingFromLabel } = imports._encodings; + +var TextDecoder = class TextDecoder { + /** + * @type {string} + */ + encoding; + + /** + * @type {boolean} + */ + ignoreBOM; + + /** + * @type {boolean} + */ + fatal; + + get [Symbol.toStringTag]() { + return 'TextDecoder'; + } + + /** + * @param {string} encoding + * @param {object} [options] + * @param {boolean=} options.fatal + * @param {boolean=} options.ignoreBOM + */ + constructor(encoding = 'utf-8', options = {}) { + const { fatal = false, ignoreBOM = false } = options; + + const encodings = getEncodingFromLabel(encoding); + const enc = encodings.internal ?? encodings.external; + + if (enc === undefined) { + throw new Error(`Invalid encoding label: '${encoding}'`); + } + + Object.defineProperty(this, '_internalEncoding', { + value: enc, + enumerable: true, + writable: false, + configurable: false, + }); + + Object.defineProperty(this, 'encoding', { + value: encodings.external, + enumerable: true, + writable: false, + configurable: false, + }); + + Object.defineProperty(this, 'ignoreBOM', { + value: ignoreBOM, + enumerable: true, + writable: false, + configurable: false, + }); + + Object.defineProperty(this, 'fatal', { + value: fatal, + enumerable: true, + writable: false, + configurable: false, + }); + } + + decode(bytes, options = {}) { + const { stream = false } = options; + + if (stream) { + throw new Error(`TextDecoder does not implement the 'stream' option.`); + } + + /** @type {Uint8Array} */ + let input; + + if (bytes instanceof ArrayBuffer) { + input = new Uint8Array(bytes); + } else if (bytes instanceof Uint8Array) { + input = bytes; + } else if (bytes instanceof Object.getPrototypeOf(Uint8Array)) { + let { buffer, byteLength, byteOffset } = /** @type {Uint32Array} */ (bytes); + input = new Uint8Array(buffer, byteOffset, byteLength); + } else if (bytes === undefined) { + input = new Uint8Array(0); + } else { + throw new Error(`Provided input cannot be converted to ArrayBufferView or ArrayBuffer`); + } + + if (this.ignoreBOM && input.length > 2 && input[0] === 0xEF && input[1] === 0xBB && input[2] === 0xBF) { + if (this.encoding !== 'utf-8') { + throw new Error(`Cannot ignore BOM for non-UTF8 encoding.`); + } + + let { buffer, byteLength, byteOffset } = input; + input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3); + } + + return Encoding.decode(input, this.fatal, this._internalEncoding); + } +} + +var TextEncoder = class TextEncoder { + get [Symbol.toStringTag]() { + return 'TextEncoder'; + } + + get encoding() { + return 'utf-8'; + } + + encode(input = '') { + // The TextEncoder specification only allows for UTF-8 encoding. + return Encoding.encode(`${input}`, 'UTF-8'); + } + + encodeInto(input = '', output = new Uint8Array()) { + // The TextEncoder specification only allows for UTF-8 encoding. + return Encoding.encodeInto(`${input}`, output); + } +}
\ No newline at end of file diff --git a/modules/core/overrides/GLib.js b/modules/core/overrides/GLib.js index 5e3800a9..e4dca1a1 100644 --- a/modules/core/overrides/GLib.js +++ b/modules/core/overrides/GLib.js @@ -50,13 +50,6 @@ function _readSingleType(signature, forceSimple) { return [char]; } -function _makeBytes(byteArray) { - if (byteArray instanceof Uint8Array || byteArray instanceof ByteArray.ByteArray) - return ByteArray.toGBytes(byteArray); - else - return new GLib.Bytes(byteArray); -} - function _packVariant(signature, value) { if (signature.length === 0) throw new TypeError('GVariant signature cannot be empty'); @@ -113,7 +106,7 @@ function _packVariant(signature, value) { byteArray = Uint8Array.of(...byteArray, 0); bytes = ByteArray.toGBytes(byteArray); } else { - bytes = _makeBytes(value); + bytes = new GLib.Bytes(value); } return GLib.Variant.new_from_bytes(new GLib.VariantType('ay'), bytes, true); diff --git a/modules/script/_bootstrap/default.js b/modules/script/_bootstrap/default.js index 952d7fe3..fe354a02 100644 --- a/modules/script/_bootstrap/default.js +++ b/modules/script/_bootstrap/default.js @@ -6,6 +6,7 @@ 'use strict'; const {print, printerr, log, logError} = imports._print; + const {TextEncoder, TextDecoder} = imports._text; Object.defineProperties(exports, { ARGV: { @@ -16,6 +17,18 @@ return imports.system.programArgs; }, }, + TextEncoder: { + configurable: false, + enumerable: true, + writable: false, + value: TextEncoder, + }, + TextDecoder: { + configurable: false, + enumerable: true, + writable: false, + value: TextDecoder, + }, print: { configurable: false, enumerable: true, diff --git a/modules/script/byteArray.js b/modules/script/byteArray.js index e0b650ac..e127a9c6 100644 --- a/modules/script/byteArray.js +++ b/modules/script/byteArray.js @@ -41,7 +41,7 @@ function toString(array, encoding = 'utf-8') { if (!(array instanceof Uint8Array)) throw new Error('Argument to ByteArray.toString() must be a Uint8Array'); - return Encoding.toString(array, encoding); + return Encoding.decode(array, true, encoding); } /** @@ -50,7 +50,7 @@ function toString(array, encoding = 'utf-8') { * @returns {Uint8Array} */ function fromString(str, encoding = 'utf-8') { - const array = Encoding.fromString(str, encoding); + const array = Encoding.encode(str, encoding); defineToString(array); |