diff options
author | James M Snell <jasnell@gmail.com> | 2016-06-21 14:03:05 -0700 |
---|---|---|
committer | Jeremiah Senkpiel <fishrock123@rocketmail.com> | 2016-07-05 22:36:51 +0200 |
commit | d0e24923a69326b23ac14cc631724fc537521f9e (patch) | |
tree | d1d251811a1c2aaec20446acff679a65eb2fe2d3 | |
parent | 12b199369d08cd4c09120411a173dbfba48521f8 (diff) | |
download | node-new-d0e24923a69326b23ac14cc631724fc537521f9e.tar.gz |
net: use icu's punycode implementation
ICU has a punycode implementation built in. Use it instead of the
javascript implementation because it's much faster.
PR-URL: https://github.com/nodejs/node/pull/7355
Reviewed-By: Trevor Norris <trev.norris@gmail.com>
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
-rw-r--r-- | benchmark/net/punycode.js | 75 | ||||
-rw-r--r-- | lib/url.js | 12 | ||||
-rw-r--r-- | src/node_i18n.cc | 132 | ||||
-rw-r--r-- | test/parallel/test-icu-punycode.js | 72 | ||||
-rw-r--r-- | tools/icu/icu-generic.gyp | 9 | ||||
-rw-r--r-- | tools/icu/icu_small.json | 3 |
6 files changed, 291 insertions, 12 deletions
diff --git a/benchmark/net/punycode.js b/benchmark/net/punycode.js new file mode 100644 index 0000000000..f4d22557ac --- /dev/null +++ b/benchmark/net/punycode.js @@ -0,0 +1,75 @@ +'use strict'; + +const common = require('../common.js'); +const icu = process.binding('icu'); +const punycode = require('punycode'); + +const bench = common.createBenchmark(main, { + method: ['punycode', 'icu'], + n: [1024], + val: [ + 'افغانستا.icom.museum', + 'الجزائر.icom.museum', + 'österreich.icom.museum', + 'বাংলাদেশ.icom.museum', + 'беларусь.icom.museum', + 'belgië.icom.museum', + 'българия.icom.museum', + 'تشادر.icom.museum', + '中国.icom.museum', + 'القمر.icom.museum', + 'κυπρος.icom.museum', + 'českárepublika.icom.museum', + 'مصر.icom.museum', + 'ελλάδα.icom.museum', + 'magyarország.icom.museum', + 'ísland.icom.museum', + 'भारत.icom.museum', + 'ايران.icom.museum', + 'éire.icom.museum', + 'איקו״ם.ישראל.museum', + '日本.icom.museum', + 'الأردن.icom.museum' + ] +}); + +function usingPunycode(val) { + punycode.toUnicode(punycode.toASCII(val)); +} + +function usingICU(val) { + icu.toUnicode(icu.toASCII(val)); +} + +function runPunycode(n, val) { + common.v8ForceOptimization(usingPunycode, val); + var i = 0; + bench.start(); + for (; i < n; i++) + usingPunycode(val); + bench.end(n); +} + +function runICU(n, val) { + common.v8ForceOptimization(usingICU, val); + var i = 0; + bench.start(); + for (; i < n; i++) + usingICU(val); + bench.end(n); +} + +function main(conf) { + const n = +conf.n; + const val = conf.val; + switch (conf.method) { + case 'punycode': + runPunycode(n, val); + break; + case 'icu': + runICU(n, val); + break; + default: + throw new Error('Unexpected method'); + } +} diff --git a/lib/url.js b/lib/url.js index c4d6ed2e33..4a2a879bf3 100644 --- a/lib/url.js +++ b/lib/url.js @@ -1,6 +1,14 @@ 'use strict'; -const punycode = require('punycode'); +function importPunycode() { + try { + return process.binding('icu'); + } catch (e) { + return require('punycode'); + } +} + +const { toASCII } = importPunycode(); exports.parse = urlParse; exports.resolve = urlResolve; @@ -309,7 +317,7 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { // It only converts parts of the domain name that // have non-ASCII characters, i.e. it doesn't matter if // you call it with a domain that already is ASCII-only. - this.hostname = punycode.toASCII(this.hostname); + this.hostname = toASCII(this.hostname); } var p = this.port ? ':' + this.port : ''; diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 3e5b3a9129..0f3b9b76e6 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -23,8 +23,16 @@ #if defined(NODE_HAVE_I18N_SUPPORT) +#include "node.h" +#include "env.h" +#include "env-inl.h" +#include "util.h" +#include "util-inl.h" +#include "v8.h" + #include <unicode/putil.h> #include <unicode/udata.h> +#include <unicode/uidna.h> #ifdef NODE_HAVE_SMALL_ICU /* if this is defined, we have a 'secondary' entry point. @@ -43,6 +51,13 @@ extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[]; namespace node { +using v8::Context; +using v8::FunctionCallbackInfo; +using v8::Local; +using v8::Object; +using v8::String; +using v8::Value; + bool flag_icu_data_dir = false; namespace i18n { @@ -64,7 +79,124 @@ bool InitializeICUDirectory(const char* icu_data_path) { } } +static int32_t ToUnicode(MaybeStackBuffer<char>* buf, + const char* input, + size_t length) { + UErrorCode status = U_ZERO_ERROR; + uint32_t options = UIDNA_DEFAULT; + options |= UIDNA_NONTRANSITIONAL_TO_UNICODE; + UIDNA* uidna = uidna_openUTS46(options, &status); + if (U_FAILURE(status)) + return -1; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + + int32_t len = uidna_nameToUnicodeUTF8(uidna, + input, length, + **buf, buf->length(), + &info, + &status); + + if (status == U_BUFFER_OVERFLOW_ERROR) { + status = U_ZERO_ERROR; + buf->AllocateSufficientStorage(len); + len = uidna_nameToUnicodeUTF8(uidna, + input, length, + **buf, buf->length(), + &info, + &status); + } + + if (U_FAILURE(status)) + len = -1; + + uidna_close(uidna); + return len; +} + +static int32_t ToASCII(MaybeStackBuffer<char>* buf, + const char* input, + size_t length) { + UErrorCode status = U_ZERO_ERROR; + uint32_t options = UIDNA_DEFAULT; + options |= UIDNA_NONTRANSITIONAL_TO_ASCII; + UIDNA* uidna = uidna_openUTS46(options, &status); + if (U_FAILURE(status)) + return -1; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + + int32_t len = uidna_nameToASCII_UTF8(uidna, + input, length, + **buf, buf->length(), + &info, + &status); + + if (status == U_BUFFER_OVERFLOW_ERROR) { + status = U_ZERO_ERROR; + buf->AllocateSufficientStorage(len); + len = uidna_nameToASCII_UTF8(uidna, + input, length, + **buf, buf->length(), + &info, + &status); + } + + if (U_FAILURE(status)) + len = -1; + + uidna_close(uidna); + return len; +} + +static void ToUnicode(const FunctionCallbackInfo<Value>& args) { + Environment* env = Environment::GetCurrent(args); + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsString()); + Utf8Value val(env->isolate(), args[0]); + MaybeStackBuffer<char> buf; + int32_t len = ToUnicode(&buf, *val, val.length()); + + if (len < 0) { + return env->ThrowError("Cannot convert name to Unicode"); + } + + args.GetReturnValue().Set( + String::NewFromUtf8(env->isolate(), + *buf, + v8::NewStringType::kNormal, + len).ToLocalChecked()); +} + +static void ToASCII(const FunctionCallbackInfo<Value>& args) { + Environment* env = Environment::GetCurrent(args); + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsString()); + Utf8Value val(env->isolate(), args[0]); + MaybeStackBuffer<char> buf; + int32_t len = ToASCII(&buf, *val, val.length()); + + if (len < 0) { + return env->ThrowError("Cannot convert name to ASCII"); + } + + args.GetReturnValue().Set( + String::NewFromUtf8(env->isolate(), + *buf, + v8::NewStringType::kNormal, + len).ToLocalChecked()); +} + +void Init(Local<Object> target, + Local<Value> unused, + Local<Context> context, + void* priv) { + Environment* env = Environment::GetCurrent(context); + env->SetMethod(target, "toUnicode", ToUnicode); + env->SetMethod(target, "toASCII", ToASCII); +} + } // namespace i18n } // namespace node +NODE_MODULE_CONTEXT_AWARE_BUILTIN(icu, node::i18n::Init) + #endif // NODE_HAVE_I18N_SUPPORT diff --git a/test/parallel/test-icu-punycode.js b/test/parallel/test-icu-punycode.js new file mode 100644 index 0000000000..d9b36e7df7 --- /dev/null +++ b/test/parallel/test-icu-punycode.js @@ -0,0 +1,72 @@ +'use strict'; + +const common = require('../common'); +const icu = getPunycode(); +const assert = require('assert'); + +function getPunycode() { + try { + return process.binding('icu'); + } catch (err) { + return undefined; + } +} + +if (!icu) { + common.skip('icu punycode tests because ICU is not present.'); + return; +} + +// Credit for list: http://www.i18nguy.com/markup/idna-examples.html +const tests = [ + 'افغانستا.icom.museum', + 'الجزائر.icom.museum', + 'österreich.icom.museum', + 'বাংলাদেশ.icom.museum', + 'беларусь.icom.museum', + 'belgië.icom.museum', + 'българия.icom.museum', + 'تشادر.icom.museum', + '中国.icom.museum', + 'القمر.icom.museum', + 'κυπρος.icom.museum', + 'českárepublika.icom.museum', + 'مصر.icom.museum', + 'ελλάδα.icom.museum', + 'magyarország.icom.museum', + 'ísland.icom.museum', + 'भारत.icom.museum', + 'ايران.icom.museum', + 'éire.icom.museum', + 'איקו״ם.ישראל.museum', + '日本.icom.museum', + 'الأردن.icom.museum', + 'қазақстан.icom.museum', + '한국.icom.museum', + 'кыргызстан.icom.museum', + 'ລາວ.icom.museum', + 'لبنان.icom.museum', + 'македонија.icom.museum', + 'موريتانيا.icom.museum', + 'méxico.icom.museum', + 'монголулс.icom.museum', + 'المغرب.icom.museum', + 'नेपाल.icom.museum', + 'عمان.icom.museum', + 'قطر.icom.museum', + 'românia.icom.museum', + 'россия.иком.museum', + 'србијаицрнагора.иком.museum', + 'இலங்கை.icom.museum', + 'españa.icom.museum', + 'ไทย.icom.museum', + 'تونس.icom.museum', + 'türkiye.icom.museum', + 'украина.icom.museum', + 'việtnam.icom.museum' +]; + +// Testing the roundtrip +tests.forEach((i) => { + assert.strictEqual(i, icu.toUnicode(icu.toASCII(i))); +}); diff --git a/tools/icu/icu-generic.gyp b/tools/icu/icu-generic.gyp index a61b294141..9d466ac392 100644 --- a/tools/icu/icu-generic.gyp +++ b/tools/icu/icu-generic.gyp @@ -37,8 +37,7 @@ 'defines': [ # ICU cannot swap the initial data without this. # http://bugs.icu-project.org/trac/ticket/11046 - 'UCONFIG_NO_LEGACY_CONVERSION=1', - 'UCONFIG_NO_IDNA=1', + 'UCONFIG_NO_LEGACY_CONVERSION=1' ], }], ], @@ -428,9 +427,6 @@ #'<(icu_path)/source/common/ubidi_props_data.h', # and the callers '<(icu_path)/source/common/ushape.cpp', - '<(icu_path)/source/common/usprep.cpp', - '<(icu_path)/source/common/uts46.cpp', - '<(icu_path)/source/common/uidna.cpp', ]}], [ 'icu_ver_major == 57', { 'sources!': [ # work around http://bugs.icu-project.org/trac/ticket/12451 @@ -447,9 +443,6 @@ #'<(icu_path)/source/common/ubidi_props_data.h', # and the callers '<(icu_path)/source/common/ushape.cpp', - '<(icu_path)/source/common/usprep.cpp', - '<(icu_path)/source/common/uts46.cpp', - '<(icu_path)/source/common/uidna.cpp', ]}], [ 'OS == "solaris"', { 'defines': [ '_XOPEN_SOURCE_EXTENDED=0', diff --git a/tools/icu/icu_small.json b/tools/icu/icu_small.json index e434794e91..de26e2cbb1 100644 --- a/tools/icu/icu_small.json +++ b/tools/icu/icu_small.json @@ -24,7 +24,7 @@ "region": "none", "zone": "locales", "converters": "none", - "stringprep": "none", + "stringprep": "locales", "translit": "none", "brkfiles": "none", "brkdict": "none", @@ -34,7 +34,6 @@ "remove": [ "cnvalias.icu", "postalCodeData.res", - "uts46.nrm", "genderList.res", "brkitr/root.res", "unames.icu" |