summaryrefslogtreecommitdiff
path: root/deps/icu-small/source/i18n/uspoof_impl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'deps/icu-small/source/i18n/uspoof_impl.cpp')
-rw-r--r--deps/icu-small/source/i18n/uspoof_impl.cpp624
1 files changed, 348 insertions, 276 deletions
diff --git a/deps/icu-small/source/i18n/uspoof_impl.cpp b/deps/icu-small/source/i18n/uspoof_impl.cpp
index 111a1489ff..fba742c3ff 100644
--- a/deps/icu-small/source/i18n/uspoof_impl.cpp
+++ b/deps/icu-small/source/i18n/uspoof_impl.cpp
@@ -1,3 +1,5 @@
+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2008-2016, International Business Machines
@@ -13,11 +15,11 @@
#include "utrie2.h"
#include "cmemory.h"
#include "cstring.h"
-#include "identifier_info.h"
#include "scriptset.h"
#include "umutex.h"
#include "udataswp.h"
#include "uassert.h"
+#include "ucln_in.h"
#include "uspoof_impl.h"
#if !UCONFIG_NO_NORMALIZATION
@@ -27,41 +29,53 @@ U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
-SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
- fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(data), fAllowedCharsSet(NULL) ,
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
- if (U_FAILURE(status)) {
- return;
- }
+SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
+ construct(status);
+ fSpoofData = data;
+}
+
+SpoofImpl::SpoofImpl(UErrorCode& status) {
+ construct(status);
+
+ // TODO: Call this method where it is actually needed, instead of in the
+ // constructor, to allow for lazy data loading. See #12696.
+ fSpoofData = SpoofData::getDefault(status);
+}
+
+SpoofImpl::SpoofImpl() {
+ UErrorCode status = U_ZERO_ERROR;
+ construct(status);
+
+ // TODO: Call this method where it is actually needed, instead of in the
+ // constructor, to allow for lazy data loading. See #12696.
+ fSpoofData = SpoofData::getDefault(status);
+}
+
+void SpoofImpl::construct(UErrorCode& status) {
+ fMagic = USPOOF_MAGIC;
+ fChecks = USPOOF_ALL_CHECKS;
+ fSpoofData = NULL;
+ fAllowedCharsSet = NULL;
+ fAllowedLocales = NULL;
fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
+ if (U_FAILURE(status)) { return; }
+
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
- allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
fAllowedLocales = uprv_strdup("");
if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
- fMagic = USPOOF_MAGIC;
-}
-
-
-SpoofImpl::SpoofImpl() :
- fMagic(USPOOF_MAGIC), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
- UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
- fAllowedCharsSet = allowedCharsSet;
- fAllowedLocales = uprv_strdup("");
- fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
}
// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
- fAllowedLocales(NULL), fCachedIdentifierInfo(NULL) {
+ fAllowedLocales(NULL) {
if (U_FAILURE(status)) {
return;
}
@@ -71,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
fSpoofData = src.fSpoofData->addReference();
}
fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
- if (fAllowedCharsSet == NULL) {
+ fAllowedLocales = uprv_strdup(src.fAllowedLocales);
+ if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
- fAllowedLocales = uprv_strdup(src.fAllowedLocales);
fRestrictionLevel = src.fRestrictionLevel;
}
@@ -86,7 +100,11 @@ SpoofImpl::~SpoofImpl() {
}
delete fAllowedCharsSet;
uprv_free((void *)fAllowedLocales);
- delete fCachedIdentifierInfo;
+}
+
+// Cast this instance as a USpoofChecker for the C API.
+USpoofChecker *SpoofImpl::asUSpoofChecker() {
+ return reinterpret_cast<USpoofChecker*>(this);
}
//
@@ -102,12 +120,11 @@ const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &st
return NULL;
}
SpoofImpl *This = (SpoofImpl *)sc;
- if (This->fMagic != USPOOF_MAGIC ||
- This->fSpoofData == NULL) {
+ if (This->fMagic != USPOOF_MAGIC) {
status = U_INVALID_FORMAT_ERROR;
return NULL;
}
- if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
+ if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
return NULL;
}
return This;
@@ -119,148 +136,6 @@ SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
}
-
-//--------------------------------------------------------------------------------------
-//
-// confusableLookup() This is the heart of the confusable skeleton generation
-// implementation.
-//
-// Given a source character, produce the corresponding
-// replacement character(s), appending them to the dest string.
-//
-//---------------------------------------------------------------------------------------
-int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UnicodeString &dest) const {
-
- // Binary search the spoof data key table for the inChar
- int32_t *low = fSpoofData->fCFUKeys;
- int32_t *mid = NULL;
- int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
- UChar32 midc;
- do {
- int32_t delta = ((int32_t)(limit-low))/2;
- mid = low + delta;
- midc = *mid & 0x1fffff;
- if (inChar == midc) {
- goto foundChar;
- } else if (inChar < midc) {
- limit = mid;
- } else {
- low = mid;
- }
- } while (low < limit-1);
- mid = low;
- midc = *mid & 0x1fffff;
- if (inChar != midc) {
- // Char not found. It maps to itself.
- int i = 0;
- dest.append(inChar);
- return i;
- }
- foundChar:
- int32_t keyFlags = *mid & 0xff000000;
- if ((keyFlags & tableMask) == 0) {
- // We found the right key char, but the entry doesn't pertain to the
- // table we need. See if there is an adjacent key that does
- if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
- int32_t *altMid;
- for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
- keyFlags = *altMid & 0xff000000;
- if (keyFlags & tableMask) {
- mid = altMid;
- goto foundKey;
- }
- }
- for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
- keyFlags = *altMid & 0xff000000;
- if (keyFlags & tableMask) {
- mid = altMid;
- goto foundKey;
- }
- }
- }
- // No key entry for this char & table.
- // The input char maps to itself.
- int i = 0;
- dest.append(inChar);
- return i;
- }
-
- foundKey:
- int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
- int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
-
- // Value is either a UChar (for strings of length 1) or
- // an index into the string table (for longer strings)
- uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
- if (stringLen == 1) {
- dest.append((UChar)value);
- return 1;
- }
-
- // String length of 4 from the above lookup is used for all strings of length >= 4.
- // For these, get the real length from the string lengths table,
- // which maps string table indexes to lengths.
- // All strings of the same length are stored contiguously in the string table.
- // 'value' from the lookup above is the starting index for the desired string.
-
- int32_t ix;
- if (stringLen == 4) {
- int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
- for (ix = 0; ix < stringLengthsLimit; ix++) {
- if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
- stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
- break;
- }
- }
- U_ASSERT(ix < stringLengthsLimit);
- }
-
- U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
- UChar *src = &fSpoofData->fCFUStrings[value];
- dest.append(src, stringLen);
- return stringLen;
-}
-
-
-//---------------------------------------------------------------------------------------
-//
-// wholeScriptCheck()
-//
-// Input text is already normalized to NFD
-// Return the set of scripts, each of which can represent something that is
-// confusable with the input text. The script of the input text
-// is included; input consisting of characters from a single script will
-// always produce a result consisting of a set containing that script.
-//
-//---------------------------------------------------------------------------------------
-void SpoofImpl::wholeScriptCheck(
- const UnicodeString &text, ScriptSet *result, UErrorCode &status) const {
-
- UTrie2 *table =
- (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
- result->setAll();
- int32_t length = text.length();
- for (int32_t inputIdx=0; inputIdx < length;) {
- UChar32 c = text.char32At(inputIdx);
- inputIdx += U16_LENGTH(c);
- uint32_t index = utrie2_get32(table, c);
- if (index == 0) {
- // No confusables in another script for this char.
- // TODO: we should change the data to have sets with just the single script
- // bit for the script of this char. Gets rid of this special case.
- // Until then, grab the script from the char and intersect it with the set.
- UScriptCode cpScript = uscript_getScript(c, &status);
- U_ASSERT(cpScript > USCRIPT_INHERITED);
- result->intersect(cpScript, status);
- } else if (index == 1) {
- // Script == Common or Inherited. Nothing to do.
- } else {
- result->intersect(fSpoofData->fScriptSets[index]);
- }
- }
-}
-
-
void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
UnicodeSet allowedChars;
UnicodeSet *tmpSet = NULL;
@@ -372,6 +247,137 @@ void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UEr
}
}
+// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
+void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
+ result.resetAll();
+ result.setScriptExtensions(codePoint, status);
+ if (U_FAILURE(status)) { return; }
+
+ // Section 5.1 step 1
+ if (result.test(USCRIPT_HAN, status)) {
+ result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
+ result.set(USCRIPT_JAPANESE, status);
+ result.set(USCRIPT_KOREAN, status);
+ }
+ if (result.test(USCRIPT_HIRAGANA, status)) {
+ result.set(USCRIPT_JAPANESE, status);
+ }
+ if (result.test(USCRIPT_KATAKANA, status)) {
+ result.set(USCRIPT_JAPANESE, status);
+ }
+ if (result.test(USCRIPT_HANGUL, status)) {
+ result.set(USCRIPT_KOREAN, status);
+ }
+ if (result.test(USCRIPT_BOPOMOFO, status)) {
+ result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
+ }
+
+ // Section 5.1 step 2
+ if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
+ result.setAll();
+ }
+}
+
+// Computes the resolved script set for a string, according to UTS 39 section 5.1.
+void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
+ getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
+}
+
+// Computes the resolved script set for a string, omitting characters having the specified script.
+// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
+void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
+ result.setAll();
+
+ ScriptSet temp;
+ UChar32 codePoint;
+ for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
+ codePoint = input.char32At(i);
+
+ // Compute the augmented script set for the character
+ getAugmentedScriptSet(codePoint, temp, status);
+ if (U_FAILURE(status)) { return; }
+
+ // Intersect the augmented script set with the resolved script set, but only if the character doesn't
+ // have the script specified in the function call
+ if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
+ result.intersect(temp);
+ }
+ }
+}
+
+// Computes the set of numerics for a string, according to UTS 39 section 5.3.
+void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
+ result.clear();
+
+ UChar32 codePoint;
+ for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
+ codePoint = input.char32At(i);
+
+ // Store a representative character for each kind of decimal digit
+ if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
+ // Store the zero character as a representative for comparison.
+ // Unicode guarantees it is codePoint - value
+ result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
+ }
+ }
+}
+
+// Computes the restriction level of a string, according to UTS 39 section 5.2.
+URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
+ // Section 5.2 step 1:
+ if (!fAllowedCharsSet->containsAll(input)) {
+ return USPOOF_UNRESTRICTIVE;
+ }
+
+ // Section 5.2 step 2
+ // Java use a static UnicodeSet for this test. In C++, avoid the static variable
+ // and just do a simple for loop.
+ UBool allASCII = TRUE;
+ for (int32_t i=0, length=input.length(); i<length; i++) {
+ if (input.charAt(i) > 0x7f) {
+ allASCII = FALSE;
+ break;
+ }
+ }
+ if (allASCII) {
+ return USPOOF_ASCII;
+ }
+
+ // Section 5.2 steps 3:
+ ScriptSet resolvedScriptSet;
+ getResolvedScriptSet(input, resolvedScriptSet, status);
+ if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
+
+ // Section 5.2 step 4:
+ if (!resolvedScriptSet.isEmpty()) {
+ return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
+ }
+
+ // Section 5.2 step 5:
+ ScriptSet resolvedNoLatn;
+ getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
+ if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
+
+ // Section 5.2 step 6:
+ if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
+ || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
+ || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
+ return USPOOF_HIGHLY_RESTRICTIVE;
+ }
+
+ // Section 5.2 step 7:
+ if (!resolvedNoLatn.isEmpty()
+ && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
+ && !resolvedNoLatn.test(USCRIPT_GREEK, status)
+ && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
+ return USPOOF_MODERATELY_RESTRICTIVE;
+ }
+
+ // Section 5.2 step 8:
+ return USPOOF_MINIMALLY_RESTRICTIVE;
+}
+
+
// Convert a text format hex number. Utility function used by builder code. Static.
// Input: UChar *string text. Output: a UChar32
@@ -404,55 +410,60 @@ UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorC
return (UChar32)val;
}
-// IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
-// Maintain a one-element cache, which is sufficient to avoid repeatedly
-// creating new ones unless we get multi-thread concurrency in spoof
-// check operations, which should be statistically uncommon.
-// These functions are used in place of new & delete of an IdentifierInfo.
-// They will recycle the IdentifierInfo when possible.
-// They are logically const, and used within const functions that must be thread safe.
-IdentifierInfo *SpoofImpl::getIdentifierInfo(UErrorCode &status) const {
- IdentifierInfo *returnIdInfo = NULL;
- if (U_FAILURE(status)) {
- return returnIdInfo;
- }
- SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
- {
- Mutex m;
- returnIdInfo = nonConstThis->fCachedIdentifierInfo;
- nonConstThis->fCachedIdentifierInfo = NULL;
- }
- if (returnIdInfo == NULL) {
- returnIdInfo = new IdentifierInfo(status);
- if (U_SUCCESS(status) && returnIdInfo == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
- if (U_FAILURE(status) && returnIdInfo != NULL) {
- delete returnIdInfo;
- returnIdInfo = NULL;
- }
- }
- return returnIdInfo;
+//-----------------------------------------
+//
+// class CheckResult Implementation
+//
+//-----------------------------------------
+
+CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
+ clear();
}
+USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
+ return reinterpret_cast<USpoofCheckResult*>(this);
+}
-void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
- if (idInfo != NULL) {
- SpoofImpl *nonConstThis = const_cast<SpoofImpl *>(this);
- {
- Mutex m;
- if (nonConstThis->fCachedIdentifierInfo == NULL) {
- nonConstThis->fCachedIdentifierInfo = idInfo;
- idInfo = NULL;
- }
- }
- delete idInfo;
+//
+// Incoming parameter check on Status and the CheckResult object
+// received from the C API.
+//
+const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
+ if (U_FAILURE(status)) { return NULL; }
+ if (ptr == NULL) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return NULL;
}
+ CheckResult *This = (CheckResult*) ptr;
+ if (This->fMagic != USPOOF_CHECK_MAGIC) {
+ status = U_INVALID_FORMAT_ERROR;
+ return NULL;
+ }
+ return This;
+}
+
+CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
+ return const_cast<CheckResult *>
+ (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
}
+void CheckResult::clear() {
+ fChecks = 0;
+ fNumerics.clear();
+ fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
+}
+int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
+ if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
+ return fChecks | fRestrictionLevel;
+ } else {
+ return fChecks;
+ }
+}
+CheckResult::~CheckResult() {
+}
//----------------------------------------------------------------------------------------------
//
@@ -461,12 +472,14 @@ void SpoofImpl::releaseIdentifierInfo(IdentifierInfo *idInfo) const {
//----------------------------------------------------------------------------------------------
-UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
+UBool SpoofData::validateDataVersion(UErrorCode &status) const {
if (U_FAILURE(status) ||
- rawData == NULL ||
- rawData->fMagic != USPOOF_MAGIC ||
- rawData->fFormatVersion[0] > 1 ||
- rawData->fFormatVersion[1] > 0) {
+ fRawData == NULL ||
+ fRawData->fMagic != USPOOF_MAGIC ||
+ fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
+ fRawData->fFormatVersion[1] != 0 ||
+ fRawData->fFormatVersion[2] != 0 ||
+ fRawData->fFormatVersion[3] != 0) {
status = U_INVALID_FORMAT_ERROR;
return FALSE;
}
@@ -485,7 +498,7 @@ spoofDataIsAcceptable(void *context,
pInfo->dataFormat[1] == 0x66 &&
pInfo->dataFormat[2] == 0x75 &&
pInfo->dataFormat[3] == 0x20 &&
- pInfo->formatVersion[0] == 1
+ pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
) {
UVersionInfo *version = static_cast<UVersionInfo *>(context);
if(version != NULL) {
@@ -497,32 +510,61 @@ spoofDataIsAcceptable(void *context,
}
}
+// Methods for the loading of the default confusables data file. The confusable
+// data is loaded only when it is needed.
+//
+// SpoofData::getDefault() - Return the default confusables data, and call the
+// initOnce() if it is not available. Adds a reference
+// to the SpoofData that the caller is responsible for
+// decrementing when they are done with the data.
//
-// SpoofData::getDefault() - return a wrapper around the spoof data that is
-// baked into the default ICU data.
+// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
+// is shared by all spoof checkers using the default data.
//
-// Called once, from the initOnce() function in uspoof_impl.cpp; the resulting
-// SpoofData is shared by all spoof checkers using the default data.
+// uspoof_cleanupDefaultData - Called during cleanup.
//
-SpoofData *SpoofData::getDefault(UErrorCode &status) {
+
+static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
+static SpoofData* gDefaultSpoofData;
+
+static UBool U_CALLCONV
+uspoof_cleanupDefaultData(void) {
+ if (gDefaultSpoofData) {
+ // Will delete, assuming all user-level spoof checkers were closed.
+ gDefaultSpoofData->removeReference();
+ gDefaultSpoofData = NULL;
+ gSpoofInitDefaultOnce.reset();
+ }
+ return TRUE;
+}
+
+static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
spoofDataIsAcceptable,
NULL, // context, would receive dataVersion if supplied.
&status);
+ if (U_FAILURE(status)) { return; }
+ gDefaultSpoofData = new SpoofData(udm, status);
if (U_FAILURE(status)) {
- return NULL;
- }
- SpoofData *This = new SpoofData(udm, status);
- if (U_FAILURE(status)) {
- delete This;
- return NULL;
+ delete gDefaultSpoofData;
+ return;
}
- if (This == NULL) {
+ if (gDefaultSpoofData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
+ return;
}
- return This;
+ ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
+}
+
+SpoofData* SpoofData::getDefault(UErrorCode& status) {
+ umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
+ if (U_FAILURE(status)) { return NULL; }
+ gDefaultSpoofData->addReference();
+ return gDefaultSpoofData;
}
+
+
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
{
reset();
@@ -533,7 +575,7 @@ SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
// fRawData is non-const because it may be constructed by the data builder.
fRawData = reinterpret_cast<SpoofDataHeader *>(
const_cast<void *>(udata_getMemory(udm)));
- validateDataVersion(fRawData, status);
+ validateDataVersion(status);
initPtrs(status);
}
@@ -554,7 +596,7 @@ SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
status = U_INVALID_FORMAT_ERROR;
return;
}
- validateDataVersion(fRawData, status);
+ validateDataVersion(status);
initPtrs(status);
}
@@ -582,7 +624,7 @@ SpoofData::SpoofData(UErrorCode &status) {
uprv_memset(fRawData, 0, initialSize);
fRawData->fMagic = USPOOF_MAGIC;
- fRawData->fFormatVersion[0] = 1;
+ fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
fRawData->fFormatVersion[1] = 0;
fRawData->fFormatVersion[2] = 0;
fRawData->fFormatVersion[3] = 0;
@@ -600,11 +642,7 @@ void SpoofData::reset() {
fRefCount = 1;
fCFUKeys = NULL;
fCFUValues = NULL;
- fCFUStringLengths = NULL;
fCFUStrings = NULL;
- fAnyCaseTrie = NULL;
- fLowerCaseTrie = NULL;
- fScriptSets = NULL;
}
@@ -626,7 +664,6 @@ void SpoofData::reset() {
void SpoofData::initPtrs(UErrorCode &status) {
fCFUKeys = NULL;
fCFUValues = NULL;
- fCFUStringLengths = NULL;
fCFUStrings = NULL;
if (U_FAILURE(status)) {
return;
@@ -637,33 +674,13 @@ void SpoofData::initPtrs(UErrorCode &status) {
if (fRawData->fCFUStringIndex != 0) {
fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
}
- if (fRawData->fCFUStringLengths != 0) {
- fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
- }
if (fRawData->fCFUStringTable != 0) {
fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
}
-
- if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
- fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
- (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
- }
- if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
- fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
- (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
- }
-
- if (fRawData->fScriptSets != 0) {
- fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
- }
}
SpoofData::~SpoofData() {
- utrie2_close(fAnyCaseTrie);
- fAnyCaseTrie = NULL;
- utrie2_close(fLowerCaseTrie);
- fLowerCaseTrie = NULL;
if (fDataOwned) {
uprv_free(fRawData);
}
@@ -708,6 +725,78 @@ void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
return (char *)fRawData + returnOffset;
}
+int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
+ int32_t dataSize = fRawData->fLength;
+ if (capacity < dataSize) {
+ status = U_BUFFER_OVERFLOW_ERROR;
+ return dataSize;
+ }
+ uprv_memcpy(buf, fRawData, dataSize);
+ return dataSize;
+}
+
+int32_t SpoofData::size() const {
+ return fRawData->fLength;
+}
+
+//-------------------------------
+//
+// Front-end APIs for SpoofData
+//
+//-------------------------------
+
+int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
+ // Perform a binary search.
+ // [lo, hi), i.e lo is inclusive, hi is exclusive.
+ // The result after the loop will be in lo.
+ int32_t lo = 0;
+ int32_t hi = length();
+ do {
+ int32_t mid = (lo + hi) / 2;
+ if (codePointAt(mid) > inChar) {
+ hi = mid;
+ } else if (codePointAt(mid) < inChar) {
+ lo = mid;
+ } else {
+ // Found result. Break early.
+ lo = mid;
+ break;
+ }
+ } while (hi - lo > 1);
+
+ // Did we find an entry? If not, the char maps to itself.
+ if (codePointAt(lo) != inChar) {
+ dest.append(inChar);
+ return 1;
+ }
+
+ // Add the element to the string builder and return.
+ return appendValueTo(lo, dest);
+}
+
+int32_t SpoofData::length() const {
+ return fRawData->fCFUKeysSize;
+}
+
+UChar32 SpoofData::codePointAt(int32_t index) const {
+ return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
+}
+
+int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
+ int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
+
+ // Value is either a char (for strings of length 1) or
+ // an index into the string table (for longer strings)
+ uint16_t value = fCFUValues[index];
+ if (stringLength == 1) {
+ dest.append((UChar)value);
+ } else {
+ dest.append(fCFUStrings + value, stringLength);
+ }
+
+ return stringLength;
+}
+
U_NAMESPACE_END
@@ -739,7 +828,10 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
pInfo->dataFormat[1]==0x66 &&
pInfo->dataFormat[2]==0x75 &&
pInfo->dataFormat[3]==0x20 &&
- pInfo->formatVersion[0]==1 )) {
+ pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
+ pInfo->formatVersion[1]==0 &&
+ pInfo->formatVersion[2]==0 &&
+ pInfo->formatVersion[3]==0 )) {
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
"(format version %02x %02x %02x %02x) is not recognized\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
@@ -828,26 +920,6 @@ uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *ou
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
- // String Lengths Section
- sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
- sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
- ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
- // Any Case Trie
- sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
- sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
- utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
- // Lower Case Trie
- sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
- sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
- utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
- // Script Sets. The data is an array of int32_t
- sectionStart = ds->readUInt32(spoofDH->fScriptSets);
- sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
- ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
-
// And, last, swap the header itself.
// int32_t fMagic // swap this
// uint8_t fFormatVersion[4] // Do not swap this, just copy