uspoof_impl.cpp 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2008-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. */
  9. #include "unicode/utypes.h"
  10. #include "unicode/uspoof.h"
  11. #include "unicode/uchar.h"
  12. #include "unicode/uniset.h"
  13. #include "unicode/utf16.h"
  14. #include "utrie2.h"
  15. #include "cmemory.h"
  16. #include "cstring.h"
  17. #include "scriptset.h"
  18. #include "umutex.h"
  19. #include "udataswp.h"
  20. #include "uassert.h"
  21. #include "ucln_in.h"
  22. #include "uspoof_impl.h"
  23. #if !UCONFIG_NO_NORMALIZATION
  24. U_NAMESPACE_BEGIN
  25. UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
  26. SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
  27. construct(status);
  28. fSpoofData = data;
  29. }
  30. SpoofImpl::SpoofImpl(UErrorCode& status) {
  31. construct(status);
  32. // TODO: Call this method where it is actually needed, instead of in the
  33. // constructor, to allow for lazy data loading. See #12696.
  34. fSpoofData = SpoofData::getDefault(status);
  35. }
  36. SpoofImpl::SpoofImpl() {
  37. UErrorCode status = U_ZERO_ERROR;
  38. construct(status);
  39. // TODO: Call this method where it is actually needed, instead of in the
  40. // constructor, to allow for lazy data loading. See #12696.
  41. fSpoofData = SpoofData::getDefault(status);
  42. }
  43. void SpoofImpl::construct(UErrorCode& status) {
  44. fChecks = USPOOF_ALL_CHECKS;
  45. fSpoofData = nullptr;
  46. fAllowedCharsSet = nullptr;
  47. fAllowedLocales = nullptr;
  48. fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
  49. if (U_FAILURE(status)) { return; }
  50. UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
  51. fAllowedCharsSet = allowedCharsSet;
  52. fAllowedLocales = uprv_strdup("");
  53. if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
  54. status = U_MEMORY_ALLOCATION_ERROR;
  55. return;
  56. }
  57. allowedCharsSet->freeze();
  58. }
  59. // Copy Constructor, used by the user level clone() function.
  60. SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
  61. fChecks(USPOOF_ALL_CHECKS), fSpoofData(nullptr), fAllowedCharsSet(nullptr) ,
  62. fAllowedLocales(nullptr) {
  63. if (U_FAILURE(status)) {
  64. return;
  65. }
  66. fChecks = src.fChecks;
  67. if (src.fSpoofData != nullptr) {
  68. fSpoofData = src.fSpoofData->addReference();
  69. }
  70. fAllowedCharsSet = src.fAllowedCharsSet->clone();
  71. fAllowedLocales = uprv_strdup(src.fAllowedLocales);
  72. if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
  73. status = U_MEMORY_ALLOCATION_ERROR;
  74. }
  75. fRestrictionLevel = src.fRestrictionLevel;
  76. }
  77. SpoofImpl::~SpoofImpl() {
  78. if (fSpoofData != nullptr) {
  79. fSpoofData->removeReference(); // Will delete if refCount goes to zero.
  80. }
  81. delete fAllowedCharsSet;
  82. uprv_free((void *)fAllowedLocales);
  83. }
  84. // Cast this instance as a USpoofChecker for the C API.
  85. USpoofChecker *SpoofImpl::asUSpoofChecker() {
  86. return exportForC();
  87. }
  88. //
  89. // Incoming parameter check on Status and the SpoofChecker object
  90. // received from the C API.
  91. //
  92. const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
  93. auto* This = validate(sc, status);
  94. if (U_FAILURE(status)) {
  95. return nullptr;
  96. }
  97. if (This->fSpoofData != nullptr && !This->fSpoofData->validateDataVersion(status)) {
  98. return nullptr;
  99. }
  100. return This;
  101. }
  102. SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
  103. return const_cast<SpoofImpl *>
  104. (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
  105. }
  106. void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
  107. UnicodeSet allowedChars;
  108. UnicodeSet *tmpSet = nullptr;
  109. const char *locStart = localesList;
  110. const char *locEnd = nullptr;
  111. const char *localesListEnd = localesList + uprv_strlen(localesList);
  112. int32_t localeListCount = 0; // Number of locales provided by caller.
  113. // Loop runs once per locale from the localesList, a comma separated list of locales.
  114. do {
  115. locEnd = uprv_strchr(locStart, ',');
  116. if (locEnd == nullptr) {
  117. locEnd = localesListEnd;
  118. }
  119. while (*locStart == ' ') {
  120. locStart++;
  121. }
  122. const char *trimmedEnd = locEnd-1;
  123. while (trimmedEnd > locStart && *trimmedEnd == ' ') {
  124. trimmedEnd--;
  125. }
  126. if (trimmedEnd <= locStart) {
  127. break;
  128. }
  129. const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
  130. localeListCount++;
  131. // We have one locale from the locales list.
  132. // Add the script chars for this locale to the accumulating set of allowed chars.
  133. // If the locale is no good, we will be notified back via status.
  134. addScriptChars(locale, &allowedChars, status);
  135. uprv_free((void *)locale);
  136. if (U_FAILURE(status)) {
  137. break;
  138. }
  139. locStart = locEnd + 1;
  140. } while (locStart < localesListEnd);
  141. // If our caller provided an empty list of locales, we disable the allowed characters checking
  142. if (localeListCount == 0) {
  143. uprv_free((void *)fAllowedLocales);
  144. fAllowedLocales = uprv_strdup("");
  145. tmpSet = new UnicodeSet(0, 0x10ffff);
  146. if (fAllowedLocales == nullptr || tmpSet == nullptr) {
  147. status = U_MEMORY_ALLOCATION_ERROR;
  148. return;
  149. }
  150. tmpSet->freeze();
  151. delete fAllowedCharsSet;
  152. fAllowedCharsSet = tmpSet;
  153. fChecks &= ~USPOOF_CHAR_LIMIT;
  154. return;
  155. }
  156. // Add all common and inherited characters to the set of allowed chars.
  157. UnicodeSet tempSet;
  158. tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
  159. allowedChars.addAll(tempSet);
  160. tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
  161. allowedChars.addAll(tempSet);
  162. // If anything went wrong, we bail out without changing
  163. // the state of the spoof checker.
  164. if (U_FAILURE(status)) {
  165. return;
  166. }
  167. // Store the updated spoof checker state.
  168. tmpSet = allowedChars.clone();
  169. const char *tmpLocalesList = uprv_strdup(localesList);
  170. if (tmpSet == nullptr || tmpLocalesList == nullptr) {
  171. status = U_MEMORY_ALLOCATION_ERROR;
  172. return;
  173. }
  174. uprv_free((void *)fAllowedLocales);
  175. fAllowedLocales = tmpLocalesList;
  176. tmpSet->freeze();
  177. delete fAllowedCharsSet;
  178. fAllowedCharsSet = tmpSet;
  179. fChecks |= USPOOF_CHAR_LIMIT;
  180. }
  181. const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
  182. return fAllowedLocales;
  183. }
  184. // Given a locale (a language), add all the characters from all of the scripts used with that language
  185. // to the allowedChars UnicodeSet
  186. void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
  187. UScriptCode scripts[30];
  188. int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
  189. if (U_FAILURE(status)) {
  190. return;
  191. }
  192. if (status == U_USING_DEFAULT_WARNING) {
  193. status = U_ILLEGAL_ARGUMENT_ERROR;
  194. return;
  195. }
  196. UnicodeSet tmpSet;
  197. int32_t i;
  198. for (i=0; i<numScripts; i++) {
  199. tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
  200. allowedChars->addAll(tmpSet);
  201. }
  202. }
  203. // Computes the augmented script set for a code point, according to UTS 39 section 5.1.
  204. void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
  205. result.resetAll();
  206. result.setScriptExtensions(codePoint, status);
  207. if (U_FAILURE(status)) { return; }
  208. // Section 5.1 step 1
  209. if (result.test(USCRIPT_HAN, status)) {
  210. result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
  211. result.set(USCRIPT_JAPANESE, status);
  212. result.set(USCRIPT_KOREAN, status);
  213. }
  214. if (result.test(USCRIPT_HIRAGANA, status)) {
  215. result.set(USCRIPT_JAPANESE, status);
  216. }
  217. if (result.test(USCRIPT_KATAKANA, status)) {
  218. result.set(USCRIPT_JAPANESE, status);
  219. }
  220. if (result.test(USCRIPT_HANGUL, status)) {
  221. result.set(USCRIPT_KOREAN, status);
  222. }
  223. if (result.test(USCRIPT_BOPOMOFO, status)) {
  224. result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
  225. }
  226. // Section 5.1 step 2
  227. if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
  228. result.setAll();
  229. }
  230. }
  231. // Computes the resolved script set for a string, according to UTS 39 section 5.1.
  232. void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
  233. getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
  234. }
  235. // Computes the resolved script set for a string, omitting characters having the specified script.
  236. // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
  237. void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
  238. result.setAll();
  239. ScriptSet temp;
  240. UChar32 codePoint;
  241. for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
  242. codePoint = input.char32At(i);
  243. // Compute the augmented script set for the character
  244. getAugmentedScriptSet(codePoint, temp, status);
  245. if (U_FAILURE(status)) { return; }
  246. // Intersect the augmented script set with the resolved script set, but only if the character doesn't
  247. // have the script specified in the function call
  248. if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
  249. result.intersect(temp);
  250. }
  251. }
  252. }
  253. // Computes the set of numerics for a string, according to UTS 39 section 5.3.
  254. void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
  255. result.clear();
  256. UChar32 codePoint;
  257. for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
  258. codePoint = input.char32At(i);
  259. // Store a representative character for each kind of decimal digit
  260. if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
  261. // Store the zero character as a representative for comparison.
  262. // Unicode guarantees it is codePoint - value
  263. result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
  264. }
  265. }
  266. }
  267. // Computes the restriction level of a string, according to UTS 39 section 5.2.
  268. URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
  269. // Section 5.2 step 1:
  270. if (!fAllowedCharsSet->containsAll(input)) {
  271. return USPOOF_UNRESTRICTIVE;
  272. }
  273. // Section 5.2 step 2
  274. // Java use a static UnicodeSet for this test. In C++, avoid the static variable
  275. // and just do a simple for loop.
  276. UBool allASCII = true;
  277. for (int32_t i=0, length=input.length(); i<length; i++) {
  278. if (input.charAt(i) > 0x7f) {
  279. allASCII = false;
  280. break;
  281. }
  282. }
  283. if (allASCII) {
  284. return USPOOF_ASCII;
  285. }
  286. // Section 5.2 steps 3:
  287. ScriptSet resolvedScriptSet;
  288. getResolvedScriptSet(input, resolvedScriptSet, status);
  289. if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
  290. // Section 5.2 step 4:
  291. if (!resolvedScriptSet.isEmpty()) {
  292. return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
  293. }
  294. // Section 5.2 step 5:
  295. ScriptSet resolvedNoLatn;
  296. getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
  297. if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
  298. // Section 5.2 step 6:
  299. if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
  300. || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
  301. || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
  302. return USPOOF_HIGHLY_RESTRICTIVE;
  303. }
  304. // Section 5.2 step 7:
  305. if (!resolvedNoLatn.isEmpty()
  306. && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
  307. && !resolvedNoLatn.test(USCRIPT_GREEK, status)
  308. && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
  309. return USPOOF_MODERATELY_RESTRICTIVE;
  310. }
  311. // Section 5.2 step 8:
  312. return USPOOF_MINIMALLY_RESTRICTIVE;
  313. }
  314. int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
  315. bool sawLeadCharacter = false;
  316. for (int32_t i=0; i<input.length();) {
  317. UChar32 cp = input.char32At(i);
  318. if (sawLeadCharacter && cp == 0x0307) {
  319. return i;
  320. }
  321. uint8_t combiningClass = u_getCombiningClass(cp);
  322. // Skip over characters except for those with combining class 0 (non-combining characters) or with
  323. // combining class 230 (same class as U+0307)
  324. U_ASSERT(u_getCombiningClass(0x0307) == 230);
  325. if (combiningClass == 0 || combiningClass == 230) {
  326. sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
  327. }
  328. i += U16_LENGTH(cp);
  329. }
  330. return -1;
  331. }
  332. static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
  333. return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
  334. u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
  335. }
  336. bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
  337. if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
  338. return true;
  339. }
  340. UnicodeString skelStr;
  341. fSpoofData->confusableLookup(cp, skelStr);
  342. UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
  343. if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
  344. return true;
  345. }
  346. return false;
  347. }
  348. // Convert a text format hex number. Utility function used by builder code. Static.
  349. // Input: char16_t *string text. Output: a UChar32
  350. // Input has been pre-checked, and will have no non-hex chars.
  351. // The number must fall in the code point range of 0..0x10ffff
  352. // Static Function.
  353. UChar32 SpoofImpl::ScanHex(const char16_t *s, int32_t start, int32_t limit, UErrorCode &status) {
  354. if (U_FAILURE(status)) {
  355. return 0;
  356. }
  357. U_ASSERT(limit-start > 0);
  358. uint32_t val = 0;
  359. int i;
  360. for (i=start; i<limit; i++) {
  361. int digitVal = s[i] - 0x30;
  362. if (digitVal>9) {
  363. digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
  364. }
  365. if (digitVal>15) {
  366. digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
  367. }
  368. U_ASSERT(digitVal <= 0xf);
  369. val <<= 4;
  370. val += digitVal;
  371. }
  372. if (val > 0x10ffff) {
  373. status = U_PARSE_ERROR;
  374. val = 0;
  375. }
  376. return (UChar32)val;
  377. }
  378. //-----------------------------------------
  379. //
  380. // class CheckResult Implementation
  381. //
  382. //-----------------------------------------
  383. CheckResult::CheckResult() {
  384. clear();
  385. }
  386. USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
  387. return exportForC();
  388. }
  389. //
  390. // Incoming parameter check on Status and the CheckResult object
  391. // received from the C API.
  392. //
  393. const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
  394. return validate(ptr, status);
  395. }
  396. CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
  397. return validate(ptr, status);
  398. }
  399. void CheckResult::clear() {
  400. fChecks = 0;
  401. fNumerics.clear();
  402. fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
  403. }
  404. int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
  405. if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
  406. return fChecks | fRestrictionLevel;
  407. } else {
  408. return fChecks;
  409. }
  410. }
  411. CheckResult::~CheckResult() {
  412. }
  413. //----------------------------------------------------------------------------------------------
  414. //
  415. // class SpoofData Implementation
  416. //
  417. //----------------------------------------------------------------------------------------------
  418. UBool SpoofData::validateDataVersion(UErrorCode &status) const {
  419. if (U_FAILURE(status) ||
  420. fRawData == nullptr ||
  421. fRawData->fMagic != USPOOF_MAGIC ||
  422. fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
  423. fRawData->fFormatVersion[1] != 0 ||
  424. fRawData->fFormatVersion[2] != 0 ||
  425. fRawData->fFormatVersion[3] != 0) {
  426. status = U_INVALID_FORMAT_ERROR;
  427. return false;
  428. }
  429. return true;
  430. }
  431. static UBool U_CALLCONV
  432. spoofDataIsAcceptable(void *context,
  433. const char * /* type */, const char * /*name*/,
  434. const UDataInfo *pInfo) {
  435. if(
  436. pInfo->size >= 20 &&
  437. pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
  438. pInfo->charsetFamily == U_CHARSET_FAMILY &&
  439. pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
  440. pInfo->dataFormat[1] == 0x66 &&
  441. pInfo->dataFormat[2] == 0x75 &&
  442. pInfo->dataFormat[3] == 0x20 &&
  443. pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
  444. ) {
  445. UVersionInfo *version = static_cast<UVersionInfo *>(context);
  446. if(version != nullptr) {
  447. uprv_memcpy(version, pInfo->dataVersion, 4);
  448. }
  449. return true;
  450. } else {
  451. return false;
  452. }
  453. }
  454. // Methods for the loading of the default confusables data file. The confusable
  455. // data is loaded only when it is needed.
  456. //
  457. // SpoofData::getDefault() - Return the default confusables data, and call the
  458. // initOnce() if it is not available. Adds a reference
  459. // to the SpoofData that the caller is responsible for
  460. // decrementing when they are done with the data.
  461. //
  462. // uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
  463. // is shared by all spoof checkers using the default data.
  464. //
  465. // uspoof_cleanupDefaultData - Called during cleanup.
  466. //
  467. static UInitOnce gSpoofInitDefaultOnce {};
  468. static SpoofData* gDefaultSpoofData;
  469. static UBool U_CALLCONV
  470. uspoof_cleanupDefaultData() {
  471. if (gDefaultSpoofData) {
  472. // Will delete, assuming all user-level spoof checkers were closed.
  473. gDefaultSpoofData->removeReference();
  474. gDefaultSpoofData = nullptr;
  475. gSpoofInitDefaultOnce.reset();
  476. }
  477. return true;
  478. }
  479. static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
  480. UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
  481. spoofDataIsAcceptable,
  482. nullptr, // context, would receive dataVersion if supplied.
  483. &status);
  484. if (U_FAILURE(status)) { return; }
  485. gDefaultSpoofData = new SpoofData(udm, status);
  486. if (U_FAILURE(status)) {
  487. delete gDefaultSpoofData;
  488. gDefaultSpoofData = nullptr;
  489. return;
  490. }
  491. if (gDefaultSpoofData == nullptr) {
  492. status = U_MEMORY_ALLOCATION_ERROR;
  493. return;
  494. }
  495. ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
  496. }
  497. SpoofData* SpoofData::getDefault(UErrorCode& status) {
  498. umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
  499. if (U_FAILURE(status)) { return nullptr; }
  500. gDefaultSpoofData->addReference();
  501. return gDefaultSpoofData;
  502. }
  503. SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
  504. {
  505. reset();
  506. if (U_FAILURE(status)) {
  507. return;
  508. }
  509. fUDM = udm;
  510. // fRawData is non-const because it may be constructed by the data builder.
  511. fRawData = reinterpret_cast<SpoofDataHeader *>(
  512. const_cast<void *>(udata_getMemory(udm)));
  513. validateDataVersion(status);
  514. initPtrs(status);
  515. }
  516. SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
  517. {
  518. reset();
  519. if (U_FAILURE(status)) {
  520. return;
  521. }
  522. if ((size_t)length < sizeof(SpoofDataHeader)) {
  523. status = U_INVALID_FORMAT_ERROR;
  524. return;
  525. }
  526. if (data == nullptr) {
  527. status = U_ILLEGAL_ARGUMENT_ERROR;
  528. return;
  529. }
  530. void *ncData = const_cast<void *>(data);
  531. fRawData = static_cast<SpoofDataHeader *>(ncData);
  532. if (length < fRawData->fLength) {
  533. status = U_INVALID_FORMAT_ERROR;
  534. return;
  535. }
  536. validateDataVersion(status);
  537. initPtrs(status);
  538. }
  539. // Spoof Data constructor for use from data builder.
  540. // Initializes a new, empty data area that will be populated later.
  541. SpoofData::SpoofData(UErrorCode &status) {
  542. reset();
  543. if (U_FAILURE(status)) {
  544. return;
  545. }
  546. fDataOwned = true;
  547. // The spoof header should already be sized to be a multiple of 16 bytes.
  548. // Just in case it's not, round it up.
  549. uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
  550. U_ASSERT(initialSize == sizeof(SpoofDataHeader));
  551. fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
  552. fMemLimit = initialSize;
  553. if (fRawData == nullptr) {
  554. status = U_MEMORY_ALLOCATION_ERROR;
  555. return;
  556. }
  557. uprv_memset(fRawData, 0, initialSize);
  558. fRawData->fMagic = USPOOF_MAGIC;
  559. fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
  560. fRawData->fFormatVersion[1] = 0;
  561. fRawData->fFormatVersion[2] = 0;
  562. fRawData->fFormatVersion[3] = 0;
  563. initPtrs(status);
  564. }
  565. // reset() - initialize all fields.
  566. // Should be updated if any new fields are added.
  567. // Called by constructors to put things in a known initial state.
  568. void SpoofData::reset() {
  569. fRawData = nullptr;
  570. fDataOwned = false;
  571. fUDM = nullptr;
  572. fMemLimit = 0;
  573. fRefCount = 1;
  574. fCFUKeys = nullptr;
  575. fCFUValues = nullptr;
  576. fCFUStrings = nullptr;
  577. }
  578. // SpoofData::initPtrs()
  579. // Initialize the pointers to the various sections of the raw data.
  580. //
  581. // This function is used both during the Trie building process (multiple
  582. // times, as the individual data sections are added), and
  583. // during the opening of a Spoof Checker from prebuilt data.
  584. //
  585. // The pointers for non-existent data sections (identified by an offset of 0)
  586. // are set to nullptr.
  587. //
  588. // Note: During building the data, adding each new data section
  589. // reallocs the raw data area, which likely relocates it, which
  590. // in turn requires reinitializing all of the pointers into it, hence
  591. // multiple calls to this function during building.
  592. //
  593. void SpoofData::initPtrs(UErrorCode &status) {
  594. fCFUKeys = nullptr;
  595. fCFUValues = nullptr;
  596. fCFUStrings = nullptr;
  597. if (U_FAILURE(status)) {
  598. return;
  599. }
  600. if (fRawData->fCFUKeys != 0) {
  601. fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
  602. }
  603. if (fRawData->fCFUStringIndex != 0) {
  604. fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
  605. }
  606. if (fRawData->fCFUStringTable != 0) {
  607. fCFUStrings = (char16_t *)((char *)fRawData + fRawData->fCFUStringTable);
  608. }
  609. }
  610. SpoofData::~SpoofData() {
  611. if (fDataOwned) {
  612. uprv_free(fRawData);
  613. }
  614. fRawData = nullptr;
  615. if (fUDM != nullptr) {
  616. udata_close(fUDM);
  617. }
  618. fUDM = nullptr;
  619. }
  620. void SpoofData::removeReference() {
  621. if (umtx_atomic_dec(&fRefCount) == 0) {
  622. delete this;
  623. }
  624. }
  625. SpoofData *SpoofData::addReference() {
  626. umtx_atomic_inc(&fRefCount);
  627. return this;
  628. }
  629. void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
  630. if (U_FAILURE(status)) {
  631. return nullptr;
  632. }
  633. if (!fDataOwned) {
  634. UPRV_UNREACHABLE_EXIT;
  635. }
  636. numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
  637. uint32_t returnOffset = fMemLimit;
  638. fMemLimit += numBytes;
  639. fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
  640. fRawData->fLength = fMemLimit;
  641. uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
  642. initPtrs(status);
  643. return (char *)fRawData + returnOffset;
  644. }
  645. int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
  646. int32_t dataSize = fRawData->fLength;
  647. if (capacity < dataSize) {
  648. status = U_BUFFER_OVERFLOW_ERROR;
  649. return dataSize;
  650. }
  651. uprv_memcpy(buf, fRawData, dataSize);
  652. return dataSize;
  653. }
  654. int32_t SpoofData::size() const {
  655. return fRawData->fLength;
  656. }
  657. //-------------------------------
  658. //
  659. // Front-end APIs for SpoofData
  660. //
  661. //-------------------------------
  662. int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
  663. // Perform a binary search.
  664. // [lo, hi), i.e lo is inclusive, hi is exclusive.
  665. // The result after the loop will be in lo.
  666. int32_t lo = 0;
  667. int32_t hi = length();
  668. do {
  669. int32_t mid = (lo + hi) / 2;
  670. if (codePointAt(mid) > inChar) {
  671. hi = mid;
  672. } else if (codePointAt(mid) < inChar) {
  673. lo = mid;
  674. } else {
  675. // Found result. Break early.
  676. lo = mid;
  677. break;
  678. }
  679. } while (hi - lo > 1);
  680. // Did we find an entry? If not, the char maps to itself.
  681. if (codePointAt(lo) != inChar) {
  682. dest.append(inChar);
  683. return 1;
  684. }
  685. // Add the element to the string builder and return.
  686. return appendValueTo(lo, dest);
  687. }
  688. int32_t SpoofData::length() const {
  689. return fRawData->fCFUKeysSize;
  690. }
  691. UChar32 SpoofData::codePointAt(int32_t index) const {
  692. return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
  693. }
  694. int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
  695. int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
  696. // Value is either a char (for strings of length 1) or
  697. // an index into the string table (for longer strings)
  698. uint16_t value = fCFUValues[index];
  699. if (stringLength == 1) {
  700. dest.append((char16_t)value);
  701. } else {
  702. dest.append(fCFUStrings + value, stringLength);
  703. }
  704. return stringLength;
  705. }
  706. U_NAMESPACE_END
  707. U_NAMESPACE_USE
  708. //-----------------------------------------------------------------------------
  709. //
  710. // uspoof_swap - byte swap and char encoding swap of spoof data
  711. //
  712. //-----------------------------------------------------------------------------
  713. U_CAPI int32_t U_EXPORT2
  714. uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
  715. UErrorCode *status) {
  716. if (status == nullptr || U_FAILURE(*status)) {
  717. return 0;
  718. }
  719. if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {
  720. *status=U_ILLEGAL_ARGUMENT_ERROR;
  721. return 0;
  722. }
  723. //
  724. // Check that the data header is for spoof data.
  725. // (Header contents are defined in gencfu.cpp)
  726. //
  727. const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
  728. if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
  729. pInfo->dataFormat[1]==0x66 &&
  730. pInfo->dataFormat[2]==0x75 &&
  731. pInfo->dataFormat[3]==0x20 &&
  732. pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
  733. pInfo->formatVersion[1]==0 &&
  734. pInfo->formatVersion[2]==0 &&
  735. pInfo->formatVersion[3]==0 )) {
  736. udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
  737. "(format version %02x %02x %02x %02x) is not recognized\n",
  738. pInfo->dataFormat[0], pInfo->dataFormat[1],
  739. pInfo->dataFormat[2], pInfo->dataFormat[3],
  740. pInfo->formatVersion[0], pInfo->formatVersion[1],
  741. pInfo->formatVersion[2], pInfo->formatVersion[3]);
  742. *status=U_UNSUPPORTED_ERROR;
  743. return 0;
  744. }
  745. //
  746. // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
  747. // header). This swap also conveniently gets us
  748. // the size of the ICU d.h., which lets us locate the start
  749. // of the uspoof specific data.
  750. //
  751. int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
  752. //
  753. // Get the Spoof Data Header, and check that it appears to be OK.
  754. //
  755. //
  756. const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
  757. SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
  758. if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
  759. ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
  760. {
  761. udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
  762. *status=U_UNSUPPORTED_ERROR;
  763. return 0;
  764. }
  765. //
  766. // Prefight operation? Just return the size
  767. //
  768. int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
  769. int32_t totalSize = headerSize + spoofDataLength;
  770. if (length < 0) {
  771. return totalSize;
  772. }
  773. //
  774. // Check that length passed in is consistent with length from Spoof data header.
  775. //
  776. if (length < totalSize) {
  777. udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
  778. spoofDataLength);
  779. *status=U_INDEX_OUTOFBOUNDS_ERROR;
  780. return 0;
  781. }
  782. //
  783. // Swap the Data. Do the data itself first, then the Spoof Data Header, because
  784. // we need to reference the header to locate the data, and an
  785. // inplace swap of the header leaves it unusable.
  786. //
  787. uint8_t *outBytes = (uint8_t *)outData + headerSize;
  788. SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
  789. int32_t sectionStart;
  790. int32_t sectionLength;
  791. //
  792. // If not swapping in place, zero out the output buffer before starting.
  793. // Gaps may exist between the individual sections, and these must be zeroed in
  794. // the output buffer. The simplest way to do that is to just zero the whole thing.
  795. //
  796. if (inBytes != outBytes) {
  797. uprv_memset(outBytes, 0, spoofDataLength);
  798. }
  799. // Confusables Keys Section (fCFUKeys)
  800. sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
  801. sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
  802. ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
  803. // String Index Section
  804. sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
  805. sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
  806. ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
  807. // String Table Section
  808. sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
  809. sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
  810. ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
  811. // And, last, swap the header itself.
  812. // int32_t fMagic // swap this
  813. // uint8_t fFormatVersion[4] // Do not swap this, just copy
  814. // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
  815. //
  816. uint32_t magic = ds->readUInt32(spoofDH->fMagic);
  817. ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
  818. if (inBytes != outBytes) {
  819. uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
  820. }
  821. // swap starting at fLength
  822. ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
  823. return totalSize;
  824. }
  825. #endif