uspoof.h 66 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 2008-2016, International Business Machines Corporation
  6. * and others. All Rights Reserved.
  7. ***************************************************************************
  8. * file name: uspoof.h
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2008Feb13
  14. * created by: Andy Heninger
  15. *
  16. * Unicode Spoof Detection
  17. */
  18. #ifndef USPOOF_H
  19. #define USPOOF_H
  20. #include "unicode/utypes.h"
  21. #include "unicode/uset.h"
  22. #include "unicode/parseerr.h"
  23. #if !UCONFIG_NO_NORMALIZATION
  24. #if U_SHOW_CPLUSPLUS_API
  25. #include "unicode/localpointer.h"
  26. #include "unicode/unistr.h"
  27. #include "unicode/uniset.h"
  28. #endif
  29. /**
  30. * \file
  31. * \brief C API: Unicode Security and Spoofing Detection
  32. *
  33. * <p>
  34. * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
  35. * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
  36. *
  37. * <ol>
  38. * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
  39. * &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
  40. * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
  41. * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
  42. * </ol>
  43. *
  44. * <p>
  45. * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
  46. * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
  47. * content filters.
  48. *
  49. * <p>
  50. * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++.
  51. *
  52. * <h2>Confusables</h2>
  53. *
  54. * <p>
  55. * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings:
  56. *
  57. * \code{.c}
  58. * UErrorCode status = U_ZERO_ERROR;
  59. * UChar* str1 = (UChar*) u"Harvest";
  60. * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
  61. *
  62. * USpoofChecker* sc = uspoof_open(&status);
  63. * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
  64. *
  65. * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
  66. * UBool result = bitmask != 0;
  67. * // areConfusable: 1 (status: U_ZERO_ERROR)
  68. * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
  69. * uspoof_close(sc);
  70. * \endcode
  71. *
  72. * <p>
  73. * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
  74. * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
  75. * confusability test; and the following line extracts the result out of the return value. For best performance,
  76. * the instance should be created once (e.g., upon application startup), and the efficient
  77. * {@link uspoof_areConfusable} method can be used at runtime.
  78. *
  79. * <p>
  80. * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
  81. * {@link uspoof_close} when the object goes out of scope:
  82. *
  83. * \code{.cpp}
  84. * UErrorCode status = U_ZERO_ERROR;
  85. * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
  86. * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status);
  87. * // ...
  88. * \endcode
  89. *
  90. * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can
  91. * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so
  92. * the following snippet is equivalent to the example above:
  93. *
  94. * \code{.c}
  95. * UErrorCode status = U_ZERO_ERROR;
  96. * UChar* str1 = (UChar*) u"Harvest";
  97. * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
  98. *
  99. * USpoofChecker* sc = uspoof_open(&status);
  100. * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
  101. *
  102. * // Get skeleton 1
  103. * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
  104. * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
  105. * status = U_ZERO_ERROR;
  106. * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
  107. *
  108. * // Get skeleton 2
  109. * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
  110. * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
  111. * status = U_ZERO_ERROR;
  112. * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
  113. *
  114. * // Are the skeletons the same?
  115. * UBool result = u_strcmp(skel1, skel2) == 0;
  116. * // areConfusable: 1 (status: U_ZERO_ERROR)
  117. * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
  118. * uspoof_close(sc);
  119. * free(skel1);
  120. * free(skel2);
  121. * \endcode
  122. *
  123. * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
  124. * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below:
  125. *
  126. * \code{.c}
  127. * UErrorCode status = U_ZERO_ERROR;
  128. * #define DICTIONARY_LENGTH 2
  129. * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
  130. * UChar* skeletons[DICTIONARY_LENGTH];
  131. * UChar* str = (UChar*) u"1orern";
  132. *
  133. * // Setup:
  134. * USpoofChecker* sc = uspoof_open(&status);
  135. * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
  136. * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
  137. * UChar* word = dictionary[i];
  138. * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
  139. * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
  140. * status = U_ZERO_ERROR;
  141. * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
  142. * }
  143. *
  144. * // Live Check:
  145. * {
  146. * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
  147. * UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
  148. * status = U_ZERO_ERROR;
  149. * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
  150. * UBool result = false;
  151. * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
  152. * result = u_strcmp(skel, skeletons[i]) == 0;
  153. * if (result == true) { break; }
  154. * }
  155. * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
  156. * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
  157. * free(skel);
  158. * }
  159. *
  160. * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
  161. * free(skeletons[i]);
  162. * }
  163. * uspoof_close(sc);
  164. * \endcode
  165. *
  166. * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
  167. * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
  168. * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
  169. *
  170. * <h2>Spoof Detection</h2>
  171. *
  172. * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a
  173. * string:
  174. *
  175. * \code{.c}
  176. * UErrorCode status = U_ZERO_ERROR;
  177. * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
  178. *
  179. * // Get the default set of allowable characters:
  180. * USet* allowed = uset_openEmpty();
  181. * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
  182. * uset_addAll(allowed, uspoof_getInclusionSet(&status));
  183. *
  184. * USpoofChecker* sc = uspoof_open(&status);
  185. * uspoof_setAllowedChars(sc, allowed, &status);
  186. * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
  187. *
  188. * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
  189. * UBool result = bitmask != 0;
  190. * // fails checks: 1 (status: U_ZERO_ERROR)
  191. * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
  192. * uspoof_close(sc);
  193. * uset_close(allowed);
  194. * \endcode
  195. *
  196. * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at
  197. * startup, and call the cheaper {@link uspoof_check} online. We specify the set of
  198. * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39.
  199. *
  200. * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings,
  201. * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers.
  202. *
  203. * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks
  204. * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions
  205. * with a {@link USpoofCheckResult} parameter:
  206. *
  207. * \code{.c}
  208. * UErrorCode status = U_ZERO_ERROR;
  209. * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
  210. *
  211. * // Get the default set of allowable characters:
  212. * USet* allowed = uset_openEmpty();
  213. * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
  214. * uset_addAll(allowed, uspoof_getInclusionSet(&status));
  215. *
  216. * USpoofChecker* sc = uspoof_open(&status);
  217. * uspoof_setAllowedChars(sc, allowed, &status);
  218. * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
  219. *
  220. * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
  221. * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status);
  222. *
  223. * int32_t failures1 = bitmask;
  224. * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
  225. * assert(failures1 == failures2);
  226. * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
  227. * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
  228. *
  229. * // Cleanup:
  230. * uspoof_close(sc);
  231. * uset_close(allowed);
  232. * uspoof_closeCheckResult(checkResult);
  233. * \endcode
  234. *
  235. * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally
  236. * equivalent to the one above:
  237. *
  238. * \code{.cpp}
  239. * UErrorCode status = U_ZERO_ERROR;
  240. * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
  241. *
  242. * // Get the default set of allowable characters:
  243. * UnicodeSet allowed;
  244. * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
  245. * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
  246. *
  247. * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
  248. * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
  249. * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
  250. *
  251. * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
  252. * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
  253. *
  254. * int32_t failures1 = bitmask;
  255. * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
  256. * assert(failures1 == failures2);
  257. * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
  258. * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
  259. *
  260. * // Explicit cleanup not necessary.
  261. * \endcode
  262. *
  263. * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
  264. * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
  265. *
  266. * <ul>
  267. * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
  268. * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
  269. * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
  270. * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
  271. * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
  272. * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
  273. * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li>
  274. * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
  275. * </ul>
  276. *
  277. * <p>
  278. * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
  279. * INVISIBLE and MIXED_NUMBERS conditions, you could do:
  280. *
  281. * \code{.c}
  282. * UErrorCode status = U_ZERO_ERROR;
  283. * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR
  284. *
  285. * USpoofChecker* sc = uspoof_open(&status);
  286. * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
  287. *
  288. * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
  289. * UBool result = bitmask != 0;
  290. * // fails checks: 1 (status: U_ZERO_ERROR)
  291. * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
  292. * uspoof_close(sc);
  293. * \endcode
  294. *
  295. * Here is an example in C++ showing how to compute the restriction level of a string:
  296. *
  297. * \code{.cpp}
  298. * UErrorCode status = U_ZERO_ERROR;
  299. * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
  300. *
  301. * // Get the default set of allowable characters:
  302. * UnicodeSet allowed;
  303. * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
  304. * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
  305. *
  306. * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
  307. * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
  308. * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
  309. * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status);
  310. *
  311. * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
  312. * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
  313. *
  314. * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
  315. * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
  316. * assert((restrictionLevel & bitmask) == restrictionLevel);
  317. * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
  318. * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
  319. * \endcode
  320. *
  321. * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
  322. * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
  323. *
  324. * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
  325. * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
  326. * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
  327. * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
  328. * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
  329. * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
  330. * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of
  331. * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code
  332. * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
  333. * scripts.
  334. *
  335. * <h2>Additional Information</h2>
  336. *
  337. * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
  338. *
  339. * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
  340. * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
  341. * using the same USpoofChecker instance.
  342. *
  343. * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
  344. * thread safe. Those that take a non-const USpoofChecker are not thread safe..
  345. *
  346. * @stable ICU 4.6
  347. */
  348. U_CDECL_BEGIN
  349. struct USpoofChecker;
  350. /**
  351. * @stable ICU 4.2
  352. */
  353. typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */
  354. struct USpoofCheckResult;
  355. /**
  356. * @see uspoof_openCheckResult
  357. * @stable ICU 58
  358. */
  359. typedef struct USpoofCheckResult USpoofCheckResult;
  360. /**
  361. * Enum for the kinds of checks that USpoofChecker can perform.
  362. * These enum values are used both to select the set of checks that
  363. * will be performed, and to report results from the check function.
  364. *
  365. * @stable ICU 4.2
  366. */
  367. typedef enum USpoofChecks {
  368. /**
  369. * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
  370. * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
  371. * 4.
  372. *
  373. * @see uspoof_areConfusable
  374. * @stable ICU 4.2
  375. */
  376. USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1,
  377. /**
  378. * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
  379. * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
  380. * 39 section 4.
  381. *
  382. * @see uspoof_areConfusable
  383. * @stable ICU 4.2
  384. */
  385. USPOOF_MIXED_SCRIPT_CONFUSABLE = 2,
  386. /**
  387. * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
  388. * that the two strings are visually confusable and that they are not from the same script but both of them are
  389. * single-script strings, according to UTS 39 section 4.
  390. *
  391. * @see uspoof_areConfusable
  392. * @stable ICU 4.2
  393. */
  394. USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4,
  395. /**
  396. * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set
  397. * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
  398. * make {@link uspoof_areConfusable} return only those types of confusables.
  399. *
  400. * @see uspoof_areConfusable
  401. * @see uspoof_getSkeleton
  402. * @stable ICU 58
  403. */
  404. USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
  405. #ifndef U_HIDE_DEPRECATED_API
  406. /**
  407. * This flag is deprecated and no longer affects the behavior of SpoofChecker.
  408. *
  409. * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated.
  410. */
  411. USPOOF_ANY_CASE = 8,
  412. #endif /* U_HIDE_DEPRECATED_API */
  413. /**
  414. * Check that an identifier is no looser than the specified RestrictionLevel.
  415. * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE.
  416. *
  417. * If USPOOF_AUX_INFO is enabled the actual restriction level of the
  418. * identifier being tested will also be returned by uspoof_check().
  419. *
  420. * @see URestrictionLevel
  421. * @see uspoof_setRestrictionLevel
  422. * @see USPOOF_AUX_INFO
  423. *
  424. * @stable ICU 51
  425. */
  426. USPOOF_RESTRICTION_LEVEL = 16,
  427. #ifndef U_HIDE_DEPRECATED_API
  428. /** Check that an identifier contains only characters from a
  429. * single script (plus chars from the common and inherited scripts.)
  430. * Applies to checks of a single identifier check only.
  431. * @deprecated ICU 51 Use RESTRICTION_LEVEL instead.
  432. */
  433. USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL,
  434. #endif /* U_HIDE_DEPRECATED_API */
  435. /** Check an identifier for the presence of invisible characters,
  436. * such as zero-width spaces, or character sequences that are
  437. * likely not to display, such as multiple occurrences of the same
  438. * non-spacing mark. This check does not test the input string as a whole
  439. * for conformance to any particular syntax for identifiers.
  440. */
  441. USPOOF_INVISIBLE = 32,
  442. /** Check that an identifier contains only characters from a specified set
  443. * of acceptable characters. See {@link uspoof_setAllowedChars} and
  444. * {@link uspoof_setAllowedLocales}. Note that a string that fails this check
  445. * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
  446. */
  447. USPOOF_CHAR_LIMIT = 64,
  448. /**
  449. * Check that an identifier does not mix numbers from different numbering systems.
  450. * For more information, see UTS 39 section 5.3.
  451. *
  452. * @stable ICU 51
  453. */
  454. USPOOF_MIXED_NUMBERS = 128,
  455. /**
  456. * Check that an identifier does not have a combining character following a character in which that
  457. * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
  458. *
  459. * More specifically, the following characters are forbidden from preceding a U+0307:
  460. * <ul>
  461. * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
  462. * <li>Latin lowercase letter 'l'</li>
  463. * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
  464. * <li>Any character whose confusable prototype ends with such a character
  465. * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
  466. * </ul>
  467. * In addition, combining characters are allowed between the above characters and U+0307 except those
  468. * with combining class 0 or combining class "Above" (230, same class as U+0307).
  469. *
  470. * This list and the number of combing characters considered by this check may grow over time.
  471. *
  472. * @stable ICU 62
  473. */
  474. USPOOF_HIDDEN_OVERLAY = 256,
  475. /**
  476. * Enable all spoof checks.
  477. *
  478. * @stable ICU 4.6
  479. */
  480. USPOOF_ALL_CHECKS = 0xFFFF,
  481. /**
  482. * Enable the return of auxiliary (non-error) information in the
  483. * upper bits of the check results value.
  484. *
  485. * If this "check" is not enabled, the results of {@link uspoof_check} will be
  486. * zero when an identifier passes all of the enabled checks.
  487. *
  488. * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will
  489. * be zero when an identifier passes all checks.
  490. *
  491. * @stable ICU 51
  492. */
  493. USPOOF_AUX_INFO = 0x40000000
  494. } USpoofChecks;
  495. /**
  496. * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and
  497. * for returned identifier restriction levels in check results.
  498. *
  499. * @stable ICU 51
  500. *
  501. * @see uspoof_setRestrictionLevel
  502. * @see uspoof_check
  503. */
  504. typedef enum URestrictionLevel {
  505. /**
  506. * All characters in the string are in the identifier profile and all characters in the string are in the
  507. * ASCII range.
  508. *
  509. * @stable ICU 51
  510. */
  511. USPOOF_ASCII = 0x10000000,
  512. /**
  513. * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and
  514. * the string is single-script, according to the definition in UTS 39 section 5.1.
  515. *
  516. * @stable ICU 53
  517. */
  518. USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000,
  519. /**
  520. * The string classifies as Single Script, or all characters in the string are in the identifier profile and
  521. * the string is covered by any of the following sets of scripts, according to the definition in UTS 39
  522. * section 5.1:
  523. * <ul>
  524. * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
  525. * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
  526. * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
  527. * </ul>
  528. * This is the default restriction in ICU.
  529. *
  530. * @stable ICU 51
  531. */
  532. USPOOF_HIGHLY_RESTRICTIVE = 0x30000000,
  533. /**
  534. * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
  535. * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
  536. * Greek, and Cherokee.
  537. *
  538. * @stable ICU 51
  539. */
  540. USPOOF_MODERATELY_RESTRICTIVE = 0x40000000,
  541. /**
  542. * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts.
  543. *
  544. * @stable ICU 51
  545. */
  546. USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000,
  547. /**
  548. * Any valid identifiers, including characters outside of the Identifier Profile.
  549. *
  550. * @stable ICU 51
  551. */
  552. USPOOF_UNRESTRICTIVE = 0x60000000,
  553. /**
  554. * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}.
  555. *
  556. * @stable ICU 53
  557. */
  558. USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000,
  559. #ifndef U_HIDE_INTERNAL_API
  560. /**
  561. * An undefined restriction level.
  562. * @internal
  563. */
  564. USPOOF_UNDEFINED_RESTRICTIVE = -1
  565. #endif /* U_HIDE_INTERNAL_API */
  566. } URestrictionLevel;
  567. /**
  568. * Create a Unicode Spoof Checker, configured to perform all
  569. * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
  570. * Note that additional checks may be added in the future,
  571. * resulting in the changes to the default checking behavior.
  572. *
  573. * @param status The error code, set if this function encounters a problem.
  574. * @return the newly created Spoof Checker
  575. * @stable ICU 4.2
  576. */
  577. U_CAPI USpoofChecker * U_EXPORT2
  578. uspoof_open(UErrorCode *status);
  579. /**
  580. * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory.
  581. * Inverse of uspoof_serialize().
  582. * The memory containing the serialized data must remain valid and unchanged
  583. * as long as the spoof checker, or any cloned copies of the spoof checker,
  584. * are in use. Ownership of the memory remains with the caller.
  585. * The spoof checker (and any clones) must be closed prior to deleting the
  586. * serialized data.
  587. *
  588. * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data
  589. * @param length the number of bytes available at data;
  590. * can be more than necessary
  591. * @param pActualLength receives the actual number of bytes at data taken up by the data;
  592. * can be NULL
  593. * @param pErrorCode ICU error code
  594. * @return the spoof checker.
  595. *
  596. * @see uspoof_open
  597. * @see uspoof_serialize
  598. * @stable ICU 4.2
  599. */
  600. U_CAPI USpoofChecker * U_EXPORT2
  601. uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
  602. UErrorCode *pErrorCode);
  603. /**
  604. * Open a Spoof Checker from the source form of the spoof data.
  605. * The input corresponds to the Unicode data file confusables.txt
  606. * as described in Unicode UAX #39. The syntax of the source data
  607. * is as described in UAX #39 for this file, and the content of
  608. * this file is acceptable input.
  609. *
  610. * The character encoding of the (char *) input text is UTF-8.
  611. *
  612. * @param confusables a pointer to the confusable characters definitions,
  613. * as found in file confusables.txt from unicode.org.
  614. * @param confusablesLen The length of the confusables text, or -1 if the
  615. * input string is zero terminated.
  616. * @param confusablesWholeScript
  617. * Deprecated in ICU 58. No longer used.
  618. * @param confusablesWholeScriptLen
  619. * Deprecated in ICU 58. No longer used.
  620. * @param errType In the event of an error in the input, indicates
  621. * which of the input files contains the error.
  622. * The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or
  623. * USPOOF_WHOLE_SCRIPT_CONFUSABLE, or
  624. * zero if no errors are found.
  625. * @param pe In the event of an error in the input, receives the position
  626. * in the input text (line, offset) of the error.
  627. * @param status an in/out ICU UErrorCode. Among the possible errors is
  628. * U_PARSE_ERROR, which is used to report syntax errors
  629. * in the input.
  630. * @return A spoof checker that uses the rules from the input files.
  631. * @stable ICU 4.2
  632. */
  633. U_CAPI USpoofChecker * U_EXPORT2
  634. uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
  635. const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
  636. int32_t *errType, UParseError *pe, UErrorCode *status);
  637. /**
  638. * Close a Spoof Checker, freeing any memory that was being held by
  639. * its implementation.
  640. * @stable ICU 4.2
  641. */
  642. U_CAPI void U_EXPORT2
  643. uspoof_close(USpoofChecker *sc);
  644. /**
  645. * Clone a Spoof Checker. The clone will be set to perform the same checks
  646. * as the original source.
  647. *
  648. * @param sc The source USpoofChecker
  649. * @param status The error code, set if this function encounters a problem.
  650. * @return
  651. * @stable ICU 4.2
  652. */
  653. U_CAPI USpoofChecker * U_EXPORT2
  654. uspoof_clone(const USpoofChecker *sc, UErrorCode *status);
  655. /**
  656. * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method
  657. * overwrites any checks that may have already been enabled. By default, all checks are enabled.
  658. *
  659. * To enable specific checks and disable all others,
  660. * OR together only the bit constants for the desired checks.
  661. * For example, to fail strings containing characters outside of
  662. * the set specified by {@link uspoof_setAllowedChars} and
  663. * also strings that contain digits from mixed numbering systems:
  664. *
  665. * <pre>
  666. * {@code
  667. * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS);
  668. * }
  669. * </pre>
  670. *
  671. * To disable specific checks and enable all others,
  672. * start with ALL_CHECKS and "AND away" the not-desired checks.
  673. * For example, if you are not planning to use the {@link uspoof_areConfusable} functionality,
  674. * it is good practice to disable the CONFUSABLE check:
  675. *
  676. * <pre>
  677. * {@code
  678. * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE);
  679. * }
  680. * </pre>
  681. *
  682. * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and
  683. * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
  684. * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
  685. * methods.
  686. *
  687. * @param sc The USpoofChecker
  688. * @param checks The set of checks that this spoof checker will perform.
  689. * The value is a bit set, obtained by OR-ing together
  690. * values from enum USpoofChecks.
  691. * @param status The error code, set if this function encounters a problem.
  692. * @stable ICU 4.2
  693. *
  694. */
  695. U_CAPI void U_EXPORT2
  696. uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status);
  697. /**
  698. * Get the set of checks that this Spoof Checker has been configured to perform.
  699. *
  700. * @param sc The USpoofChecker
  701. * @param status The error code, set if this function encounters a problem.
  702. * @return The set of checks that this spoof checker will perform.
  703. * The value is a bit set, obtained by OR-ing together
  704. * values from enum USpoofChecks.
  705. * @stable ICU 4.2
  706. *
  707. */
  708. U_CAPI int32_t U_EXPORT2
  709. uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
  710. /**
  711. * Set the loosest restriction level allowed for strings. The default if this is not called is
  712. * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
  713. * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
  714. * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
  715. *
  716. * @param sc The USpoofChecker
  717. * @param restrictionLevel The loosest restriction level allowed.
  718. * @see URestrictionLevel
  719. * @stable ICU 51
  720. */
  721. U_CAPI void U_EXPORT2
  722. uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
  723. /**
  724. * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}.
  725. *
  726. * @return The restriction level
  727. * @see URestrictionLevel
  728. * @stable ICU 51
  729. */
  730. U_CAPI URestrictionLevel U_EXPORT2
  731. uspoof_getRestrictionLevel(const USpoofChecker *sc);
  732. /**
  733. * Limit characters that are acceptable in identifiers being checked to those
  734. * normally used with the languages associated with the specified locales.
  735. * Any previously specified list of locales is replaced by the new settings.
  736. *
  737. * A set of languages is determined from the locale(s), and
  738. * from those a set of acceptable Unicode scripts is determined.
  739. * Characters from this set of scripts, along with characters from
  740. * the "common" and "inherited" Unicode Script categories
  741. * will be permitted.
  742. *
  743. * Supplying an empty string removes all restrictions;
  744. * characters from any script will be allowed.
  745. *
  746. * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this
  747. * USpoofChecker when calling this function with a non-empty list
  748. * of locales.
  749. *
  750. * The Unicode Set of characters that will be allowed is accessible
  751. * via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales()
  752. * will <i>replace</i> any previously applied set of allowed characters.
  753. *
  754. * Adjustments, such as additions or deletions of certain classes of characters,
  755. * can be made to the result of uspoof_setAllowedLocales() by
  756. * fetching the resulting set with uspoof_getAllowedChars(),
  757. * manipulating it with the Unicode Set API, then resetting the
  758. * spoof detectors limits with uspoof_setAllowedChars().
  759. *
  760. * @param sc The USpoofChecker
  761. * @param localesList A list list of locales, from which the language
  762. * and associated script are extracted. The locales
  763. * are comma-separated if there is more than one.
  764. * White space may not appear within an individual locale,
  765. * but is ignored otherwise.
  766. * The locales are syntactically like those from the
  767. * HTTP Accept-Language header.
  768. * If the localesList is empty, no restrictions will be placed on
  769. * the allowed characters.
  770. *
  771. * @param status The error code, set if this function encounters a problem.
  772. * @stable ICU 4.2
  773. */
  774. U_CAPI void U_EXPORT2
  775. uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status);
  776. /**
  777. * Get a list of locales for the scripts that are acceptable in strings
  778. * to be checked. If no limitations on scripts have been specified,
  779. * an empty string will be returned.
  780. *
  781. * uspoof_setAllowedChars() will reset the list of allowed to be empty.
  782. *
  783. * The format of the returned list is the same as that supplied to
  784. * uspoof_setAllowedLocales(), but returned list may not be identical
  785. * to the originally specified string; the string may be reformatted,
  786. * and information other than languages from
  787. * the originally specified locales may be omitted.
  788. *
  789. * @param sc The USpoofChecker
  790. * @param status The error code, set if this function encounters a problem.
  791. * @return A string containing a list of locales corresponding
  792. * to the acceptable scripts, formatted like an
  793. * HTTP Accept Language value.
  794. *
  795. * @stable ICU 4.2
  796. */
  797. U_CAPI const char * U_EXPORT2
  798. uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status);
  799. /**
  800. * Limit the acceptable characters to those specified by a Unicode Set.
  801. * Any previously specified character limit is
  802. * is replaced by the new settings. This includes limits on
  803. * characters that were set with the uspoof_setAllowedLocales() function.
  804. *
  805. * The USPOOF_CHAR_LIMIT test is automatically enabled for this
  806. * USpoofChecker by this function.
  807. *
  808. * @param sc The USpoofChecker
  809. * @param chars A Unicode Set containing the list of
  810. * characters that are permitted. Ownership of the set
  811. * remains with the caller. The incoming set is cloned by
  812. * this function, so there are no restrictions on modifying
  813. * or deleting the USet after calling this function.
  814. * @param status The error code, set if this function encounters a problem.
  815. * @stable ICU 4.2
  816. */
  817. U_CAPI void U_EXPORT2
  818. uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status);
  819. /**
  820. * Get a USet for the characters permitted in an identifier.
  821. * This corresponds to the limits imposed by the Set Allowed Characters
  822. * functions. Limitations imposed by other checks will not be
  823. * reflected in the set returned by this function.
  824. *
  825. * The returned set will be frozen, meaning that it cannot be modified
  826. * by the caller.
  827. *
  828. * Ownership of the returned set remains with the Spoof Detector. The
  829. * returned set will become invalid if the spoof detector is closed,
  830. * or if a new set of allowed characters is specified.
  831. *
  832. *
  833. * @param sc The USpoofChecker
  834. * @param status The error code, set if this function encounters a problem.
  835. * @return A USet containing the characters that are permitted by
  836. * the USPOOF_CHAR_LIMIT test.
  837. * @stable ICU 4.2
  838. */
  839. U_CAPI const USet * U_EXPORT2
  840. uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
  841. /**
  842. * Check the specified string for possible security issues.
  843. * The text to be checked will typically be an identifier of some sort.
  844. * The set of checks to be performed is specified with uspoof_setChecks().
  845. *
  846. * \note
  847. * Consider using the newer API, {@link uspoof_check2}, instead.
  848. * The newer API exposes additional information from the check procedure
  849. * and is otherwise identical to this method.
  850. *
  851. * @param sc The USpoofChecker
  852. * @param id The identifier to be checked for possible security issues,
  853. * in UTF-16 format.
  854. * @param length the length of the string to be checked, expressed in
  855. * 16 bit UTF-16 code units, or -1 if the string is
  856. * zero terminated.
  857. * @param position Deprecated in ICU 51. Always returns zero.
  858. * Originally, an out parameter for the index of the first
  859. * string position that failed a check.
  860. * This parameter may be NULL.
  861. * @param status The error code, set if an error occurred while attempting to
  862. * perform the check.
  863. * Spoofing or security issues detected with the input string are
  864. * not reported here, but through the function's return value.
  865. * @return An integer value with bits set for any potential security
  866. * or spoofing issues detected. The bits are defined by
  867. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  868. * will be zero if the input string passes all of the
  869. * enabled checks.
  870. * @see uspoof_check2
  871. * @stable ICU 4.2
  872. */
  873. U_CAPI int32_t U_EXPORT2
  874. uspoof_check(const USpoofChecker *sc,
  875. const UChar *id, int32_t length,
  876. int32_t *position,
  877. UErrorCode *status);
  878. /**
  879. * Check the specified string for possible security issues.
  880. * The text to be checked will typically be an identifier of some sort.
  881. * The set of checks to be performed is specified with uspoof_setChecks().
  882. *
  883. * \note
  884. * Consider using the newer API, {@link uspoof_check2UTF8}, instead.
  885. * The newer API exposes additional information from the check procedure
  886. * and is otherwise identical to this method.
  887. *
  888. * @param sc The USpoofChecker
  889. * @param id A identifier to be checked for possible security issues, in UTF8 format.
  890. * @param length the length of the string to be checked, or -1 if the string is
  891. * zero terminated.
  892. * @param position Deprecated in ICU 51. Always returns zero.
  893. * Originally, an out parameter for the index of the first
  894. * string position that failed a check.
  895. * This parameter may be NULL.
  896. * @param status The error code, set if an error occurred while attempting to
  897. * perform the check.
  898. * Spoofing or security issues detected with the input string are
  899. * not reported here, but through the function's return value.
  900. * If the input contains invalid UTF-8 sequences,
  901. * a status of U_INVALID_CHAR_FOUND will be returned.
  902. * @return An integer value with bits set for any potential security
  903. * or spoofing issues detected. The bits are defined by
  904. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  905. * will be zero if the input string passes all of the
  906. * enabled checks.
  907. * @see uspoof_check2UTF8
  908. * @stable ICU 4.2
  909. */
  910. U_CAPI int32_t U_EXPORT2
  911. uspoof_checkUTF8(const USpoofChecker *sc,
  912. const char *id, int32_t length,
  913. int32_t *position,
  914. UErrorCode *status);
  915. /**
  916. * Check the specified string for possible security issues.
  917. * The text to be checked will typically be an identifier of some sort.
  918. * The set of checks to be performed is specified with uspoof_setChecks().
  919. *
  920. * @param sc The USpoofChecker
  921. * @param id The identifier to be checked for possible security issues,
  922. * in UTF-16 format.
  923. * @param length the length of the string to be checked, or -1 if the string is
  924. * zero terminated.
  925. * @param checkResult An instance of USpoofCheckResult to be filled with
  926. * details about the identifier. Can be NULL.
  927. * @param status The error code, set if an error occurred while attempting to
  928. * perform the check.
  929. * Spoofing or security issues detected with the input string are
  930. * not reported here, but through the function's return value.
  931. * @return An integer value with bits set for any potential security
  932. * or spoofing issues detected. The bits are defined by
  933. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  934. * will be zero if the input string passes all of the
  935. * enabled checks. Any information in this bitmask will be
  936. * consistent with the information saved in the optional
  937. * checkResult parameter.
  938. * @see uspoof_openCheckResult
  939. * @see uspoof_check2UTF8
  940. * @see uspoof_check2UnicodeString
  941. * @stable ICU 58
  942. */
  943. U_CAPI int32_t U_EXPORT2
  944. uspoof_check2(const USpoofChecker *sc,
  945. const UChar* id, int32_t length,
  946. USpoofCheckResult* checkResult,
  947. UErrorCode *status);
  948. /**
  949. * Check the specified string for possible security issues.
  950. * The text to be checked will typically be an identifier of some sort.
  951. * The set of checks to be performed is specified with uspoof_setChecks().
  952. *
  953. * This version of {@link uspoof_check} accepts a USpoofCheckResult, which
  954. * returns additional information about the identifier. For more
  955. * information, see {@link uspoof_openCheckResult}.
  956. *
  957. * @param sc The USpoofChecker
  958. * @param id A identifier to be checked for possible security issues, in UTF8 format.
  959. * @param length the length of the string to be checked, or -1 if the string is
  960. * zero terminated.
  961. * @param checkResult An instance of USpoofCheckResult to be filled with
  962. * details about the identifier. Can be NULL.
  963. * @param status The error code, set if an error occurred while attempting to
  964. * perform the check.
  965. * Spoofing or security issues detected with the input string are
  966. * not reported here, but through the function's return value.
  967. * @return An integer value with bits set for any potential security
  968. * or spoofing issues detected. The bits are defined by
  969. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  970. * will be zero if the input string passes all of the
  971. * enabled checks. Any information in this bitmask will be
  972. * consistent with the information saved in the optional
  973. * checkResult parameter.
  974. * @see uspoof_openCheckResult
  975. * @see uspoof_check2
  976. * @see uspoof_check2UnicodeString
  977. * @stable ICU 58
  978. */
  979. U_CAPI int32_t U_EXPORT2
  980. uspoof_check2UTF8(const USpoofChecker *sc,
  981. const char *id, int32_t length,
  982. USpoofCheckResult* checkResult,
  983. UErrorCode *status);
  984. /**
  985. * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return
  986. * information about the identifier. Information includes:
  987. * <ul>
  988. * <li>A bitmask of the checks that failed</li>
  989. * <li>The identifier's restriction level (UTS 39 section 5.2)</li>
  990. * <li>The set of numerics in the string (UTS 39 section 5.3)</li>
  991. * </ul>
  992. * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call
  993. * of {@link uspoof_check2}.
  994. *
  995. * @param status The error code, set if this function encounters a problem.
  996. * @return the newly created USpoofCheckResult
  997. * @see uspoof_check2
  998. * @see uspoof_check2UTF8
  999. * @see uspoof_check2UnicodeString
  1000. * @stable ICU 58
  1001. */
  1002. U_CAPI USpoofCheckResult* U_EXPORT2
  1003. uspoof_openCheckResult(UErrorCode *status);
  1004. /**
  1005. * Close a USpoofCheckResult, freeing any memory that was being held by
  1006. * its implementation.
  1007. *
  1008. * @param checkResult The instance of USpoofCheckResult to close
  1009. * @stable ICU 58
  1010. */
  1011. U_CAPI void U_EXPORT2
  1012. uspoof_closeCheckResult(USpoofCheckResult *checkResult);
  1013. /**
  1014. * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
  1015. * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on.
  1016. *
  1017. * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
  1018. * @param status The error code, set if an error occurred.
  1019. * @return An integer value with bits set for any potential security
  1020. * or spoofing issues detected. The bits are defined by
  1021. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  1022. * will be zero if the input string passes all of the
  1023. * enabled checks.
  1024. * @see uspoof_setChecks
  1025. * @stable ICU 58
  1026. */
  1027. U_CAPI int32_t U_EXPORT2
  1028. uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status);
  1029. /**
  1030. * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check
  1031. * was enabled; otherwise, undefined.
  1032. *
  1033. * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
  1034. * @param status The error code, set if an error occurred.
  1035. * @return The restriction level contained in the USpoofCheckResult
  1036. * @see uspoof_setRestrictionLevel
  1037. * @stable ICU 58
  1038. */
  1039. U_CAPI URestrictionLevel U_EXPORT2
  1040. uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status);
  1041. /**
  1042. * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled;
  1043. * otherwise, undefined. The set will contain the zero digit from each decimal number system found
  1044. * in the input string. Ownership of the returned USet remains with the USpoofCheckResult.
  1045. * The USet will be free'd when {@link uspoof_closeCheckResult} is called.
  1046. *
  1047. * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
  1048. * @return The set of numerics contained in the USpoofCheckResult
  1049. * @param status The error code, set if an error occurred.
  1050. * @stable ICU 58
  1051. */
  1052. U_CAPI const USet* U_EXPORT2
  1053. uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status);
  1054. /**
  1055. * Check the whether two specified strings are visually confusable.
  1056. *
  1057. * If the strings are confusable, the return value will be nonzero, as long as
  1058. * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
  1059. *
  1060. * The bits in the return value correspond to flags for each of the classes of
  1061. * confusables applicable to the two input strings. According to UTS 39
  1062. * section 4, the possible flags are:
  1063. *
  1064. * <ul>
  1065. * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
  1066. * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
  1067. * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
  1068. * </ul>
  1069. *
  1070. * If one or more of the above flags were not listed in uspoof_setChecks(), this
  1071. * function will never report that class of confusable. The check
  1072. * {@link USPOOF_CONFUSABLE} enables all three flags.
  1073. *
  1074. *
  1075. * @param sc The USpoofChecker
  1076. * @param id1 The first of the two identifiers to be compared for
  1077. * confusability. The strings are in UTF-16 format.
  1078. * @param length1 the length of the first identifier, expressed in
  1079. * 16 bit UTF-16 code units, or -1 if the string is
  1080. * nul terminated.
  1081. * @param id2 The second of the two identifiers to be compared for
  1082. * confusability. The identifiers are in UTF-16 format.
  1083. * @param length2 The length of the second identifiers, expressed in
  1084. * 16 bit UTF-16 code units, or -1 if the string is
  1085. * nul terminated.
  1086. * @param status The error code, set if an error occurred while attempting to
  1087. * perform the check.
  1088. * Confusability of the identifiers is not reported here,
  1089. * but through this function's return value.
  1090. * @return An integer value with bit(s) set corresponding to
  1091. * the type of confusability found, as defined by
  1092. * enum USpoofChecks. Zero is returned if the identifiers
  1093. * are not confusable.
  1094. *
  1095. * @stable ICU 4.2
  1096. */
  1097. U_CAPI int32_t U_EXPORT2
  1098. uspoof_areConfusable(const USpoofChecker *sc,
  1099. const UChar *id1, int32_t length1,
  1100. const UChar *id2, int32_t length2,
  1101. UErrorCode *status);
  1102. /**
  1103. * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
  1104. *
  1105. * @param sc The USpoofChecker
  1106. * @param id1 The first of the two identifiers to be compared for
  1107. * confusability. The strings are in UTF-8 format.
  1108. * @param length1 the length of the first identifiers, in bytes, or -1
  1109. * if the string is nul terminated.
  1110. * @param id2 The second of the two identifiers to be compared for
  1111. * confusability. The strings are in UTF-8 format.
  1112. * @param length2 The length of the second string in bytes, or -1
  1113. * if the string is nul terminated.
  1114. * @param status The error code, set if an error occurred while attempting to
  1115. * perform the check.
  1116. * Confusability of the strings is not reported here,
  1117. * but through this function's return value.
  1118. * @return An integer value with bit(s) set corresponding to
  1119. * the type of confusability found, as defined by
  1120. * enum USpoofChecks. Zero is returned if the strings
  1121. * are not confusable.
  1122. *
  1123. * @stable ICU 4.2
  1124. *
  1125. * @see uspoof_areConfusable
  1126. */
  1127. U_CAPI int32_t U_EXPORT2
  1128. uspoof_areConfusableUTF8(const USpoofChecker *sc,
  1129. const char *id1, int32_t length1,
  1130. const char *id2, int32_t length2,
  1131. UErrorCode *status);
  1132. /**
  1133. * Get the "skeleton" for an identifier.
  1134. * Skeletons are a transformation of the input identifier;
  1135. * Two identifiers are confusable if their skeletons are identical.
  1136. * See Unicode UAX #39 for additional information.
  1137. *
  1138. * Using skeletons directly makes it possible to quickly check
  1139. * whether an identifier is confusable with any of some large
  1140. * set of existing identifiers, by creating an efficiently
  1141. * searchable collection of the skeletons.
  1142. *
  1143. * @param sc The USpoofChecker
  1144. * @param type Deprecated in ICU 58. You may pass any number.
  1145. * Originally, controlled which of the Unicode confusable data
  1146. * tables to use.
  1147. * @param id The input identifier whose skeleton will be computed.
  1148. * @param length The length of the input identifier, expressed in 16 bit
  1149. * UTF-16 code units, or -1 if the string is zero terminated.
  1150. * @param dest The output buffer, to receive the skeleton string.
  1151. * @param destCapacity The length of the output buffer, in 16 bit units.
  1152. * The destCapacity may be zero, in which case the function will
  1153. * return the actual length of the skeleton.
  1154. * @param status The error code, set if an error occurred while attempting to
  1155. * perform the check.
  1156. * @return The length of the skeleton string. The returned length
  1157. * is always that of the complete skeleton, even when the
  1158. * supplied buffer is too small (or of zero length)
  1159. *
  1160. * @stable ICU 4.2
  1161. * @see uspoof_areConfusable
  1162. */
  1163. U_CAPI int32_t U_EXPORT2
  1164. uspoof_getSkeleton(const USpoofChecker *sc,
  1165. uint32_t type,
  1166. const UChar *id, int32_t length,
  1167. UChar *dest, int32_t destCapacity,
  1168. UErrorCode *status);
  1169. /**
  1170. * Get the "skeleton" for an identifier.
  1171. * Skeletons are a transformation of the input identifier;
  1172. * Two identifiers are confusable if their skeletons are identical.
  1173. * See Unicode UAX #39 for additional information.
  1174. *
  1175. * Using skeletons directly makes it possible to quickly check
  1176. * whether an identifier is confusable with any of some large
  1177. * set of existing identifiers, by creating an efficiently
  1178. * searchable collection of the skeletons.
  1179. *
  1180. * @param sc The USpoofChecker
  1181. * @param type Deprecated in ICU 58. You may pass any number.
  1182. * Originally, controlled which of the Unicode confusable data
  1183. * tables to use.
  1184. * @param id The UTF-8 format identifier whose skeleton will be computed.
  1185. * @param length The length of the input string, in bytes,
  1186. * or -1 if the string is zero terminated.
  1187. * @param dest The output buffer, to receive the skeleton string.
  1188. * @param destCapacity The length of the output buffer, in bytes.
  1189. * The destCapacity may be zero, in which case the function will
  1190. * return the actual length of the skeleton.
  1191. * @param status The error code, set if an error occurred while attempting to
  1192. * perform the check. Possible Errors include U_INVALID_CHAR_FOUND
  1193. * for invalid UTF-8 sequences, and
  1194. * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
  1195. * to hold the complete skeleton.
  1196. * @return The length of the skeleton string, in bytes. The returned length
  1197. * is always that of the complete skeleton, even when the
  1198. * supplied buffer is too small (or of zero length)
  1199. *
  1200. * @stable ICU 4.2
  1201. */
  1202. U_CAPI int32_t U_EXPORT2
  1203. uspoof_getSkeletonUTF8(const USpoofChecker *sc,
  1204. uint32_t type,
  1205. const char *id, int32_t length,
  1206. char *dest, int32_t destCapacity,
  1207. UErrorCode *status);
  1208. /**
  1209. * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
  1210. * in http://unicode.org/Public/security/latest/xidmodifications.txt
  1211. * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
  1212. *
  1213. * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
  1214. * be deleted by the caller.
  1215. *
  1216. * @param status The error code, set if a problem occurs while creating the set.
  1217. *
  1218. * @stable ICU 51
  1219. */
  1220. U_CAPI const USet * U_EXPORT2
  1221. uspoof_getInclusionSet(UErrorCode *status);
  1222. /**
  1223. * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
  1224. * in http://unicode.org/Public/security/latest/xidmodifications.txt
  1225. * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
  1226. *
  1227. * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
  1228. * be deleted by the caller.
  1229. *
  1230. * @param status The error code, set if a problem occurs while creating the set.
  1231. *
  1232. * @stable ICU 51
  1233. */
  1234. U_CAPI const USet * U_EXPORT2
  1235. uspoof_getRecommendedSet(UErrorCode *status);
  1236. /**
  1237. * Serialize the data for a spoof detector into a chunk of memory.
  1238. * The flattened spoof detection tables can later be used to efficiently
  1239. * instantiate a new Spoof Detector.
  1240. *
  1241. * The serialized spoof checker includes only the data compiled from the
  1242. * Unicode data tables by uspoof_openFromSource(); it does not include
  1243. * include any other state or configuration that may have been set.
  1244. *
  1245. * @param sc the Spoof Detector whose data is to be serialized.
  1246. * @param data a pointer to 32-bit-aligned memory to be filled with the data,
  1247. * can be NULL if capacity==0
  1248. * @param capacity the number of bytes available at data,
  1249. * or 0 for preflighting
  1250. * @param status an in/out ICU UErrorCode; possible errors include:
  1251. * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
  1252. * - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad
  1253. * @return the number of bytes written or needed for the spoof data
  1254. *
  1255. * @see utrie2_openFromSerialized()
  1256. * @stable ICU 4.2
  1257. */
  1258. U_CAPI int32_t U_EXPORT2
  1259. uspoof_serialize(USpoofChecker *sc,
  1260. void *data, int32_t capacity,
  1261. UErrorCode *status);
  1262. U_CDECL_END
  1263. #if U_SHOW_CPLUSPLUS_API
  1264. U_NAMESPACE_BEGIN
  1265. /**
  1266. * \class LocalUSpoofCheckerPointer
  1267. * "Smart pointer" class, closes a USpoofChecker via uspoof_close().
  1268. * For most methods see the LocalPointerBase base class.
  1269. *
  1270. * @see LocalPointerBase
  1271. * @see LocalPointer
  1272. * @stable ICU 4.4
  1273. */
  1274. /**
  1275. * \cond
  1276. * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
  1277. * For now, suppress with a Doxygen cond
  1278. */
  1279. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close);
  1280. /** \endcond */
  1281. /**
  1282. * \class LocalUSpoofCheckResultPointer
  1283. * "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`.
  1284. * For most methods see the LocalPointerBase base class.
  1285. *
  1286. * @see LocalPointerBase
  1287. * @see LocalPointer
  1288. * @stable ICU 58
  1289. */
  1290. /**
  1291. * \cond
  1292. * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
  1293. * For now, suppress with a Doxygen cond
  1294. */
  1295. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult);
  1296. /** \endcond */
  1297. U_NAMESPACE_END
  1298. /**
  1299. * Limit the acceptable characters to those specified by a Unicode Set.
  1300. * Any previously specified character limit is
  1301. * is replaced by the new settings. This includes limits on
  1302. * characters that were set with the uspoof_setAllowedLocales() function.
  1303. *
  1304. * The USPOOF_CHAR_LIMIT test is automatically enabled for this
  1305. * USoofChecker by this function.
  1306. *
  1307. * @param sc The USpoofChecker
  1308. * @param chars A Unicode Set containing the list of
  1309. * characters that are permitted. Ownership of the set
  1310. * remains with the caller. The incoming set is cloned by
  1311. * this function, so there are no restrictions on modifying
  1312. * or deleting the UnicodeSet after calling this function.
  1313. * @param status The error code, set if this function encounters a problem.
  1314. * @stable ICU 4.2
  1315. */
  1316. U_CAPI void U_EXPORT2
  1317. uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status);
  1318. /**
  1319. * Get a UnicodeSet for the characters permitted in an identifier.
  1320. * This corresponds to the limits imposed by the Set Allowed Characters /
  1321. * UnicodeSet functions. Limitations imposed by other checks will not be
  1322. * reflected in the set returned by this function.
  1323. *
  1324. * The returned set will be frozen, meaning that it cannot be modified
  1325. * by the caller.
  1326. *
  1327. * Ownership of the returned set remains with the Spoof Detector. The
  1328. * returned set will become invalid if the spoof detector is closed,
  1329. * or if a new set of allowed characters is specified.
  1330. *
  1331. *
  1332. * @param sc The USpoofChecker
  1333. * @param status The error code, set if this function encounters a problem.
  1334. * @return A UnicodeSet containing the characters that are permitted by
  1335. * the USPOOF_CHAR_LIMIT test.
  1336. * @stable ICU 4.2
  1337. */
  1338. U_CAPI const icu::UnicodeSet * U_EXPORT2
  1339. uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
  1340. /**
  1341. * Check the specified string for possible security issues.
  1342. * The text to be checked will typically be an identifier of some sort.
  1343. * The set of checks to be performed is specified with uspoof_setChecks().
  1344. *
  1345. * \note
  1346. * Consider using the newer API, {@link uspoof_check2UnicodeString}, instead.
  1347. * The newer API exposes additional information from the check procedure
  1348. * and is otherwise identical to this method.
  1349. *
  1350. * @param sc The USpoofChecker
  1351. * @param id A identifier to be checked for possible security issues.
  1352. * @param position Deprecated in ICU 51. Always returns zero.
  1353. * Originally, an out parameter for the index of the first
  1354. * string position that failed a check.
  1355. * This parameter may be nullptr.
  1356. * @param status The error code, set if an error occurred while attempting to
  1357. * perform the check.
  1358. * Spoofing or security issues detected with the input string are
  1359. * not reported here, but through the function's return value.
  1360. * @return An integer value with bits set for any potential security
  1361. * or spoofing issues detected. The bits are defined by
  1362. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  1363. * will be zero if the input string passes all of the
  1364. * enabled checks.
  1365. * @see uspoof_check2UnicodeString
  1366. * @stable ICU 4.2
  1367. */
  1368. U_CAPI int32_t U_EXPORT2
  1369. uspoof_checkUnicodeString(const USpoofChecker *sc,
  1370. const icu::UnicodeString &id,
  1371. int32_t *position,
  1372. UErrorCode *status);
  1373. /**
  1374. * Check the specified string for possible security issues.
  1375. * The text to be checked will typically be an identifier of some sort.
  1376. * The set of checks to be performed is specified with uspoof_setChecks().
  1377. *
  1378. * @param sc The USpoofChecker
  1379. * @param id A identifier to be checked for possible security issues.
  1380. * @param checkResult An instance of USpoofCheckResult to be filled with
  1381. * details about the identifier. Can be nullptr.
  1382. * @param status The error code, set if an error occurred while attempting to
  1383. * perform the check.
  1384. * Spoofing or security issues detected with the input string are
  1385. * not reported here, but through the function's return value.
  1386. * @return An integer value with bits set for any potential security
  1387. * or spoofing issues detected. The bits are defined by
  1388. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  1389. * will be zero if the input string passes all of the
  1390. * enabled checks. Any information in this bitmask will be
  1391. * consistent with the information saved in the optional
  1392. * checkResult parameter.
  1393. * @see uspoof_openCheckResult
  1394. * @see uspoof_check2
  1395. * @see uspoof_check2UTF8
  1396. * @stable ICU 58
  1397. */
  1398. U_CAPI int32_t U_EXPORT2
  1399. uspoof_check2UnicodeString(const USpoofChecker *sc,
  1400. const icu::UnicodeString &id,
  1401. USpoofCheckResult* checkResult,
  1402. UErrorCode *status);
  1403. /**
  1404. * A version of {@link uspoof_areConfusable} accepting UnicodeStrings.
  1405. *
  1406. * @param sc The USpoofChecker
  1407. * @param s1 The first of the two identifiers to be compared for
  1408. * confusability. The strings are in UTF-8 format.
  1409. * @param s2 The second of the two identifiers to be compared for
  1410. * confusability. The strings are in UTF-8 format.
  1411. * @param status The error code, set if an error occurred while attempting to
  1412. * perform the check.
  1413. * Confusability of the identifiers is not reported here,
  1414. * but through this function's return value.
  1415. * @return An integer value with bit(s) set corresponding to
  1416. * the type of confusability found, as defined by
  1417. * enum USpoofChecks. Zero is returned if the identifiers
  1418. * are not confusable.
  1419. *
  1420. * @stable ICU 4.2
  1421. *
  1422. * @see uspoof_areConfusable
  1423. */
  1424. U_CAPI int32_t U_EXPORT2
  1425. uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
  1426. const icu::UnicodeString &s1,
  1427. const icu::UnicodeString &s2,
  1428. UErrorCode *status);
  1429. /**
  1430. * Get the "skeleton" for an identifier.
  1431. * Skeletons are a transformation of the input identifier;
  1432. * Two identifiers are confusable if their skeletons are identical.
  1433. * See Unicode UAX #39 for additional information.
  1434. *
  1435. * Using skeletons directly makes it possible to quickly check
  1436. * whether an identifier is confusable with any of some large
  1437. * set of existing identifiers, by creating an efficiently
  1438. * searchable collection of the skeletons.
  1439. *
  1440. * @param sc The USpoofChecker.
  1441. * @param type Deprecated in ICU 58. You may pass any number.
  1442. * Originally, controlled which of the Unicode confusable data
  1443. * tables to use.
  1444. * @param id The input identifier whose skeleton will be computed.
  1445. * @param dest The output identifier, to receive the skeleton string.
  1446. * @param status The error code, set if an error occurred while attempting to
  1447. * perform the check.
  1448. * @return A reference to the destination (skeleton) string.
  1449. *
  1450. * @stable ICU 4.2
  1451. */
  1452. U_I18N_API icu::UnicodeString & U_EXPORT2
  1453. uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
  1454. uint32_t type,
  1455. const icu::UnicodeString &id,
  1456. icu::UnicodeString &dest,
  1457. UErrorCode *status);
  1458. /**
  1459. * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
  1460. * in http://unicode.org/Public/security/latest/xidmodifications.txt
  1461. * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
  1462. *
  1463. * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
  1464. * be deleted by the caller.
  1465. *
  1466. * @param status The error code, set if a problem occurs while creating the set.
  1467. *
  1468. * @stable ICU 51
  1469. */
  1470. U_CAPI const icu::UnicodeSet * U_EXPORT2
  1471. uspoof_getInclusionUnicodeSet(UErrorCode *status);
  1472. /**
  1473. * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
  1474. * in http://unicode.org/Public/security/latest/xidmodifications.txt
  1475. * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
  1476. *
  1477. * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
  1478. * be deleted by the caller.
  1479. *
  1480. * @param status The error code, set if a problem occurs while creating the set.
  1481. *
  1482. * @stable ICU 51
  1483. */
  1484. U_CAPI const icu::UnicodeSet * U_EXPORT2
  1485. uspoof_getRecommendedUnicodeSet(UErrorCode *status);
  1486. #endif /* U_SHOW_CPLUSPLUS_API */
  1487. #endif /* UCONFIG_NO_NORMALIZATION */
  1488. #endif /* USPOOF_H */