loclikely.cpp 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 1997-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: loclikely.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2010feb25
  16. * created by: Markus W. Scherer
  17. *
  18. * Code for likely and minimized locale subtags, separated out from other .cpp files
  19. * that then do not depend on resource bundle code and likely-subtags data.
  20. */
  21. #include "unicode/bytestream.h"
  22. #include "unicode/utypes.h"
  23. #include "unicode/locid.h"
  24. #include "unicode/putil.h"
  25. #include "unicode/uchar.h"
  26. #include "unicode/uloc.h"
  27. #include "unicode/ures.h"
  28. #include "unicode/uscript.h"
  29. #include "bytesinkutil.h"
  30. #include "charstr.h"
  31. #include "cmemory.h"
  32. #include "cstring.h"
  33. #include "ulocimp.h"
  34. #include "ustr_imp.h"
  35. /**
  36. * These are the canonical strings for unknown languages, scripts and regions.
  37. **/
  38. static const char* const unknownLanguage = "und";
  39. static const char* const unknownScript = "Zzzz";
  40. static const char* const unknownRegion = "ZZ";
  41. /**
  42. * This function looks for the localeID in the likelySubtags resource.
  43. *
  44. * @param localeID The tag to find.
  45. * @param buffer A buffer to hold the matching entry
  46. * @param bufferLength The length of the output buffer
  47. * @return A pointer to "buffer" if found, or a null pointer if not.
  48. */
  49. static const char* U_CALLCONV
  50. findLikelySubtags(const char* localeID,
  51. char* buffer,
  52. int32_t bufferLength,
  53. UErrorCode* err) {
  54. const char* result = nullptr;
  55. if (!U_FAILURE(*err)) {
  56. int32_t resLen = 0;
  57. const char16_t* s = nullptr;
  58. UErrorCode tmpErr = U_ZERO_ERROR;
  59. icu::LocalUResourceBundlePointer subtags(ures_openDirect(nullptr, "likelySubtags", &tmpErr));
  60. if (U_SUCCESS(tmpErr)) {
  61. icu::CharString und;
  62. if (localeID != nullptr) {
  63. if (*localeID == '\0') {
  64. localeID = unknownLanguage;
  65. } else if (*localeID == '_') {
  66. und.append(unknownLanguage, *err);
  67. und.append(localeID, *err);
  68. if (U_FAILURE(*err)) {
  69. return nullptr;
  70. }
  71. localeID = und.data();
  72. }
  73. }
  74. s = ures_getStringByKey(subtags.getAlias(), localeID, &resLen, &tmpErr);
  75. if (U_FAILURE(tmpErr)) {
  76. /*
  77. * If a resource is missing, it's not really an error, it's
  78. * just that we don't have any data for that particular locale ID.
  79. */
  80. if (tmpErr != U_MISSING_RESOURCE_ERROR) {
  81. *err = tmpErr;
  82. }
  83. }
  84. else if (resLen >= bufferLength) {
  85. /* The buffer should never overflow. */
  86. *err = U_INTERNAL_PROGRAM_ERROR;
  87. }
  88. else {
  89. u_UCharsToChars(s, buffer, resLen + 1);
  90. if (resLen >= 3 &&
  91. uprv_strnicmp(buffer, unknownLanguage, 3) == 0 &&
  92. (resLen == 3 || buffer[3] == '_')) {
  93. uprv_memmove(buffer, buffer + 3, resLen - 3 + 1);
  94. }
  95. result = buffer;
  96. }
  97. } else {
  98. *err = tmpErr;
  99. }
  100. }
  101. return result;
  102. }
  103. /**
  104. * Append a tag to a buffer, adding the separator if necessary. The buffer
  105. * must be large enough to contain the resulting tag plus any separator
  106. * necessary. The tag must not be a zero-length string.
  107. *
  108. * @param tag The tag to add.
  109. * @param tagLength The length of the tag.
  110. * @param buffer The output buffer.
  111. * @param bufferLength The length of the output buffer. This is an input/output parameter.
  112. **/
  113. static void U_CALLCONV
  114. appendTag(
  115. const char* tag,
  116. int32_t tagLength,
  117. char* buffer,
  118. int32_t* bufferLength,
  119. UBool withSeparator) {
  120. if (withSeparator) {
  121. buffer[*bufferLength] = '_';
  122. ++(*bufferLength);
  123. }
  124. uprv_memmove(
  125. &buffer[*bufferLength],
  126. tag,
  127. tagLength);
  128. *bufferLength += tagLength;
  129. }
  130. /**
  131. * Create a tag string from the supplied parameters. The lang, script and region
  132. * parameters may be nullptr pointers. If they are, their corresponding length parameters
  133. * must be less than or equal to 0.
  134. *
  135. * If any of the language, script or region parameters are empty, and the alternateTags
  136. * parameter is not nullptr, it will be parsed for potential language, script and region tags
  137. * to be used when constructing the new tag. If the alternateTags parameter is nullptr, or
  138. * it contains no language tag, the default tag for the unknown language is used.
  139. *
  140. * If the length of the new string exceeds the capacity of the output buffer,
  141. * the function copies as many bytes to the output buffer as it can, and returns
  142. * the error U_BUFFER_OVERFLOW_ERROR.
  143. *
  144. * If an illegal argument is provided, the function returns the error
  145. * U_ILLEGAL_ARGUMENT_ERROR.
  146. *
  147. * Note that this function can return the warning U_STRING_NOT_TERMINATED_WARNING if
  148. * the tag string fits in the output buffer, but the null terminator doesn't.
  149. *
  150. * @param lang The language tag to use.
  151. * @param langLength The length of the language tag.
  152. * @param script The script tag to use.
  153. * @param scriptLength The length of the script tag.
  154. * @param region The region tag to use.
  155. * @param regionLength The length of the region tag.
  156. * @param trailing Any trailing data to append to the new tag.
  157. * @param trailingLength The length of the trailing data.
  158. * @param alternateTags A string containing any alternate tags.
  159. * @param sink The output sink receiving the tag string.
  160. * @param err A pointer to a UErrorCode for error reporting.
  161. **/
  162. static void U_CALLCONV
  163. createTagStringWithAlternates(
  164. const char* lang,
  165. int32_t langLength,
  166. const char* script,
  167. int32_t scriptLength,
  168. const char* region,
  169. int32_t regionLength,
  170. const char* trailing,
  171. int32_t trailingLength,
  172. const char* alternateTags,
  173. icu::ByteSink& sink,
  174. UErrorCode* err) {
  175. if (U_FAILURE(*err)) {
  176. goto error;
  177. }
  178. else if (langLength >= ULOC_LANG_CAPACITY ||
  179. scriptLength >= ULOC_SCRIPT_CAPACITY ||
  180. regionLength >= ULOC_COUNTRY_CAPACITY) {
  181. goto error;
  182. }
  183. else {
  184. /**
  185. * ULOC_FULLNAME_CAPACITY will provide enough capacity
  186. * that we can build a string that contains the language,
  187. * script and region code without worrying about overrunning
  188. * the user-supplied buffer.
  189. **/
  190. char tagBuffer[ULOC_FULLNAME_CAPACITY];
  191. int32_t tagLength = 0;
  192. UBool regionAppended = false;
  193. if (langLength > 0) {
  194. appendTag(
  195. lang,
  196. langLength,
  197. tagBuffer,
  198. &tagLength,
  199. /*withSeparator=*/false);
  200. }
  201. else if (alternateTags == nullptr) {
  202. /*
  203. * Use the empty string for an unknown language, if
  204. * we found no language.
  205. */
  206. }
  207. else {
  208. /*
  209. * Parse the alternateTags string for the language.
  210. */
  211. char alternateLang[ULOC_LANG_CAPACITY];
  212. int32_t alternateLangLength = sizeof(alternateLang);
  213. alternateLangLength =
  214. uloc_getLanguage(
  215. alternateTags,
  216. alternateLang,
  217. alternateLangLength,
  218. err);
  219. if(U_FAILURE(*err) ||
  220. alternateLangLength >= ULOC_LANG_CAPACITY) {
  221. goto error;
  222. }
  223. else if (alternateLangLength == 0) {
  224. /*
  225. * Use the empty string for an unknown language, if
  226. * we found no language.
  227. */
  228. }
  229. else {
  230. appendTag(
  231. alternateLang,
  232. alternateLangLength,
  233. tagBuffer,
  234. &tagLength,
  235. /*withSeparator=*/false);
  236. }
  237. }
  238. if (scriptLength > 0) {
  239. appendTag(
  240. script,
  241. scriptLength,
  242. tagBuffer,
  243. &tagLength,
  244. /*withSeparator=*/true);
  245. }
  246. else if (alternateTags != nullptr) {
  247. /*
  248. * Parse the alternateTags string for the script.
  249. */
  250. char alternateScript[ULOC_SCRIPT_CAPACITY];
  251. const int32_t alternateScriptLength =
  252. uloc_getScript(
  253. alternateTags,
  254. alternateScript,
  255. sizeof(alternateScript),
  256. err);
  257. if (U_FAILURE(*err) ||
  258. alternateScriptLength >= ULOC_SCRIPT_CAPACITY) {
  259. goto error;
  260. }
  261. else if (alternateScriptLength > 0) {
  262. appendTag(
  263. alternateScript,
  264. alternateScriptLength,
  265. tagBuffer,
  266. &tagLength,
  267. /*withSeparator=*/true);
  268. }
  269. }
  270. if (regionLength > 0) {
  271. appendTag(
  272. region,
  273. regionLength,
  274. tagBuffer,
  275. &tagLength,
  276. /*withSeparator=*/true);
  277. regionAppended = true;
  278. }
  279. else if (alternateTags != nullptr) {
  280. /*
  281. * Parse the alternateTags string for the region.
  282. */
  283. char alternateRegion[ULOC_COUNTRY_CAPACITY];
  284. const int32_t alternateRegionLength =
  285. uloc_getCountry(
  286. alternateTags,
  287. alternateRegion,
  288. sizeof(alternateRegion),
  289. err);
  290. if (U_FAILURE(*err) ||
  291. alternateRegionLength >= ULOC_COUNTRY_CAPACITY) {
  292. goto error;
  293. }
  294. else if (alternateRegionLength > 0) {
  295. appendTag(
  296. alternateRegion,
  297. alternateRegionLength,
  298. tagBuffer,
  299. &tagLength,
  300. /*withSeparator=*/true);
  301. regionAppended = true;
  302. }
  303. }
  304. /**
  305. * Copy the partial tag from our internal buffer to the supplied
  306. * target.
  307. **/
  308. sink.Append(tagBuffer, tagLength);
  309. if (trailingLength > 0) {
  310. if (*trailing != '@') {
  311. sink.Append("_", 1);
  312. if (!regionAppended) {
  313. /* extra separator is required */
  314. sink.Append("_", 1);
  315. }
  316. }
  317. /*
  318. * Copy the trailing data into the supplied buffer.
  319. */
  320. sink.Append(trailing, trailingLength);
  321. }
  322. return;
  323. }
  324. error:
  325. /**
  326. * An overflow indicates the locale ID passed in
  327. * is ill-formed. If we got here, and there was
  328. * no previous error, it's an implicit overflow.
  329. **/
  330. if (*err == U_BUFFER_OVERFLOW_ERROR ||
  331. U_SUCCESS(*err)) {
  332. *err = U_ILLEGAL_ARGUMENT_ERROR;
  333. }
  334. }
  335. /**
  336. * Create a tag string from the supplied parameters. The lang, script and region
  337. * parameters may be nullptr pointers. If they are, their corresponding length parameters
  338. * must be less than or equal to 0. If the lang parameter is an empty string, the
  339. * default value for an unknown language is written to the output buffer.
  340. *
  341. * If the length of the new string exceeds the capacity of the output buffer,
  342. * the function copies as many bytes to the output buffer as it can, and returns
  343. * the error U_BUFFER_OVERFLOW_ERROR.
  344. *
  345. * If an illegal argument is provided, the function returns the error
  346. * U_ILLEGAL_ARGUMENT_ERROR.
  347. *
  348. * @param lang The language tag to use.
  349. * @param langLength The length of the language tag.
  350. * @param script The script tag to use.
  351. * @param scriptLength The length of the script tag.
  352. * @param region The region tag to use.
  353. * @param regionLength The length of the region tag.
  354. * @param trailing Any trailing data to append to the new tag.
  355. * @param trailingLength The length of the trailing data.
  356. * @param sink The output sink receiving the tag string.
  357. * @param err A pointer to a UErrorCode for error reporting.
  358. **/
  359. static void U_CALLCONV
  360. createTagString(
  361. const char* lang,
  362. int32_t langLength,
  363. const char* script,
  364. int32_t scriptLength,
  365. const char* region,
  366. int32_t regionLength,
  367. const char* trailing,
  368. int32_t trailingLength,
  369. icu::ByteSink& sink,
  370. UErrorCode* err)
  371. {
  372. createTagStringWithAlternates(
  373. lang,
  374. langLength,
  375. script,
  376. scriptLength,
  377. region,
  378. regionLength,
  379. trailing,
  380. trailingLength,
  381. nullptr,
  382. sink,
  383. err);
  384. }
  385. /**
  386. * Parse the language, script, and region subtags from a tag string, and copy the
  387. * results into the corresponding output parameters. The buffers are null-terminated,
  388. * unless overflow occurs.
  389. *
  390. * The langLength, scriptLength, and regionLength parameters are input/output
  391. * parameters, and must contain the capacity of their corresponding buffers on
  392. * input. On output, they will contain the actual length of the buffers, not
  393. * including the null terminator.
  394. *
  395. * If the length of any of the output subtags exceeds the capacity of the corresponding
  396. * buffer, the function copies as many bytes to the output buffer as it can, and returns
  397. * the error U_BUFFER_OVERFLOW_ERROR. It will not parse any more subtags once overflow
  398. * occurs.
  399. *
  400. * If an illegal argument is provided, the function returns the error
  401. * U_ILLEGAL_ARGUMENT_ERROR.
  402. *
  403. * @param localeID The locale ID to parse.
  404. * @param lang The language tag buffer.
  405. * @param langLength The length of the language tag.
  406. * @param script The script tag buffer.
  407. * @param scriptLength The length of the script tag.
  408. * @param region The region tag buffer.
  409. * @param regionLength The length of the region tag.
  410. * @param err A pointer to a UErrorCode for error reporting.
  411. * @return The number of chars of the localeID parameter consumed.
  412. **/
  413. static int32_t U_CALLCONV
  414. parseTagString(
  415. const char* localeID,
  416. char* lang,
  417. int32_t* langLength,
  418. char* script,
  419. int32_t* scriptLength,
  420. char* region,
  421. int32_t* regionLength,
  422. UErrorCode* err)
  423. {
  424. const char* position = localeID;
  425. int32_t subtagLength = 0;
  426. if(U_FAILURE(*err) ||
  427. localeID == nullptr ||
  428. lang == nullptr ||
  429. langLength == nullptr ||
  430. script == nullptr ||
  431. scriptLength == nullptr ||
  432. region == nullptr ||
  433. regionLength == nullptr) {
  434. goto error;
  435. }
  436. subtagLength = ulocimp_getLanguage(position, &position, *err).extract(lang, *langLength, *err);
  437. /*
  438. * Note that we explicit consider U_STRING_NOT_TERMINATED_WARNING
  439. * to be an error, because it indicates the user-supplied tag is
  440. * not well-formed.
  441. */
  442. if(U_FAILURE(*err)) {
  443. goto error;
  444. }
  445. *langLength = subtagLength;
  446. /*
  447. * If no language was present, use the empty string instead.
  448. * Otherwise, move past any separator.
  449. */
  450. if (_isIDSeparator(*position)) {
  451. ++position;
  452. }
  453. subtagLength = ulocimp_getScript(position, &position, *err).extract(script, *scriptLength, *err);
  454. if(U_FAILURE(*err)) {
  455. goto error;
  456. }
  457. *scriptLength = subtagLength;
  458. if (*scriptLength > 0) {
  459. if (uprv_strnicmp(script, unknownScript, *scriptLength) == 0) {
  460. /**
  461. * If the script part is the "unknown" script, then don't return it.
  462. **/
  463. *scriptLength = 0;
  464. }
  465. /*
  466. * Move past any separator.
  467. */
  468. if (_isIDSeparator(*position)) {
  469. ++position;
  470. }
  471. }
  472. subtagLength = ulocimp_getCountry(position, &position, *err).extract(region, *regionLength, *err);
  473. if(U_FAILURE(*err)) {
  474. goto error;
  475. }
  476. *regionLength = subtagLength;
  477. if (*regionLength > 0) {
  478. if (uprv_strnicmp(region, unknownRegion, *regionLength) == 0) {
  479. /**
  480. * If the region part is the "unknown" region, then don't return it.
  481. **/
  482. *regionLength = 0;
  483. }
  484. } else if (*position != 0 && *position != '@') {
  485. /* back up over consumed trailing separator */
  486. --position;
  487. }
  488. exit:
  489. return (int32_t)(position - localeID);
  490. error:
  491. /**
  492. * If we get here, we have no explicit error, it's the result of an
  493. * illegal argument.
  494. **/
  495. if (!U_FAILURE(*err)) {
  496. *err = U_ILLEGAL_ARGUMENT_ERROR;
  497. }
  498. goto exit;
  499. }
  500. static UBool U_CALLCONV
  501. createLikelySubtagsString(
  502. const char* lang,
  503. int32_t langLength,
  504. const char* script,
  505. int32_t scriptLength,
  506. const char* region,
  507. int32_t regionLength,
  508. const char* variants,
  509. int32_t variantsLength,
  510. icu::ByteSink& sink,
  511. UErrorCode* err) {
  512. /**
  513. * ULOC_FULLNAME_CAPACITY will provide enough capacity
  514. * that we can build a string that contains the language,
  515. * script and region code without worrying about overrunning
  516. * the user-supplied buffer.
  517. **/
  518. char likelySubtagsBuffer[ULOC_FULLNAME_CAPACITY];
  519. if(U_FAILURE(*err)) {
  520. goto error;
  521. }
  522. /**
  523. * Try the language with the script and region first.
  524. **/
  525. if (scriptLength > 0 && regionLength > 0) {
  526. const char* likelySubtags = nullptr;
  527. icu::CharString tagBuffer;
  528. {
  529. icu::CharStringByteSink sink(&tagBuffer);
  530. createTagString(
  531. lang,
  532. langLength,
  533. script,
  534. scriptLength,
  535. region,
  536. regionLength,
  537. nullptr,
  538. 0,
  539. sink,
  540. err);
  541. }
  542. if(U_FAILURE(*err)) {
  543. goto error;
  544. }
  545. likelySubtags =
  546. findLikelySubtags(
  547. tagBuffer.data(),
  548. likelySubtagsBuffer,
  549. sizeof(likelySubtagsBuffer),
  550. err);
  551. if(U_FAILURE(*err)) {
  552. goto error;
  553. }
  554. if (likelySubtags != nullptr) {
  555. /* Always use the language tag from the
  556. maximal string, since it may be more
  557. specific than the one provided. */
  558. createTagStringWithAlternates(
  559. nullptr,
  560. 0,
  561. nullptr,
  562. 0,
  563. nullptr,
  564. 0,
  565. variants,
  566. variantsLength,
  567. likelySubtags,
  568. sink,
  569. err);
  570. return true;
  571. }
  572. }
  573. /**
  574. * Try the language with just the script.
  575. **/
  576. if (scriptLength > 0) {
  577. const char* likelySubtags = nullptr;
  578. icu::CharString tagBuffer;
  579. {
  580. icu::CharStringByteSink sink(&tagBuffer);
  581. createTagString(
  582. lang,
  583. langLength,
  584. script,
  585. scriptLength,
  586. nullptr,
  587. 0,
  588. nullptr,
  589. 0,
  590. sink,
  591. err);
  592. }
  593. if(U_FAILURE(*err)) {
  594. goto error;
  595. }
  596. likelySubtags =
  597. findLikelySubtags(
  598. tagBuffer.data(),
  599. likelySubtagsBuffer,
  600. sizeof(likelySubtagsBuffer),
  601. err);
  602. if(U_FAILURE(*err)) {
  603. goto error;
  604. }
  605. if (likelySubtags != nullptr) {
  606. /* Always use the language tag from the
  607. maximal string, since it may be more
  608. specific than the one provided. */
  609. createTagStringWithAlternates(
  610. nullptr,
  611. 0,
  612. nullptr,
  613. 0,
  614. region,
  615. regionLength,
  616. variants,
  617. variantsLength,
  618. likelySubtags,
  619. sink,
  620. err);
  621. return true;
  622. }
  623. }
  624. /**
  625. * Try the language with just the region.
  626. **/
  627. if (regionLength > 0) {
  628. const char* likelySubtags = nullptr;
  629. icu::CharString tagBuffer;
  630. {
  631. icu::CharStringByteSink sink(&tagBuffer);
  632. createTagString(
  633. lang,
  634. langLength,
  635. nullptr,
  636. 0,
  637. region,
  638. regionLength,
  639. nullptr,
  640. 0,
  641. sink,
  642. err);
  643. }
  644. if(U_FAILURE(*err)) {
  645. goto error;
  646. }
  647. likelySubtags =
  648. findLikelySubtags(
  649. tagBuffer.data(),
  650. likelySubtagsBuffer,
  651. sizeof(likelySubtagsBuffer),
  652. err);
  653. if(U_FAILURE(*err)) {
  654. goto error;
  655. }
  656. if (likelySubtags != nullptr) {
  657. /* Always use the language tag from the
  658. maximal string, since it may be more
  659. specific than the one provided. */
  660. createTagStringWithAlternates(
  661. nullptr,
  662. 0,
  663. script,
  664. scriptLength,
  665. nullptr,
  666. 0,
  667. variants,
  668. variantsLength,
  669. likelySubtags,
  670. sink,
  671. err);
  672. return true;
  673. }
  674. }
  675. /**
  676. * Finally, try just the language.
  677. **/
  678. {
  679. const char* likelySubtags = nullptr;
  680. icu::CharString tagBuffer;
  681. {
  682. icu::CharStringByteSink sink(&tagBuffer);
  683. createTagString(
  684. lang,
  685. langLength,
  686. nullptr,
  687. 0,
  688. nullptr,
  689. 0,
  690. nullptr,
  691. 0,
  692. sink,
  693. err);
  694. }
  695. if(U_FAILURE(*err)) {
  696. goto error;
  697. }
  698. likelySubtags =
  699. findLikelySubtags(
  700. tagBuffer.data(),
  701. likelySubtagsBuffer,
  702. sizeof(likelySubtagsBuffer),
  703. err);
  704. if(U_FAILURE(*err)) {
  705. goto error;
  706. }
  707. if (likelySubtags != nullptr) {
  708. /* Always use the language tag from the
  709. maximal string, since it may be more
  710. specific than the one provided. */
  711. createTagStringWithAlternates(
  712. nullptr,
  713. 0,
  714. script,
  715. scriptLength,
  716. region,
  717. regionLength,
  718. variants,
  719. variantsLength,
  720. likelySubtags,
  721. sink,
  722. err);
  723. return true;
  724. }
  725. }
  726. return false;
  727. error:
  728. if (!U_FAILURE(*err)) {
  729. *err = U_ILLEGAL_ARGUMENT_ERROR;
  730. }
  731. return false;
  732. }
  733. #define CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength) UPRV_BLOCK_MACRO_BEGIN { \
  734. int32_t count = 0; \
  735. int32_t i; \
  736. for (i = 0; i < trailingLength; i++) { \
  737. if (trailing[i] == '-' || trailing[i] == '_') { \
  738. count = 0; \
  739. if (count > 8) { \
  740. goto error; \
  741. } \
  742. } else if (trailing[i] == '@') { \
  743. break; \
  744. } else if (count > 8) { \
  745. goto error; \
  746. } else { \
  747. count++; \
  748. } \
  749. } \
  750. } UPRV_BLOCK_MACRO_END
  751. static UBool
  752. _uloc_addLikelySubtags(const char* localeID,
  753. icu::ByteSink& sink,
  754. UErrorCode* err) {
  755. char lang[ULOC_LANG_CAPACITY];
  756. int32_t langLength = sizeof(lang);
  757. char script[ULOC_SCRIPT_CAPACITY];
  758. int32_t scriptLength = sizeof(script);
  759. char region[ULOC_COUNTRY_CAPACITY];
  760. int32_t regionLength = sizeof(region);
  761. const char* trailing = "";
  762. int32_t trailingLength = 0;
  763. int32_t trailingIndex = 0;
  764. UBool success = false;
  765. if(U_FAILURE(*err)) {
  766. goto error;
  767. }
  768. if (localeID == nullptr) {
  769. goto error;
  770. }
  771. trailingIndex = parseTagString(
  772. localeID,
  773. lang,
  774. &langLength,
  775. script,
  776. &scriptLength,
  777. region,
  778. &regionLength,
  779. err);
  780. if(U_FAILURE(*err)) {
  781. /* Overflow indicates an illegal argument error */
  782. if (*err == U_BUFFER_OVERFLOW_ERROR) {
  783. *err = U_ILLEGAL_ARGUMENT_ERROR;
  784. }
  785. goto error;
  786. }
  787. /* Find the length of the trailing portion. */
  788. while (_isIDSeparator(localeID[trailingIndex])) {
  789. trailingIndex++;
  790. }
  791. trailing = &localeID[trailingIndex];
  792. trailingLength = (int32_t)uprv_strlen(trailing);
  793. CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
  794. success =
  795. createLikelySubtagsString(
  796. lang,
  797. langLength,
  798. script,
  799. scriptLength,
  800. region,
  801. regionLength,
  802. trailing,
  803. trailingLength,
  804. sink,
  805. err);
  806. if (!success) {
  807. const int32_t localIDLength = (int32_t)uprv_strlen(localeID);
  808. /*
  809. * If we get here, we need to return localeID.
  810. */
  811. sink.Append(localeID, localIDLength);
  812. }
  813. return success;
  814. error:
  815. if (!U_FAILURE(*err)) {
  816. *err = U_ILLEGAL_ARGUMENT_ERROR;
  817. }
  818. return false;
  819. }
  820. // Add likely subtags to the sink
  821. // return true if the value in the sink is produced by a match during the lookup
  822. // return false if the value in the sink is the same as input because there are
  823. // no match after the lookup.
  824. static UBool _ulocimp_addLikelySubtags(const char*, icu::ByteSink&, UErrorCode*);
  825. static void
  826. _uloc_minimizeSubtags(const char* localeID,
  827. icu::ByteSink& sink,
  828. UErrorCode* err) {
  829. icu::CharString maximizedTagBuffer;
  830. char lang[ULOC_LANG_CAPACITY];
  831. int32_t langLength = sizeof(lang);
  832. char script[ULOC_SCRIPT_CAPACITY];
  833. int32_t scriptLength = sizeof(script);
  834. char region[ULOC_COUNTRY_CAPACITY];
  835. int32_t regionLength = sizeof(region);
  836. const char* trailing = "";
  837. int32_t trailingLength = 0;
  838. int32_t trailingIndex = 0;
  839. UBool successGetMax = false;
  840. if(U_FAILURE(*err)) {
  841. goto error;
  842. }
  843. else if (localeID == nullptr) {
  844. goto error;
  845. }
  846. trailingIndex =
  847. parseTagString(
  848. localeID,
  849. lang,
  850. &langLength,
  851. script,
  852. &scriptLength,
  853. region,
  854. &regionLength,
  855. err);
  856. if(U_FAILURE(*err)) {
  857. /* Overflow indicates an illegal argument error */
  858. if (*err == U_BUFFER_OVERFLOW_ERROR) {
  859. *err = U_ILLEGAL_ARGUMENT_ERROR;
  860. }
  861. goto error;
  862. }
  863. /* Find the spot where the variants or the keywords begin, if any. */
  864. while (_isIDSeparator(localeID[trailingIndex])) {
  865. trailingIndex++;
  866. }
  867. trailing = &localeID[trailingIndex];
  868. trailingLength = (int32_t)uprv_strlen(trailing);
  869. CHECK_TRAILING_VARIANT_SIZE(trailing, trailingLength);
  870. {
  871. icu::CharString base;
  872. {
  873. icu::CharStringByteSink baseSink(&base);
  874. createTagString(
  875. lang,
  876. langLength,
  877. script,
  878. scriptLength,
  879. region,
  880. regionLength,
  881. nullptr,
  882. 0,
  883. baseSink,
  884. err);
  885. }
  886. /**
  887. * First, we need to first get the maximization
  888. * from AddLikelySubtags.
  889. **/
  890. {
  891. icu::CharStringByteSink maxSink(&maximizedTagBuffer);
  892. successGetMax = _ulocimp_addLikelySubtags(base.data(), maxSink, err);
  893. }
  894. }
  895. if(U_FAILURE(*err)) {
  896. goto error;
  897. }
  898. if (!successGetMax) {
  899. /**
  900. * If we got here, return the locale ID parameter unchanged.
  901. **/
  902. const int32_t localeIDLength = (int32_t)uprv_strlen(localeID);
  903. sink.Append(localeID, localeIDLength);
  904. return;
  905. }
  906. // In the following, the lang, script, region are referring to those in
  907. // the maximizedTagBuffer, not the one in the localeID.
  908. langLength = sizeof(lang);
  909. scriptLength = sizeof(script);
  910. regionLength = sizeof(region);
  911. parseTagString(
  912. maximizedTagBuffer.data(),
  913. lang,
  914. &langLength,
  915. script,
  916. &scriptLength,
  917. region,
  918. &regionLength,
  919. err);
  920. if(U_FAILURE(*err)) {
  921. goto error;
  922. }
  923. /**
  924. * Start first with just the language.
  925. **/
  926. {
  927. icu::CharString tagBuffer;
  928. {
  929. icu::CharStringByteSink tagSink(&tagBuffer);
  930. createLikelySubtagsString(
  931. lang,
  932. langLength,
  933. nullptr,
  934. 0,
  935. nullptr,
  936. 0,
  937. nullptr,
  938. 0,
  939. tagSink,
  940. err);
  941. }
  942. if(U_FAILURE(*err)) {
  943. goto error;
  944. }
  945. else if (!tagBuffer.isEmpty() &&
  946. uprv_strnicmp(
  947. maximizedTagBuffer.data(),
  948. tagBuffer.data(),
  949. tagBuffer.length()) == 0) {
  950. createTagString(
  951. lang,
  952. langLength,
  953. nullptr,
  954. 0,
  955. nullptr,
  956. 0,
  957. trailing,
  958. trailingLength,
  959. sink,
  960. err);
  961. return;
  962. }
  963. }
  964. /**
  965. * Next, try the language and region.
  966. **/
  967. if (regionLength > 0) {
  968. icu::CharString tagBuffer;
  969. {
  970. icu::CharStringByteSink tagSink(&tagBuffer);
  971. createLikelySubtagsString(
  972. lang,
  973. langLength,
  974. nullptr,
  975. 0,
  976. region,
  977. regionLength,
  978. nullptr,
  979. 0,
  980. tagSink,
  981. err);
  982. }
  983. if(U_FAILURE(*err)) {
  984. goto error;
  985. }
  986. else if (!tagBuffer.isEmpty() &&
  987. uprv_strnicmp(
  988. maximizedTagBuffer.data(),
  989. tagBuffer.data(),
  990. tagBuffer.length()) == 0) {
  991. createTagString(
  992. lang,
  993. langLength,
  994. nullptr,
  995. 0,
  996. region,
  997. regionLength,
  998. trailing,
  999. trailingLength,
  1000. sink,
  1001. err);
  1002. return;
  1003. }
  1004. }
  1005. /**
  1006. * Finally, try the language and script. This is our last chance,
  1007. * since trying with all three subtags would only yield the
  1008. * maximal version that we already have.
  1009. **/
  1010. if (scriptLength > 0) {
  1011. icu::CharString tagBuffer;
  1012. {
  1013. icu::CharStringByteSink tagSink(&tagBuffer);
  1014. createLikelySubtagsString(
  1015. lang,
  1016. langLength,
  1017. script,
  1018. scriptLength,
  1019. nullptr,
  1020. 0,
  1021. nullptr,
  1022. 0,
  1023. tagSink,
  1024. err);
  1025. }
  1026. if(U_FAILURE(*err)) {
  1027. goto error;
  1028. }
  1029. else if (!tagBuffer.isEmpty() &&
  1030. uprv_strnicmp(
  1031. maximizedTagBuffer.data(),
  1032. tagBuffer.data(),
  1033. tagBuffer.length()) == 0) {
  1034. createTagString(
  1035. lang,
  1036. langLength,
  1037. script,
  1038. scriptLength,
  1039. nullptr,
  1040. 0,
  1041. trailing,
  1042. trailingLength,
  1043. sink,
  1044. err);
  1045. return;
  1046. }
  1047. }
  1048. {
  1049. /**
  1050. * If we got here, return the max + trail.
  1051. **/
  1052. createTagString(
  1053. lang,
  1054. langLength,
  1055. script,
  1056. scriptLength,
  1057. region,
  1058. regionLength,
  1059. trailing,
  1060. trailingLength,
  1061. sink,
  1062. err);
  1063. return;
  1064. }
  1065. error:
  1066. if (!U_FAILURE(*err)) {
  1067. *err = U_ILLEGAL_ARGUMENT_ERROR;
  1068. }
  1069. }
  1070. static int32_t
  1071. do_canonicalize(const char* localeID,
  1072. char* buffer,
  1073. int32_t bufferCapacity,
  1074. UErrorCode* err)
  1075. {
  1076. int32_t canonicalizedSize = uloc_canonicalize(
  1077. localeID,
  1078. buffer,
  1079. bufferCapacity,
  1080. err);
  1081. if (*err == U_STRING_NOT_TERMINATED_WARNING ||
  1082. *err == U_BUFFER_OVERFLOW_ERROR) {
  1083. return canonicalizedSize;
  1084. }
  1085. else if (U_FAILURE(*err)) {
  1086. return -1;
  1087. }
  1088. else {
  1089. return canonicalizedSize;
  1090. }
  1091. }
  1092. U_CAPI int32_t U_EXPORT2
  1093. uloc_addLikelySubtags(const char* localeID,
  1094. char* maximizedLocaleID,
  1095. int32_t maximizedLocaleIDCapacity,
  1096. UErrorCode* status) {
  1097. if (U_FAILURE(*status)) {
  1098. return 0;
  1099. }
  1100. icu::CheckedArrayByteSink sink(
  1101. maximizedLocaleID, maximizedLocaleIDCapacity);
  1102. ulocimp_addLikelySubtags(localeID, sink, status);
  1103. int32_t reslen = sink.NumberOfBytesAppended();
  1104. if (U_FAILURE(*status)) {
  1105. return sink.Overflowed() ? reslen : -1;
  1106. }
  1107. if (sink.Overflowed()) {
  1108. *status = U_BUFFER_OVERFLOW_ERROR;
  1109. } else {
  1110. u_terminateChars(
  1111. maximizedLocaleID, maximizedLocaleIDCapacity, reslen, status);
  1112. }
  1113. return reslen;
  1114. }
  1115. static UBool
  1116. _ulocimp_addLikelySubtags(const char* localeID,
  1117. icu::ByteSink& sink,
  1118. UErrorCode* status) {
  1119. PreflightingLocaleIDBuffer localeBuffer;
  1120. do {
  1121. localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
  1122. localeBuffer.getCapacity(), status);
  1123. } while (localeBuffer.needToTryAgain(status));
  1124. if (U_SUCCESS(*status)) {
  1125. return _uloc_addLikelySubtags(localeBuffer.getBuffer(), sink, status);
  1126. } else {
  1127. return false;
  1128. }
  1129. }
  1130. U_CAPI void U_EXPORT2
  1131. ulocimp_addLikelySubtags(const char* localeID,
  1132. icu::ByteSink& sink,
  1133. UErrorCode* status) {
  1134. _ulocimp_addLikelySubtags(localeID, sink, status);
  1135. }
  1136. U_CAPI int32_t U_EXPORT2
  1137. uloc_minimizeSubtags(const char* localeID,
  1138. char* minimizedLocaleID,
  1139. int32_t minimizedLocaleIDCapacity,
  1140. UErrorCode* status) {
  1141. if (U_FAILURE(*status)) {
  1142. return 0;
  1143. }
  1144. icu::CheckedArrayByteSink sink(
  1145. minimizedLocaleID, minimizedLocaleIDCapacity);
  1146. ulocimp_minimizeSubtags(localeID, sink, status);
  1147. int32_t reslen = sink.NumberOfBytesAppended();
  1148. if (U_FAILURE(*status)) {
  1149. return sink.Overflowed() ? reslen : -1;
  1150. }
  1151. if (sink.Overflowed()) {
  1152. *status = U_BUFFER_OVERFLOW_ERROR;
  1153. } else {
  1154. u_terminateChars(
  1155. minimizedLocaleID, minimizedLocaleIDCapacity, reslen, status);
  1156. }
  1157. return reslen;
  1158. }
  1159. U_CAPI void U_EXPORT2
  1160. ulocimp_minimizeSubtags(const char* localeID,
  1161. icu::ByteSink& sink,
  1162. UErrorCode* status) {
  1163. PreflightingLocaleIDBuffer localeBuffer;
  1164. do {
  1165. localeBuffer.requestedCapacity = do_canonicalize(localeID, localeBuffer.getBuffer(),
  1166. localeBuffer.getCapacity(), status);
  1167. } while (localeBuffer.needToTryAgain(status));
  1168. _uloc_minimizeSubtags(localeBuffer.getBuffer(), sink, status);
  1169. }
  1170. // Pairs of (language subtag, + or -) for finding out fast if common languages
  1171. // are LTR (minus) or RTL (plus).
  1172. static const char LANG_DIR_STRING[] =
  1173. "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
  1174. // Implemented here because this calls ulocimp_addLikelySubtags().
  1175. U_CAPI UBool U_EXPORT2
  1176. uloc_isRightToLeft(const char *locale) {
  1177. UErrorCode errorCode = U_ZERO_ERROR;
  1178. char script[8];
  1179. int32_t scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &errorCode);
  1180. if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
  1181. scriptLength == 0) {
  1182. // Fastpath: We know the likely scripts and their writing direction
  1183. // for some common languages.
  1184. errorCode = U_ZERO_ERROR;
  1185. char lang[8];
  1186. int32_t langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &errorCode);
  1187. if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
  1188. return false;
  1189. }
  1190. if (langLength > 0) {
  1191. const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang);
  1192. if (langPtr != nullptr) {
  1193. switch (langPtr[langLength]) {
  1194. case '-': return false;
  1195. case '+': return true;
  1196. default: break; // partial match of a longer code
  1197. }
  1198. }
  1199. }
  1200. // Otherwise, find the likely script.
  1201. errorCode = U_ZERO_ERROR;
  1202. icu::CharString likely;
  1203. {
  1204. icu::CharStringByteSink sink(&likely);
  1205. ulocimp_addLikelySubtags(locale, sink, &errorCode);
  1206. }
  1207. if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING) {
  1208. return false;
  1209. }
  1210. scriptLength = uloc_getScript(likely.data(), script, UPRV_LENGTHOF(script), &errorCode);
  1211. if (U_FAILURE(errorCode) || errorCode == U_STRING_NOT_TERMINATED_WARNING ||
  1212. scriptLength == 0) {
  1213. return false;
  1214. }
  1215. }
  1216. UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script);
  1217. return uscript_isRightToLeft(scriptCode);
  1218. }
  1219. U_NAMESPACE_BEGIN
  1220. UBool
  1221. Locale::isRightToLeft() const {
  1222. return uloc_isRightToLeft(getBaseName());
  1223. }
  1224. U_NAMESPACE_END
  1225. // The following must at least allow for rg key value (6) plus terminator (1).
  1226. #define ULOC_RG_BUFLEN 8
  1227. U_CAPI int32_t U_EXPORT2
  1228. ulocimp_getRegionForSupplementalData(const char *localeID, UBool inferRegion,
  1229. char *region, int32_t regionCapacity, UErrorCode* status) {
  1230. if (U_FAILURE(*status)) {
  1231. return 0;
  1232. }
  1233. char rgBuf[ULOC_RG_BUFLEN];
  1234. UErrorCode rgStatus = U_ZERO_ERROR;
  1235. // First check for rg keyword value
  1236. int32_t rgLen = uloc_getKeywordValue(localeID, "rg", rgBuf, ULOC_RG_BUFLEN, &rgStatus);
  1237. if (U_FAILURE(rgStatus) || rgLen != 6) {
  1238. rgLen = 0;
  1239. } else {
  1240. // rgBuf guaranteed to be zero terminated here, with text len 6
  1241. char *rgPtr = rgBuf;
  1242. for (; *rgPtr!= 0; rgPtr++) {
  1243. *rgPtr = uprv_toupper(*rgPtr);
  1244. }
  1245. rgLen = (uprv_strcmp(rgBuf+2, "ZZZZ") == 0)? 2: 0;
  1246. }
  1247. if (rgLen == 0) {
  1248. // No valid rg keyword value, try for unicode_region_subtag
  1249. rgLen = uloc_getCountry(localeID, rgBuf, ULOC_RG_BUFLEN, status);
  1250. if (U_FAILURE(*status)) {
  1251. rgLen = 0;
  1252. } else if (rgLen == 0 && inferRegion) {
  1253. // no unicode_region_subtag but inferRegion true, try likely subtags
  1254. rgStatus = U_ZERO_ERROR;
  1255. icu::CharString locBuf;
  1256. {
  1257. icu::CharStringByteSink sink(&locBuf);
  1258. ulocimp_addLikelySubtags(localeID, sink, &rgStatus);
  1259. }
  1260. if (U_SUCCESS(rgStatus)) {
  1261. rgLen = uloc_getCountry(locBuf.data(), rgBuf, ULOC_RG_BUFLEN, status);
  1262. if (U_FAILURE(*status)) {
  1263. rgLen = 0;
  1264. }
  1265. }
  1266. }
  1267. }
  1268. rgBuf[rgLen] = 0;
  1269. uprv_strncpy(region, rgBuf, regionCapacity);
  1270. return u_terminateChars(region, regionCapacity, rgLen, status);
  1271. }