simpleformatter.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. * Copyright (C) 2014-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. ******************************************************************************
  8. * simpleformatter.cpp
  9. */
  10. #include "unicode/utypes.h"
  11. #include "unicode/simpleformatter.h"
  12. #include "unicode/unistr.h"
  13. #include "uassert.h"
  14. U_NAMESPACE_BEGIN
  15. namespace {
  16. /**
  17. * Argument numbers must be smaller than this limit.
  18. * Text segment lengths are offset by this much.
  19. * This is currently the only unused char value in compiled patterns,
  20. * except it is the maximum value of the first unit (max arg +1).
  21. */
  22. const int32_t ARG_NUM_LIMIT = 0x100;
  23. /**
  24. * Initial and maximum char/char16_t value set for a text segment.
  25. * Segment length char values are from ARG_NUM_LIMIT+1 to this value here.
  26. * Normally 0xffff, but can be as small as ARG_NUM_LIMIT+1 for testing.
  27. */
  28. const char16_t SEGMENT_LENGTH_PLACEHOLDER_CHAR = 0xffff;
  29. /**
  30. * Maximum length of a text segment. Longer segments are split into shorter ones.
  31. */
  32. const int32_t MAX_SEGMENT_LENGTH = SEGMENT_LENGTH_PLACEHOLDER_CHAR - ARG_NUM_LIMIT;
  33. enum {
  34. APOS = 0x27,
  35. DIGIT_ZERO = 0x30,
  36. DIGIT_ONE = 0x31,
  37. DIGIT_NINE = 0x39,
  38. OPEN_BRACE = 0x7b,
  39. CLOSE_BRACE = 0x7d
  40. };
  41. inline UBool isInvalidArray(const void *array, int32_t length) {
  42. return (length < 0 || (array == nullptr && length != 0));
  43. }
  44. } // namespace
  45. SimpleFormatter &SimpleFormatter::operator=(const SimpleFormatter& other) {
  46. if (this == &other) {
  47. return *this;
  48. }
  49. compiledPattern = other.compiledPattern;
  50. return *this;
  51. }
  52. SimpleFormatter::~SimpleFormatter() {}
  53. UBool SimpleFormatter::applyPatternMinMaxArguments(
  54. const UnicodeString &pattern,
  55. int32_t min, int32_t max,
  56. UErrorCode &errorCode) {
  57. if (U_FAILURE(errorCode)) {
  58. return false;
  59. }
  60. // Parse consistent with MessagePattern, but
  61. // - support only simple numbered arguments
  62. // - build a simple binary structure into the result string
  63. const char16_t *patternBuffer = pattern.getBuffer();
  64. int32_t patternLength = pattern.length();
  65. // Reserve the first char for the number of arguments.
  66. compiledPattern.setTo((char16_t)0);
  67. int32_t textLength = 0;
  68. int32_t maxArg = -1;
  69. UBool inQuote = false;
  70. for (int32_t i = 0; i < patternLength;) {
  71. char16_t c = patternBuffer[i++];
  72. if (c == APOS) {
  73. if (i < patternLength && (c = patternBuffer[i]) == APOS) {
  74. // double apostrophe, skip the second one
  75. ++i;
  76. } else if (inQuote) {
  77. // skip the quote-ending apostrophe
  78. inQuote = false;
  79. continue;
  80. } else if (c == OPEN_BRACE || c == CLOSE_BRACE) {
  81. // Skip the quote-starting apostrophe, find the end of the quoted literal text.
  82. ++i;
  83. inQuote = true;
  84. } else {
  85. // The apostrophe is part of literal text.
  86. c = APOS;
  87. }
  88. } else if (!inQuote && c == OPEN_BRACE) {
  89. if (textLength > 0) {
  90. compiledPattern.setCharAt(compiledPattern.length() - textLength - 1,
  91. (char16_t)(ARG_NUM_LIMIT + textLength));
  92. textLength = 0;
  93. }
  94. int32_t argNumber;
  95. if ((i + 1) < patternLength &&
  96. 0 <= (argNumber = patternBuffer[i] - DIGIT_ZERO) && argNumber <= 9 &&
  97. patternBuffer[i + 1] == CLOSE_BRACE) {
  98. i += 2;
  99. } else {
  100. // Multi-digit argument number (no leading zero) or syntax error.
  101. // MessagePattern permits PatternProps.skipWhiteSpace(pattern, index)
  102. // around the number, but this class does not.
  103. argNumber = -1;
  104. if (i < patternLength && DIGIT_ONE <= (c = patternBuffer[i++]) && c <= DIGIT_NINE) {
  105. argNumber = c - DIGIT_ZERO;
  106. while (i < patternLength &&
  107. DIGIT_ZERO <= (c = patternBuffer[i++]) && c <= DIGIT_NINE) {
  108. argNumber = argNumber * 10 + (c - DIGIT_ZERO);
  109. if (argNumber >= ARG_NUM_LIMIT) {
  110. break;
  111. }
  112. }
  113. }
  114. if (argNumber < 0 || c != CLOSE_BRACE) {
  115. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  116. return false;
  117. }
  118. }
  119. if (argNumber > maxArg) {
  120. maxArg = argNumber;
  121. }
  122. compiledPattern.append((char16_t)argNumber);
  123. continue;
  124. } // else: c is part of literal text
  125. // Append c and track the literal-text segment length.
  126. if (textLength == 0) {
  127. // Reserve a char for the length of a new text segment, preset the maximum length.
  128. compiledPattern.append(SEGMENT_LENGTH_PLACEHOLDER_CHAR);
  129. }
  130. compiledPattern.append(c);
  131. if (++textLength == MAX_SEGMENT_LENGTH) {
  132. textLength = 0;
  133. }
  134. }
  135. if (textLength > 0) {
  136. compiledPattern.setCharAt(compiledPattern.length() - textLength - 1,
  137. (char16_t)(ARG_NUM_LIMIT + textLength));
  138. }
  139. int32_t argCount = maxArg + 1;
  140. if (argCount < min || max < argCount) {
  141. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  142. return false;
  143. }
  144. compiledPattern.setCharAt(0, (char16_t)argCount);
  145. return true;
  146. }
  147. UnicodeString& SimpleFormatter::format(
  148. const UnicodeString &value0,
  149. UnicodeString &appendTo, UErrorCode &errorCode) const {
  150. const UnicodeString *values[] = { &value0 };
  151. return formatAndAppend(values, 1, appendTo, nullptr, 0, errorCode);
  152. }
  153. UnicodeString& SimpleFormatter::format(
  154. const UnicodeString &value0,
  155. const UnicodeString &value1,
  156. UnicodeString &appendTo, UErrorCode &errorCode) const {
  157. const UnicodeString *values[] = { &value0, &value1 };
  158. return formatAndAppend(values, 2, appendTo, nullptr, 0, errorCode);
  159. }
  160. UnicodeString& SimpleFormatter::format(
  161. const UnicodeString &value0,
  162. const UnicodeString &value1,
  163. const UnicodeString &value2,
  164. UnicodeString &appendTo, UErrorCode &errorCode) const {
  165. const UnicodeString *values[] = { &value0, &value1, &value2 };
  166. return formatAndAppend(values, 3, appendTo, nullptr, 0, errorCode);
  167. }
  168. UnicodeString& SimpleFormatter::formatAndAppend(
  169. const UnicodeString *const *values, int32_t valuesLength,
  170. UnicodeString &appendTo,
  171. int32_t *offsets, int32_t offsetsLength, UErrorCode &errorCode) const {
  172. if (U_FAILURE(errorCode)) {
  173. return appendTo;
  174. }
  175. if (isInvalidArray(values, valuesLength) || isInvalidArray(offsets, offsetsLength) ||
  176. valuesLength < getArgumentLimit()) {
  177. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  178. return appendTo;
  179. }
  180. return format(compiledPattern.getBuffer(), compiledPattern.length(), values,
  181. appendTo, nullptr, true,
  182. offsets, offsetsLength, errorCode);
  183. }
  184. UnicodeString &SimpleFormatter::formatAndReplace(
  185. const UnicodeString *const *values, int32_t valuesLength,
  186. UnicodeString &result,
  187. int32_t *offsets, int32_t offsetsLength, UErrorCode &errorCode) const {
  188. if (U_FAILURE(errorCode)) {
  189. return result;
  190. }
  191. if (isInvalidArray(values, valuesLength) || isInvalidArray(offsets, offsetsLength)) {
  192. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  193. return result;
  194. }
  195. const char16_t *cp = compiledPattern.getBuffer();
  196. int32_t cpLength = compiledPattern.length();
  197. if (valuesLength < getArgumentLimit(cp, cpLength)) {
  198. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  199. return result;
  200. }
  201. // If the pattern starts with an argument whose value is the same object
  202. // as the result, then we keep the result contents and append to it.
  203. // Otherwise we replace its contents.
  204. int32_t firstArg = -1;
  205. // If any non-initial argument value is the same object as the result,
  206. // then we first copy its contents and use that instead while formatting.
  207. UnicodeString resultCopy;
  208. if (getArgumentLimit(cp, cpLength) > 0) {
  209. for (int32_t i = 1; i < cpLength;) {
  210. int32_t n = cp[i++];
  211. if (n < ARG_NUM_LIMIT) {
  212. if (values[n] == &result) {
  213. if (i == 2) {
  214. firstArg = n;
  215. } else if (resultCopy.isEmpty() && !result.isEmpty()) {
  216. resultCopy = result;
  217. }
  218. }
  219. } else {
  220. i += n - ARG_NUM_LIMIT;
  221. }
  222. }
  223. }
  224. if (firstArg < 0) {
  225. result.remove();
  226. }
  227. return format(cp, cpLength, values,
  228. result, &resultCopy, false,
  229. offsets, offsetsLength, errorCode);
  230. }
  231. UnicodeString SimpleFormatter::getTextWithNoArguments(
  232. const char16_t *compiledPattern,
  233. int32_t compiledPatternLength,
  234. int32_t* offsets,
  235. int32_t offsetsLength) {
  236. for (int32_t i = 0; i < offsetsLength; i++) {
  237. offsets[i] = -1;
  238. }
  239. int32_t capacity = compiledPatternLength - 1 -
  240. getArgumentLimit(compiledPattern, compiledPatternLength);
  241. UnicodeString sb(capacity, 0, 0); // Java: StringBuilder
  242. for (int32_t i = 1; i < compiledPatternLength;) {
  243. int32_t n = compiledPattern[i++];
  244. if (n > ARG_NUM_LIMIT) {
  245. n -= ARG_NUM_LIMIT;
  246. sb.append(compiledPattern + i, n);
  247. i += n;
  248. } else if (n < offsetsLength) {
  249. // TODO(ICU-20406): This does not distinguish between "{0}{1}" and "{1}{0}".
  250. // Consider removing this function and replacing it with an iterator interface.
  251. offsets[n] = sb.length();
  252. }
  253. }
  254. return sb;
  255. }
  256. UnicodeString &SimpleFormatter::format(
  257. const char16_t *compiledPattern, int32_t compiledPatternLength,
  258. const UnicodeString *const *values,
  259. UnicodeString &result, const UnicodeString *resultCopy, UBool forbidResultAsValue,
  260. int32_t *offsets, int32_t offsetsLength,
  261. UErrorCode &errorCode) {
  262. if (U_FAILURE(errorCode)) {
  263. return result;
  264. }
  265. for (int32_t i = 0; i < offsetsLength; i++) {
  266. offsets[i] = -1;
  267. }
  268. for (int32_t i = 1; i < compiledPatternLength;) {
  269. int32_t n = compiledPattern[i++];
  270. if (n < ARG_NUM_LIMIT) {
  271. const UnicodeString *value = values[n];
  272. if (value == nullptr) {
  273. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  274. return result;
  275. }
  276. if (value == &result) {
  277. if (forbidResultAsValue) {
  278. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  279. return result;
  280. }
  281. if (i == 2) {
  282. // We are appending to result which is also the first value object.
  283. if (n < offsetsLength) {
  284. offsets[n] = 0;
  285. }
  286. } else {
  287. if (n < offsetsLength) {
  288. offsets[n] = result.length();
  289. }
  290. result.append(*resultCopy);
  291. }
  292. } else {
  293. if (n < offsetsLength) {
  294. offsets[n] = result.length();
  295. }
  296. result.append(*value);
  297. }
  298. } else {
  299. int32_t length = n - ARG_NUM_LIMIT;
  300. result.append(compiledPattern + i, length);
  301. i += length;
  302. }
  303. }
  304. return result;
  305. }
  306. U_NAMESPACE_END