edits.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // edits.h
  4. // created: 2016dec30 Markus W. Scherer
  5. #ifndef __EDITS_H__
  6. #define __EDITS_H__
  7. #include "unicode/utypes.h"
  8. #if U_SHOW_CPLUSPLUS_API
  9. #include "unicode/uobject.h"
  10. /**
  11. * \file
  12. * \brief C++ API: C++ class Edits for low-level string transformations on styled text.
  13. */
  14. U_NAMESPACE_BEGIN
  15. class UnicodeString;
  16. /**
  17. * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions
  18. * in linear progression. Does not support moving/reordering of text.
  19. *
  20. * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to
  21. * instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and
  22. * {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity,
  23. * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one
  24. * mapping between code points in the source and destination strings.
  25. *
  26. * After all edits have been added, instances of this class should be considered immutable, and an
  27. * {@link Edits::Iterator} can be used for queries.
  28. *
  29. * There are four flavors of Edits::Iterator:
  30. *
  31. * <ul>
  32. * <li>{@link #getFineIterator()} retains full granularity of change edits.
  33. * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling
  34. * next() on the iterator, skips over no-change edits (unchanged regions).
  35. * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change
  36. * edits are automatically merged during the construction phase.)
  37. * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when
  38. * calling next() on the iterator, skips over no-change edits (unchanged regions).
  39. * </ul>
  40. *
  41. * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the
  42. * following fine edits:
  43. * <ul>
  44. * <li>abc ⇨ abc (no-change)
  45. * <li>ß ⇨ ss (change)
  46. * <li>D ⇨ d (change)
  47. * <li>e ⇨ e (no-change)
  48. * <li>F ⇨ f (change)
  49. * </ul>
  50. * and the following coarse edits (note how adjacent change edits get merged together):
  51. * <ul>
  52. * <li>abc ⇨ abc (no-change)
  53. * <li>ßD ⇨ ssd (change)
  54. * <li>e ⇨ e (no-change)
  55. * <li>F ⇨ f (change)
  56. * </ul>
  57. *
  58. * The "fine changes" and "coarse changes" iterators will step through only the change edits when their
  59. * `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when
  60. * their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()`
  61. * methods are used to walk through the string.
  62. *
  63. * For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in
  64. * UCharacterCaseTest.java.
  65. *
  66. * An Edits object tracks a separate UErrorCode, but ICU string transformation functions
  67. * (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
  68. *
  69. * @stable ICU 59
  70. */
  71. class U_COMMON_API Edits final : public UMemory {
  72. public:
  73. /**
  74. * Constructs an empty object.
  75. * @stable ICU 59
  76. */
  77. Edits() :
  78. array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
  79. errorCode_(U_ZERO_ERROR) {}
  80. /**
  81. * Copy constructor.
  82. * @param other source edits
  83. * @stable ICU 60
  84. */
  85. Edits(const Edits &other) :
  86. array(stackArray), capacity(STACK_CAPACITY), length(other.length),
  87. delta(other.delta), numChanges(other.numChanges),
  88. errorCode_(other.errorCode_) {
  89. copyArray(other);
  90. }
  91. /**
  92. * Move constructor, might leave src empty.
  93. * This object will have the same contents that the source object had.
  94. * @param src source edits
  95. * @stable ICU 60
  96. */
  97. Edits(Edits &&src) noexcept :
  98. array(stackArray), capacity(STACK_CAPACITY), length(src.length),
  99. delta(src.delta), numChanges(src.numChanges),
  100. errorCode_(src.errorCode_) {
  101. moveArray(src);
  102. }
  103. /**
  104. * Destructor.
  105. * @stable ICU 59
  106. */
  107. ~Edits();
  108. /**
  109. * Assignment operator.
  110. * @param other source edits
  111. * @return *this
  112. * @stable ICU 60
  113. */
  114. Edits &operator=(const Edits &other);
  115. /**
  116. * Move assignment operator, might leave src empty.
  117. * This object will have the same contents that the source object had.
  118. * The behavior is undefined if *this and src are the same object.
  119. * @param src source edits
  120. * @return *this
  121. * @stable ICU 60
  122. */
  123. Edits &operator=(Edits &&src) noexcept;
  124. /**
  125. * Resets the data but may not release memory.
  126. * @stable ICU 59
  127. */
  128. void reset() noexcept;
  129. /**
  130. * Adds a no-change edit: a record for an unchanged segment of text.
  131. * Normally called from inside ICU string transformation functions, not user code.
  132. * @stable ICU 59
  133. */
  134. void addUnchanged(int32_t unchangedLength);
  135. /**
  136. * Adds a change edit: a record for a text replacement/insertion/deletion.
  137. * Normally called from inside ICU string transformation functions, not user code.
  138. * @stable ICU 59
  139. */
  140. void addReplace(int32_t oldLength, int32_t newLength);
  141. /**
  142. * Sets the UErrorCode if an error occurred while recording edits.
  143. * Preserves older error codes in the outErrorCode.
  144. * Normally called from inside ICU string transformation functions, not user code.
  145. * @param outErrorCode Set to an error code if it does not contain one already
  146. * and an error occurred while recording edits.
  147. * Otherwise unchanged.
  148. * @return true if U_FAILURE(outErrorCode)
  149. * @stable ICU 59
  150. */
  151. UBool copyErrorTo(UErrorCode &outErrorCode) const;
  152. /**
  153. * How much longer is the new text compared with the old text?
  154. * @return new length minus old length
  155. * @stable ICU 59
  156. */
  157. int32_t lengthDelta() const { return delta; }
  158. /**
  159. * @return true if there are any change edits
  160. * @stable ICU 59
  161. */
  162. UBool hasChanges() const { return numChanges != 0; }
  163. /**
  164. * @return the number of change edits
  165. * @stable ICU 60
  166. */
  167. int32_t numberOfChanges() const { return numChanges; }
  168. /**
  169. * Access to the list of edits.
  170. *
  171. * At any moment in time, an instance of this class points to a single edit: a "window" into a span
  172. * of the source string and the corresponding span of the destination string. The source string span
  173. * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string
  174. * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars.
  175. *
  176. * The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`,
  177. * and `findDestinationIndex(int32_t, UErrorCode &)` methods.
  178. * Calling any of these methods mutates the iterator to make it point to the corresponding edit.
  179. *
  180. * For more information, see the documentation for {@link Edits}.
  181. *
  182. * @see getCoarseIterator
  183. * @see getFineIterator
  184. * @stable ICU 59
  185. */
  186. struct U_COMMON_API Iterator final : public UMemory {
  187. /**
  188. * Default constructor, empty iterator.
  189. * @stable ICU 60
  190. */
  191. Iterator() :
  192. array(nullptr), index(0), length(0),
  193. remaining(0), onlyChanges_(false), coarse(false),
  194. dir(0), changed(false), oldLength_(0), newLength_(0),
  195. srcIndex(0), replIndex(0), destIndex(0) {}
  196. /**
  197. * Copy constructor.
  198. * @stable ICU 59
  199. */
  200. Iterator(const Iterator &other) = default;
  201. /**
  202. * Assignment operator.
  203. * @stable ICU 59
  204. */
  205. Iterator &operator=(const Iterator &other) = default;
  206. /**
  207. * Advances the iterator to the next edit.
  208. * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
  209. * or else the function returns immediately. Check for U_FAILURE()
  210. * on output or use with function chaining. (See User Guide for details.)
  211. * @return true if there is another edit
  212. * @stable ICU 59
  213. */
  214. UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }
  215. /**
  216. * Moves the iterator to the edit that contains the source index.
  217. * The source index may be found in a no-change edit
  218. * even if normal iteration would skip no-change edits.
  219. * Normal iteration can continue from a found edit.
  220. *
  221. * The iterator state before this search logically does not matter.
  222. * (It may affect the performance of the search.)
  223. *
  224. * The iterator state after this search is undefined
  225. * if the source index is out of bounds for the source string.
  226. *
  227. * @param i source index
  228. * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
  229. * or else the function returns immediately. Check for U_FAILURE()
  230. * on output or use with function chaining. (See User Guide for details.)
  231. * @return true if the edit for the source index was found
  232. * @stable ICU 59
  233. */
  234. UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {
  235. return findIndex(i, true, errorCode) == 0;
  236. }
  237. /**
  238. * Moves the iterator to the edit that contains the destination index.
  239. * The destination index may be found in a no-change edit
  240. * even if normal iteration would skip no-change edits.
  241. * Normal iteration can continue from a found edit.
  242. *
  243. * The iterator state before this search logically does not matter.
  244. * (It may affect the performance of the search.)
  245. *
  246. * The iterator state after this search is undefined
  247. * if the source index is out of bounds for the source string.
  248. *
  249. * @param i destination index
  250. * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
  251. * or else the function returns immediately. Check for U_FAILURE()
  252. * on output or use with function chaining. (See User Guide for details.)
  253. * @return true if the edit for the destination index was found
  254. * @stable ICU 60
  255. */
  256. UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {
  257. return findIndex(i, false, errorCode) == 0;
  258. }
  259. /**
  260. * Computes the destination index corresponding to the given source index.
  261. * If the source index is inside a change edit (not at its start),
  262. * then the destination index at the end of that edit is returned,
  263. * since there is no information about index mapping inside a change edit.
  264. *
  265. * (This means that indexes to the start and middle of an edit,
  266. * for example around a grapheme cluster, are mapped to indexes
  267. * encompassing the entire edit.
  268. * The alternative, mapping an interior index to the start,
  269. * would map such an interval to an empty one.)
  270. *
  271. * This operation will usually but not always modify this object.
  272. * The iterator state after this search is undefined.
  273. *
  274. * @param i source index
  275. * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
  276. * or else the function returns immediately. Check for U_FAILURE()
  277. * on output or use with function chaining. (See User Guide for details.)
  278. * @return destination index; undefined if i is not 0..string length
  279. * @stable ICU 60
  280. */
  281. int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);
  282. /**
  283. * Computes the source index corresponding to the given destination index.
  284. * If the destination index is inside a change edit (not at its start),
  285. * then the source index at the end of that edit is returned,
  286. * since there is no information about index mapping inside a change edit.
  287. *
  288. * (This means that indexes to the start and middle of an edit,
  289. * for example around a grapheme cluster, are mapped to indexes
  290. * encompassing the entire edit.
  291. * The alternative, mapping an interior index to the start,
  292. * would map such an interval to an empty one.)
  293. *
  294. * This operation will usually but not always modify this object.
  295. * The iterator state after this search is undefined.
  296. *
  297. * @param i destination index
  298. * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
  299. * or else the function returns immediately. Check for U_FAILURE()
  300. * on output or use with function chaining. (See User Guide for details.)
  301. * @return source index; undefined if i is not 0..string length
  302. * @stable ICU 60
  303. */
  304. int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);
  305. /**
  306. * Returns whether the edit currently represented by the iterator is a change edit.
  307. *
  308. * @return true if this edit replaces oldLength() units with newLength() different ones.
  309. * false if oldLength units remain unchanged.
  310. * @stable ICU 59
  311. */
  312. UBool hasChange() const { return changed; }
  313. /**
  314. * The length of the current span in the source string, which starts at {@link #sourceIndex}.
  315. *
  316. * @return the number of units in the original string which are replaced or remain unchanged.
  317. * @stable ICU 59
  318. */
  319. int32_t oldLength() const { return oldLength_; }
  320. /**
  321. * The length of the current span in the destination string, which starts at
  322. * {@link #destinationIndex}, or in the replacement string, which starts at
  323. * {@link #replacementIndex}.
  324. *
  325. * @return the number of units in the modified string, if hasChange() is true.
  326. * Same as oldLength if hasChange() is false.
  327. * @stable ICU 59
  328. */
  329. int32_t newLength() const { return newLength_; }
  330. /**
  331. * The start index of the current span in the source string; the span has length
  332. * {@link #oldLength}.
  333. *
  334. * @return the current index into the source string
  335. * @stable ICU 59
  336. */
  337. int32_t sourceIndex() const { return srcIndex; }
  338. /**
  339. * The start index of the current span in the replacement string; the span has length
  340. * {@link #newLength}. Well-defined only if the current edit is a change edit.
  341. *
  342. * The *replacement string* is the concatenation of all substrings of the destination
  343. * string corresponding to change edits.
  344. *
  345. * This method is intended to be used together with operations that write only replacement
  346. * characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option).
  347. * The source string can then be modified in-place.
  348. *
  349. * @return the current index into the replacement-characters-only string,
  350. * not counting unchanged spans
  351. * @stable ICU 59
  352. */
  353. int32_t replacementIndex() const {
  354. // TODO: Throw an exception if we aren't in a change edit?
  355. return replIndex;
  356. }
  357. /**
  358. * The start index of the current span in the destination string; the span has length
  359. * {@link #newLength}.
  360. *
  361. * @return the current index into the full destination string
  362. * @stable ICU 59
  363. */
  364. int32_t destinationIndex() const { return destIndex; }
  365. #ifndef U_HIDE_INTERNAL_API
  366. /**
  367. * A string representation of the current edit represented by the iterator for debugging. You
  368. * should not depend on the contents of the return string.
  369. * @internal
  370. */
  371. UnicodeString& toString(UnicodeString& appendTo) const;
  372. #endif // U_HIDE_INTERNAL_API
  373. private:
  374. friend class Edits;
  375. Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
  376. int32_t readLength(int32_t head);
  377. void updateNextIndexes();
  378. void updatePreviousIndexes();
  379. UBool noNext();
  380. UBool next(UBool onlyChanges, UErrorCode &errorCode);
  381. UBool previous(UErrorCode &errorCode);
  382. /** @return -1: error or i<0; 0: found; 1: i>=string length */
  383. int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
  384. const uint16_t *array;
  385. int32_t index, length;
  386. // 0 if we are not within compressed equal-length changes.
  387. // Otherwise the number of remaining changes, including the current one.
  388. int32_t remaining;
  389. UBool onlyChanges_, coarse;
  390. int8_t dir; // iteration direction: back(<0), initial(0), forward(>0)
  391. UBool changed;
  392. int32_t oldLength_, newLength_;
  393. int32_t srcIndex, replIndex, destIndex;
  394. };
  395. /**
  396. * Returns an Iterator for coarse-grained change edits
  397. * (adjacent change edits are treated as one).
  398. * Can be used to perform simple string updates.
  399. * Skips no-change edits.
  400. * @return an Iterator that merges adjacent changes.
  401. * @stable ICU 59
  402. */
  403. Iterator getCoarseChangesIterator() const {
  404. return Iterator(array, length, true, true);
  405. }
  406. /**
  407. * Returns an Iterator for coarse-grained change and no-change edits
  408. * (adjacent change edits are treated as one).
  409. * Can be used to perform simple string updates.
  410. * Adjacent change edits are treated as one edit.
  411. * @return an Iterator that merges adjacent changes.
  412. * @stable ICU 59
  413. */
  414. Iterator getCoarseIterator() const {
  415. return Iterator(array, length, false, true);
  416. }
  417. /**
  418. * Returns an Iterator for fine-grained change edits
  419. * (full granularity of change edits is retained).
  420. * Can be used for modifying styled text.
  421. * Skips no-change edits.
  422. * @return an Iterator that separates adjacent changes.
  423. * @stable ICU 59
  424. */
  425. Iterator getFineChangesIterator() const {
  426. return Iterator(array, length, true, false);
  427. }
  428. /**
  429. * Returns an Iterator for fine-grained change and no-change edits
  430. * (full granularity of change edits is retained).
  431. * Can be used for modifying styled text.
  432. * @return an Iterator that separates adjacent changes.
  433. * @stable ICU 59
  434. */
  435. Iterator getFineIterator() const {
  436. return Iterator(array, length, false, false);
  437. }
  438. /**
  439. * Merges the two input Edits and appends the result to this object.
  440. *
  441. * Consider two string transformations (for example, normalization and case mapping)
  442. * where each records Edits in addition to writing an output string.<br>
  443. * Edits ab reflect how substrings of input string a
  444. * map to substrings of intermediate string b.<br>
  445. * Edits bc reflect how substrings of intermediate string b
  446. * map to substrings of output string c.<br>
  447. * This function merges ab and bc such that the additional edits
  448. * recorded in this object reflect how substrings of input string a
  449. * map to substrings of output string c.
  450. *
  451. * If unrelated Edits are passed in where the output string of the first
  452. * has a different length than the input string of the second,
  453. * then a U_ILLEGAL_ARGUMENT_ERROR is reported.
  454. *
  455. * @param ab reflects how substrings of input string a
  456. * map to substrings of intermediate string b.
  457. * @param bc reflects how substrings of intermediate string b
  458. * map to substrings of output string c.
  459. * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
  460. * or else the function returns immediately. Check for U_FAILURE()
  461. * on output or use with function chaining. (See User Guide for details.)
  462. * @return *this, with the merged edits appended
  463. * @stable ICU 60
  464. */
  465. Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);
  466. private:
  467. void releaseArray() noexcept;
  468. Edits &copyArray(const Edits &other);
  469. Edits &moveArray(Edits &src) noexcept;
  470. void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
  471. int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
  472. void append(int32_t r);
  473. UBool growArray();
  474. static const int32_t STACK_CAPACITY = 100;
  475. uint16_t *array;
  476. int32_t capacity;
  477. int32_t length;
  478. int32_t delta;
  479. int32_t numChanges;
  480. UErrorCode errorCode_;
  481. uint16_t stackArray[STACK_CAPACITY];
  482. };
  483. U_NAMESPACE_END
  484. #endif /* U_SHOW_CPLUSPLUS_API */
  485. #endif // __EDITS_H__