IrishCasing.cpp 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. /******************************************************************************
  6. This file provides a finite state machine to support Irish Gaelic uppercasing
  7. rules.
  8. The caller will need to iterate through a string, passing a State variable
  9. along with the current character to each UpperCase call and checking the flags
  10. that are returned:
  11. If aMarkPos is true, caller must remember the current index in the string as
  12. a possible target for a future action.
  13. If aAction is non-zero, then one or more characters from the marked index are
  14. to be modified:
  15. 1 lowercase the marked letter
  16. 2 lowercase the marked letter and its successor
  17. 3 lowercase the marked letter, and delete its successor
  18. ### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
  19. ### comments 1 and 4:
  20. v = [a,á,e,é,i,í,o,ó,u,ú]
  21. V = [A,Á,E,É,I,Í,O,Ó,U,Ú]
  22. bhf -> bhF
  23. bhF -> bhF
  24. bp -> bP
  25. bP -> bP
  26. dt -> dT
  27. dT -> dT
  28. gc -> gC
  29. gC -> gC
  30. h{V} -> h{V}
  31. mb -> mB
  32. mB -> mB
  33. n-{v} -> n{V}
  34. n{V} -> n{V}
  35. nd -> nD
  36. nD -> nD
  37. ng -> nG
  38. nG -> nG
  39. t-{v} -> t{V}
  40. t{V} -> t{V}
  41. ts{v} -> tS{V}
  42. tS{v} -> tS{V}
  43. tS{V} -> tS{V}
  44. tsl -> tSL
  45. tSl -> tSL
  46. tSL -> tSL
  47. tsn -> tSN
  48. tSn -> tSN
  49. tSN -> tSN
  50. tsr -> tSR
  51. tSr -> tSR
  52. tSR -> tSR
  53. ### Create table of states and actions for each input class.
  54. Start (non-word) state is #; generic in-word state is _, once we know there's
  55. no special action to do in this word.
  56. # _ b bh d g h m n n- t t- ts
  57. input\state
  58. b b' _ _ _ _ _ _ 1 _ _ _ _ _
  59. B _ _ _ _ _ _ _ 1 _ _ _ _ _
  60. c _ _ _ _ _ 1 _ _ _ _ _ _ _
  61. C _ _ _ _ _ 1 _ _ _ _ _ _ _
  62. d d' _ _ _ _ _ _ _ 1 _ _ _ _
  63. D _ _ _ _ _ _ _ _ 1 _ _ _ _
  64. f _ _ _ 2 _ _ _ _ _ _ _ _ _
  65. F _ _ _ 2 _ _ _ _ _ _ _ _ _
  66. g g' _ _ _ _ _ _ _ 1 _ _ _ _
  67. G _ _ _ _ _ _ _ _ 1 _ _ _ _
  68. h h' _ bh _ _ _ _ _ _ _ _ _ _
  69. l _ _ _ _ _ _ _ _ _ _ _ _ 1
  70. L _ _ _ _ _ _ _ _ _ _ _ _ 1
  71. m m' _ _ _ _ _ _ _ _ _ _ _ _
  72. n n' _ _ _ _ _ _ _ _ _ _ _ 1
  73. N _ _ _ _ _ _ _ _ _ _ _ _ 1
  74. p _ _ 1 _ _ _ _ _ _ _ _ _ _
  75. P _ _ 1 _ _ _ _ _ _ _ _ _ _
  76. r _ _ _ _ _ _ _ _ _ _ _ _ 1
  77. R _ _ _ _ _ _ _ _ _ _ _ _ 1
  78. s _ _ _ _ _ _ _ _ _ _ ts _ _
  79. S _ _ _ _ _ _ _ _ _ _ ts _ _
  80. t t' _ _ _ 1 _ _ _ _ _ _ _ _
  81. T _ _ _ _ 1 _ _ _ _ _ _ _ _
  82. vowel _ _ _ _ _ _ _ _ _ 1d _ 1d 1
  83. Vowel _ _ _ _ _ _ 1 _ 1 _ 1 _ 1
  84. hyph _ _ _ _ _ _ _ _ n- _ t- _ _
  85. letter _ _ _ _ _ _ _ _ _ _ _ _ _
  86. other # # # # # # # # # # # # #
  87. Actions:
  88. 1 lowercase one letter at start of word
  89. 2 lowercase two letters at start of word
  90. 1d lowercase one letter at start of word, and delete next
  91. (and then go to state _, nothing further to do in this word)
  92. else just go to the given state; suffix ' indicates mark start-of-word.
  93. ### Consolidate identical states and classes:
  94. 0 1 2 3 4 5 6 7 8 9 A B
  95. # _ b bh d g h m n [nt]- t ts
  96. input\state
  97. b b' _ _ _ _ _ _ 1 _ _ _ _
  98. B _ _ _ _ _ _ _ 1 _ _ _ _
  99. [cC] _ _ _ _ _ 1 _ _ _ _ _ _
  100. d d' _ _ _ _ _ _ _ 1 _ _ _
  101. [DG] _ _ _ _ _ _ _ _ 1 _ _ _
  102. [fF] _ _ _ 2 _ _ _ _ _ _ _ _
  103. g g' _ _ _ _ _ _ _ 1 _ _ _
  104. h h' _ bh _ _ _ _ _ _ _ _ _
  105. [lLNrR] _ _ _ _ _ _ _ _ _ _ _ 1
  106. m m' _ _ _ _ _ _ _ _ _ _ _
  107. n n' _ _ _ _ _ _ _ _ _ _ 1
  108. [pP] _ _ 1 _ _ _ _ _ _ _ _ _
  109. [sS] _ _ _ _ _ _ _ _ _ _ ts _
  110. t t' _ _ _ 1 _ _ _ _ _ _ _
  111. T _ _ _ _ 1 _ _ _ _ _ _ _
  112. vowel _ _ _ _ _ _ _ _ _ 1d _ 1
  113. Vowel _ _ _ _ _ _ 1 _ 1 _ 1 1
  114. hyph _ _ _ _ _ _ _ _ [nt-] _ [nt-] _
  115. letter _ _ _ _ _ _ _ _ _ _ _ _
  116. other # # # # # # # # # # # #
  117. So we have 20 input classes, and 12 states.
  118. State table array will contain bytes that encode action and new state:
  119. 0x80 - bit flag: mark start-of-word position
  120. 0x40 - currently unused
  121. 0x30 - action mask: 4 values
  122. 0x00 - do nothing
  123. 0x10 - lowercase one letter
  124. 0x20 - lowercase two letters
  125. 0x30 - lowercase one, delete one
  126. 0x0F - next-state mask
  127. ******************************************************************************/
  128. #include "IrishCasing.h"
  129. #include "nsUnicodeProperties.h"
  130. #include "nsUnicharUtils.h"
  131. namespace mozilla {
  132. const uint8_t
  133. IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
  134. // # _ b bh d g h m n [nt]- t ts
  135. { 0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // b
  136. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // B
  137. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [cC]
  138. { 0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // d
  139. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // [DG]
  140. { 0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [fF]
  141. { 0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // g
  142. { 0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // h
  143. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // [lLNrR]
  144. { 0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // m
  145. { 0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // n
  146. { 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [pP]
  147. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 0x01 }, // [sS]
  148. { 0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // t
  149. { 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // T
  150. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, 0x11 }, // vowel
  151. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, 0x11 }, // Vowel
  152. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, 0x01 }, // hyph
  153. { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // letter
  154. { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } // other
  155. };
  156. #define HYPHEN 0x2010
  157. #define NO_BREAK_HYPHEN 0x2011
  158. #define a_ACUTE 0x00e1
  159. #define e_ACUTE 0x00e9
  160. #define i_ACUTE 0x00ed
  161. #define o_ACUTE 0x00f3
  162. #define u_ACUTE 0x00fa
  163. #define A_ACUTE 0x00c1
  164. #define E_ACUTE 0x00c9
  165. #define I_ACUTE 0x00cd
  166. #define O_ACUTE 0x00d3
  167. #define U_ACUTE 0x00da
  168. const uint8_t IrishCasing::sLcClasses[26] = {
  169. kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel,
  170. kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter,
  171. kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel,
  172. kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t,
  173. kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
  174. kClass_letter
  175. };
  176. const uint8_t IrishCasing::sUcClasses[26] = {
  177. kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel,
  178. kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter,
  179. kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel,
  180. kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T,
  181. kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
  182. kClass_letter
  183. };
  184. uint8_t
  185. IrishCasing::GetClass(uint32_t aCh)
  186. {
  187. using mozilla::unicode::GetGenCategory;
  188. if (aCh >= 'a' && aCh <= 'z') {
  189. return sLcClasses[aCh - 'a'];
  190. } else if (aCh >= 'A' && aCh <= 'Z') {
  191. return sUcClasses[aCh - 'A'];
  192. } else if (GetGenCategory(aCh) == nsIUGenCategory::kLetter) {
  193. if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE ||
  194. aCh == o_ACUTE || aCh == u_ACUTE) {
  195. return kClass_vowel;
  196. } else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE ||
  197. aCh == O_ACUTE || aCh == U_ACUTE) {
  198. return kClass_Vowel;
  199. } else {
  200. return kClass_letter;
  201. }
  202. } else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
  203. return kClass_hyph;
  204. } else {
  205. return kClass_other;
  206. }
  207. }
  208. uint32_t
  209. IrishCasing::UpperCase(uint32_t aCh, State& aState,
  210. bool& aMarkPos, uint8_t& aAction)
  211. {
  212. uint8_t cls = GetClass(aCh);
  213. uint8_t stateEntry = sUppercaseStateTable[cls][aState];
  214. aMarkPos = !!(stateEntry & kMarkPositionFlag);
  215. aAction = (stateEntry & kActionMask) >> kActionShift;
  216. aState = State(stateEntry & kNextStateMask);
  217. return ToUpperCase(aCh);
  218. }
  219. } // namespace mozilla