nsBidiUtils.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. #ifndef nsBidiUtils_h__
  6. #define nsBidiUtils_h__
  7. #include "nsStringGlue.h"
  8. /**
  9. * Read ftp://ftp.unicode.org/Public/UNIDATA/ReadMe-Latest.txt
  10. * section BIDIRECTIONAL PROPERTIES
  11. * for the detailed definition of the following categories
  12. *
  13. * The values here must match the equivalents in %bidicategorycode in
  14. * mozilla/intl/unicharutil/tools/genUnicodePropertyData.pl,
  15. * and must also match the values used by ICU's UCharDirection.
  16. */
  17. enum nsCharType {
  18. eCharType_LeftToRight = 0,
  19. eCharType_RightToLeft = 1,
  20. eCharType_EuropeanNumber = 2,
  21. eCharType_EuropeanNumberSeparator = 3,
  22. eCharType_EuropeanNumberTerminator = 4,
  23. eCharType_ArabicNumber = 5,
  24. eCharType_CommonNumberSeparator = 6,
  25. eCharType_BlockSeparator = 7,
  26. eCharType_SegmentSeparator = 8,
  27. eCharType_WhiteSpaceNeutral = 9,
  28. eCharType_OtherNeutral = 10,
  29. eCharType_LeftToRightEmbedding = 11,
  30. eCharType_LeftToRightOverride = 12,
  31. eCharType_RightToLeftArabic = 13,
  32. eCharType_RightToLeftEmbedding = 14,
  33. eCharType_RightToLeftOverride = 15,
  34. eCharType_PopDirectionalFormat = 16,
  35. eCharType_DirNonSpacingMark = 17,
  36. eCharType_BoundaryNeutral = 18,
  37. eCharType_FirstStrongIsolate = 19,
  38. eCharType_LeftToRightIsolate = 20,
  39. eCharType_RightToLeftIsolate = 21,
  40. eCharType_PopDirectionalIsolate = 22,
  41. eCharType_CharTypeCount
  42. };
  43. /**
  44. * This specifies the language directional property of a character set.
  45. */
  46. typedef enum nsCharType nsCharType;
  47. /**
  48. * Find the direction of an embedding level or paragraph level set by
  49. * the Unicode Bidi Algorithm. (Even levels are left-to-right, odd
  50. * levels right-to-left.
  51. */
  52. #define IS_LEVEL_RTL(level) (((level) & 1) == 1)
  53. /**
  54. * Check whether two bidi levels have the same parity and thus the same
  55. * directionality
  56. */
  57. #define IS_SAME_DIRECTION(level1, level2) (((level1 ^ level2) & 1) == 0)
  58. /**
  59. * Convert from nsBidiLevel to nsBidiDirection
  60. */
  61. #define DIRECTION_FROM_LEVEL(level) ((IS_LEVEL_RTL(level)) \
  62. ? NSBIDI_RTL : NSBIDI_LTR)
  63. /**
  64. * definitions of bidirection character types by category
  65. */
  66. #define CHARTYPE_IS_RTL(val) ( ( (val) == eCharType_RightToLeft) || ( (val) == eCharType_RightToLeftArabic) )
  67. #define CHARTYPE_IS_WEAK(val) ( ( (val) == eCharType_EuropeanNumberSeparator) \
  68. || ( (val) == eCharType_EuropeanNumberTerminator) \
  69. || ( ( (val) > eCharType_ArabicNumber) && ( (val) != eCharType_RightToLeftArabic) ) )
  70. /**
  71. * Inspects a Unichar, converting numbers to Arabic or Hindi forms and returning them
  72. * @param aChar is the character
  73. * @param aPrevCharArabic is true if the previous character in the string is an Arabic char
  74. * @param aNumFlag specifies the conversion to perform:
  75. * IBMBIDI_NUMERAL_NOMINAL: don't do any conversion
  76. * IBMBIDI_NUMERAL_HINDI: convert to Hindi forms (Unicode 0660-0669)
  77. * IBMBIDI_NUMERAL_ARABIC: convert to Arabic forms (Unicode 0030-0039)
  78. * IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to Hindi, otherwise to Arabic
  79. * @return the converted Unichar
  80. */
  81. char16_t HandleNumberInChar(char16_t aChar, bool aPrevCharArabic, uint32_t aNumFlag);
  82. /**
  83. * Scan a Unichar string, converting numbers to Arabic or Hindi forms in place
  84. * @param aBuffer is the string
  85. * @param aSize is the size of aBuffer
  86. * @param aNumFlag specifies the conversion to perform:
  87. * IBMBIDI_NUMERAL_NOMINAL: don't do any conversion
  88. * IBMBIDI_NUMERAL_HINDI: convert to Hindi forms (Unicode 0660-0669)
  89. * IBMBIDI_NUMERAL_ARABIC: convert to Arabic forms (Unicode 0030-0039)
  90. * IBMBIDI_NUMERAL_HINDICONTEXT: convert numbers in Arabic text to Hindi, otherwise to Arabic
  91. */
  92. nsresult HandleNumbers(char16_t* aBuffer, uint32_t aSize, uint32_t aNumFlag);
  93. /**
  94. * Give a UTF-32 codepoint
  95. * return true if the codepoint is a Bidi control character (LRM, RLM, ALM;
  96. * LRE, RLE, PDF, LRO, RLO; LRI, RLI, FSI, PDI).
  97. * Return false, otherwise
  98. */
  99. #define LRM_CHAR 0x200e
  100. #define RLM_CHAR 0x200f
  101. #define LRE_CHAR 0x202a
  102. #define RLE_CHAR 0x202b
  103. #define PDF_CHAR 0x202c
  104. #define LRO_CHAR 0x202d
  105. #define RLO_CHAR 0x202e
  106. #define LRI_CHAR 0x2066
  107. #define RLI_CHAR 0x2067
  108. #define FSI_CHAR 0x2068
  109. #define PDI_CHAR 0x2069
  110. #define ALM_CHAR 0x061C
  111. inline bool IsBidiControl(uint32_t aChar) {
  112. return ((LRE_CHAR <= aChar && aChar <= RLO_CHAR) ||
  113. (LRI_CHAR <= aChar && aChar <= PDI_CHAR) ||
  114. (aChar == ALM_CHAR) ||
  115. (aChar & 0xfffffe) == LRM_CHAR);
  116. }
  117. /**
  118. * Give a UTF-32 codepoint
  119. * Return true if the codepoint is a Bidi control character that may result
  120. * in RTL directionality and therefore needs to trigger bidi resolution;
  121. * return false otherwise.
  122. */
  123. inline bool IsBidiControlRTL(uint32_t aChar) {
  124. return aChar == RLM_CHAR ||
  125. aChar == RLE_CHAR ||
  126. aChar == RLO_CHAR ||
  127. aChar == RLI_CHAR ||
  128. aChar == ALM_CHAR;
  129. }
  130. /**
  131. * Give an nsString.
  132. * @return true if the string contains right-to-left characters
  133. */
  134. bool HasRTLChars(const nsAString& aString);
  135. // These values are shared with Preferences dialog
  136. // ------------------
  137. // If Pref values are to be changed
  138. // in the XUL file of Prefs. the values
  139. // Must be changed here too..
  140. // ------------------
  141. //
  142. #define IBMBIDI_TEXTDIRECTION_STR "bidi.direction"
  143. #define IBMBIDI_TEXTTYPE_STR "bidi.texttype"
  144. #define IBMBIDI_NUMERAL_STR "bidi.numeral"
  145. // ------------------
  146. // Text Direction
  147. // ------------------
  148. // bidi.direction
  149. #define IBMBIDI_TEXTDIRECTION_LTR 1 // 1 = directionLTRBidi *
  150. #define IBMBIDI_TEXTDIRECTION_RTL 2 // 2 = directionRTLBidi
  151. // ------------------
  152. // Text Type
  153. // ------------------
  154. // bidi.texttype
  155. #define IBMBIDI_TEXTTYPE_CHARSET 1 // 1 = charsettexttypeBidi *
  156. #define IBMBIDI_TEXTTYPE_LOGICAL 2 // 2 = logicaltexttypeBidi
  157. #define IBMBIDI_TEXTTYPE_VISUAL 3 // 3 = visualtexttypeBidi
  158. // ------------------
  159. // Numeral Style
  160. // ------------------
  161. // bidi.numeral
  162. #define IBMBIDI_NUMERAL_NOMINAL 0 // 0 = nominalnumeralBidi *
  163. #define IBMBIDI_NUMERAL_REGULAR 1 // 1 = regularcontextnumeralBidi
  164. #define IBMBIDI_NUMERAL_HINDICONTEXT 2 // 2 = hindicontextnumeralBidi
  165. #define IBMBIDI_NUMERAL_ARABIC 3 // 3 = arabicnumeralBidi
  166. #define IBMBIDI_NUMERAL_HINDI 4 // 4 = hindinumeralBidi
  167. #define IBMBIDI_NUMERAL_PERSIANCONTEXT 5 // 5 = persiancontextnumeralBidi
  168. #define IBMBIDI_NUMERAL_PERSIAN 6 // 6 = persiannumeralBidi
  169. #define IBMBIDI_DEFAULT_BIDI_OPTIONS \
  170. ((IBMBIDI_TEXTDIRECTION_LTR<<0) | \
  171. (IBMBIDI_TEXTTYPE_CHARSET<<4) | \
  172. (IBMBIDI_NUMERAL_NOMINAL<<8))
  173. #define GET_BIDI_OPTION_DIRECTION(bo) (((bo)>>0) & 0x0000000F) /* 4 bits for DIRECTION */
  174. #define GET_BIDI_OPTION_TEXTTYPE(bo) (((bo)>>4) & 0x0000000F) /* 4 bits for TEXTTYPE */
  175. #define GET_BIDI_OPTION_NUMERAL(bo) (((bo)>>8) & 0x0000000F) /* 4 bits for NUMERAL */
  176. #define SET_BIDI_OPTION_DIRECTION(bo, dir) {(bo)=((bo) & 0xFFFFFFF0)|(((dir)& 0x0000000F)<<0);}
  177. #define SET_BIDI_OPTION_TEXTTYPE(bo, tt) {(bo)=((bo) & 0xFFFFFF0F)|(((tt)& 0x0000000F)<<4);}
  178. #define SET_BIDI_OPTION_NUMERAL(bo, num) {(bo)=((bo) & 0xFFFFF0FF)|(((num)& 0x0000000F)<<8);}
  179. /* Constants related to the position of numerics in the codepage */
  180. #define START_HINDI_DIGITS 0x0660
  181. #define END_HINDI_DIGITS 0x0669
  182. #define START_ARABIC_DIGITS 0x0030
  183. #define END_ARABIC_DIGITS 0x0039
  184. #define START_FARSI_DIGITS 0x06f0
  185. #define END_FARSI_DIGITS 0x06f9
  186. #define IS_HINDI_DIGIT(u) ( ( (u) >= START_HINDI_DIGITS ) && ( (u) <= END_HINDI_DIGITS ) )
  187. #define IS_ARABIC_DIGIT(u) ( ( (u) >= START_ARABIC_DIGITS ) && ( (u) <= END_ARABIC_DIGITS ) )
  188. #define IS_FARSI_DIGIT(u) ( ( (u) >= START_FARSI_DIGITS ) && ( (u) <= END_FARSI_DIGITS ) )
  189. /**
  190. * Arabic numeric separator and numeric formatting characters:
  191. * U+0600;ARABIC NUMBER SIGN
  192. * U+0601;ARABIC SIGN SANAH
  193. * U+0602;ARABIC FOOTNOTE MARKER
  194. * U+0603;ARABIC SIGN SAFHA
  195. * U+066A;ARABIC PERCENT SIGN
  196. * U+066B;ARABIC DECIMAL SEPARATOR
  197. * U+066C;ARABIC THOUSANDS SEPARATOR
  198. * U+06DD;ARABIC END OF AYAH
  199. */
  200. #define IS_ARABIC_SEPARATOR(u) ( ( /*(u) >= 0x0600 &&*/ (u) <= 0x0603 ) || \
  201. ( (u) >= 0x066A && (u) <= 0x066C ) || \
  202. ( (u) == 0x06DD ) )
  203. #define IS_BIDI_DIACRITIC(u) ( \
  204. ( (u) >= 0x0591 && (u) <= 0x05A1) || ( (u) >= 0x05A3 && (u) <= 0x05B9) \
  205. || ( (u) >= 0x05BB && (u) <= 0x05BD) || ( (u) == 0x05BF) || ( (u) == 0x05C1) \
  206. || ( (u) == 0x05C2) || ( (u) == 0x05C4) \
  207. || ( (u) >= 0x064B && (u) <= 0x0652) || ( (u) == 0x0670) \
  208. || ( (u) >= 0x06D7 && (u) <= 0x06E4) || ( (u) == 0x06E7) || ( (u) == 0x06E8) \
  209. || ( (u) >= 0x06EA && (u) <= 0x06ED) )
  210. #define IS_HEBREW_CHAR(c) (((0x0590 <= (c)) && ((c) <= 0x05FF)) || (((c) >= 0xfb1d) && ((c) <= 0xfb4f)))
  211. #define IS_ARABIC_CHAR(c) ( (0x0600 <= (c) && (c) <= 0x08FF) && \
  212. ( (c) <= 0x06ff || \
  213. ((c) >= 0x0750 && (c) <= 0x077f) || \
  214. (c) >= 0x08a0 ) )
  215. #define IS_ARABIC_ALPHABETIC(c) (IS_ARABIC_CHAR(c) && \
  216. !(IS_HINDI_DIGIT(c) || IS_FARSI_DIGIT(c) || IS_ARABIC_SEPARATOR(c)))
  217. /**
  218. * The codepoint ranges in the following macros are based on the blocks
  219. * allocated, or planned to be allocated, to right-to-left characters in the
  220. * BMP (Basic Multilingual Plane) and SMP (Supplementary Multilingual Plane)
  221. * according to
  222. * http://unicode.org/Public/UNIDATA/extracted/DerivedBidiClass.txt and
  223. * http://www.unicode.org/roadmaps/
  224. */
  225. #define IS_IN_BMP_RTL_BLOCK(c) ((0x590 <= (c)) && ((c) <= 0x8ff))
  226. #define IS_RTL_PRESENTATION_FORM(c) (((0xfb1d <= (c)) && ((c) <= 0xfdff)) || \
  227. ((0xfe70 <= (c)) && ((c) <= 0xfefc)))
  228. #define IS_IN_SMP_RTL_BLOCK(c) (((0x10800 <= (c)) && ((c) <= 0x10fff)) || \
  229. ((0x1e800 <= (c)) && ((c) <= 0x1eFFF)))
  230. #define UCS2_CHAR_IS_BIDI(c) ((IS_IN_BMP_RTL_BLOCK(c)) || \
  231. (IS_RTL_PRESENTATION_FORM(c)) || \
  232. (c) == 0xD802 || (c) == 0xD803)
  233. #define UTF32_CHAR_IS_BIDI(c) ((IS_IN_BMP_RTL_BLOCK(c)) || \
  234. (IS_RTL_PRESENTATION_FORM(c)) || \
  235. (IS_IN_SMP_RTL_BLOCK(c)))
  236. #endif /* nsBidiUtils_h__ */