123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998 |
- /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
- /* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
- #include "nsJISx4051LineBreaker.h"
- #include "jisx4051class.h"
- #include "nsComplexBreaker.h"
- #include "nsTArray.h"
- #include "nsUnicodeProperties.h"
- /*
- Simplification of Pair Table in JIS X 4051
- 1. The Origion Table - in 4.1.3
- In JIS x 4051. The pair table is defined as below
- Class of
- Leading Class of Trailing Char Class
- Char
- 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20
- * # * #
- 1 X X X X X X X X X X X X X X X X X X X X X E
- 2 X X X X X X
- 3 X X X X X X
- 4 X X X X X X
- 5 X X X X X X
- 6 X X X X X X
- 7 X X X X X X X
- 8 X X X X X X E
- 9 X X X X X X
- 10 X X X X X X
- 11 X X X X X X
- 12 X X X X X X
- 13 X X X X X X X
- 14 X X X X X X X
- 15 X X X X X X X X X
- 16 X X X X X X X X
- 17 X X X X X E
- 18 X X X X X X X X X
- 19 X E E E E E X X X X X X X X X X X X E X E E
- 20 X X X X X E
- * Same Char
- # Other Char
- X Cannot Break
- The classes mean:
- 1: Open parenthesis
- 2: Close parenthesis
- 3: Prohibit a line break before
- 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
- 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
- 6: Full stop
- 7: Non-breakable between same characters
- 8: Prefix (e.g., "$", "NO.")
- 9: Postfix (e.g., "%")
- 10: Ideographic space
- 11: Hiragana
- 12: Japanese characters (except class 11)
- 13: Subscript
- 14: Ruby
- 15: Numeric
- 16: Alphabet
- 17: Space for Western language
- 18: Western characters (except class 17)
- 19: Split line note (Warichu) begin quote
- 20: Split line note (Warichu) end quote
- 2. Simplified by remove the class which we do not care
- However, since we do not care about class 13(Subscript), 14(Ruby),
- 16 (Aphabet), 19(split line note begin quote), and 20(split line note end
- quote) we can simplify this par table into the following
- Class of
- Leading Class of Trailing Char Class
- Char
- 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18
- 1 X X X X X X X X X X X X X X X
- 2 X X X X X
- 3 X X X X X
- 4 X X X X X
- 5 X X X X X
- 6 X X X X X
- 7 X X X X X X
- 8 X X X X X X
- 9 X X X X X
- 10 X X X X X
- 11 X X X X X
- 12 X X X X X
- 15 X X X X X X X X
- 17 X X X X X
- 18 X X X X X X X
- 3. Simplified by merged classes
- After the 2 simplification, the pair table have some duplication
- a. class 2, 3, 4, 5, 6, are the same- we can merged them
- b. class 10, 11, 12, 17 are the same- we can merged them
- Class of
- Leading Class of Trailing Char Class
- Char
- 1 [a] 7 8 9 [b]15 18
- 1 X X X X X X X X
- [a] X
- 7 X X
- 8 X X
- 9 X
- [b] X
- 15 X X X X
- 18 X X X
- 4. We add COMPLEX characters and make it breakable w/ all ther class
- except after class 1 and before class [a]
- Class of
- Leading Class of Trailing Char Class
- Char
- 1 [a] 7 8 9 [b]15 18 COMPLEX
- 1 X X X X X X X X X
- [a] X
- 7 X X
- 8 X X
- 9 X
- [b] X
- 15 X X X X
- 18 X X X
- COMPLEX X T
- T : need special handling
- 5. However, we need two special class for some punctuations/parentheses,
- theirs breaking rules like character class (18), see bug 389056.
- And also we need character like punctuation that is same behavior with 18,
- but the characters are not letters of all languages. (e.g., '_')
- [c]. Based on open parenthesis class (1), but it is not breakable after
- character class (18) or numeric class (15).
- [d]. Based on close parenthesis (or punctuation) class (2), but it is not
- breakable before character class (18) or numeric class (15).
- Class of
- Leading Class of Trailing Char Class
- Char
- 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d]
- 1 X X X X X X X X X X X
- [a] X X X
- 7 X X
- 8 X X
- 9 X
- [b] X X
- 15 X X X X X X
- 18 X X X X X
- COMPLEX X T
- [c] X X X X X X X X X X X
- [d] X X X X
- 6. And Unicode has "NON-BREAK" characters. The lines should be broken around
- them. But in JIS X 4051, such class is not, therefore, we create [e].
- Class of
- Leading Class of Trailing Char Class
- Char
- 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
- 1 X X X X X X X X X X X X
- [a] X X X
- 7 X X X
- 8 X X X
- 9 X X
- [b] X X X
- 15 X X X X X X X
- 18 X X X X X X
- COMPLEX X T X
- [c] X X X X X X X X X X X X
- [d] X X X X X
- [e] X X X X X X X X X X X X
- 7. Now we use one bit to encode weather it is breakable, and use 2 bytes
- for one row, then the bit table will look like:
- 18 <- 1
- 1 0000 1111 1111 1111 = 0x0FFF
- [a] 0000 1100 0000 0010 = 0x0C02
- 7 0000 1000 0000 0110 = 0x0806
- 8 0000 1000 0100 0010 = 0x0842
- 9 0000 1000 0000 0010 = 0x0802
- [b] 0000 1100 0000 0010 = 0x0C02
- 15 0000 1110 1101 0010 = 0x0ED2
- 18 0000 1110 1100 0010 = 0x0EC2
- COMPLEX 0000 1001 0000 0010 = 0x0902
- [c] 0000 1111 1111 1111 = 0x0FFF
- [d] 0000 1100 1100 0010 = 0x0CC2
- [e] 0000 1111 1111 1111 = 0x0FFF
- */
- #define MAX_CLASSES 12
- static const uint16_t gPair[MAX_CLASSES] = {
- 0x0FFF,
- 0x0C02,
- 0x0806,
- 0x0842,
- 0x0802,
- 0x0C02,
- 0x0ED2,
- 0x0EC2,
- 0x0902,
- 0x0FFF,
- 0x0CC2,
- 0x0FFF
- };
- /*
- 8. And if the character is not enough far from word start, word end and
- another break point, we should not break in non-CJK languages.
- I.e., Don't break around 15, 18, [c] and [d], but don't change
- that if they are related to [b].
- Class of
- Leading Class of Trailing Char Class
- Char
- 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e]
- 1 X X X X X X X X X X X X
- [a] X X X X X X
- 7 X X X X X X X
- 8 X X X X X X
- 9 X X X X X X
- [b] X X X
- 15 X X X X X X X X X X X
- 18 X X X X X X X X X X X
- COMPLEX X X X T X X X
- [c] X X X X X X X X X X X X
- [d] X X X X X X X X X X X
- [e] X X X X X X X X X X X X
- 18 <- 1
- 1 0000 1111 1111 1111 = 0x0FFF
- [a] 0000 1110 1100 0010 = 0x0EC2
- 7 0000 1110 1100 0110 = 0x0EC6
- 8 0000 1110 1100 0010 = 0x0EC2
- 9 0000 1110 1100 0010 = 0x0EC2
- [b] 0000 1100 0000 0010 = 0x0C02
- 15 0000 1111 1101 1111 = 0x0FDF
- 18 0000 1111 1101 1111 = 0x0FDF
- COMPLEX 0000 1111 1100 0010 = 0x0FC2
- [c] 0000 1111 1111 1111 = 0x0FFF
- [d] 0000 1111 1101 1111 = 0x0FDF
- [e] 0000 1111 1111 1111 = 0x0FFF
- */
- static const uint16_t gPairConservative[MAX_CLASSES] = {
- 0x0FFF,
- 0x0EC2,
- 0x0EC6,
- 0x0EC2,
- 0x0EC2,
- 0x0C02,
- 0x0FDF,
- 0x0FDF,
- 0x0FC2,
- 0x0FFF,
- 0x0FDF,
- 0x0FFF
- };
- /*
- 9. Now we map the class to number
- 0: 1
- 1: [a]- 2, 3, 4, 5, 6
- 2: 7
- 3: 8
- 4: 9
- 5: [b]- 10, 11, 12, 17
- 6: 15
- 7: 18
- 8: COMPLEX
- 9: [c]
- A: [d]
- B: [e]
- and they mean:
- 0: Open parenthesis
- 1: Punctuation that prohibits break before
- 2: Non-breakable between same classes
- 3: Prefix
- 4: Postfix
- 5: Breakable character (Spaces and Most Japanese characters)
- 6: Numeric
- 7: Characters
- 8: Need special handling characters (E.g., Thai)
- 9: Open parentheses like Character (See bug 389056)
- A: Close parenthese (or punctuations) like Character (See bug 389056)
- B: Non breakable (See bug 390920)
- */
- #define CLASS_NONE INT8_MAX
- #define CLASS_OPEN 0x00
- #define CLASS_CLOSE 0x01
- #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
- #define CLASS_PREFIX 0x03
- #define CLASS_POSTFFIX 0x04
- #define CLASS_BREAKABLE 0x05
- #define CLASS_NUMERIC 0x06
- #define CLASS_CHARACTER 0x07
- #define CLASS_COMPLEX 0x08
- #define CLASS_OPEN_LIKE_CHARACTER 0x09
- #define CLASS_CLOSE_LIKE_CHARACTER 0x0A
- #define CLASS_NON_BREAKABLE 0x0B
- #define U_NULL char16_t(0x0000)
- #define U_SLASH char16_t('/')
- #define U_SPACE char16_t(' ')
- #define U_HYPHEN char16_t('-')
- #define U_EQUAL char16_t('=')
- #define U_PERCENT char16_t('%')
- #define U_AMPERSAND char16_t('&')
- #define U_SEMICOLON char16_t(';')
- #define U_BACKSLASH char16_t('\\')
- #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
- #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
- #define U_OPEN_GUILLEMET char16_t(0x00AB)
- #define NEED_CONTEXTUAL_ANALYSIS(c) (IS_HYPHEN(c) || \
- (c) == U_SLASH || \
- (c) == U_PERCENT || \
- (c) == U_AMPERSAND || \
- (c) == U_SEMICOLON || \
- (c) == U_BACKSLASH || \
- (c) == U_OPEN_SINGLE_QUOTE || \
- (c) == U_OPEN_DOUBLE_QUOTE || \
- (c) == U_OPEN_GUILLEMET)
- #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
- static inline int
- GETCLASSFROMTABLE(const uint32_t* t, uint16_t l)
- {
- return ((((t)[(l>>3)]) >> ((l & 0x0007)<<2)) & 0x000f);
- }
- static inline int
- IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u)
- {
- return ((0xff66 <= (u)) && ((u) <= 0xff70));
- }
- static inline int
- IS_CJK_CHAR(char16_t u)
- {
- return ((0x1100 <= (u) && (u) <= 0x11ff) ||
- (0x2e80 <= (u) && (u) <= 0xd7ff) ||
- (0xf900 <= (u) && (u) <= 0xfaff) ||
- (0xff00 <= (u) && (u) <= 0xffef) );
- }
- static inline bool
- IS_NONBREAKABLE_SPACE(char16_t u)
- {
- return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE
- }
- static inline bool
- IS_HYPHEN(char16_t u)
- {
- return (u == U_HYPHEN ||
- u == 0x058A || // ARMENIAN HYPHEN
- u == 0x2010 || // HYPHEN
- u == 0x2012 || // FIGURE DASH
- u == 0x2013); // EN DASH
- }
- static int8_t
- GetClass(uint32_t u)
- {
- if (u < 0x10000) {
- uint16_t h = u & 0xFF00;
- uint16_t l = u & 0x00ff;
- // Handle 3 range table first
- if (0x0000 == h) {
- return GETCLASSFROMTABLE(gLBClass00, l);
- }
- if (0x1700 == h) {
- return GETCLASSFROMTABLE(gLBClass17, l);
- }
- if (NS_NeedsPlatformNativeHandling(u)) {
- return CLASS_COMPLEX;
- }
- if (0x0E00 == h) {
- return GETCLASSFROMTABLE(gLBClass0E, l);
- }
- if (0x2000 == h) {
- return GETCLASSFROMTABLE(gLBClass20, l);
- }
- if (0x2100 == h) {
- return GETCLASSFROMTABLE(gLBClass21, l);
- }
- if (0x3000 == h) {
- return GETCLASSFROMTABLE(gLBClass30, l);
- }
- if (0xff00 == h) {
- if (l < 0x0060) { // Fullwidth ASCII variant
- return GETCLASSFROMTABLE(gLBClass00, (l+0x20));
- }
- if (l < 0x00a0) { // Halfwidth Katakana variants
- switch (l) {
- case 0x61: return GetClass(0x3002);
- case 0x62: return GetClass(0x300c);
- case 0x63: return GetClass(0x300d);
- case 0x64: return GetClass(0x3001);
- case 0x65: return GetClass(0x30fb);
- case 0x9e: return GetClass(0x309b);
- case 0x9f: return GetClass(0x309c);
- default:
- if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) {
- return CLASS_CLOSE; // jis x4051 class 3
- }
- return CLASS_BREAKABLE; // jis x4051 class 11
- }
- }
- if (l < 0x00e0) {
- return CLASS_CHARACTER; // Halfwidth Hangul variants
- }
- if (l < 0x00f0) {
- static char16_t NarrowFFEx[16] = {
- 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
- 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000
- };
- return GetClass(NarrowFFEx[l - 0x00e0]);
- }
- } else if (0x3100 == h) {
- if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun
- // XXX: This is per UAX #14, but UAX #14 may change
- // the line breaking rules about Kanbun and Bopomofo.
- return CLASS_BREAKABLE;
- }
- if (l >= 0xf0) { // Katakana small letters for Ainu
- return CLASS_CLOSE;
- }
- } else if (0x0300 == h) {
- if (0x4F == l || (0x5C <= l && l <= 0x62)) {
- return CLASS_NON_BREAKABLE;
- }
- } else if (0x0500 == h) {
- // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
- if (l == 0x8A) {
- return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
- }
- } else if (0x0F00 == h) {
- if (0x08 == l || 0x0C == l || 0x12 == l) {
- return CLASS_NON_BREAKABLE;
- }
- } else if (0x1800 == h) {
- if (0x0E == l) {
- return CLASS_NON_BREAKABLE;
- }
- } else if (0x1600 == h) {
- if (0x80 == l) { // U+1680 OGHAM SPACE MARK
- return CLASS_BREAKABLE;
- }
- } else if (u == 0xfeff) {
- return CLASS_NON_BREAKABLE;
- }
- }
- // Mapping for Unicode LineBreak.txt classes to the (simplified) set of
- // character classes used here.
- // XXX The mappings here were derived by comparing the Unicode LineBreak
- // values of BMP characters to the classes our existing GetClass returns
- // for the same codepoints; in cases where characters with the same
- // LineBreak class mapped to various classes here, I picked what seemed
- // the most prevalent equivalence.
- // Some of these are unclear to me, but currently they are ONLY used
- // for characters not handled by the old code above, so all the JISx405
- // special cases should already be accounted for.
- static const int8_t sUnicodeLineBreakToClass[] = {
- /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER,
- /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER,
- /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER,
- /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER,
- /* BREAK_AFTER = 4, [BA] */ CLASS_CHARACTER,
- /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER,
- /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER,
- /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER,
- /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CHARACTER,
- /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER,
- /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE,
- /* EXCLAMATION = 11, [EX] */ CLASS_CHARACTER,
- /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE,
- /* HYPHEN = 13, [HY] */ CLASS_CHARACTER,
- /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE,
- /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER,
- /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER,
- /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE,
- /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER,
- /* NUMERIC = 19, [NU] */ CLASS_CHARACTER,
- /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_CHARACTER,
- /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CHARACTER,
- /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER,
- /* QUOTATION = 23, [QU] */ CLASS_CHARACTER,
- /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER,
- /* SURROGATE = 25, [SG] */ CLASS_CHARACTER,
- /* SPACE = 26, [SP] */ CLASS_BREAKABLE,
- /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER,
- /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE,
- /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER,
- /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE,
- /* H2 = 31, [H2] */ CLASS_BREAKABLE,
- /* H3 = 32, [H3] */ CLASS_BREAKABLE,
- /* JL = 33, [JL] */ CLASS_CHARACTER,
- /* JT = 34, [JT] */ CLASS_CHARACTER,
- /* JV = 35, [JV] */ CLASS_CHARACTER,
- /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER,
- /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE,
- /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER,
- /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER,
- /* E_BASE = 40, [EB] */ CLASS_BREAKABLE,
- /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER,
- /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER
- };
- static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass),
- "Gecko vs ICU LineBreak class mismatch");
- auto cls = mozilla::unicode::GetLineBreakClass(u);
- MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass));
- return sUnicodeLineBreakToClass[cls];
- }
- static bool
- GetPair(int8_t c1, int8_t c2)
- {
- NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
- NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
- return (0 == ((gPair[c1] >> c2) & 0x0001));
- }
- static bool
- GetPairConservative(int8_t c1, int8_t c2)
- {
- NS_ASSERTION(c1 < MAX_CLASSES ,"illegal classes 1");
- NS_ASSERTION(c2 < MAX_CLASSES ,"illegal classes 2");
- return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
- }
- nsJISx4051LineBreaker::nsJISx4051LineBreaker()
- {
- }
- nsJISx4051LineBreaker::~nsJISx4051LineBreaker()
- {
- }
- NS_IMPL_ISUPPORTS(nsJISx4051LineBreaker, nsILineBreaker)
- class ContextState {
- public:
- ContextState(const char16_t* aText, uint32_t aLength) {
- mUniText = aText;
- mText = nullptr;
- mLength = aLength;
- Init();
- }
- ContextState(const uint8_t* aText, uint32_t aLength) {
- mUniText = nullptr;
- mText = aText;
- mLength = aLength;
- Init();
- }
- uint32_t Length() { return mLength; }
- uint32_t Index() { return mIndex; }
- char16_t GetCharAt(uint32_t aIndex) {
- NS_ASSERTION(aIndex < mLength, "Out of range!");
- return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
- }
- void AdvanceIndex() {
- ++mIndex;
- }
- void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
- // A word of western language should not be broken. But even if the word has
- // only ASCII characters, non-natural context words should be broken, e.g.,
- // URL and file path. For protecting the natural words, we should use
- // conservative breaking rules at following conditions:
- // 1. at near the start of word
- // 2. at near the end of word
- // 3. at near the latest broken point
- // CONSERVATIVE_BREAK_RANGE define the 'near' in characters.
- #define CONSERVATIVE_BREAK_RANGE 6
- bool UseConservativeBreaking(uint32_t aOffset = 0) {
- if (mHasCJKChar)
- return false;
- uint32_t index = mIndex + aOffset;
- bool result = (index < CONSERVATIVE_BREAK_RANGE ||
- mLength - index < CONSERVATIVE_BREAK_RANGE ||
- index - mLastBreakIndex < CONSERVATIVE_BREAK_RANGE);
- if (result || !mHasNonbreakableSpace)
- return result;
- // This text has no-breakable space, we need to check whether the index
- // is near it.
- // Note that index is always larger than CONSERVATIVE_BREAK_RANGE here.
- for (uint32_t i = index; index - CONSERVATIVE_BREAK_RANGE < i; --i) {
- if (IS_NONBREAKABLE_SPACE(GetCharAt(i - 1)))
- return true;
- }
- // Note that index is always less than mLength - CONSERVATIVE_BREAK_RANGE.
- for (uint32_t i = index + 1; i < index + CONSERVATIVE_BREAK_RANGE; ++i) {
- if (IS_NONBREAKABLE_SPACE(GetCharAt(i)))
- return true;
- }
- return false;
- }
- bool HasPreviousEqualsSign() const {
- return mHasPreviousEqualsSign;
- }
- void NotifySeenEqualsSign() {
- mHasPreviousEqualsSign = true;
- }
- bool HasPreviousSlash() const {
- return mHasPreviousSlash;
- }
- void NotifySeenSlash() {
- mHasPreviousSlash = true;
- }
- bool HasPreviousBackslash() const {
- return mHasPreviousBackslash;
- }
- void NotifySeenBackslash() {
- mHasPreviousBackslash = true;
- }
- uint32_t GetPreviousNonHyphenCharacter() const {
- return mPreviousNonHyphenCharacter;
- }
- void NotifyNonHyphenCharacter(uint32_t ch) {
- mPreviousNonHyphenCharacter = ch;
- }
- private:
- void Init() {
- mIndex = 0;
- mLastBreakIndex = 0;
- mPreviousNonHyphenCharacter = U_NULL;
- mHasCJKChar = 0;
- mHasNonbreakableSpace = 0;
- mHasPreviousEqualsSign = false;
- mHasPreviousSlash = false;
- mHasPreviousBackslash = false;
- for (uint32_t i = 0; i < mLength; ++i) {
- char16_t u = GetCharAt(i);
- if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u))
- mHasNonbreakableSpace = 1;
- else if (mUniText && !mHasCJKChar && IS_CJK_CHAR(u))
- mHasCJKChar = 1;
- }
- }
- const char16_t* mUniText;
- const uint8_t* mText;
- uint32_t mIndex;
- uint32_t mLength; // length of text
- uint32_t mLastBreakIndex;
- uint32_t mPreviousNonHyphenCharacter; // The last character we have seen
- // which is not U_HYPHEN
- bool mHasCJKChar; // if the text has CJK character, this is true.
- bool mHasNonbreakableSpace; // if the text has no-breakable space,
- // this is true.
- bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL
- bool mHasPreviousSlash; // True if we have seen a U_SLASH
- bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH
- };
- static int8_t
- ContextualAnalysis(char16_t prev, char16_t cur, char16_t next,
- ContextState &aState)
- {
- // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
- if (IS_HYPHEN(cur)) {
- // If next character is hyphen, we don't need to break between them.
- if (IS_HYPHEN(next))
- return CLASS_CHARACTER;
- // If prev and next characters are numeric, it may be in Math context.
- // So, we should not break here.
- bool prevIsNum = IS_ASCII_DIGIT(prev);
- bool nextIsNum = IS_ASCII_DIGIT(next);
- if (prevIsNum && nextIsNum)
- return CLASS_NUMERIC;
- // If one side is numeric and the other is a character, or if both sides are
- // characters, the hyphen should be breakable.
- if (!aState.UseConservativeBreaking(1)) {
- char16_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
- if (prevOfHyphen && next) {
- int8_t prevClass = GetClass(prevOfHyphen);
- int8_t nextClass = GetClass(next);
- bool prevIsNumOrCharOrClose =
- prevIsNum ||
- (prevClass == CLASS_CHARACTER &&
- !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
- prevClass == CLASS_CLOSE ||
- prevClass == CLASS_CLOSE_LIKE_CHARACTER;
- bool nextIsNumOrCharOrOpen =
- nextIsNum ||
- (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
- nextClass == CLASS_OPEN ||
- nextClass == CLASS_OPEN_LIKE_CHARACTER ||
- next == U_OPEN_SINGLE_QUOTE ||
- next == U_OPEN_DOUBLE_QUOTE ||
- next == U_OPEN_GUILLEMET;
- if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
- return CLASS_CLOSE;
- }
- }
- }
- } else {
- aState.NotifyNonHyphenCharacter(cur);
- if (cur == U_SLASH || cur == U_BACKSLASH) {
- // If this is immediately after same char, we should not break here.
- if (prev == cur)
- return CLASS_CHARACTER;
- // If this text has two or more (BACK)SLASHs, this may be file path or URL.
- // Make sure to compute shouldReturn before we notify on this slash.
- bool shouldReturn = !aState.UseConservativeBreaking() &&
- (cur == U_SLASH ?
- aState.HasPreviousSlash() : aState.HasPreviousBackslash());
- if (cur == U_SLASH) {
- aState.NotifySeenSlash();
- } else {
- aState.NotifySeenBackslash();
- }
- if (shouldReturn)
- return CLASS_OPEN;
- } else if (cur == U_PERCENT) {
- // If this is a part of the param of URL, we should break before.
- if (!aState.UseConservativeBreaking()) {
- if (aState.Index() >= 3 &&
- aState.GetCharAt(aState.Index() - 3) == U_PERCENT)
- return CLASS_OPEN;
- if (aState.Index() + 3 < aState.Length() &&
- aState.GetCharAt(aState.Index() + 3) == U_PERCENT)
- return CLASS_OPEN;
- }
- } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
- // If this may be a separator of params of URL, we should break after.
- if (!aState.UseConservativeBreaking(1) &&
- aState.HasPreviousEqualsSign())
- return CLASS_CLOSE;
- } else if (cur == U_OPEN_SINGLE_QUOTE ||
- cur == U_OPEN_DOUBLE_QUOTE ||
- cur == U_OPEN_GUILLEMET) {
- // for CJK usage, we treat these as openers to allow a break before them,
- // but otherwise treat them as normal characters because quote mark usage
- // in various Western languages varies too much; see bug #450088 discussion.
- if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
- return CLASS_OPEN;
- } else {
- NS_ERROR("Forgot to handle the current character!");
- }
- }
- return GetClass(cur);
- }
- int32_t
- nsJISx4051LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
- uint32_t aPos, int8_t aDirection)
- {
- bool textNeedsJISx4051 = false;
- int32_t begin, end;
- for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
- if (IS_CJK_CHAR(aText[begin]) || NS_NeedsPlatformNativeHandling(aText[begin])) {
- textNeedsJISx4051 = true;
- }
- }
- for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
- if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
- textNeedsJISx4051 = true;
- }
- }
- int32_t ret;
- AutoTArray<uint8_t, 2000> breakState;
- if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
- // No complex text character, do not try to do complex line break.
- // (This is required for serializers. See Bug #344816.)
- // Also fall back to this when out of memory.
- if (aDirection < 0) {
- ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
- } else {
- ret = end;
- }
- } else {
- GetJISx4051Breaks(aText + begin, end - begin, nsILineBreaker::kWordBreak_Normal,
- breakState.Elements());
- ret = aPos;
- do {
- ret += aDirection;
- } while (begin < ret && ret < end && !breakState[ret - begin]);
- }
- return ret;
- }
- int32_t
- nsJISx4051LineBreaker::Next(const char16_t* aText, uint32_t aLen,
- uint32_t aPos)
- {
- NS_ASSERTION(aText, "aText shouldn't be null");
- NS_ASSERTION(aLen > aPos, "Bad position passed to nsJISx4051LineBreaker::Next");
- int32_t nextPos = WordMove(aText, aLen, aPos, 1);
- return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
- }
- int32_t
- nsJISx4051LineBreaker::Prev(const char16_t* aText, uint32_t aLen,
- uint32_t aPos)
- {
- NS_ASSERTION(aText, "aText shouldn't be null");
- NS_ASSERTION(aLen >= aPos && aPos > 0,
- "Bad position passed to nsJISx4051LineBreaker::Prev");
- int32_t prevPos = WordMove(aText, aLen, aPos, -1);
- return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
- }
- void
- nsJISx4051LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
- uint8_t aWordBreak,
- uint8_t* aBreakBefore)
- {
- uint32_t cur;
- int8_t lastClass = CLASS_NONE;
- ContextState state(aChars, aLength);
- for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
- uint32_t ch = aChars[cur];
- if (NS_IS_HIGH_SURROGATE(ch)) {
- if (cur + 1 < aLength && NS_IS_LOW_SURROGATE(aChars[cur + 1])) {
- ch = SURROGATE_TO_UCS4(ch, aChars[cur + 1]);
- }
- }
- int8_t cl;
- if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
- cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
- ch,
- cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
- state);
- } else {
- if (ch == U_EQUAL)
- state.NotifySeenEqualsSign();
- state.NotifyNonHyphenCharacter(ch);
- cl = GetClass(ch);
- }
- bool allowBreak = false;
- if (cur > 0) {
- NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
- "Loop should have prevented adjacent complex chars here");
- if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
- allowBreak = (state.UseConservativeBreaking()) ?
- GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
- } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
- allowBreak = true;
- }
- }
- aBreakBefore[cur] = allowBreak;
- if (allowBreak)
- state.NotifyBreakBefore();
- lastClass = cl;
- if (CLASS_COMPLEX == cl) {
- uint32_t end = cur + 1;
- while (end < aLength && CLASS_COMPLEX == GetClass(aChars[end])) {
- ++end;
- }
- NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
- // We have to consider word-break value again for complex characters
- if (aWordBreak != nsILineBreaker::kWordBreak_Normal) {
- // Respect word-break property
- for (uint32_t i = cur; i < end; i++)
- aBreakBefore[i] = (aWordBreak == nsILineBreaker::kWordBreak_BreakAll);
- }
- // restore breakability at chunk begin, which was always set to false
- // by the complex line breaker
- aBreakBefore[cur] = allowBreak;
- cur = end - 1;
- }
- if (ch > 0xffff) {
- // Supplementary-plane character: mark that we cannot break before the
- // trailing low surrogate, and advance past it.
- ++cur;
- aBreakBefore[cur] = false;
- state.AdvanceIndex();
- }
- }
- }
- void
- nsJISx4051LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
- uint8_t aWordBreak,
- uint8_t* aBreakBefore)
- {
- uint32_t cur;
- int8_t lastClass = CLASS_NONE;
- ContextState state(aChars, aLength);
- for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
- char16_t ch = aChars[cur];
- int8_t cl;
- if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
- cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL,
- ch,
- cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
- state);
- } else {
- if (ch == U_EQUAL)
- state.NotifySeenEqualsSign();
- state.NotifyNonHyphenCharacter(ch);
- cl = GetClass(ch);
- }
- bool allowBreak = false;
- if (cur > 0) {
- if (aWordBreak == nsILineBreaker::kWordBreak_Normal) {
- allowBreak = (state.UseConservativeBreaking()) ?
- GetPairConservative(lastClass, cl) : GetPair(lastClass, cl);
- } else if (aWordBreak == nsILineBreaker::kWordBreak_BreakAll) {
- allowBreak = true;
- }
- }
- aBreakBefore[cur] = allowBreak;
- if (allowBreak)
- state.NotifyBreakBefore();
- lastClass = cl;
- }
- }
|