dictbe.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /**
  4. *******************************************************************************
  5. * Copyright (C) 2006-2014, International Business Machines Corporation *
  6. * and others. All Rights Reserved. *
  7. *******************************************************************************
  8. */
  9. #ifndef DICTBE_H
  10. #define DICTBE_H
  11. #include "unicode/utypes.h"
  12. #include "unicode/uniset.h"
  13. #include "unicode/utext.h"
  14. #include "brkeng.h"
  15. #include "hash.h"
  16. #include "mlbe.h"
  17. #include "uvectr32.h"
  18. U_NAMESPACE_BEGIN
  19. class DictionaryMatcher;
  20. class MlBreakEngine;
  21. class Normalizer2;
  22. /*******************************************************************
  23. * DictionaryBreakEngine
  24. */
  25. /**
  26. * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
  27. * dictionary to determine language-specific breaks.</p>
  28. *
  29. * <p>After it is constructed a DictionaryBreakEngine may be shared between
  30. * threads without synchronization.</p>
  31. */
  32. class DictionaryBreakEngine : public LanguageBreakEngine {
  33. private:
  34. /**
  35. * The set of characters handled by this engine
  36. * @internal
  37. */
  38. UnicodeSet fSet;
  39. public:
  40. /**
  41. * <p>Constructor </p>
  42. */
  43. DictionaryBreakEngine();
  44. /**
  45. * <p>Virtual destructor.</p>
  46. */
  47. virtual ~DictionaryBreakEngine();
  48. /**
  49. * <p>Indicate whether this engine handles a particular character for
  50. * a particular kind of break.</p>
  51. *
  52. * @param c A character which begins a run that the engine might handle
  53. * @return true if this engine handles the particular character and break
  54. * type.
  55. */
  56. virtual UBool handles(UChar32 c) const override;
  57. /**
  58. * <p>Find any breaks within a run in the supplied text.</p>
  59. *
  60. * @param text A UText representing the text. The iterator is left at
  61. * the end of the run of characters which the engine is capable of handling
  62. * that starts from the first character in the range.
  63. * @param startPos The start of the run within the supplied text.
  64. * @param endPos The end of the run within the supplied text.
  65. * @param foundBreaks vector of int32_t to receive the break positions
  66. * @param status Information on any errors encountered.
  67. * @return The number of breaks found.
  68. */
  69. virtual int32_t findBreaks( UText *text,
  70. int32_t startPos,
  71. int32_t endPos,
  72. UVector32 &foundBreaks,
  73. UBool isPhraseBreaking,
  74. UErrorCode& status ) const override;
  75. protected:
  76. /**
  77. * <p>Set the character set handled by this engine.</p>
  78. *
  79. * @param set A UnicodeSet of the set of characters handled by the engine
  80. */
  81. virtual void setCharacters( const UnicodeSet &set );
  82. /**
  83. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  84. *
  85. * @param text A UText representing the text
  86. * @param rangeStart The start of the range of dictionary characters
  87. * @param rangeEnd The end of the range of dictionary characters
  88. * @param foundBreaks Output of C array of int32_t break positions, or 0
  89. * @param status Information on any errors encountered.
  90. * @return The number of breaks found
  91. */
  92. virtual int32_t divideUpDictionaryRange( UText *text,
  93. int32_t rangeStart,
  94. int32_t rangeEnd,
  95. UVector32 &foundBreaks,
  96. UBool isPhraseBreaking,
  97. UErrorCode& status) const = 0;
  98. };
  99. /*******************************************************************
  100. * ThaiBreakEngine
  101. */
  102. /**
  103. * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
  104. * dictionary and heuristics to determine Thai-specific breaks.</p>
  105. *
  106. * <p>After it is constructed a ThaiBreakEngine may be shared between
  107. * threads without synchronization.</p>
  108. */
  109. class ThaiBreakEngine : public DictionaryBreakEngine {
  110. private:
  111. /**
  112. * The set of characters handled by this engine
  113. * @internal
  114. */
  115. UnicodeSet fEndWordSet;
  116. UnicodeSet fBeginWordSet;
  117. UnicodeSet fSuffixSet;
  118. UnicodeSet fMarkSet;
  119. DictionaryMatcher *fDictionary;
  120. public:
  121. /**
  122. * <p>Default constructor.</p>
  123. *
  124. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  125. * engine is deleted.
  126. */
  127. ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  128. /**
  129. * <p>Virtual destructor.</p>
  130. */
  131. virtual ~ThaiBreakEngine();
  132. protected:
  133. /**
  134. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  135. *
  136. * @param text A UText representing the text
  137. * @param rangeStart The start of the range of dictionary characters
  138. * @param rangeEnd The end of the range of dictionary characters
  139. * @param foundBreaks Output of C array of int32_t break positions, or 0
  140. * @param status Information on any errors encountered.
  141. * @return The number of breaks found
  142. */
  143. virtual int32_t divideUpDictionaryRange( UText *text,
  144. int32_t rangeStart,
  145. int32_t rangeEnd,
  146. UVector32 &foundBreaks,
  147. UBool isPhraseBreaking,
  148. UErrorCode& status) const override;
  149. };
  150. /*******************************************************************
  151. * LaoBreakEngine
  152. */
  153. /**
  154. * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
  155. * dictionary and heuristics to determine Lao-specific breaks.</p>
  156. *
  157. * <p>After it is constructed a LaoBreakEngine may be shared between
  158. * threads without synchronization.</p>
  159. */
  160. class LaoBreakEngine : public DictionaryBreakEngine {
  161. private:
  162. /**
  163. * The set of characters handled by this engine
  164. * @internal
  165. */
  166. UnicodeSet fEndWordSet;
  167. UnicodeSet fBeginWordSet;
  168. UnicodeSet fMarkSet;
  169. DictionaryMatcher *fDictionary;
  170. public:
  171. /**
  172. * <p>Default constructor.</p>
  173. *
  174. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  175. * engine is deleted.
  176. */
  177. LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  178. /**
  179. * <p>Virtual destructor.</p>
  180. */
  181. virtual ~LaoBreakEngine();
  182. protected:
  183. /**
  184. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  185. *
  186. * @param text A UText representing the text
  187. * @param rangeStart The start of the range of dictionary characters
  188. * @param rangeEnd The end of the range of dictionary characters
  189. * @param foundBreaks Output of C array of int32_t break positions, or 0
  190. * @param status Information on any errors encountered.
  191. * @return The number of breaks found
  192. */
  193. virtual int32_t divideUpDictionaryRange( UText *text,
  194. int32_t rangeStart,
  195. int32_t rangeEnd,
  196. UVector32 &foundBreaks,
  197. UBool isPhraseBreaking,
  198. UErrorCode& status) const override;
  199. };
  200. /*******************************************************************
  201. * BurmeseBreakEngine
  202. */
  203. /**
  204. * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
  205. * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
  206. *
  207. * <p>After it is constructed a BurmeseBreakEngine may be shared between
  208. * threads without synchronization.</p>
  209. */
  210. class BurmeseBreakEngine : public DictionaryBreakEngine {
  211. private:
  212. /**
  213. * The set of characters handled by this engine
  214. * @internal
  215. */
  216. UnicodeSet fEndWordSet;
  217. UnicodeSet fBeginWordSet;
  218. UnicodeSet fMarkSet;
  219. DictionaryMatcher *fDictionary;
  220. public:
  221. /**
  222. * <p>Default constructor.</p>
  223. *
  224. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  225. * engine is deleted.
  226. */
  227. BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  228. /**
  229. * <p>Virtual destructor.</p>
  230. */
  231. virtual ~BurmeseBreakEngine();
  232. protected:
  233. /**
  234. * <p>Divide up a range of known dictionary characters.</p>
  235. *
  236. * @param text A UText representing the text
  237. * @param rangeStart The start of the range of dictionary characters
  238. * @param rangeEnd The end of the range of dictionary characters
  239. * @param foundBreaks Output of C array of int32_t break positions, or 0
  240. * @param status Information on any errors encountered.
  241. * @return The number of breaks found
  242. */
  243. virtual int32_t divideUpDictionaryRange( UText *text,
  244. int32_t rangeStart,
  245. int32_t rangeEnd,
  246. UVector32 &foundBreaks,
  247. UBool isPhraseBreaking,
  248. UErrorCode& status) const override;
  249. };
  250. /*******************************************************************
  251. * KhmerBreakEngine
  252. */
  253. /**
  254. * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
  255. * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
  256. *
  257. * <p>After it is constructed a KhmerBreakEngine may be shared between
  258. * threads without synchronization.</p>
  259. */
  260. class KhmerBreakEngine : public DictionaryBreakEngine {
  261. private:
  262. /**
  263. * The set of characters handled by this engine
  264. * @internal
  265. */
  266. UnicodeSet fEndWordSet;
  267. UnicodeSet fBeginWordSet;
  268. UnicodeSet fMarkSet;
  269. DictionaryMatcher *fDictionary;
  270. public:
  271. /**
  272. * <p>Default constructor.</p>
  273. *
  274. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  275. * engine is deleted.
  276. */
  277. KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  278. /**
  279. * <p>Virtual destructor.</p>
  280. */
  281. virtual ~KhmerBreakEngine();
  282. protected:
  283. /**
  284. * <p>Divide up a range of known dictionary characters.</p>
  285. *
  286. * @param text A UText representing the text
  287. * @param rangeStart The start of the range of dictionary characters
  288. * @param rangeEnd The end of the range of dictionary characters
  289. * @param foundBreaks Output of C array of int32_t break positions, or 0
  290. * @param status Information on any errors encountered.
  291. * @return The number of breaks found
  292. */
  293. virtual int32_t divideUpDictionaryRange( UText *text,
  294. int32_t rangeStart,
  295. int32_t rangeEnd,
  296. UVector32 &foundBreaks,
  297. UBool isPhraseBreaking,
  298. UErrorCode& status) const override;
  299. };
  300. #if !UCONFIG_NO_NORMALIZATION
  301. /*******************************************************************
  302. * CjkBreakEngine
  303. */
  304. //indicates language/script that the CjkBreakEngine will handle
  305. enum LanguageType {
  306. kKorean,
  307. kChineseJapanese
  308. };
  309. /**
  310. * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
  311. * dictionary with costs associated with each word and
  312. * Viterbi decoding to determine CJK-specific breaks.</p>
  313. */
  314. class CjkBreakEngine : public DictionaryBreakEngine {
  315. protected:
  316. /**
  317. * The set of characters handled by this engine
  318. * @internal
  319. */
  320. UnicodeSet fHangulWordSet;
  321. UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
  322. UnicodeSet fClosePunctuationSet;
  323. DictionaryMatcher *fDictionary;
  324. const Normalizer2 *nfkcNorm2;
  325. MlBreakEngine *fMlBreakEngine;
  326. bool isCj;
  327. private:
  328. // Load Japanese extensions.
  329. void loadJapaneseExtensions(UErrorCode& error);
  330. // Load Japanese Hiragana.
  331. void loadHiragana(UErrorCode& error);
  332. // Initialize fSkipSet by loading Japanese Hiragana and extensions.
  333. void initJapanesePhraseParameter(UErrorCode& error);
  334. Hashtable fSkipSet;
  335. public:
  336. /**
  337. * <p>Default constructor.</p>
  338. *
  339. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  340. * engine is deleted. The DictionaryMatcher must contain costs for each word
  341. * in order for the dictionary to work properly.
  342. */
  343. CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
  344. /**
  345. * <p>Virtual destructor.</p>
  346. */
  347. virtual ~CjkBreakEngine();
  348. protected:
  349. /**
  350. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  351. *
  352. * @param text A UText representing the text
  353. * @param rangeStart The start of the range of dictionary characters
  354. * @param rangeEnd The end of the range of dictionary characters
  355. * @param foundBreaks Output of C array of int32_t break positions, or 0
  356. * @param status Information on any errors encountered.
  357. * @return The number of breaks found
  358. */
  359. virtual int32_t divideUpDictionaryRange( UText *text,
  360. int32_t rangeStart,
  361. int32_t rangeEnd,
  362. UVector32 &foundBreaks,
  363. UBool isPhraseBreaking,
  364. UErrorCode& status) const override;
  365. };
  366. #endif
  367. U_NAMESPACE_END
  368. /* DICTBE_H */
  369. #endif