lstmbe.h 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. // © 2021 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. #ifndef LSTMBE_H
  4. #define LSTMBE_H
  5. #include "unicode/utypes.h"
  6. #if !UCONFIG_NO_BREAK_ITERATION
  7. #include "unicode/uniset.h"
  8. #include "unicode/ures.h"
  9. #include "unicode/utext.h"
  10. #include "unicode/utypes.h"
  11. #include "brkeng.h"
  12. #include "dictbe.h"
  13. #include "uvectr32.h"
  14. U_NAMESPACE_BEGIN
  15. class Vectorizer;
  16. struct LSTMData;
  17. /*******************************************************************
  18. * LSTMBreakEngine
  19. */
  20. /**
  21. * <p>LSTMBreakEngine is a kind of DictionaryBreakEngine that uses a
  22. * LSTM to determine language-specific breaks.</p>
  23. *
  24. * <p>After it is constructed a LSTMBreakEngine may be shared between
  25. * threads without synchronization.</p>
  26. */
  27. class LSTMBreakEngine : public DictionaryBreakEngine {
  28. public:
  29. /**
  30. * <p>Constructor.</p>
  31. */
  32. LSTMBreakEngine(const LSTMData* data, const UnicodeSet& set, UErrorCode &status);
  33. /**
  34. * <p>Virtual destructor.</p>
  35. */
  36. virtual ~LSTMBreakEngine();
  37. virtual const char16_t* name() const;
  38. protected:
  39. /**
  40. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  41. *
  42. * @param text A UText representing the text
  43. * @param rangeStart The start of the range of dictionary characters
  44. * @param rangeEnd The end of the range of dictionary characters
  45. * @param foundBreaks Output of C array of int32_t break positions, or 0
  46. * @param status Information on any errors encountered.
  47. * @return The number of breaks found
  48. */
  49. virtual int32_t divideUpDictionaryRange(UText *text,
  50. int32_t rangeStart,
  51. int32_t rangeEnd,
  52. UVector32 &foundBreaks,
  53. UBool isPhraseBreaking,
  54. UErrorCode& status) const override;
  55. private:
  56. const LSTMData* fData;
  57. const Vectorizer* fVectorizer;
  58. };
  59. U_CAPI const LanguageBreakEngine* U_EXPORT2 CreateLSTMBreakEngine(
  60. UScriptCode script, const LSTMData* data, UErrorCode& status);
  61. U_CAPI const LSTMData* U_EXPORT2 CreateLSTMData(
  62. UResourceBundle* rb, UErrorCode& status);
  63. U_CAPI const LSTMData* U_EXPORT2 CreateLSTMDataForScript(
  64. UScriptCode script, UErrorCode& status);
  65. U_CAPI void U_EXPORT2 DeleteLSTMData(const LSTMData* data);
  66. U_CAPI const char16_t* U_EXPORT2 LSTMDataName(const LSTMData* data);
  67. U_NAMESPACE_END
  68. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  69. #endif /* LSTMBE_H */