HtmlToTextParser.h 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. /*
  2. * Copyright 2005 - 2016 Zarafa and its licensors
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. *
  16. */
  17. #pragma once
  18. #include <kopano/zcdefs.h>
  19. #include <string>
  20. #include <map>
  21. #include <stack>
  22. namespace KC {
  23. class _kc_export CHtmlToTextParser _kc_final {
  24. public:
  25. CHtmlToTextParser(void);
  26. bool Parse(const WCHAR *lpwHTML);
  27. std::wstring& GetText();
  28. protected:
  29. _kc_hidden void Init(void);
  30. _kc_hidden void parseTag(const wchar_t *&);
  31. _kc_hidden bool parseEntity(const wchar_t *&);
  32. _kc_hidden void parseAttributes(const wchar_t *&);
  33. _kc_hidden void addChar(wchar_t);
  34. _kc_hidden void addNewLine(bool force_line);
  35. _kc_hidden bool addURLAttribute(const wchar_t *attr, bool spaces = false);
  36. _kc_hidden void addSpace(bool force);
  37. //Parse tags
  38. _kc_hidden void parseTagP(void);
  39. _kc_hidden void parseTagBP(void);
  40. _kc_hidden void parseTagBR(void);
  41. _kc_hidden void parseTagTR(void);
  42. _kc_hidden void parseTagBTR(void);
  43. _kc_hidden void parseTagTDTH(void);
  44. _kc_hidden void parseTagIMG(void);
  45. _kc_hidden void parseTagA(void);
  46. _kc_hidden void parseTagBA(void);
  47. _kc_hidden void parseTagSCRIPT(void);
  48. _kc_hidden void parseTagBSCRIPT(void);
  49. _kc_hidden void parseTagSTYLE(void);
  50. _kc_hidden void parseTagBSTYLE(void);
  51. _kc_hidden void parseTagHEAD(void);
  52. _kc_hidden void parseTagBHEAD(void);
  53. _kc_hidden void parseTagNewLine(void);
  54. _kc_hidden void parseTagHR(void);
  55. _kc_hidden void parseTagHeading(void);
  56. _kc_hidden void parseTagPRE(void);
  57. _kc_hidden void parseTagBPRE(void);
  58. _kc_hidden void parseTagOL(void);
  59. _kc_hidden void parseTagUL(void);
  60. _kc_hidden void parseTagLI(void);
  61. _kc_hidden void parseTagPopList(void);
  62. _kc_hidden void parseTagDL(void);
  63. _kc_hidden void parseTagDT(void);
  64. _kc_hidden void parseTagDD(void);
  65. std::wstring strText;
  66. bool fScriptMode = false;
  67. bool fHeadMode = false;
  68. short cNewlines = 0;
  69. bool fStyleMode = false;
  70. bool fTDTHMode = false;
  71. bool fPreMode = false;
  72. bool fTextMode = false;
  73. bool fAddSpace = false;
  74. typedef void ( CHtmlToTextParser::*ParseMethodType )( void );
  75. struct _kc_hidden tagParser {
  76. tagParser(void) = default;
  77. tagParser(bool bParseAttrs, ParseMethodType parserMethod){
  78. this->bParseAttrs = bParseAttrs;
  79. this->parserMethod = parserMethod;
  80. };
  81. bool bParseAttrs = false;
  82. ParseMethodType parserMethod = nullptr;
  83. };
  84. struct _TableRow {
  85. bool bFirstCol;
  86. };
  87. enum eListMode { lmDefinition, lmOrdered, lmUnordered };
  88. struct ListInfo {
  89. eListMode mode = lmDefinition;
  90. unsigned int count = 0;
  91. };
  92. typedef std::map<std::wstring, tagParser> MapParser;
  93. typedef std::map<std::wstring, std::wstring> MapAttrs;
  94. typedef std::stack<MapAttrs> StackMapAttrs;
  95. typedef std::stack<_TableRow> StackTableRow;
  96. typedef std::stack<ListInfo> ListInfoStack;
  97. StackTableRow stackTableRow;
  98. MapParser tagMap;
  99. StackMapAttrs stackAttrs;
  100. ListInfo listInfo;
  101. ListInfoStack listInfoStack;
  102. };
  103. } /* namespace */