HtmlToTextParser.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. /*
  2. * Copyright 2005 - 2016 Zarafa and its licensors
  3. *
  4. * This program is free software: you can redistribute it and/or modify
  5. * it under the terms of the GNU Affero General Public License, version 3,
  6. * as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU Affero General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU Affero General Public License
  14. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. *
  16. */
  17. #include <kopano/platform.h>
  18. #include <algorithm>
  19. #include <cwctype>
  20. #include <utility>
  21. #include "HtmlToTextParser.h"
  22. #include "HtmlEntity.h"
  23. #include <cwctype>
  24. namespace KC {
  25. CHtmlToTextParser::CHtmlToTextParser(void)
  26. {
  27. tagMap[L"head"] = tagParser(false, &CHtmlToTextParser::parseTagHEAD);
  28. tagMap[L"/head"] = tagParser(false, &CHtmlToTextParser::parseTagBHEAD);
  29. tagMap[L"style"] = tagParser(false, &CHtmlToTextParser::parseTagSTYLE);
  30. tagMap[L"/style"] = tagParser(false, &CHtmlToTextParser::parseTagBSTYLE);
  31. tagMap[L"script"] = tagParser(false, &CHtmlToTextParser::parseTagSCRIPT);
  32. tagMap[L"/script"] = tagParser(false, &CHtmlToTextParser::parseTagBSCRIPT);
  33. tagMap[L"pre"] = tagParser(false, &CHtmlToTextParser::parseTagPRE);
  34. tagMap[L"/pre"] = tagParser(false, &CHtmlToTextParser::parseTagBPRE);
  35. tagMap[L"p"] = tagParser(false, &CHtmlToTextParser::parseTagP);
  36. tagMap[L"/p"] = tagParser(false, &CHtmlToTextParser::parseTagBP);
  37. tagMap[L"a"] = tagParser(true, &CHtmlToTextParser::parseTagA);
  38. tagMap[L"/a"] = tagParser(false, &CHtmlToTextParser::parseTagBA);
  39. tagMap[L"br"] = tagParser(false, &CHtmlToTextParser::parseTagBR);
  40. tagMap[L"tr"] = tagParser(false, &CHtmlToTextParser::parseTagTR);
  41. tagMap[L"/tr"] = tagParser(false, &CHtmlToTextParser::parseTagBTR);
  42. tagMap[L"td"] = tagParser(false, &CHtmlToTextParser::parseTagTDTH);
  43. tagMap[L"th"] = tagParser(false, &CHtmlToTextParser::parseTagTDTH);
  44. tagMap[L"img"] = tagParser(true, &CHtmlToTextParser::parseTagIMG);
  45. tagMap[L"div"] = tagParser(false, &CHtmlToTextParser::parseTagNewLine);
  46. tagMap[L"/div"] = tagParser(false, &CHtmlToTextParser::parseTagNewLine);
  47. tagMap[L"hr"] = tagParser(false, &CHtmlToTextParser::parseTagHR);
  48. tagMap[L"h1"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
  49. tagMap[L"h2"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
  50. tagMap[L"h3"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
  51. tagMap[L"h4"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
  52. tagMap[L"h5"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
  53. tagMap[L"h6"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
  54. tagMap[L"ol"] = tagParser(false, &CHtmlToTextParser::parseTagOL);
  55. tagMap[L"/ol"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
  56. tagMap[L"ul"] = tagParser(false, &CHtmlToTextParser::parseTagUL);
  57. tagMap[L"/ul"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
  58. tagMap[L"li"] = tagParser(false, &CHtmlToTextParser::parseTagLI);
  59. tagMap[L"/dl"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
  60. tagMap[L"dt"] = tagParser(false, &CHtmlToTextParser::parseTagDT);
  61. tagMap[L"dd"] = tagParser(false, &CHtmlToTextParser::parseTagDD);
  62. tagMap[L"dl"] = tagParser(false, &CHtmlToTextParser::parseTagDL);
  63. // @todo check span
  64. }
  65. void CHtmlToTextParser::Init()
  66. {
  67. fScriptMode = false;
  68. fHeadMode = false;
  69. cNewlines = 0;
  70. fStyleMode = false;
  71. fTDTHMode = false;
  72. fPreMode = false;
  73. fTextMode = false;
  74. fAddSpace = false;
  75. strText.clear();
  76. }
  77. bool CHtmlToTextParser::Parse(const WCHAR *lpwHTML)
  78. {
  79. Init();
  80. while(*lpwHTML != 0)
  81. {
  82. if((*lpwHTML == '\n' || *lpwHTML == '\r' || *lpwHTML == '\t') && !fPreMode) {// ignore tabs and newlines
  83. if(fTextMode && !fTDTHMode && !fScriptMode && !fHeadMode && !fStyleMode && (*lpwHTML == '\n' || *lpwHTML == '\r'))
  84. fAddSpace = true;
  85. else
  86. fAddSpace = false;
  87. ++lpwHTML;
  88. continue;
  89. } else if(*lpwHTML == '<' && *lpwHTML+1 != ' ') { // The next char can not be a space!
  90. ++lpwHTML;
  91. parseTag(lpwHTML);
  92. continue;
  93. } else if(*lpwHTML == ' ' && !fPreMode) {
  94. fTextMode = true;
  95. addSpace(false);
  96. ++lpwHTML;
  97. continue;
  98. }
  99. if (fTextMode && fAddSpace)
  100. addSpace(false);
  101. fAddSpace = false;
  102. fTextMode = true;
  103. // if (skippable and not parsed)
  104. if (!(fScriptMode || fHeadMode || fStyleMode)) {
  105. if (parseEntity(lpwHTML))
  106. continue;
  107. addChar(*lpwHTML);
  108. }
  109. ++lpwHTML;
  110. }
  111. return true;
  112. }
  113. std::wstring& CHtmlToTextParser::GetText() {
  114. /*
  115. * Remove all trailing whitespace, but remember if there was the usual
  116. * final newline (since it too counts as whitespace) and retain/restore
  117. * it afterwards.
  118. */
  119. bool lf = false;
  120. auto r = strText.rbegin();
  121. for (; r != strText.rend() && iswspace(*r); ++r)
  122. if (*r == L'\n')
  123. /* \n is sufficient — no need to test for \r too */
  124. lf = true;
  125. strText.erase(r.base(), strText.end());
  126. if (lf)
  127. strText += L"\r\n";
  128. return strText;
  129. }
  130. void CHtmlToTextParser::addNewLine(bool forceLine) {
  131. if (strText.empty())
  132. return;
  133. if (forceLine || cNewlines == 0)
  134. strText += L"\r\n";
  135. ++cNewlines;
  136. }
  137. void CHtmlToTextParser::addChar(WCHAR c) {
  138. if (fScriptMode || fHeadMode || fStyleMode)
  139. return;
  140. strText.push_back(c);
  141. cNewlines = 0;
  142. fTDTHMode = false;
  143. }
  144. void CHtmlToTextParser::addSpace(bool force) {
  145. if(force || (!strText.empty() && *strText.rbegin() != ' ') )
  146. addChar(' ');
  147. }
  148. /**
  149. * @todo validate the entity!!
  150. */
  151. bool CHtmlToTextParser::parseEntity(const WCHAR* &lpwHTML)
  152. {
  153. std::wstring entity;
  154. if(*lpwHTML != '&')
  155. return false;
  156. ++lpwHTML;
  157. if (*lpwHTML == '#') {
  158. int base = 10;
  159. ++lpwHTML;
  160. if (*lpwHTML == 'x') {
  161. ++lpwHTML;
  162. base = 16;
  163. }
  164. for (int i = 0; iswxdigit(*lpwHTML) && *lpwHTML != ';' && i < 10; ++i) {
  165. entity += *lpwHTML;
  166. ++lpwHTML;
  167. }
  168. strText.push_back(wcstoul(entity.c_str(), NULL, base));
  169. } else {
  170. for (int i = 0; *lpwHTML != ';' && *lpwHTML != 0 && i < 10; ++i) {
  171. entity += *lpwHTML;
  172. ++lpwHTML;
  173. }
  174. WCHAR code = CHtmlEntity::toChar(entity.c_str());
  175. if (code > 0)
  176. strText.push_back( code );
  177. }
  178. if(*lpwHTML == ';')
  179. ++lpwHTML;
  180. return true;
  181. }
  182. void CHtmlToTextParser::parseTag(const WCHAR* &lpwHTML)
  183. {
  184. bool bTagName = true;
  185. bool bTagEnd = false;
  186. bool bParseAttrs = false;
  187. MapParser::const_iterator iterTag;
  188. std::wstring tagName;
  189. while (*lpwHTML != 0 && !bTagEnd)
  190. {
  191. if (bTagName && *lpwHTML == '!') {
  192. // HTML comment or doctype detect, ignore all the text
  193. bool fCommentMode = false;
  194. ++lpwHTML;
  195. if (*lpwHTML == '-' && *(lpwHTML+1) == '-') {
  196. fCommentMode = true;
  197. lpwHTML += 2; // Skip over the initial "<!--"
  198. }
  199. while (*lpwHTML != 0) {
  200. if (*lpwHTML != '>') {
  201. ++lpwHTML;
  202. continue;
  203. }
  204. if (!fCommentMode) {
  205. ++lpwHTML; // all others end on the first >
  206. return;
  207. }
  208. if (*(lpwHTML-1) == '-' && *(lpwHTML-2) == '-' ) {
  209. ++lpwHTML; // comment ends with -->
  210. return;
  211. }
  212. ++lpwHTML;
  213. }
  214. } else if (*lpwHTML == '>') {
  215. if(!bTagEnd){
  216. iterTag = tagMap.find(tagName);
  217. bTagEnd = true;
  218. bTagName = false;
  219. }
  220. } else if (*lpwHTML == '<') {
  221. return; // Possible broken HTML, ignore data before
  222. } else if (bTagName) {
  223. if (*lpwHTML == ' ') {
  224. bTagName = false;
  225. iterTag = tagMap.find(tagName);
  226. if (iterTag != tagMap.cend())
  227. bParseAttrs = iterTag->second.bParseAttrs;
  228. } else {
  229. tagName.push_back(towlower(*lpwHTML));
  230. }
  231. } else if (bParseAttrs) {
  232. parseAttributes(lpwHTML);
  233. break;
  234. }
  235. ++lpwHTML;
  236. }
  237. // Parse tag
  238. if (!bTagName && iterTag != tagMap.cend()) {
  239. (this->*iterTag->second.parserMethod)();
  240. fTextMode = false;
  241. }
  242. }
  243. void CHtmlToTextParser::parseAttributes(const WCHAR* &lpwHTML)
  244. {
  245. std::wstring attrName;
  246. std::wstring attrValue;
  247. bool bAttrName = true;
  248. bool bAttrValue = false;
  249. bool bEndTag = false;
  250. MapAttrs mapAttrs;
  251. WCHAR firstQuote = 0;
  252. while(*lpwHTML != 0 && !bEndTag) {
  253. if(*lpwHTML == '>' && bAttrValue) {
  254. bAttrValue = false;
  255. bEndTag = true;
  256. } else if(*lpwHTML == '>' && bAttrName) {
  257. ++lpwHTML;
  258. break; // No attributes or broken attribute detect
  259. } else if(*lpwHTML == '=' && bAttrName) {
  260. bAttrName = false;
  261. bAttrValue = true;
  262. } else if(*lpwHTML == ' ' && bAttrValue && firstQuote == 0) {
  263. if (!attrValue.empty())
  264. bAttrValue = false;
  265. // ignore space
  266. } else if (bAttrValue) {
  267. if(*lpwHTML == '\'' || *lpwHTML == '\"') {
  268. if (firstQuote == 0) {
  269. firstQuote = *lpwHTML++;
  270. continue; // Don't add the quote!
  271. } else if (firstQuote == *lpwHTML) {
  272. bAttrValue = false;
  273. }
  274. }
  275. if(bAttrValue)
  276. attrValue.push_back(*lpwHTML);
  277. } else if (bAttrName) {
  278. attrName.push_back(towlower(*lpwHTML));
  279. }
  280. if(!bAttrName && !bAttrValue) {
  281. mapAttrs[std::move(attrName)] = std::move(attrValue);
  282. firstQuote = 0;
  283. bAttrName = true;
  284. bAttrValue = false;
  285. attrValue.clear();
  286. attrName.clear();
  287. }
  288. ++lpwHTML;
  289. }
  290. stackAttrs.push(std::move(mapAttrs));
  291. }
  292. void CHtmlToTextParser::parseTagP()
  293. {
  294. if (cNewlines < 2 && !fTDTHMode) {
  295. addNewLine( false );
  296. addNewLine( true );
  297. }
  298. }
  299. void CHtmlToTextParser::parseTagBP() {
  300. addNewLine( false );
  301. addNewLine( true );
  302. }
  303. void CHtmlToTextParser::parseTagBR()
  304. {
  305. addNewLine( true );
  306. }
  307. void CHtmlToTextParser::parseTagTR()
  308. {
  309. _TableRow t;
  310. t.bFirstCol = true;
  311. addNewLine( false );
  312. stackTableRow.push(t);
  313. }
  314. void CHtmlToTextParser::parseTagBTR()
  315. {
  316. if(!stackTableRow.empty())
  317. stackTableRow.pop();
  318. }
  319. void CHtmlToTextParser::parseTagTDTH()
  320. {
  321. if (!stackTableRow.empty() && stackTableRow.top().bFirstCol == true)
  322. stackTableRow.top().bFirstCol = false;
  323. else
  324. addChar('\t');
  325. fTDTHMode = true;
  326. }
  327. void CHtmlToTextParser::parseTagIMG()
  328. {
  329. if (addURLAttribute(L"src", true)) {
  330. cNewlines = 0;
  331. fTDTHMode = false;
  332. }
  333. if (!stackAttrs.empty())
  334. stackAttrs.pop();
  335. }
  336. void CHtmlToTextParser::parseTagA() {
  337. // nothing todo, only because we want to parse the tag A attributes
  338. }
  339. void CHtmlToTextParser::parseTagBA()
  340. {
  341. if (addURLAttribute(L"href")) {
  342. cNewlines = 0;
  343. fTDTHMode = false;
  344. }
  345. if(!stackAttrs.empty())
  346. stackAttrs.pop();
  347. }
  348. bool CHtmlToTextParser::addURLAttribute(const WCHAR *lpattr, bool bSpaces) {
  349. MapAttrs::const_iterator iter;
  350. if (stackAttrs.empty())
  351. return false;
  352. iter = stackAttrs.top().find(lpattr);
  353. if (iter == stackAttrs.top().cend())
  354. return false;
  355. if (wcsncasecmp(iter->second.c_str(), L"http:", 5) != 0 &&
  356. wcsncasecmp(iter->second.c_str(), L"ftp:", 4) != 0 &&
  357. wcsncasecmp(iter->second.c_str(), L"mailto:", 7) != 0)
  358. return false;
  359. addSpace(false);
  360. strText.append(L"<");
  361. strText.append(iter->second);
  362. strText.append(L">");
  363. addSpace(false);
  364. return true;
  365. }
  366. void CHtmlToTextParser::parseTagSCRIPT() {
  367. fScriptMode = true;
  368. }
  369. void CHtmlToTextParser::parseTagBSCRIPT() {
  370. fScriptMode = false;
  371. }
  372. void CHtmlToTextParser::parseTagSTYLE() {
  373. fStyleMode = true;
  374. }
  375. void CHtmlToTextParser::parseTagBSTYLE() {
  376. fStyleMode = false;
  377. }
  378. void CHtmlToTextParser::parseTagHEAD() {
  379. fHeadMode = true;
  380. }
  381. void CHtmlToTextParser::parseTagBHEAD() {
  382. fHeadMode = false;
  383. }
  384. void CHtmlToTextParser::parseTagNewLine() {
  385. addNewLine( false );
  386. }
  387. void CHtmlToTextParser::parseTagHR() {
  388. addNewLine( false );
  389. strText.append(L"--------------------------------");
  390. addNewLine( true );
  391. }
  392. void CHtmlToTextParser::parseTagHeading() {
  393. addNewLine( false );
  394. addNewLine( true );
  395. }
  396. void CHtmlToTextParser::parseTagPopList() {
  397. if (!listInfoStack.empty())
  398. listInfoStack.pop();
  399. addNewLine( false );
  400. }
  401. void CHtmlToTextParser::parseTagOL() {
  402. listInfo.mode = lmOrdered;
  403. listInfo.count = 1;
  404. listInfoStack.push(listInfo);
  405. }
  406. void CHtmlToTextParser::parseTagUL() {
  407. listInfo.mode = lmUnordered;
  408. listInfo.count = 1;
  409. listInfoStack.push(listInfo);
  410. }
  411. static std::wstring inttostring(unsigned int x) {
  412. WCHAR buf[33];
  413. swprintf(buf, 33, L"%u", x);
  414. return buf;
  415. }
  416. void CHtmlToTextParser::parseTagLI() {
  417. addNewLine( false );
  418. if (listInfoStack.empty())
  419. return;
  420. for (size_t i = 0; i < listInfoStack.size() - 1; ++i)
  421. strText.append(L"\t");
  422. if (listInfoStack.top().mode == lmOrdered)
  423. strText += inttostring(listInfoStack.top().count++) + L".";
  424. else
  425. strText.append(L"*");
  426. strText.append(L"\t");
  427. cNewlines = 0;
  428. fTDTHMode = false;
  429. }
  430. void CHtmlToTextParser::parseTagDT() {
  431. addNewLine( false );
  432. if (listInfoStack.empty())
  433. return;
  434. for (size_t i = 0; i < listInfoStack.size() - 1; ++i)
  435. strText.append(L"\t");
  436. }
  437. void CHtmlToTextParser::parseTagDD() {
  438. addNewLine( false );
  439. if (listInfoStack.empty())
  440. return;
  441. for (size_t i = 0; i < listInfoStack.size(); ++i)
  442. strText.append(L"\t");
  443. }
  444. void CHtmlToTextParser::parseTagDL() {
  445. listInfo.mode = lmDefinition;
  446. listInfo.count = 1;
  447. listInfoStack.push(listInfo);
  448. }
  449. void CHtmlToTextParser::parseTagPRE() {
  450. fPreMode = true;
  451. addNewLine( false );
  452. addNewLine( true );
  453. }
  454. void CHtmlToTextParser::parseTagBPRE() {
  455. fPreMode = false;
  456. addNewLine( false );
  457. addNewLine( true );
  458. }
  459. } /* namespace */