123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544 |
- /*
- * Copyright 2005 - 2016 Zarafa and its licensors
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License, version 3,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- */
- #include <kopano/platform.h>
- #include <algorithm>
- #include <cwctype>
- #include <utility>
- #include "HtmlToTextParser.h"
- #include "HtmlEntity.h"
- #include <cwctype>
- namespace KC {
- CHtmlToTextParser::CHtmlToTextParser(void)
- {
- tagMap[L"head"] = tagParser(false, &CHtmlToTextParser::parseTagHEAD);
- tagMap[L"/head"] = tagParser(false, &CHtmlToTextParser::parseTagBHEAD);
- tagMap[L"style"] = tagParser(false, &CHtmlToTextParser::parseTagSTYLE);
- tagMap[L"/style"] = tagParser(false, &CHtmlToTextParser::parseTagBSTYLE);
- tagMap[L"script"] = tagParser(false, &CHtmlToTextParser::parseTagSCRIPT);
- tagMap[L"/script"] = tagParser(false, &CHtmlToTextParser::parseTagBSCRIPT);
- tagMap[L"pre"] = tagParser(false, &CHtmlToTextParser::parseTagPRE);
- tagMap[L"/pre"] = tagParser(false, &CHtmlToTextParser::parseTagBPRE);
- tagMap[L"p"] = tagParser(false, &CHtmlToTextParser::parseTagP);
- tagMap[L"/p"] = tagParser(false, &CHtmlToTextParser::parseTagBP);
- tagMap[L"a"] = tagParser(true, &CHtmlToTextParser::parseTagA);
- tagMap[L"/a"] = tagParser(false, &CHtmlToTextParser::parseTagBA);
- tagMap[L"br"] = tagParser(false, &CHtmlToTextParser::parseTagBR);
- tagMap[L"tr"] = tagParser(false, &CHtmlToTextParser::parseTagTR);
- tagMap[L"/tr"] = tagParser(false, &CHtmlToTextParser::parseTagBTR);
- tagMap[L"td"] = tagParser(false, &CHtmlToTextParser::parseTagTDTH);
- tagMap[L"th"] = tagParser(false, &CHtmlToTextParser::parseTagTDTH);
- tagMap[L"img"] = tagParser(true, &CHtmlToTextParser::parseTagIMG);
- tagMap[L"div"] = tagParser(false, &CHtmlToTextParser::parseTagNewLine);
- tagMap[L"/div"] = tagParser(false, &CHtmlToTextParser::parseTagNewLine);
- tagMap[L"hr"] = tagParser(false, &CHtmlToTextParser::parseTagHR);
- tagMap[L"h1"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
- tagMap[L"h2"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
- tagMap[L"h3"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
- tagMap[L"h4"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
- tagMap[L"h5"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
- tagMap[L"h6"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
- tagMap[L"ol"] = tagParser(false, &CHtmlToTextParser::parseTagOL);
- tagMap[L"/ol"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
- tagMap[L"ul"] = tagParser(false, &CHtmlToTextParser::parseTagUL);
- tagMap[L"/ul"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
- tagMap[L"li"] = tagParser(false, &CHtmlToTextParser::parseTagLI);
-
- tagMap[L"/dl"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
- tagMap[L"dt"] = tagParser(false, &CHtmlToTextParser::parseTagDT);
- tagMap[L"dd"] = tagParser(false, &CHtmlToTextParser::parseTagDD);
- tagMap[L"dl"] = tagParser(false, &CHtmlToTextParser::parseTagDL);
-
- // @todo check span
- }
- void CHtmlToTextParser::Init()
- {
- fScriptMode = false;
- fHeadMode = false;
- cNewlines = 0;
- fStyleMode = false;
- fTDTHMode = false;
- fPreMode = false;
- fTextMode = false;
- fAddSpace = false;
- strText.clear();
- }
- bool CHtmlToTextParser::Parse(const WCHAR *lpwHTML)
- {
- Init();
- while(*lpwHTML != 0)
- {
- if((*lpwHTML == '\n' || *lpwHTML == '\r' || *lpwHTML == '\t') && !fPreMode) {// ignore tabs and newlines
- if(fTextMode && !fTDTHMode && !fScriptMode && !fHeadMode && !fStyleMode && (*lpwHTML == '\n' || *lpwHTML == '\r'))
- fAddSpace = true;
- else
- fAddSpace = false;
- ++lpwHTML;
- continue;
- } else if(*lpwHTML == '<' && *lpwHTML+1 != ' ') { // The next char can not be a space!
- ++lpwHTML;
- parseTag(lpwHTML);
- continue;
- } else if(*lpwHTML == ' ' && !fPreMode) {
- fTextMode = true;
- addSpace(false);
- ++lpwHTML;
- continue;
- }
- if (fTextMode && fAddSpace)
- addSpace(false);
- fAddSpace = false;
- fTextMode = true;
- // if (skippable and not parsed)
- if (!(fScriptMode || fHeadMode || fStyleMode)) {
- if (parseEntity(lpwHTML))
- continue;
- addChar(*lpwHTML);
- }
- ++lpwHTML;
- }
- return true;
- }
- std::wstring& CHtmlToTextParser::GetText() {
- /*
- * Remove all trailing whitespace, but remember if there was the usual
- * final newline (since it too counts as whitespace) and retain/restore
- * it afterwards.
- */
- bool lf = false;
- auto r = strText.rbegin();
- for (; r != strText.rend() && iswspace(*r); ++r)
- if (*r == L'\n')
- /* \n is sufficient — no need to test for \r too */
- lf = true;
- strText.erase(r.base(), strText.end());
- if (lf)
- strText += L"\r\n";
- return strText;
- }
- void CHtmlToTextParser::addNewLine(bool forceLine) {
- if (strText.empty())
- return;
- if (forceLine || cNewlines == 0)
- strText += L"\r\n";
- ++cNewlines;
- }
- void CHtmlToTextParser::addChar(WCHAR c) {
- if (fScriptMode || fHeadMode || fStyleMode)
- return;
- strText.push_back(c);
- cNewlines = 0;
- fTDTHMode = false;
- }
- void CHtmlToTextParser::addSpace(bool force) {
-
- if(force || (!strText.empty() && *strText.rbegin() != ' ') )
- addChar(' ');
- }
- /**
- * @todo validate the entity!!
- */
- bool CHtmlToTextParser::parseEntity(const WCHAR* &lpwHTML)
- {
- std::wstring entity;
- if(*lpwHTML != '&')
- return false;
- ++lpwHTML;
- if (*lpwHTML == '#') {
- int base = 10;
- ++lpwHTML;
- if (*lpwHTML == 'x') {
- ++lpwHTML;
- base = 16;
- }
- for (int i = 0; iswxdigit(*lpwHTML) && *lpwHTML != ';' && i < 10; ++i) {
- entity += *lpwHTML;
- ++lpwHTML;
- }
- strText.push_back(wcstoul(entity.c_str(), NULL, base));
- } else {
- for (int i = 0; *lpwHTML != ';' && *lpwHTML != 0 && i < 10; ++i) {
- entity += *lpwHTML;
- ++lpwHTML;
- }
- WCHAR code = CHtmlEntity::toChar(entity.c_str());
- if (code > 0)
- strText.push_back( code );
- }
- if(*lpwHTML == ';')
- ++lpwHTML;
- return true;
- }
- void CHtmlToTextParser::parseTag(const WCHAR* &lpwHTML)
- {
- bool bTagName = true;
- bool bTagEnd = false;
- bool bParseAttrs = false;
- MapParser::const_iterator iterTag;
- std::wstring tagName;
- while (*lpwHTML != 0 && !bTagEnd)
- {
- if (bTagName && *lpwHTML == '!') {
-
- // HTML comment or doctype detect, ignore all the text
- bool fCommentMode = false;
- ++lpwHTML;
- if (*lpwHTML == '-' && *(lpwHTML+1) == '-') {
- fCommentMode = true;
- lpwHTML += 2; // Skip over the initial "<!--"
- }
- while (*lpwHTML != 0) {
- if (*lpwHTML != '>') {
- ++lpwHTML;
- continue;
- }
- if (!fCommentMode) {
- ++lpwHTML; // all others end on the first >
- return;
- }
- if (*(lpwHTML-1) == '-' && *(lpwHTML-2) == '-' ) {
- ++lpwHTML; // comment ends with -->
- return;
- }
- ++lpwHTML;
- }
- } else if (*lpwHTML == '>') {
- if(!bTagEnd){
- iterTag = tagMap.find(tagName);
- bTagEnd = true;
- bTagName = false;
- }
- } else if (*lpwHTML == '<') {
- return; // Possible broken HTML, ignore data before
- } else if (bTagName) {
- if (*lpwHTML == ' ') {
- bTagName = false;
- iterTag = tagMap.find(tagName);
- if (iterTag != tagMap.cend())
- bParseAttrs = iterTag->second.bParseAttrs;
- } else {
- tagName.push_back(towlower(*lpwHTML));
- }
- } else if (bParseAttrs) {
- parseAttributes(lpwHTML);
- break;
- }
- ++lpwHTML;
- }
- // Parse tag
- if (!bTagName && iterTag != tagMap.cend()) {
- (this->*iterTag->second.parserMethod)();
- fTextMode = false;
- }
- }
- void CHtmlToTextParser::parseAttributes(const WCHAR* &lpwHTML)
- {
- std::wstring attrName;
- std::wstring attrValue;
- bool bAttrName = true;
- bool bAttrValue = false;
- bool bEndTag = false;
- MapAttrs mapAttrs;
- WCHAR firstQuote = 0;
- while(*lpwHTML != 0 && !bEndTag) {
- if(*lpwHTML == '>' && bAttrValue) {
- bAttrValue = false;
- bEndTag = true;
- } else if(*lpwHTML == '>' && bAttrName) {
- ++lpwHTML;
- break; // No attributes or broken attribute detect
- } else if(*lpwHTML == '=' && bAttrName) {
- bAttrName = false;
- bAttrValue = true;
- } else if(*lpwHTML == ' ' && bAttrValue && firstQuote == 0) {
- if (!attrValue.empty())
- bAttrValue = false;
- // ignore space
- } else if (bAttrValue) {
- if(*lpwHTML == '\'' || *lpwHTML == '\"') {
- if (firstQuote == 0) {
- firstQuote = *lpwHTML++;
- continue; // Don't add the quote!
- } else if (firstQuote == *lpwHTML) {
- bAttrValue = false;
- }
- }
- if(bAttrValue)
- attrValue.push_back(*lpwHTML);
- } else if (bAttrName) {
- attrName.push_back(towlower(*lpwHTML));
- }
- if(!bAttrName && !bAttrValue) {
- mapAttrs[std::move(attrName)] = std::move(attrValue);
- firstQuote = 0;
- bAttrName = true;
- bAttrValue = false;
- attrValue.clear();
- attrName.clear();
- }
- ++lpwHTML;
- }
- stackAttrs.push(std::move(mapAttrs));
- }
- void CHtmlToTextParser::parseTagP()
- {
- if (cNewlines < 2 && !fTDTHMode) {
- addNewLine( false );
- addNewLine( true );
- }
- }
- void CHtmlToTextParser::parseTagBP() {
- addNewLine( false );
- addNewLine( true );
- }
- void CHtmlToTextParser::parseTagBR()
- {
- addNewLine( true );
- }
- void CHtmlToTextParser::parseTagTR()
- {
- _TableRow t;
- t.bFirstCol = true;
- addNewLine( false );
- stackTableRow.push(t);
- }
- void CHtmlToTextParser::parseTagBTR()
- {
- if(!stackTableRow.empty())
- stackTableRow.pop();
- }
- void CHtmlToTextParser::parseTagTDTH()
- {
- if (!stackTableRow.empty() && stackTableRow.top().bFirstCol == true)
- stackTableRow.top().bFirstCol = false;
- else
- addChar('\t');
- fTDTHMode = true;
- }
- void CHtmlToTextParser::parseTagIMG()
- {
- if (addURLAttribute(L"src", true)) {
- cNewlines = 0;
- fTDTHMode = false;
- }
- if (!stackAttrs.empty())
- stackAttrs.pop();
- }
- void CHtmlToTextParser::parseTagA() {
- // nothing todo, only because we want to parse the tag A attributes
- }
- void CHtmlToTextParser::parseTagBA()
- {
- if (addURLAttribute(L"href")) {
- cNewlines = 0;
- fTDTHMode = false;
- }
- if(!stackAttrs.empty())
- stackAttrs.pop();
- }
- bool CHtmlToTextParser::addURLAttribute(const WCHAR *lpattr, bool bSpaces) {
- MapAttrs::const_iterator iter;
- if (stackAttrs.empty())
- return false;
- iter = stackAttrs.top().find(lpattr);
- if (iter == stackAttrs.top().cend())
- return false;
- if (wcsncasecmp(iter->second.c_str(), L"http:", 5) != 0 &&
- wcsncasecmp(iter->second.c_str(), L"ftp:", 4) != 0 &&
- wcsncasecmp(iter->second.c_str(), L"mailto:", 7) != 0)
- return false;
- addSpace(false);
- strText.append(L"<");
- strText.append(iter->second);
- strText.append(L">");
- addSpace(false);
- return true;
- }
- void CHtmlToTextParser::parseTagSCRIPT() {
- fScriptMode = true;
- }
- void CHtmlToTextParser::parseTagBSCRIPT() {
- fScriptMode = false;
- }
- void CHtmlToTextParser::parseTagSTYLE() {
- fStyleMode = true;
- }
- void CHtmlToTextParser::parseTagBSTYLE() {
- fStyleMode = false;
- }
- void CHtmlToTextParser::parseTagHEAD() {
- fHeadMode = true;
- }
- void CHtmlToTextParser::parseTagBHEAD() {
- fHeadMode = false;
- }
- void CHtmlToTextParser::parseTagNewLine() {
- addNewLine( false );
- }
- void CHtmlToTextParser::parseTagHR() {
- addNewLine( false );
- strText.append(L"--------------------------------");
- addNewLine( true );
- }
- void CHtmlToTextParser::parseTagHeading() {
- addNewLine( false );
- addNewLine( true );
- }
- void CHtmlToTextParser::parseTagPopList() {
- if (!listInfoStack.empty())
- listInfoStack.pop();
- addNewLine( false );
- }
- void CHtmlToTextParser::parseTagOL() {
- listInfo.mode = lmOrdered;
- listInfo.count = 1;
- listInfoStack.push(listInfo);
- }
- void CHtmlToTextParser::parseTagUL() {
- listInfo.mode = lmUnordered;
- listInfo.count = 1;
- listInfoStack.push(listInfo);
- }
- static std::wstring inttostring(unsigned int x) {
- WCHAR buf[33];
- swprintf(buf, 33, L"%u", x);
- return buf;
- }
- void CHtmlToTextParser::parseTagLI() {
- addNewLine( false );
- if (listInfoStack.empty())
- return;
- for (size_t i = 0; i < listInfoStack.size() - 1; ++i)
- strText.append(L"\t");
- if (listInfoStack.top().mode == lmOrdered)
- strText += inttostring(listInfoStack.top().count++) + L".";
- else
- strText.append(L"*");
- strText.append(L"\t");
- cNewlines = 0;
- fTDTHMode = false;
- }
- void CHtmlToTextParser::parseTagDT() {
- addNewLine( false );
- if (listInfoStack.empty())
- return;
- for (size_t i = 0; i < listInfoStack.size() - 1; ++i)
- strText.append(L"\t");
- }
- void CHtmlToTextParser::parseTagDD() {
- addNewLine( false );
- if (listInfoStack.empty())
- return;
- for (size_t i = 0; i < listInfoStack.size(); ++i)
- strText.append(L"\t");
- }
- void CHtmlToTextParser::parseTagDL() {
- listInfo.mode = lmDefinition;
- listInfo.count = 1;
- listInfoStack.push(listInfo);
- }
- void CHtmlToTextParser::parseTagPRE() {
- fPreMode = true;
- addNewLine( false );
- addNewLine( true );
- }
- void CHtmlToTextParser::parseTagBPRE() {
- fPreMode = false;
- addNewLine( false );
- addNewLine( true );
- }
- } /* namespace */
|