hp
/
kopano-core


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
							/*
 * Copyright 2005 - 2016 Zarafa and its licensors
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

#include <kopano/platform.h>
#include <algorithm>
#include <cwctype>
#include <utility>
#include "HtmlToTextParser.h"
#include "HtmlEntity.h"
#include <cwctype>

namespace KC {

CHtmlToTextParser::CHtmlToTextParser(void)
{
	tagMap[L"head"] = tagParser(false, &CHtmlToTextParser::parseTagHEAD);
	tagMap[L"/head"] = tagParser(false, &CHtmlToTextParser::parseTagBHEAD);
	tagMap[L"style"] = tagParser(false, &CHtmlToTextParser::parseTagSTYLE);
	tagMap[L"/style"] = tagParser(false, &CHtmlToTextParser::parseTagBSTYLE);
	tagMap[L"script"] = tagParser(false, &CHtmlToTextParser::parseTagSCRIPT);
	tagMap[L"/script"] = tagParser(false, &CHtmlToTextParser::parseTagBSCRIPT);
	tagMap[L"pre"] = tagParser(false, &CHtmlToTextParser::parseTagPRE);
	tagMap[L"/pre"] = tagParser(false, &CHtmlToTextParser::parseTagBPRE);
	tagMap[L"p"] = tagParser(false, &CHtmlToTextParser::parseTagP);
	tagMap[L"/p"] = tagParser(false, &CHtmlToTextParser::parseTagBP);
	tagMap[L"a"] = tagParser(true, &CHtmlToTextParser::parseTagA);
	tagMap[L"/a"] = tagParser(false, &CHtmlToTextParser::parseTagBA);
	tagMap[L"br"] = tagParser(false, &CHtmlToTextParser::parseTagBR);
	tagMap[L"tr"] = tagParser(false, &CHtmlToTextParser::parseTagTR);
	tagMap[L"/tr"] = tagParser(false, &CHtmlToTextParser::parseTagBTR);
	tagMap[L"td"] = tagParser(false, &CHtmlToTextParser::parseTagTDTH);
	tagMap[L"th"] = tagParser(false, &CHtmlToTextParser::parseTagTDTH);
	tagMap[L"img"] = tagParser(true, &CHtmlToTextParser::parseTagIMG);
	tagMap[L"div"] = tagParser(false, &CHtmlToTextParser::parseTagNewLine);
	tagMap[L"/div"] = tagParser(false, &CHtmlToTextParser::parseTagNewLine);
	tagMap[L"hr"] = tagParser(false, &CHtmlToTextParser::parseTagHR);
	tagMap[L"h1"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
	tagMap[L"h2"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
	tagMap[L"h3"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
	tagMap[L"h4"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
	tagMap[L"h5"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);
	tagMap[L"h6"] = tagParser(false, &CHtmlToTextParser::parseTagHeading);

	tagMap[L"ol"] = tagParser(false, &CHtmlToTextParser::parseTagOL);
	tagMap[L"/ol"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
	tagMap[L"ul"] = tagParser(false, &CHtmlToTextParser::parseTagUL);
	tagMap[L"/ul"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
	tagMap[L"li"] = tagParser(false, &CHtmlToTextParser::parseTagLI);
	
	tagMap[L"/dl"] = tagParser(false, &CHtmlToTextParser::parseTagPopList);
	tagMap[L"dt"] = tagParser(false, &CHtmlToTextParser::parseTagDT);
	tagMap[L"dd"] = tagParser(false, &CHtmlToTextParser::parseTagDD);
	tagMap[L"dl"] = tagParser(false, &CHtmlToTextParser::parseTagDL);
	
	// @todo check span
}

void CHtmlToTextParser::Init()
{
	fScriptMode = false;
	fHeadMode = false;
	cNewlines = 0;
	fStyleMode = false;
	fTDTHMode = false;
	fPreMode = false;
	fTextMode = false;
	fAddSpace = false;

	strText.clear();
}

bool CHtmlToTextParser::Parse(const WCHAR *lpwHTML)
{
	Init();

	while(*lpwHTML != 0)
	{
		if((*lpwHTML == '\n' || *lpwHTML == '\r' || *lpwHTML == '\t') && !fPreMode) {// ignore tabs and newlines
			if(fTextMode && !fTDTHMode && !fScriptMode && !fHeadMode && !fStyleMode && (*lpwHTML == '\n' || *lpwHTML == '\r'))
				fAddSpace = true;
			else
				fAddSpace = false;

			++lpwHTML;
			continue;
		} else if(*lpwHTML == '<' && *lpwHTML+1 != ' ') { // The next char can not be a space!
			++lpwHTML;
			parseTag(lpwHTML);
			continue;
		} else if(*lpwHTML == ' ' && !fPreMode) {
			fTextMode = true;
			addSpace(false);
			++lpwHTML;
			continue;
		}
		if (fTextMode && fAddSpace)
			addSpace(false);
		fAddSpace = false;
		fTextMode = true;

		// if (skippable and not parsed)
		if (!(fScriptMode || fHeadMode || fStyleMode)) {
			if (parseEntity(lpwHTML))
				continue;
			addChar(*lpwHTML);
		}
		++lpwHTML;
	}

	return true;
}

std::wstring& CHtmlToTextParser::GetText() {
	/*
	 * Remove all trailing whitespace, but remember if there was the usual
	 * final newline (since it too counts as whitespace) and retain/restore
	 * it afterwards.
	 */
	bool lf = false;
	auto r = strText.rbegin();
	for (; r != strText.rend() && iswspace(*r); ++r)
		if (*r == L'\n')
			/* \n is sufficient — no need to test for \r too */
			lf = true;
	strText.erase(r.base(), strText.end());
	if (lf)
		strText += L"\r\n";
	return strText;
}

void CHtmlToTextParser::addNewLine(bool forceLine) {
	if (strText.empty())
		return;

	if (forceLine || cNewlines == 0)
		strText += L"\r\n";

	++cNewlines;
}

void CHtmlToTextParser::addChar(WCHAR c) {
	if (fScriptMode || fHeadMode || fStyleMode)
		return;

	strText.push_back(c);
	cNewlines = 0;
	fTDTHMode = false;
}

void CHtmlToTextParser::addSpace(bool force) {
	
	if(force || (!strText.empty() && *strText.rbegin() != ' ') )
		addChar(' ');
}

/**
 * @todo validate the entity!!
 */
bool CHtmlToTextParser::parseEntity(const WCHAR* &lpwHTML)
{
	std::wstring entity;

	if(*lpwHTML != '&')
		return false;

	++lpwHTML;

	if (*lpwHTML == '#') {
		int base = 10;

		++lpwHTML;
		if (*lpwHTML == 'x') {
			++lpwHTML;
			base = 16;
		}

		for (int i = 0; iswxdigit(*lpwHTML) && *lpwHTML != ';' && i < 10; ++i) {
			entity += *lpwHTML;
			++lpwHTML;
		}

		strText.push_back(wcstoul(entity.c_str(), NULL, base));
	} else {
		for (int i = 0; *lpwHTML != ';' && *lpwHTML != 0 && i < 10; ++i) {
			entity += *lpwHTML;
			++lpwHTML;
		}

		WCHAR code = CHtmlEntity::toChar(entity.c_str());
		if (code > 0)
			strText.push_back( code );
	}

	if(*lpwHTML == ';')
		++lpwHTML;

	return true;
}

void CHtmlToTextParser::parseTag(const WCHAR* &lpwHTML)
{
	bool bTagName = true;
	bool bTagEnd = false;
	bool bParseAttrs = false;
	MapParser::const_iterator iterTag;

	std::wstring tagName;

	while (*lpwHTML != 0 && !bTagEnd) 
	{
		if (bTagName && *lpwHTML == '!') {
			
			// HTML comment or doctype detect, ignore all the text
			bool fCommentMode = false;
			++lpwHTML;

			if (*lpwHTML == '-' && *(lpwHTML+1) == '-') {
				fCommentMode = true;
				lpwHTML += 2; // Skip over the initial "<!--"
			}

			while (*lpwHTML != 0) {
				if (*lpwHTML != '>') {
					++lpwHTML;
					continue;
				}
				if (!fCommentMode) {
					++lpwHTML; // all others end on the first >
					return;
				}
				if (*(lpwHTML-1) == '-' && *(lpwHTML-2) == '-' ) {
					++lpwHTML; // comment ends with -->
					return;
				}
				++lpwHTML;
			}
		} else if (*lpwHTML == '>') {
			if(!bTagEnd){
				iterTag = tagMap.find(tagName);
				bTagEnd = true;
				bTagName = false;
			}
		} else if (*lpwHTML == '<') {
			return; // Possible broken HTML, ignore data before
		} else if (bTagName) {
			if (*lpwHTML == ' ') {
				bTagName = false;
				iterTag = tagMap.find(tagName);
				if (iterTag != tagMap.cend())
					bParseAttrs = iterTag->second.bParseAttrs;
			} else {
				tagName.push_back(towlower(*lpwHTML));
			}
		} else if (bParseAttrs) {
			parseAttributes(lpwHTML);
			break;
		}

		++lpwHTML;
	}

	// Parse tag
	if (!bTagName && iterTag != tagMap.cend()) {
		(this->*iterTag->second.parserMethod)();
		fTextMode = false;
	}
}

void CHtmlToTextParser::parseAttributes(const WCHAR* &lpwHTML)
{
	std::wstring attrName;
	std::wstring attrValue;
	bool bAttrName = true;
	bool bAttrValue = false;
	bool bEndTag = false;
	MapAttrs mapAttrs;

	WCHAR firstQuote = 0;

	while(*lpwHTML != 0 && !bEndTag) {
		if(*lpwHTML == '>' && bAttrValue) {
				bAttrValue = false;
				bEndTag = true;
		} else if(*lpwHTML == '>' && bAttrName) {
			++lpwHTML;
			break; // No attributes or broken attribute detect
		} else if(*lpwHTML == '=' && bAttrName) {
			bAttrName = false;
			bAttrValue = true;
		} else if(*lpwHTML == ' ' && bAttrValue && firstQuote == 0) {

			if (!attrValue.empty())
				bAttrValue = false;
			// ignore space
		} else if (bAttrValue) {
			if(*lpwHTML == '\'' || *lpwHTML == '\"') {
				if (firstQuote == 0) {
					firstQuote = *lpwHTML++;
					continue; // Don't add the quote!
				} else if (firstQuote == *lpwHTML) {
					bAttrValue = false;
				}
			}

			if(bAttrValue)
				attrValue.push_back(*lpwHTML);
		} else if (bAttrName) {
			attrName.push_back(towlower(*lpwHTML));
		}

		if(!bAttrName && !bAttrValue) {
			mapAttrs[std::move(attrName)] = std::move(attrValue);
			firstQuote = 0;
			bAttrName = true;
			bAttrValue = false;
			attrValue.clear();
			attrName.clear();
		}

		++lpwHTML;
	}

	stackAttrs.push(std::move(mapAttrs));
}

void CHtmlToTextParser::parseTagP()
{
	if (cNewlines < 2 && !fTDTHMode) {
		addNewLine( false );
		addNewLine( true );
	}
}

void CHtmlToTextParser::parseTagBP() {
	addNewLine( false );
	addNewLine( true );
}

void CHtmlToTextParser::parseTagBR()
{
	addNewLine( true );
}

void CHtmlToTextParser::parseTagTR()
{
	_TableRow t;
	t.bFirstCol = true;

	addNewLine( false );
	stackTableRow.push(t);
}

void CHtmlToTextParser::parseTagBTR()
{
	if(!stackTableRow.empty())
		stackTableRow.pop();
}

void CHtmlToTextParser::parseTagTDTH()
{
	if (!stackTableRow.empty() && stackTableRow.top().bFirstCol == true)
		 stackTableRow.top().bFirstCol = false;
	else
		addChar('\t');

	fTDTHMode = true;
}

void CHtmlToTextParser::parseTagIMG()
{
	if (addURLAttribute(L"src", true)) {
		cNewlines = 0;
		fTDTHMode = false;
	}

	if (!stackAttrs.empty())
		stackAttrs.pop();
}

void CHtmlToTextParser::parseTagA() {
	// nothing todo, only because we want to parse the tag A attributes
}

void CHtmlToTextParser::parseTagBA()
{
	if (addURLAttribute(L"href")) {
		cNewlines = 0;
		fTDTHMode = false;
	}

	if(!stackAttrs.empty())
		stackAttrs.pop();

}

bool CHtmlToTextParser::addURLAttribute(const WCHAR *lpattr, bool bSpaces) {

	MapAttrs::const_iterator iter;

	if (stackAttrs.empty())
		return false;

	iter = stackAttrs.top().find(lpattr);
	if (iter == stackAttrs.top().cend())
		return false;
	if (wcsncasecmp(iter->second.c_str(), L"http:", 5) != 0 &&
	    wcsncasecmp(iter->second.c_str(), L"ftp:", 4) != 0 &&
	    wcsncasecmp(iter->second.c_str(), L"mailto:", 7) != 0)
		return false;
	addSpace(false);
	strText.append(L"<");
	strText.append(iter->second);
	strText.append(L">");
	addSpace(false);
	return true;
}

void CHtmlToTextParser::parseTagSCRIPT() {
	fScriptMode = true;
}

void CHtmlToTextParser::parseTagBSCRIPT() {
	fScriptMode = false;
}

void CHtmlToTextParser::parseTagSTYLE() {
	fStyleMode = true;
}

void CHtmlToTextParser::parseTagBSTYLE() {
	fStyleMode = false;
}

void CHtmlToTextParser::parseTagHEAD() {
	fHeadMode = true;
}

void CHtmlToTextParser::parseTagBHEAD() {
	fHeadMode = false;
}

void CHtmlToTextParser::parseTagNewLine() {
	addNewLine( false );
}

void CHtmlToTextParser::parseTagHR() {
	addNewLine( false );
	strText.append(L"--------------------------------");
	addNewLine( true );
}
void CHtmlToTextParser::parseTagHeading() {
	addNewLine( false );
	addNewLine( true );
}

void CHtmlToTextParser::parseTagPopList() {
	if (!listInfoStack.empty())
		listInfoStack.pop();
	addNewLine( false );
}

void CHtmlToTextParser::parseTagOL() {
	listInfo.mode = lmOrdered;
	listInfo.count = 1;
	listInfoStack.push(listInfo);
}

void CHtmlToTextParser::parseTagUL() {
	listInfo.mode = lmUnordered;
	listInfo.count = 1;
	listInfoStack.push(listInfo);
}

static std::wstring inttostring(unsigned int x) {
	WCHAR buf[33];
	swprintf(buf, 33, L"%u", x);
	return buf;
}

void CHtmlToTextParser::parseTagLI() {
	addNewLine( false );
	if (listInfoStack.empty())
		return;
	for (size_t i = 0; i < listInfoStack.size() - 1; ++i)
		strText.append(L"\t");
	if (listInfoStack.top().mode == lmOrdered)
		strText += inttostring(listInfoStack.top().count++) + L".";
	else
		strText.append(L"*");
	strText.append(L"\t");
	cNewlines = 0;
	fTDTHMode = false;
}

void CHtmlToTextParser::parseTagDT() {
	addNewLine( false );
	if (listInfoStack.empty())
		return;
	for (size_t i = 0; i < listInfoStack.size() - 1; ++i)
		strText.append(L"\t");
}

void CHtmlToTextParser::parseTagDD() {
	addNewLine( false );
	if (listInfoStack.empty())
		return;
	for (size_t i = 0; i < listInfoStack.size(); ++i)
		strText.append(L"\t");
}

void CHtmlToTextParser::parseTagDL() {
	listInfo.mode = lmDefinition;
	listInfo.count = 1;
	listInfoStack.push(listInfo);
}

void CHtmlToTextParser::parseTagPRE() {
	fPreMode = true;
    addNewLine( false );
    addNewLine( true );
}

void CHtmlToTextParser::parseTagBPRE() {
	fPreMode = false;
	addNewLine( false );
	addNewLine( true );
}

} /* namespace */