123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800 |
- /*
- * Timeless dependency
- * Copyright (C) <2019> <alkeon> [alkeon@autistici.org]
- *
- * Texdi is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Texdi is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with timeless. If not, see <http://www.gnu.org/licenses/>.
- *
- */
- #include <iostream>
- #include <string>
- #include <fstream>
- #include <sstream>
- #include <map>
- #include <curl/curl.h>
- #include "parser.h"
- #include "datalite.h"
- using namespace std;
- #define TITLE 0
- #define LINK 1
- #define DESCRIPTION 2
- #define START_ITEM 3
- #define END_ITEM 4
- const char * user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
- map<string, string> ENTITIES_VALUES = {
- { "AElig", "Æ" },
- { "Aacute", "Á" },
- { "Acirc", "Â" },
- { "Agrave", "À" },
- { "Alpha", "Α" },
- { "Aring", "Å" },
- { "Atilde", "Ã" },
- { "Auml", "Ä" },
- { "Beta", "Β" },
- { "Ccedil", "Ç" },
- { "Chi", "Χ" },
- { "Dagger", "‡" },
- { "Delta", "Δ" },
- { "ETH", "Ð" },
- { "Eacute", "É" },
- { "Ecirc", "Ê" },
- { "Egrave", "È" },
- { "Epsilon", "Ε" },
- { "Eta", "Η" },
- { "Euml", "Ë" },
- { "Gamma", "Γ" },
- { "Iacute", "Í" },
- { "Icirc", "Î" },
- { "Igrave", "Ì" },
- { "Iota", "Ι" },
- { "Iuml", "Ï" },
- { "Kappa", "Κ" },
- { "Lambda", "Λ" },
- { "Mu", "Μ" },
- { "Ntilde", "Ñ" },
- { "Nu", "Ν" },
- { "OElig", "Œ" },
- { "Oacute", "Ó" },
- { "Ocirc", "Ô" },
- { "Ograve", "Ò" },
- { "Omega", "Ω" },
- { "Omicron", "Ο" },
- { "Oslash", "Ø" },
- { "Otilde", "Õ" },
- { "Ouml", "Ö" },
- { "Phi", "Φ" },
- { "Pi", "Π" },
- { "Prime", "″" },
- { "Psi", "Ψ" },
- { "Rho", "Ρ" },
- { "Scaron", "Š" },
- { "Sigma", "Σ" },
- { "THORN", "Þ" },
- { "Tau", "Τ" },
- { "Theta", "Θ" },
- { "Uacute", "Ú" },
- { "Ucirc", "Û" },
- { "Ugrave", "Ù" },
- { "Upsilon", "Υ" },
- { "Uuml", "Ü" },
- { "Xi", "Ξ" },
- { "Yacute", "Ý" },
- { "Yuml", "Ÿ" },
- { "Zeta", "Ζ" },
- { "aacute", "á" },
- { "acirc", "â" },
- { "acute", "´" },
- { "aelig", "æ" },
- { "agrave", "à" },
- { "alefsym", "ℵ" },
- { "alpha", "α" },
- { "amp", "&" },
- { "and", "∧" },
- { "ang", "∠" },
- { "apos", "'" },
- { "aring", "å" },
- { "asymp", "≈" },
- { "atilde", "ã" },
- { "auml", "ä" },
- { "bdquo", "„" },
- { "beta", "β" },
- { "brvbar", "¦" },
- { "bull", "•" },
- { "cap", "∩" },
- { "ccedil", "ç" },
- { "cedil", "¸" },
- { "cent", "¢" },
- { "chi", "χ" },
- { "circ", "ˆ" },
- { "clubs", "♣" },
- { "cong", "≅" },
- { "copy", "©" },
- { "crarr", "↵" },
- { "cup", "∪" },
- { "curren", "¤" },
- { "dArr", "⇓" },
- { "dagger", "†" },
- { "darr", "↓" },
- { "deg", "°" },
- { "delta", "δ" },
- { "diams", "♦" },
- { "divide", "÷" },
- { "eacute", "é" },
- { "ecirc", "ê" },
- { "egrave", "è" },
- { "empty", "∅" },
- { "emsp", "\xE2\x80\x83" },
- { "ensp", "\xE2\x80\x82" },
- { "epsilon", "ε" },
- { "equiv", "≡" },
- { "eta", "η" },
- { "eth", "ð" },
- { "euml", "ë" },
- { "euro", "€" },
- { "exist", "∃" },
- { "fnof", "ƒ" },
- { "forall", "∀" },
- { "frac12", "½" },
- { "frac14", "¼" },
- { "frac34", "¾" },
- { "frasl", "⁄" },
- { "gamma", "γ" },
- { "ge", "≥" },
- { "gt", ">" },
- { "hArr", "⇔" },
- { "harr", "↔" },
- { "hearts", "♥" },
- { "hellip", "…" },
- { "iacute", "í" },
- { "icirc", "î" },
- { "iexcl", "¡" },
- { "igrave", "ì" },
- { "image", "ℑ" },
- { "infin", "∞" },
- { "int", "∫" },
- { "iota", "ι" },
- { "iquest", "¿" },
- { "isin", "∈" },
- { "iuml", "ï" },
- { "kappa", "κ" },
- { "lArr", "⇐" },
- { "lambda", "λ" },
- { "lang", "〈" },
- { "laquo", "«" },
- { "larr", "←" },
- { "lceil", "⌈" },
- { "ldquo", "“" },
- { "le", "≤" },
- { "lfloor", "⌊" },
- { "lowast", "∗" },
- { "loz", "◊" },
- { "lrm", "\xE2\x80\x8E" },
- { "lsaquo", "‹" },
- { "lsquo", "‘" },
- { "lt", "<" },
- { "macr", "¯" },
- { "mdash", "—" },
- { "micro", "µ" },
- { "middot", "·" },
- { "minus", "−" },
- { "mu", "μ" },
- { "nabla", "∇" },
- { "nbsp", "\xC2\xA0" },
- { "ndash", "–" },
- { "ne", "≠" },
- { "ni", "∋" },
- { "not", "¬" },
- { "notin", "∉" },
- { "nsub", "⊄" },
- { "ntilde", "ñ" },
- { "nu", "ν" },
- { "oacute", "ó" },
- { "ocirc", "ô" },
- { "oelig", "œ" },
- { "ograve", "ò" },
- { "oline", "‾" },
- { "omega", "ω" },
- { "omicron", "ο" },
- { "oplus", "⊕" },
- { "or", "∨" },
- { "ordf", "ª" },
- { "ordm", "º" },
- { "oslash", "ø" },
- { "otilde", "õ" },
- { "otimes", "⊗" },
- { "ouml", "ö" },
- { "para", "¶" },
- { "part", "∂" },
- { "permil", "‰" },
- { "perp", "⊥" },
- { "phi", "φ" },
- { "pi", "π" },
- { "piv", "ϖ" },
- { "plusmn", "±" },
- { "pound", "£" },
- { "prime", "′" },
- { "prod", "∏" },
- { "prop", "∝" },
- { "psi", "ψ" },
- { "quot", "\"" },
- { "rArr", "⇒" },
- { "radic", "√" },
- { "rang", "〉" },
- { "raquo", "»" },
- { "rarr", "→" },
- { "rceil", "⌉" },
- { "rdquo", "”" },
- { "real", "ℜ" },
- { "reg", "®" },
- { "rfloor", "⌋" },
- { "rho", "ρ" },
- { "rlm", "\xE2\x80\x8F" },
- { "rsaquo", "›" },
- { "rsquo", "’" },
- { "sbquo", "‚" },
- { "scaron", "š" },
- { "sdot", "⋅" },
- { "sect", "§" },
- { "shy", "\xC2\xAD" },
- { "sigma", "σ" },
- { "sigmaf", "ς" },
- { "sim", "∼" },
- { "spades", "♠" },
- { "sub", "⊂" },
- { "sube", "⊆" },
- { "sum", "∑" },
- { "sup1", "¹" },
- { "sup2", "²" },
- { "sup3", "³" },
- { "sup", "⊃" },
- { "supe", "⊇" },
- { "szlig", "ß" },
- { "tau", "τ" },
- { "there4", "∴" },
- { "theta", "θ" },
- { "thetasym", "ϑ" },
- { "thinsp", "\xE2\x80\x89" },
- { "thorn", "þ" },
- { "tilde", "˜" },
- { "times", "×" },
- { "trade", "™" },
- { "uArr", "⇑" },
- { "uacute", "ú" },
- { "uarr", "↑" },
- { "ucirc", "û" },
- { "ugrave", "ù" },
- { "uml", "¨" },
- { "upsih", "ϒ" },
- { "upsilon", "υ" },
- { "uuml", "ü" },
- { "weierp", "℘" },
- { "xi", "ξ" },
- { "yacute", "ý" },
- { "yen", "¥" },
- { "yuml", "ÿ" },
- { "zeta", "ζ" },
- { "zwj", "\xE2\x80\x8D" },
- { "zwnj", "\xE2\x80\x8C" },
- };
- size_t write_to_string(void *ptr, size_t size, size_t nmemb, string stream){
- size_t realsize = size * nmemb;
- string temp(static_cast<const char*>(ptr), realsize);
- stream.append(temp);
- return realsize;
- }
- void parser::get_news(){
- datalite news;
- news.set_file("news");
- cout << "Downloading news" << endl;
- while(news.is_valid_channel()){
- string channel = news.get_new_channel();
- cout << channel << endl;
- cout << "\tDownloading" << endl;
- this->download_news(channel);
- }
- }
- void parser::download_news(const string& xml){
- CURL * curl;
- curl = curl_easy_init();
- if (curl) {
- string response;
- curl_easy_setopt(curl, CURLOPT_URL, xml.c_str());
- curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
- curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 3L);
- curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);
- curl_easy_setopt(curl, CURLOPT_USERAGENT, user_agent);
- curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 10L);
- curl_easy_setopt(curl, CURLOPT_USE_SSL, CURLUSESSL_TRY);
- curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_to_string);
- curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
- curl_easy_perform(curl);
- curl_easy_cleanup(curl);
- if(response.length() > 0){
- cout << "\tParsing" << endl;
- parser p;
- p.detect_rss_standard(istringstream(response).str());
- }
- }else
- curl_easy_cleanup(curl);
- }
- int parser::next_tag(const string& line){
- int tag = -1;
- size_t size = line.size();
- size_t title = line.find("<title>");
- size_t link = line.find("<link>");
- size_t description = line.find("<description>");
- size_t end_description = line.find("</description>");
- size_t start_item = line.find("<item");
- size_t end_item = line.find("</item>");
- if(title != string::npos && title < size){
- size = title;
- tag = TITLE;
- }
- if(link != string::npos && link < size){
- size = link;
- tag = LINK;
- }
- if(description != string::npos && description < size){
- size = description;
- tag = DESCRIPTION;
- }
- if(end_description != string::npos && end_description < size){
- size = end_description;
- tag = DESCRIPTION;
- }
- if(start_item != string::npos && start_item < size){
- size = start_item;
- tag = START_ITEM;
- }
- if(end_item != string::npos && end_item < size){
- size = end_item;
- tag = END_ITEM;
- }
- return tag;
- }
- int parser::next_tag_atom(const string& line){
- int tag = -1;
- size_t size = line.size();
- size_t title = line.find("<title>");
- size_t link = line.find("<link");
- size_t description = line.find("<content type");
- size_t end_description = line.find("</content>");
- size_t start_item = line.find("<entry>");
- size_t end_item = line.find("</entry>");
- if(title != string::npos && title < size){
- size = title;
- tag = TITLE;
- }
- if(link != string::npos && link < size){
- size = link;
- tag = LINK;
- }
- if(description != string::npos && description < size){
- size = description;
- tag = DESCRIPTION;
- }
- if(end_description != string::npos && end_description < size){
- size = end_description;
- tag = DESCRIPTION;
- }
- if(start_item != string::npos && start_item < size){
- size = start_item;
- tag = START_ITEM;
- }
- if(end_item != string::npos && end_item < size){
- size = end_item;
- tag = END_ITEM;
- }
- return tag;
- }
- void parser::detect_rss_standard(const string& rss_text){
- stringstream index(rss_text);
- string line;
- bool rss = false;
- bool atom = false;
- while(getline(index, line)){
- size_t rss_position = line.find("<rss version");
- if(rss_position != string::npos) rss = true;
- else{
- size_t atom_position = line.find("<feed");
- if(atom_position != string::npos){
- atom = true;
- _item = false;
- }
- size_t channel_position = line.find("<channel");
- if(channel_position != string::npos) rss = true;
- }
- if(atom)
- this->atom(line);
- else if(rss)
- this->rss(line);
- }
- datalite d;
- d.set_file("news");
- d.bulk_insert(_links, _titles, _descriptions);
- }
- void parser::atom(string line){
- int tag = next_tag_atom(line);
- if(tag == -1 && _content){
- atom_description(line);
- }
- while(tag != -1){
- switch(tag){
- case TITLE: title(line);break;
- case LINK: atom_link(line);break;
- case DESCRIPTION: atom_description(line);break;
- case START_ITEM: atom_start_item(line);break;
- case END_ITEM: atom_end_item(line);break;
- }
- tag = next_tag_atom(line);
- }
- atom_description(line);
- }
- void parser::atom_link(string& line){
- size_t link = line.find("<link");
- if(link != string::npos){
- line = line.substr(link + 5);
- link = line.find("href=\"");
- if(link != string::npos){
- line = line.substr(link + 6);
- size_t less_than = line.find("\"");
- if(less_than != string::npos){
- if(_item_link.size() == 0)
- ++_item_tags;
- _item_link = line.substr(0, less_than);
- line = line.substr(less_than);
- }
- }
- }
- }
- void parser::atom_description(string& line){
- size_t content = line.find("<content type");
- if(content != string::npos and !_content){
- content = line.find(">", content);
- line = line.substr(content + 1);
- _item_description = line;
- _content = true;
- atom_end_description(line);
- delete_tags(_item_description);
- }
- if(_content && content == string::npos){
- _item_description += " " + line;
- atom_end_description(line);
- delete_tags(_item_description);
- }
- atom_end_description(line);
- }
- void parser::atom_end_description(string& line){
- size_t content_end_item = _item_description.find("</content>");
- if(content_end_item != string::npos)
- _item_description = _item_description.substr(0, content_end_item);
- size_t content_end = line.find("</content>");
- if(content_end != string::npos && _content){
- line = line.substr(content_end + 10);
- while(line[0]==' ') line.erase(0,1);
- _content = false;
- ++_item_tags;
- }
- }
- void parser::atom_start_item(string& line){
- size_t start_entry = line.find("<entry>");
- if(start_entry != string::npos){
- _item_tags = 0;
- _item_link = string();
- _item = true;
- _content = false;
- line = line.substr(start_entry + 7);
- }
- }
- void parser::atom_end_item(string& line){
- size_t end_entry = line.find("</entry>");
- if(end_entry != string::npos){
- if(_item && _item_tags == 3){
- clean_string(_item_description);
- escape_character_sql(_item_title);
- escape_character_sql(_item_description);
- delete_tags(_item_description);
- _links.push_back(_item_link);
- _titles.push_back(_item_title);
- _descriptions.push_back(_item_description);
- }
- _item = false;
- _item_tags = 0;
- _item_link = string();
- _content = false;
- line = line.substr(end_entry + 8);
- }
- }
- void parser::rss(string line){
- int tag = next_tag(line);
- if(tag == -1 && _content){
- rss_description(line);
- }
- while(tag != -1){
- switch(tag){
- case TITLE: title(line);break;
- case LINK: rss_link(line);break;
- case DESCRIPTION: rss_description(line);break;
- case START_ITEM: rss_start_item(line);break;
- case END_ITEM: rss_end_item(line);break;
- }
- tag = next_tag(line);
- }
- }
- std::string UnicodeToUTF8(unsigned int codepoint)
- {
- std::string out;
- if (codepoint <= 0x7f)
- out.append(1, static_cast<char>(codepoint));
- else if (codepoint <= 0x7ff)
- {
- out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
- out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
- }
- else if (codepoint <= 0xffff)
- {
- out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
- out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
- out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
- }
- else
- {
- out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
- out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
- out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
- out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
- }
- return out;
- }
- string parser::decode_html_chars(string line){
- size_t amp = line.find("&");
- size_t semicolon = line.find(";");
- while(amp != string::npos && semicolon != string::npos){
- if(amp + 1 < line.length() && line[amp + 1] == '#'){
- string encoded_value;
- if(amp + 2 < line.length() && line[amp + 2] == 'x'){
- try{
- encoded_value = line.substr(amp + 3, semicolon - (amp + 3));
- line = line.erase(amp, semicolon + 1 - amp);
- line = line.insert(amp, UnicodeToUTF8(stoul(encoded_value, nullptr, 16)));
- }catch(...){
- amp += 1;
- }
- }else{
- try{
- encoded_value = line.substr(amp + 2, semicolon - (amp + 2));
- line = line.erase(amp, semicolon + 1 - amp);
- line = line.insert(amp, UnicodeToUTF8(stoi(encoded_value)));
- }catch(...){
- amp += 1;
- }
- }
- } else {
- string encoded_value = line.substr(amp + 1, semicolon - (amp + 1));
- map<string, string>::iterator encoded_value_iterator = ENTITIES_VALUES.find(encoded_value);
- if(encoded_value_iterator != ENTITIES_VALUES.end()){
- line = line.erase(amp, semicolon + 1 - amp);
- line = line.insert(amp, encoded_value_iterator->second);
- }else{
- amp += 1;
- }
- }
- amp = line.find("&", amp);
- semicolon = line.find(";", amp);
- }
-
- return line;
- }
- void parser::clean_string(string& line){
- size_t cdata = line.find("<![CDATA[");
- if(cdata != string::npos)
- line = line.erase(cdata, 9);
- cdata = line.find("]]>");
- if(cdata != string::npos)
- line = line.erase(cdata, 3);
- line = decode_html_chars(line);
- }
- void parser::title(string& line){
- size_t title = line.find("<title>");
- if(title != string::npos){
- line = line.substr(title + 7);
- size_t less_than = line.find("</");
- if(less_than != string::npos){
- _item_title = line.substr(0, less_than);
- clean_string(_item_title);
- ++_item_tags;
- }
- }
- }
- void parser::rss_link(string& line){
- size_t link = line.find("<link>");
- if(link != string::npos){
- line = line.substr(link + 6);
- size_t less_than = line.find("</");
- if(less_than != string::npos){
- _item_link = line.substr(0, less_than);
- clean_string(_item_link);
- ++_item_tags;
- }
- }
- }
- void parser::rss_description(string& line){
- size_t content = line.find("<description>");
- if(content != string::npos && !_content){
- line = line.substr(content + 13);
- _item_description = line;
- _content = true;
- clean_string(_item_description);
- rss_end_description(_item_description);
- delete_tags(_item_description);
- }
-
- if(_content && content == string::npos){
-
- _item_description += " " + line;
- clean_string(_item_description);
- rss_end_description(line);
- delete_tags(_item_description);
- }
- rss_end_description(line);
- }
- void parser::rss_end_description(string& line){
- size_t content_end_item = _item_description.find("</description>");
- if(content_end_item != string::npos)
- _item_description = _item_description.substr(0, content_end_item);
- size_t content_end = line.find("</description>");
- if(content_end != string::npos && _content){
- line = line.substr(content_end + 14);
- while(_item_description[0] == ' ') _item_description.erase(0,1);
- _content = false;
- ++_item_tags;
- }
- }
- void parser::escape_character_sql(string& line){
- if(line.size() > 100) line = line.substr(0, 100);
- for(unsigned i = 0; i < line.size(); ++i){
- if(line[i] == '\''){
- line = line.insert(i, "\'");
- ++i;
- }
- }
- }
- void parser::rss_end_item(string& line){
- size_t end_entry = line.find("</item>");
- if(end_entry != string::npos){
- if(_item && _item_tags == 3){
- clean_string(_item_description);
- escape_character_sql(_item_title);
- escape_character_sql(_item_description);
- delete_tags(_item_description);
- _links.push_back(_item_link);
- _titles.push_back(_item_title);
- _descriptions.push_back(_item_description);
- }
- _item = false;
- _content = false;
- _item_tags = 0;
- line = line.substr(end_entry + 7);
- }
- }
- void parser::rss_start_item(string& line){
- size_t start_entry = line.find("<item");
- if(start_entry != string::npos){
- _item_tags = 0;
- _item_link = string();
- _item = true;
- _content = false;
- line = line.substr(start_entry + 5);
- }
- }
- void parser::delete_tags(string& line){
- size_t less_than = _item_description.find("<");
- size_t great_than = _item_description.find(">");
- while(less_than != string::npos && great_than != string::npos){
- if(less_than > great_than)
- _item_description.erase(great_than, 1);
- else{
- _item_description =
- _item_description.erase(less_than, great_than + 1 - less_than);
- }
- less_than = _item_description.find("<");
- great_than = _item_description.find(">");
- }
- }
|