parse.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. // This file is part of BOINC.
  2. // http://boinc.berkeley.edu
  3. // Copyright (C) 2008 University of California
  4. //
  5. // BOINC is free software; you can redistribute it and/or modify it
  6. // under the terms of the GNU Lesser General Public License
  7. // as published by the Free Software Foundation,
  8. // either version 3 of the License, or (at your option) any later version.
  9. //
  10. // BOINC is distributed in the hope that it will be useful,
  11. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. // See the GNU Lesser General Public License for more details.
  14. //
  15. // You should have received a copy of the GNU Lesser General Public License
  16. // along with BOINC. If not, see <http://www.gnu.org/licenses/>.
  17. #ifndef BOINC_PARSE_H
  18. #define BOINC_PARSE_H
  19. #include <cstdio>
  20. #include <stdlib.h>
  21. #include <string.h>
  22. #include <errno.h>
  23. #include "config.h"
  24. #include "miofile.h"
  25. #include "error_numbers.h"
  26. #include "str_util.h"
  27. // see parse_test.cpp for example usage of XML_PARSER
  28. #define XML_PARSE_COMMENT 1
  29. #define XML_PARSE_EOF 2
  30. #define XML_PARSE_CDATA 3
  31. #define XML_PARSE_TAG 4
  32. #define XML_PARSE_DATA 5
  33. #define XML_PARSE_OVERFLOW 6
  34. #define TAG_BUF_LEN 4096
  35. // max tag length
  36. #define ELEMENT_BUF_LEN 65536
  37. // max element length (matches BLOB_SIZE, max size of XML fields in DB)
  38. struct XML_PARSER {
  39. int scan_comment();
  40. int scan_cdata(char*, int);
  41. char parsed_tag[TAG_BUF_LEN];
  42. bool is_tag;
  43. MIOFILE* f;
  44. XML_PARSER(MIOFILE*);
  45. void init(MIOFILE* mf) {
  46. f = mf;
  47. }
  48. // read and copy text to buf; stop when find a <;
  49. // ungetc() that so we read it again
  50. // Return XML_PARSE_DATA if successful
  51. //
  52. inline int copy_until_tag(char* buf, int len) {
  53. int c;
  54. while (1) {
  55. c = f->_getc();
  56. if (!c || c == EOF) return XML_PARSE_EOF;
  57. if (c == '<') {
  58. f->_ungetc(c);
  59. *buf = 0;
  60. return XML_PARSE_DATA;
  61. }
  62. if (--len <= 0) {
  63. return XML_PARSE_OVERFLOW;
  64. }
  65. *buf++ = (char)c;
  66. }
  67. }
  68. // return true if EOF or error
  69. //
  70. inline bool get(
  71. char* buf, int len, bool& _is_tag, char* attr_buf=0, int attr_len=0
  72. ) {
  73. switch (get_aux(buf, len, attr_buf, attr_len)) {
  74. case XML_PARSE_EOF:
  75. case XML_PARSE_OVERFLOW:
  76. return true;
  77. case XML_PARSE_TAG:
  78. _is_tag = true;
  79. break;
  80. case XML_PARSE_DATA:
  81. case XML_PARSE_CDATA:
  82. default:
  83. _is_tag = false;
  84. break;
  85. }
  86. return false;
  87. }
  88. inline bool get_tag(char* ab=0, int al=0) {
  89. if (get(parsed_tag, sizeof(parsed_tag), is_tag, ab, al)) {
  90. return true;
  91. }
  92. if (strlen(parsed_tag) > TAG_BUF_LEN-10) {
  93. parsed_tag[TAG_BUF_LEN-10] = 0;
  94. }
  95. return false;
  96. }
  97. inline bool match_tag(const char* tag) {
  98. return !strcmp(parsed_tag, tag);
  99. }
  100. // read until find non-whitespace char.
  101. // Return the char in the reference param
  102. // Return true iff reached EOF
  103. //
  104. inline bool scan_nonws(int& first_char) {
  105. int c;
  106. while (1) {
  107. c = f->_getc();
  108. if (!c || c == EOF) return true;
  109. if (isascii(c) && isspace(c)) continue;
  110. first_char = c;
  111. return false;
  112. }
  113. }
  114. // Scan something, either tag or text.
  115. // Strip whitespace at start and end
  116. // (however, the supplied buffer must accommodate this white space).
  117. // Ignore comments.
  118. // Return true iff reached EOF
  119. //
  120. inline int get_aux(
  121. char* buf, int len, char* attr_buf, int attr_len
  122. ) {
  123. bool eof;
  124. int c, retval;
  125. while (1) {
  126. eof = scan_nonws(c);
  127. if (eof) return XML_PARSE_EOF;
  128. if (c == '<') {
  129. retval = scan_tag(buf, len, attr_buf, attr_len);
  130. if (retval == XML_PARSE_EOF) return retval;
  131. if (retval == XML_PARSE_OVERFLOW) return retval;
  132. if (retval == XML_PARSE_COMMENT) continue;
  133. } else {
  134. buf[0] = (char)c;
  135. retval = copy_until_tag(buf+1, len-1);
  136. if (retval != XML_PARSE_DATA) return retval;
  137. }
  138. strip_whitespace(buf);
  139. return retval;
  140. }
  141. }
  142. // we just read a <; read until we find a >.
  143. // Given <tag [attr=val attr=val] [/]>:
  144. // - copy tag (or tag/) to buf
  145. // - copy "attr=val attr=val" to attr_buf
  146. //
  147. // Return either
  148. // XML_PARSE_TAG
  149. // XML_PARSE_COMMENT
  150. // XML_PARSE_EOF
  151. // XML_PARSE_CDATA
  152. //
  153. inline int scan_tag(
  154. char* buf, int _tag_len, char* attr_buf=0, int attr_len=0
  155. ) {
  156. int c;
  157. char* buf_start = buf;
  158. bool found_space = false;
  159. int tag_len = _tag_len;
  160. for (int i=0; ; i++) {
  161. c = f->_getc();
  162. if (!c || c == EOF) return XML_PARSE_EOF;
  163. if (c == '>') {
  164. *buf = 0;
  165. if (attr_buf) *attr_buf = 0;
  166. return XML_PARSE_TAG;
  167. }
  168. if (isascii(c) && isspace(c)) {
  169. if (found_space && attr_buf) {
  170. if (--attr_len > 0) {
  171. *attr_buf++ = (char)c;
  172. }
  173. }
  174. found_space = true;
  175. } else if (c == '/') {
  176. if (--tag_len > 0) {
  177. *buf++ = (char)c;
  178. } else {
  179. return XML_PARSE_OVERFLOW;
  180. }
  181. } else {
  182. if (found_space) {
  183. if (attr_buf) {
  184. if (--attr_len > 0) {
  185. *attr_buf++ = (char)c;
  186. }
  187. }
  188. } else {
  189. if (--tag_len > 0) {
  190. *buf++ = (char)c;
  191. } else {
  192. return XML_PARSE_OVERFLOW;
  193. }
  194. }
  195. }
  196. // check for comment start
  197. //
  198. if (i==2 && !strncmp(buf_start, "!--", 3)) {
  199. return scan_comment();
  200. }
  201. if (i==7 && !strncmp(buf_start, "![CDATA[", 8)) {
  202. return scan_cdata(buf_start, tag_len);
  203. }
  204. }
  205. }
  206. // copy everything up to (but not including) the given end tag.
  207. // The copied text may include XML tags.
  208. // strips start/end whitespace.
  209. //
  210. inline int element_contents(const char* end_tag, char* buf, int buflen) {
  211. int n=0;
  212. int retval=0;
  213. while (1) {
  214. if (n == buflen-1) {
  215. retval = ERR_XML_PARSE;
  216. break;
  217. }
  218. int c = f->_getc();
  219. if (!c || c == EOF) {
  220. retval = ERR_XML_PARSE;
  221. break;
  222. }
  223. buf[n++] = (char)c;
  224. buf[n] = 0;
  225. char* p = strstr(buf, end_tag);
  226. if (p) {
  227. *p = 0;
  228. break;
  229. }
  230. }
  231. buf[n] = 0;
  232. strip_whitespace(buf);
  233. return retval;
  234. }
  235. bool parse_str_aux(const char*, char*, int);
  236. // interface starts here
  237. //
  238. bool parse_start(const char*);
  239. bool parse_str(const char*, char*, int);
  240. bool parse_string(const char*, std::string&);
  241. bool parse_int(const char*, int&);
  242. bool parse_long(const char*, long&);
  243. bool parse_double(const char*, double&);
  244. bool parse_ulong(const char*, unsigned long&);
  245. bool parse_ulonglong(const char*, unsigned long long&);
  246. bool parse_bool(const char*, bool&);
  247. int copy_element(std::string&);
  248. void skip_unexpected(const char*, bool verbose, const char*);
  249. void skip_unexpected(bool verbose=false, const char* msg="") {
  250. skip_unexpected(parsed_tag, verbose, msg);
  251. }
  252. };
  253. extern bool boinc_is_finite(double);
  254. /////////////// START DEPRECATED XML PARSER
  255. // Deprecated because it makes assumptions about
  256. // the format of the XML being parsed
  257. ///////////////
  258. // return true if the tag appears in the line
  259. //
  260. inline bool match_tag(const char* buf, const char* tag) {
  261. if (strstr(buf, tag)) return true;
  262. return false;
  263. }
  264. inline bool match_tag(const std::string &s, const char* tag) {
  265. return match_tag(s.c_str(), tag);
  266. }
  267. #if defined(_WIN32) && !defined(__MINGW32__)
  268. #define boinc_strtoull _strtoui64
  269. #else
  270. #if defined(HAVE_STRTOULL) || defined(__MINGW32__)
  271. #define boinc_strtoull strtoull
  272. #else
  273. inline unsigned long long boinc_strtoull(const char *s, char **, int) {
  274. char buf[64];
  275. char *p;
  276. unsigned long long y;
  277. strncpy(buf, s, sizeof(buf)-1);
  278. strip_whitespace(buf);
  279. p = strstr(buf, "0x");
  280. if (!p) p = strstr(buf, "0X");
  281. if (p) {
  282. sscanf(p, "%llx", &y);
  283. } else {
  284. sscanf(buf, "%llu", &y);
  285. }
  286. return y;
  287. }
  288. #endif
  289. #endif
  290. // parse an integer of the form <tag>1234</tag>
  291. // return true if it's there
  292. // Note: this doesn't check for the end tag
  293. //
  294. inline bool parse_int(const char* buf, const char* tag, int& x) {
  295. const char* p = strstr(buf, tag);
  296. if (!p) return false;
  297. errno = 0;
  298. int y = strtol(p+strlen(tag), 0, 0); // this parses 0xabcd correctly
  299. if (errno) return false;
  300. x = y;
  301. return true;
  302. }
  303. // Same, for doubles
  304. //
  305. inline bool parse_double(const char* buf, const char* tag, double& x) {
  306. double y;
  307. const char* p = strstr(buf, tag);
  308. if (!p) return false;
  309. errno = 0;
  310. y = strtod(p+strlen(tag), NULL);
  311. if (errno) return false;
  312. if (!boinc_is_finite(y)) {
  313. return false;
  314. }
  315. x = y;
  316. return true;
  317. }
  318. extern bool parse(char* , char* );
  319. extern bool parse_str(const char*, const char*, char*, int);
  320. extern bool parse_str(const char* buf, const char* tag, std::string& dest);
  321. extern void parse_attr(const char* buf, const char* attrname, char* out, int len);
  322. extern bool parse_bool(const char*, const char*, bool&);
  323. /////////////// END DEPRECATED XML PARSER
  324. extern int copy_stream(FILE* in, FILE* out);
  325. extern int strcatdup(char*& p, char* buf);
  326. extern int dup_element_contents(FILE* in, const char* end_tag, char** pp);
  327. extern int dup_element(FILE* in, const char* end_tag, char** pp);
  328. extern int copy_element_contents(FILE* in, const char* end_tag, char* p, size_t len);
  329. extern int copy_element_contents(FILE* in, const char* end_tag, std::string&);
  330. extern void replace_element_contents(
  331. char* buf, const char* start, const char* end, const char* replacement
  332. );
  333. extern bool remove_element(char* buf, const char* start, const char* end);
  334. extern bool str_replace(char* str, const char* old, const char* neww);
  335. extern char* sgets(char* buf, int len, char* &in);
  336. extern void non_ascii_escape(const char*, char*, int len);
  337. extern void xml_escape(const char*, char*, int len);
  338. extern void xml_unescape(std::string&);
  339. extern void xml_unescape(char*);
  340. extern void extract_venue(const char*, const char*, char*, int len);
  341. extern int skip_unrecognized(char* buf, MIOFILE&);
  342. #endif