xmlstring.c 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051
  1. /*
  2. * string.c : an XML string utilities module
  3. *
  4. * This module provides various utility functions for manipulating
  5. * the xmlChar* type. All functions named xmlStr* have been moved here
  6. * from the parser.c file (their original home).
  7. *
  8. * See Copyright for the status of this software.
  9. *
  10. * UTF8 string routines from:
  11. * William Brack <wbrack@mmm.com.hk>
  12. *
  13. * daniel@veillard.com
  14. */
  15. #define IN_LIBXML
  16. #include "libxml.h"
  17. #include <stdlib.h>
  18. #include <string.h>
  19. #include <libxml/xmlmemory.h>
  20. #include <libxml/parserInternals.h>
  21. #include <libxml/xmlstring.h>
  22. /************************************************************************
  23. * *
  24. * Commodity functions to handle xmlChars *
  25. * *
  26. ************************************************************************/
  27. /**
  28. * xmlStrndup:
  29. * @cur: the input xmlChar *
  30. * @len: the len of @cur
  31. *
  32. * a strndup for array of xmlChar's
  33. *
  34. * Returns a new xmlChar * or NULL
  35. */
  36. xmlChar *
  37. xmlStrndup(const xmlChar *cur, int len) {
  38. xmlChar *ret;
  39. if ((cur == NULL) || (len < 0)) return(NULL);
  40. ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
  41. if (ret == NULL) {
  42. xmlErrMemory(NULL, NULL);
  43. return(NULL);
  44. }
  45. memcpy(ret, cur, len * sizeof(xmlChar));
  46. ret[len] = 0;
  47. return(ret);
  48. }
  49. /**
  50. * xmlStrdup:
  51. * @cur: the input xmlChar *
  52. *
  53. * a strdup for array of xmlChar's. Since they are supposed to be
  54. * encoded in UTF-8 or an encoding with 8bit based chars, we assume
  55. * a termination mark of '0'.
  56. *
  57. * Returns a new xmlChar * or NULL
  58. */
  59. xmlChar *
  60. xmlStrdup(const xmlChar *cur) {
  61. const xmlChar *p = cur;
  62. if (cur == NULL) return(NULL);
  63. while (*p != 0) p++; /* non input consuming */
  64. return(xmlStrndup(cur, p - cur));
  65. }
  66. /**
  67. * xmlCharStrndup:
  68. * @cur: the input char *
  69. * @len: the len of @cur
  70. *
  71. * a strndup for char's to xmlChar's
  72. *
  73. * Returns a new xmlChar * or NULL
  74. */
  75. xmlChar *
  76. xmlCharStrndup(const char *cur, int len) {
  77. int i;
  78. xmlChar *ret;
  79. if ((cur == NULL) || (len < 0)) return(NULL);
  80. ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
  81. if (ret == NULL) {
  82. xmlErrMemory(NULL, NULL);
  83. return(NULL);
  84. }
  85. for (i = 0;i < len;i++) {
  86. ret[i] = (xmlChar) cur[i];
  87. if (ret[i] == 0) return(ret);
  88. }
  89. ret[len] = 0;
  90. return(ret);
  91. }
  92. /**
  93. * xmlCharStrdup:
  94. * @cur: the input char *
  95. *
  96. * a strdup for char's to xmlChar's
  97. *
  98. * Returns a new xmlChar * or NULL
  99. */
  100. xmlChar *
  101. xmlCharStrdup(const char *cur) {
  102. const char *p = cur;
  103. if (cur == NULL) return(NULL);
  104. while (*p != '\0') p++; /* non input consuming */
  105. return(xmlCharStrndup(cur, p - cur));
  106. }
  107. /**
  108. * xmlStrcmp:
  109. * @str1: the first xmlChar *
  110. * @str2: the second xmlChar *
  111. *
  112. * a strcmp for xmlChar's
  113. *
  114. * Returns the integer result of the comparison
  115. */
  116. int
  117. xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
  118. if (str1 == str2) return(0);
  119. if (str1 == NULL) return(-1);
  120. if (str2 == NULL) return(1);
  121. #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
  122. return(strcmp((const char *)str1, (const char *)str2));
  123. #else
  124. do {
  125. int tmp = *str1++ - *str2;
  126. if (tmp != 0) return(tmp);
  127. } while (*str2++ != 0);
  128. return 0;
  129. #endif
  130. }
  131. /**
  132. * xmlStrEqual:
  133. * @str1: the first xmlChar *
  134. * @str2: the second xmlChar *
  135. *
  136. * Check if both strings are equal of have same content.
  137. * Should be a bit more readable and faster than xmlStrcmp()
  138. *
  139. * Returns 1 if they are equal, 0 if they are different
  140. */
  141. int
  142. xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
  143. if (str1 == str2) return(1);
  144. if (str1 == NULL) return(0);
  145. if (str2 == NULL) return(0);
  146. #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
  147. return(strcmp((const char *)str1, (const char *)str2) == 0);
  148. #else
  149. do {
  150. if (*str1++ != *str2) return(0);
  151. } while (*str2++);
  152. return(1);
  153. #endif
  154. }
  155. /**
  156. * xmlStrQEqual:
  157. * @pref: the prefix of the QName
  158. * @name: the localname of the QName
  159. * @str: the second xmlChar *
  160. *
  161. * Check if a QName is Equal to a given string
  162. *
  163. * Returns 1 if they are equal, 0 if they are different
  164. */
  165. int
  166. xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
  167. if (pref == NULL) return(xmlStrEqual(name, str));
  168. if (name == NULL) return(0);
  169. if (str == NULL) return(0);
  170. do {
  171. if (*pref++ != *str) return(0);
  172. } while ((*str++) && (*pref));
  173. if (*str++ != ':') return(0);
  174. do {
  175. if (*name++ != *str) return(0);
  176. } while (*str++);
  177. return(1);
  178. }
  179. /**
  180. * xmlStrncmp:
  181. * @str1: the first xmlChar *
  182. * @str2: the second xmlChar *
  183. * @len: the max comparison length
  184. *
  185. * a strncmp for xmlChar's
  186. *
  187. * Returns the integer result of the comparison
  188. */
  189. int
  190. xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
  191. if (len <= 0) return(0);
  192. if (str1 == str2) return(0);
  193. if (str1 == NULL) return(-1);
  194. if (str2 == NULL) return(1);
  195. #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
  196. return(strncmp((const char *)str1, (const char *)str2, len));
  197. #else
  198. do {
  199. int tmp = *str1++ - *str2;
  200. if (tmp != 0 || --len == 0) return(tmp);
  201. } while (*str2++ != 0);
  202. return 0;
  203. #endif
  204. }
  205. static const xmlChar casemap[256] = {
  206. 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
  207. 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
  208. 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
  209. 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
  210. 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
  211. 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
  212. 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
  213. 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
  214. 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
  215. 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
  216. 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
  217. 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
  218. 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
  219. 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
  220. 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
  221. 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
  222. 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
  223. 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
  224. 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
  225. 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
  226. 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
  227. 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
  228. 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
  229. 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
  230. 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
  231. 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
  232. 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
  233. 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
  234. 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
  235. 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
  236. 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
  237. 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
  238. };
  239. /**
  240. * xmlStrcasecmp:
  241. * @str1: the first xmlChar *
  242. * @str2: the second xmlChar *
  243. *
  244. * a strcasecmp for xmlChar's
  245. *
  246. * Returns the integer result of the comparison
  247. */
  248. int
  249. xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
  250. register int tmp;
  251. if (str1 == str2) return(0);
  252. if (str1 == NULL) return(-1);
  253. if (str2 == NULL) return(1);
  254. do {
  255. tmp = casemap[*str1++] - casemap[*str2];
  256. if (tmp != 0) return(tmp);
  257. } while (*str2++ != 0);
  258. return 0;
  259. }
  260. /**
  261. * xmlStrncasecmp:
  262. * @str1: the first xmlChar *
  263. * @str2: the second xmlChar *
  264. * @len: the max comparison length
  265. *
  266. * a strncasecmp for xmlChar's
  267. *
  268. * Returns the integer result of the comparison
  269. */
  270. int
  271. xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
  272. register int tmp;
  273. if (len <= 0) return(0);
  274. if (str1 == str2) return(0);
  275. if (str1 == NULL) return(-1);
  276. if (str2 == NULL) return(1);
  277. do {
  278. tmp = casemap[*str1++] - casemap[*str2];
  279. if (tmp != 0 || --len == 0) return(tmp);
  280. } while (*str2++ != 0);
  281. return 0;
  282. }
  283. /**
  284. * xmlStrchr:
  285. * @str: the xmlChar * array
  286. * @val: the xmlChar to search
  287. *
  288. * a strchr for xmlChar's
  289. *
  290. * Returns the xmlChar * for the first occurrence or NULL.
  291. */
  292. const xmlChar *
  293. xmlStrchr(const xmlChar *str, xmlChar val) {
  294. if (str == NULL) return(NULL);
  295. while (*str != 0) { /* non input consuming */
  296. if (*str == val) return((xmlChar *) str);
  297. str++;
  298. }
  299. return(NULL);
  300. }
  301. /**
  302. * xmlStrstr:
  303. * @str: the xmlChar * array (haystack)
  304. * @val: the xmlChar to search (needle)
  305. *
  306. * a strstr for xmlChar's
  307. *
  308. * Returns the xmlChar * for the first occurrence or NULL.
  309. */
  310. const xmlChar *
  311. xmlStrstr(const xmlChar *str, const xmlChar *val) {
  312. int n;
  313. if (str == NULL) return(NULL);
  314. if (val == NULL) return(NULL);
  315. n = xmlStrlen(val);
  316. if (n == 0) return(str);
  317. while (*str != 0) { /* non input consuming */
  318. if (*str == *val) {
  319. if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
  320. }
  321. str++;
  322. }
  323. return(NULL);
  324. }
  325. /**
  326. * xmlStrcasestr:
  327. * @str: the xmlChar * array (haystack)
  328. * @val: the xmlChar to search (needle)
  329. *
  330. * a case-ignoring strstr for xmlChar's
  331. *
  332. * Returns the xmlChar * for the first occurrence or NULL.
  333. */
  334. const xmlChar *
  335. xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
  336. int n;
  337. if (str == NULL) return(NULL);
  338. if (val == NULL) return(NULL);
  339. n = xmlStrlen(val);
  340. if (n == 0) return(str);
  341. while (*str != 0) { /* non input consuming */
  342. if (casemap[*str] == casemap[*val])
  343. if (!xmlStrncasecmp(str, val, n)) return(str);
  344. str++;
  345. }
  346. return(NULL);
  347. }
  348. /**
  349. * xmlStrsub:
  350. * @str: the xmlChar * array (haystack)
  351. * @start: the index of the first char (zero based)
  352. * @len: the length of the substring
  353. *
  354. * Extract a substring of a given string
  355. *
  356. * Returns the xmlChar * for the first occurrence or NULL.
  357. */
  358. xmlChar *
  359. xmlStrsub(const xmlChar *str, int start, int len) {
  360. int i;
  361. if (str == NULL) return(NULL);
  362. if (start < 0) return(NULL);
  363. if (len < 0) return(NULL);
  364. for (i = 0;i < start;i++) {
  365. if (*str == 0) return(NULL);
  366. str++;
  367. }
  368. if (*str == 0) return(NULL);
  369. return(xmlStrndup(str, len));
  370. }
  371. /**
  372. * xmlStrlen:
  373. * @str: the xmlChar * array
  374. *
  375. * length of a xmlChar's string
  376. *
  377. * Returns the number of xmlChar contained in the ARRAY.
  378. */
  379. int
  380. xmlStrlen(const xmlChar *str) {
  381. int len = 0;
  382. if (str == NULL) return(0);
  383. while (*str != 0) { /* non input consuming */
  384. str++;
  385. len++;
  386. }
  387. return(len);
  388. }
  389. /**
  390. * xmlStrncat:
  391. * @cur: the original xmlChar * array
  392. * @add: the xmlChar * array added
  393. * @len: the length of @add
  394. *
  395. * a strncat for array of xmlChar's, it will extend @cur with the len
  396. * first bytes of @add. Note that if @len < 0 then this is an API error
  397. * and NULL will be returned.
  398. *
  399. * Returns a new xmlChar *, the original @cur is reallocated and should
  400. * not be freed.
  401. */
  402. xmlChar *
  403. xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
  404. int size;
  405. xmlChar *ret;
  406. if ((add == NULL) || (len == 0))
  407. return(cur);
  408. if (len < 0)
  409. return(NULL);
  410. if (cur == NULL)
  411. return(xmlStrndup(add, len));
  412. size = xmlStrlen(cur);
  413. if (size < 0)
  414. return(NULL);
  415. ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
  416. if (ret == NULL) {
  417. xmlErrMemory(NULL, NULL);
  418. return(cur);
  419. }
  420. memcpy(&ret[size], add, len * sizeof(xmlChar));
  421. ret[size + len] = 0;
  422. return(ret);
  423. }
  424. /**
  425. * xmlStrncatNew:
  426. * @str1: first xmlChar string
  427. * @str2: second xmlChar string
  428. * @len: the len of @str2 or < 0
  429. *
  430. * same as xmlStrncat, but creates a new string. The original
  431. * two strings are not freed. If @len is < 0 then the length
  432. * will be calculated automatically.
  433. *
  434. * Returns a new xmlChar * or NULL
  435. */
  436. xmlChar *
  437. xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
  438. int size;
  439. xmlChar *ret;
  440. if (len < 0) {
  441. len = xmlStrlen(str2);
  442. if (len < 0)
  443. return(NULL);
  444. }
  445. if ((str2 == NULL) || (len == 0))
  446. return(xmlStrdup(str1));
  447. if (str1 == NULL)
  448. return(xmlStrndup(str2, len));
  449. size = xmlStrlen(str1);
  450. if (size < 0)
  451. return(NULL);
  452. ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
  453. if (ret == NULL) {
  454. xmlErrMemory(NULL, NULL);
  455. return(xmlStrndup(str1, size));
  456. }
  457. memcpy(ret, str1, size * sizeof(xmlChar));
  458. memcpy(&ret[size], str2, len * sizeof(xmlChar));
  459. ret[size + len] = 0;
  460. return(ret);
  461. }
  462. /**
  463. * xmlStrcat:
  464. * @cur: the original xmlChar * array
  465. * @add: the xmlChar * array added
  466. *
  467. * a strcat for array of xmlChar's. Since they are supposed to be
  468. * encoded in UTF-8 or an encoding with 8bit based chars, we assume
  469. * a termination mark of '0'.
  470. *
  471. * Returns a new xmlChar * containing the concatenated string. The original
  472. * @cur is reallocated and should not be freed.
  473. */
  474. xmlChar *
  475. xmlStrcat(xmlChar *cur, const xmlChar *add) {
  476. const xmlChar *p = add;
  477. if (add == NULL) return(cur);
  478. if (cur == NULL)
  479. return(xmlStrdup(add));
  480. while (*p != 0) p++; /* non input consuming */
  481. return(xmlStrncat(cur, add, p - add));
  482. }
  483. /**
  484. * xmlStrPrintf:
  485. * @buf: the result buffer.
  486. * @len: the result buffer length.
  487. * @msg: the message with printf formatting.
  488. * @...: extra parameters for the message.
  489. *
  490. * Formats @msg and places result into @buf.
  491. *
  492. * Returns the number of characters written to @buf or -1 if an error occurs.
  493. */
  494. int XMLCDECL
  495. xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
  496. va_list args;
  497. int ret;
  498. if((buf == NULL) || (msg == NULL)) {
  499. return(-1);
  500. }
  501. va_start(args, msg);
  502. ret = vsnprintf((char *) buf, len, (const char *) msg, args);
  503. va_end(args);
  504. buf[len - 1] = 0; /* be safe ! */
  505. return(ret);
  506. }
  507. /**
  508. * xmlStrVPrintf:
  509. * @buf: the result buffer.
  510. * @len: the result buffer length.
  511. * @msg: the message with printf formatting.
  512. * @ap: extra parameters for the message.
  513. *
  514. * Formats @msg and places result into @buf.
  515. *
  516. * Returns the number of characters written to @buf or -1 if an error occurs.
  517. */
  518. int
  519. xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
  520. int ret;
  521. if((buf == NULL) || (msg == NULL)) {
  522. return(-1);
  523. }
  524. ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
  525. buf[len - 1] = 0; /* be safe ! */
  526. return(ret);
  527. }
  528. /************************************************************************
  529. * *
  530. * Generic UTF8 handling routines *
  531. * *
  532. * From rfc2044: encoding of the Unicode values on UTF-8: *
  533. * *
  534. * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
  535. * 0000 0000-0000 007F 0xxxxxxx *
  536. * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
  537. * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
  538. * *
  539. * I hope we won't use values > 0xFFFF anytime soon ! *
  540. * *
  541. ************************************************************************/
  542. /**
  543. * xmlUTF8Size:
  544. * @utf: pointer to the UTF8 character
  545. *
  546. * calculates the internal size of a UTF8 character
  547. *
  548. * returns the numbers of bytes in the character, -1 on format error
  549. */
  550. int
  551. xmlUTF8Size(const xmlChar *utf) {
  552. xmlChar mask;
  553. int len;
  554. if (utf == NULL)
  555. return -1;
  556. if (*utf < 0x80)
  557. return 1;
  558. /* check valid UTF8 character */
  559. if (!(*utf & 0x40))
  560. return -1;
  561. /* determine number of bytes in char */
  562. len = 2;
  563. for (mask=0x20; mask != 0; mask>>=1) {
  564. if (!(*utf & mask))
  565. return len;
  566. len++;
  567. }
  568. return -1;
  569. }
  570. /**
  571. * xmlUTF8Charcmp:
  572. * @utf1: pointer to first UTF8 char
  573. * @utf2: pointer to second UTF8 char
  574. *
  575. * compares the two UCS4 values
  576. *
  577. * returns result of the compare as with xmlStrncmp
  578. */
  579. int
  580. xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
  581. if (utf1 == NULL ) {
  582. if (utf2 == NULL)
  583. return 0;
  584. return -1;
  585. }
  586. return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
  587. }
  588. /**
  589. * xmlUTF8Strlen:
  590. * @utf: a sequence of UTF-8 encoded bytes
  591. *
  592. * compute the length of an UTF8 string, it doesn't do a full UTF8
  593. * checking of the content of the string.
  594. *
  595. * Returns the number of characters in the string or -1 in case of error
  596. */
  597. int
  598. xmlUTF8Strlen(const xmlChar *utf) {
  599. int ret = 0;
  600. if (utf == NULL)
  601. return(-1);
  602. while (*utf != 0) {
  603. if (utf[0] & 0x80) {
  604. if ((utf[1] & 0xc0) != 0x80)
  605. return(-1);
  606. if ((utf[0] & 0xe0) == 0xe0) {
  607. if ((utf[2] & 0xc0) != 0x80)
  608. return(-1);
  609. if ((utf[0] & 0xf0) == 0xf0) {
  610. if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
  611. return(-1);
  612. utf += 4;
  613. } else {
  614. utf += 3;
  615. }
  616. } else {
  617. utf += 2;
  618. }
  619. } else {
  620. utf++;
  621. }
  622. ret++;
  623. }
  624. return(ret);
  625. }
  626. /**
  627. * xmlGetUTF8Char:
  628. * @utf: a sequence of UTF-8 encoded bytes
  629. * @len: a pointer to the minimum number of bytes present in
  630. * the sequence. This is used to assure the next character
  631. * is completely contained within the sequence.
  632. *
  633. * Read the first UTF8 character from @utf
  634. *
  635. * Returns the char value or -1 in case of error, and sets *len to
  636. * the actual number of bytes consumed (0 in case of error)
  637. */
  638. int
  639. xmlGetUTF8Char(const unsigned char *utf, int *len) {
  640. unsigned int c;
  641. if (utf == NULL)
  642. goto error;
  643. if (len == NULL)
  644. goto error;
  645. if (*len < 1)
  646. goto error;
  647. c = utf[0];
  648. if (c & 0x80) {
  649. if (*len < 2)
  650. goto error;
  651. if ((utf[1] & 0xc0) != 0x80)
  652. goto error;
  653. if ((c & 0xe0) == 0xe0) {
  654. if (*len < 3)
  655. goto error;
  656. if ((utf[2] & 0xc0) != 0x80)
  657. goto error;
  658. if ((c & 0xf0) == 0xf0) {
  659. if (*len < 4)
  660. goto error;
  661. if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
  662. goto error;
  663. *len = 4;
  664. /* 4-byte code */
  665. c = (utf[0] & 0x7) << 18;
  666. c |= (utf[1] & 0x3f) << 12;
  667. c |= (utf[2] & 0x3f) << 6;
  668. c |= utf[3] & 0x3f;
  669. } else {
  670. /* 3-byte code */
  671. *len = 3;
  672. c = (utf[0] & 0xf) << 12;
  673. c |= (utf[1] & 0x3f) << 6;
  674. c |= utf[2] & 0x3f;
  675. }
  676. } else {
  677. /* 2-byte code */
  678. *len = 2;
  679. c = (utf[0] & 0x1f) << 6;
  680. c |= utf[1] & 0x3f;
  681. }
  682. } else {
  683. /* 1-byte code */
  684. *len = 1;
  685. }
  686. return(c);
  687. error:
  688. if (len != NULL)
  689. *len = 0;
  690. return(-1);
  691. }
  692. /**
  693. * xmlCheckUTF8:
  694. * @utf: Pointer to putative UTF-8 encoded string.
  695. *
  696. * Checks @utf for being valid UTF-8. @utf is assumed to be
  697. * null-terminated. This function is not super-strict, as it will
  698. * allow longer UTF-8 sequences than necessary. Note that Java is
  699. * capable of producing these sequences if provoked. Also note, this
  700. * routine checks for the 4-byte maximum size, but does not check for
  701. * 0x10ffff maximum value.
  702. *
  703. * Return value: true if @utf is valid.
  704. **/
  705. int
  706. xmlCheckUTF8(const unsigned char *utf)
  707. {
  708. int ix;
  709. unsigned char c;
  710. if (utf == NULL)
  711. return(0);
  712. /*
  713. * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
  714. * are as follows (in "bit format"):
  715. * 0xxxxxxx valid 1-byte
  716. * 110xxxxx 10xxxxxx valid 2-byte
  717. * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
  718. * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
  719. */
  720. for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
  721. if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
  722. ix++;
  723. } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
  724. if ((utf[ix+1] & 0xc0 ) != 0x80)
  725. return 0;
  726. ix += 2;
  727. } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
  728. if (((utf[ix+1] & 0xc0) != 0x80) ||
  729. ((utf[ix+2] & 0xc0) != 0x80))
  730. return 0;
  731. ix += 3;
  732. } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
  733. if (((utf[ix+1] & 0xc0) != 0x80) ||
  734. ((utf[ix+2] & 0xc0) != 0x80) ||
  735. ((utf[ix+3] & 0xc0) != 0x80))
  736. return 0;
  737. ix += 4;
  738. } else /* unknown encoding */
  739. return 0;
  740. }
  741. return(1);
  742. }
  743. /**
  744. * xmlUTF8Strsize:
  745. * @utf: a sequence of UTF-8 encoded bytes
  746. * @len: the number of characters in the array
  747. *
  748. * storage size of an UTF8 string
  749. * the behaviour is not guaranteed if the input string is not UTF-8
  750. *
  751. * Returns the storage size of
  752. * the first 'len' characters of ARRAY
  753. */
  754. int
  755. xmlUTF8Strsize(const xmlChar *utf, int len) {
  756. const xmlChar *ptr=utf;
  757. xmlChar ch;
  758. if (utf == NULL)
  759. return(0);
  760. if (len <= 0)
  761. return(0);
  762. while ( len-- > 0) {
  763. if ( !*ptr )
  764. break;
  765. if ( (ch = *ptr++) & 0x80)
  766. while ((ch<<=1) & 0x80 ) {
  767. if (*ptr == 0) break;
  768. ptr++;
  769. }
  770. }
  771. return (ptr - utf);
  772. }
  773. /**
  774. * xmlUTF8Strndup:
  775. * @utf: the input UTF8 *
  776. * @len: the len of @utf (in chars)
  777. *
  778. * a strndup for array of UTF8's
  779. *
  780. * Returns a new UTF8 * or NULL
  781. */
  782. xmlChar *
  783. xmlUTF8Strndup(const xmlChar *utf, int len) {
  784. xmlChar *ret;
  785. int i;
  786. if ((utf == NULL) || (len < 0)) return(NULL);
  787. i = xmlUTF8Strsize(utf, len);
  788. ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
  789. if (ret == NULL) {
  790. xmlGenericError(xmlGenericErrorContext,
  791. "malloc of %ld byte failed\n",
  792. (len + 1) * (long)sizeof(xmlChar));
  793. return(NULL);
  794. }
  795. memcpy(ret, utf, i * sizeof(xmlChar));
  796. ret[i] = 0;
  797. return(ret);
  798. }
  799. /**
  800. * xmlUTF8Strpos:
  801. * @utf: the input UTF8 *
  802. * @pos: the position of the desired UTF8 char (in chars)
  803. *
  804. * a function to provide the equivalent of fetching a
  805. * character from a string array
  806. *
  807. * Returns a pointer to the UTF8 character or NULL
  808. */
  809. const xmlChar *
  810. xmlUTF8Strpos(const xmlChar *utf, int pos) {
  811. xmlChar ch;
  812. if (utf == NULL) return(NULL);
  813. if (pos < 0)
  814. return(NULL);
  815. while (pos--) {
  816. if ((ch=*utf++) == 0) return(NULL);
  817. if ( ch & 0x80 ) {
  818. /* if not simple ascii, verify proper format */
  819. if ( (ch & 0xc0) != 0xc0 )
  820. return(NULL);
  821. /* then skip over remaining bytes for this char */
  822. while ( (ch <<= 1) & 0x80 )
  823. if ( (*utf++ & 0xc0) != 0x80 )
  824. return(NULL);
  825. }
  826. }
  827. return((xmlChar *)utf);
  828. }
  829. /**
  830. * xmlUTF8Strloc:
  831. * @utf: the input UTF8 *
  832. * @utfchar: the UTF8 character to be found
  833. *
  834. * a function to provide the relative location of a UTF8 char
  835. *
  836. * Returns the relative character position of the desired char
  837. * or -1 if not found
  838. */
  839. int
  840. xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
  841. int i, size;
  842. xmlChar ch;
  843. if (utf==NULL || utfchar==NULL) return -1;
  844. size = xmlUTF8Strsize(utfchar, 1);
  845. for(i=0; (ch=*utf) != 0; i++) {
  846. if (xmlStrncmp(utf, utfchar, size)==0)
  847. return(i);
  848. utf++;
  849. if ( ch & 0x80 ) {
  850. /* if not simple ascii, verify proper format */
  851. if ( (ch & 0xc0) != 0xc0 )
  852. return(-1);
  853. /* then skip over remaining bytes for this char */
  854. while ( (ch <<= 1) & 0x80 )
  855. if ( (*utf++ & 0xc0) != 0x80 )
  856. return(-1);
  857. }
  858. }
  859. return(-1);
  860. }
  861. /**
  862. * xmlUTF8Strsub:
  863. * @utf: a sequence of UTF-8 encoded bytes
  864. * @start: relative pos of first char
  865. * @len: total number to copy
  866. *
  867. * Create a substring from a given UTF-8 string
  868. * Note: positions are given in units of UTF-8 chars
  869. *
  870. * Returns a pointer to a newly created string
  871. * or NULL if any problem
  872. */
  873. xmlChar *
  874. xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
  875. int i;
  876. xmlChar ch;
  877. if (utf == NULL) return(NULL);
  878. if (start < 0) return(NULL);
  879. if (len < 0) return(NULL);
  880. /*
  881. * Skip over any leading chars
  882. */
  883. for (i = 0;i < start;i++) {
  884. if ((ch=*utf++) == 0) return(NULL);
  885. if ( ch & 0x80 ) {
  886. /* if not simple ascii, verify proper format */
  887. if ( (ch & 0xc0) != 0xc0 )
  888. return(NULL);
  889. /* then skip over remaining bytes for this char */
  890. while ( (ch <<= 1) & 0x80 )
  891. if ( (*utf++ & 0xc0) != 0x80 )
  892. return(NULL);
  893. }
  894. }
  895. return(xmlUTF8Strndup(utf, len));
  896. }
  897. /**
  898. * xmlEscapeFormatString:
  899. * @msg: a pointer to the string in which to escape '%' characters.
  900. * Must be a heap-allocated buffer created by libxml2 that may be
  901. * returned, or that may be freed and replaced.
  902. *
  903. * Replaces the string pointed to by 'msg' with an escaped string.
  904. * Returns the same string with all '%' characters escaped.
  905. */
  906. xmlChar *
  907. xmlEscapeFormatString(xmlChar **msg)
  908. {
  909. xmlChar *msgPtr = NULL;
  910. xmlChar *result = NULL;
  911. xmlChar *resultPtr = NULL;
  912. size_t count = 0;
  913. size_t msgLen = 0;
  914. size_t resultLen = 0;
  915. if (!msg || !*msg)
  916. return(NULL);
  917. for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
  918. ++msgLen;
  919. if (*msgPtr == '%')
  920. ++count;
  921. }
  922. if (count == 0)
  923. return(*msg);
  924. resultLen = msgLen + count + 1;
  925. result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
  926. if (result == NULL) {
  927. /* Clear *msg to prevent format string vulnerabilities in
  928. out-of-memory situations. */
  929. xmlFree(*msg);
  930. *msg = NULL;
  931. xmlErrMemory(NULL, NULL);
  932. return(NULL);
  933. }
  934. for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
  935. *resultPtr = *msgPtr;
  936. if (*msgPtr == '%')
  937. *(++resultPtr) = '%';
  938. }
  939. result[resultLen - 1] = '\0';
  940. xmlFree(*msg);
  941. *msg = result;
  942. return *msg;
  943. }
  944. #define bottom_xmlstring
  945. #include "elfgcchack.h"