rxp.c 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. /* $NetBSD: rxp.c,v 1.12 2004/01/27 20:30:30 jsm Exp $ */
  2. /*-
  3. * Copyright (c) 1991, 1993
  4. * The Regents of the University of California. All rights reserved.
  5. *
  6. * This code is derived from software contributed to Berkeley by
  7. * Jim R. Oldroyd at The Instruction Set and Keith Gabryelski at
  8. * Commodore Business Machines.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. * 1. Redistributions of source code must retain the above copyright
  14. * notice, this list of conditions and the following disclaimer.
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in the
  17. * documentation and/or other materials provided with the distribution.
  18. * 3. Neither the name of the University nor the names of its contributors
  19. * may be used to endorse or promote products derived from this software
  20. * without specific prior written permission.
  21. *
  22. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32. * SUCH DAMAGE.
  33. */
  34. #include <sys/cdefs.h>
  35. #ifndef lint
  36. #if 0
  37. static char sccsid[] = "@(#)rxp.c 8.1 (Berkeley) 5/31/93";
  38. #else
  39. __RCSID("$NetBSD: rxp.c,v 1.12 2004/01/27 20:30:30 jsm Exp $");
  40. #endif
  41. #endif /* not lint */
  42. /*
  43. * regular expression parser
  44. *
  45. * external functions and return values are:
  46. * rxp_compile(s)
  47. * TRUE success
  48. * FALSE parse failure; error message will be in char rxperr[]
  49. * metas are:
  50. * {...} optional pattern, equialent to [...|]
  51. * | alternate pattern
  52. * [...] pattern delimiters
  53. *
  54. * rxp_match(s)
  55. * TRUE string s matches compiled pattern
  56. * FALSE match failure or regexp error
  57. *
  58. * rxp_expand()
  59. * char * reverse-engineered regular expression string
  60. * NULL regexp error
  61. */
  62. #include <stdio.h>
  63. #include <stdlib.h>
  64. #include <ctype.h>
  65. #include "quiz.h"
  66. /* regexp tokens, arg */
  67. #define LIT (-1) /* literal character, char */
  68. #define SOT (-2) /* start text anchor, - */
  69. #define EOT (-3) /* end text anchor, - */
  70. #define GRP_S (-4) /* start alternate grp, ptr_to_end */
  71. #define GRP_E (-5) /* end group, - */
  72. #define ALT_S (-6) /* alternate starts, ptr_to_next */
  73. #define ALT_E (-7) /* alternate ends, - */
  74. #define END (-8) /* end of regexp, - */
  75. typedef short Rxp_t; /* type for regexp tokens */
  76. static Rxp_t rxpbuf[RXP_LINE_SZ]; /* compiled regular expression buffer */
  77. char rxperr[128]; /* parser error message */
  78. static int rxp__compile(const char *, int);
  79. static char *rxp__expand(int);
  80. static int rxp__match(const char *, int, Rxp_t *, Rxp_t *, const char *);
  81. int
  82. rxp_compile(s)
  83. const char * s;
  84. {
  85. return (rxp__compile(s, TRUE));
  86. }
  87. static int
  88. rxp__compile(s, first)
  89. const char *s;
  90. int first;
  91. {
  92. static Rxp_t *rp;
  93. static const char *sp;
  94. Rxp_t *grp_ptr;
  95. Rxp_t *alt_ptr;
  96. int esc, err;
  97. esc = 0;
  98. if (first) {
  99. rp = rxpbuf;
  100. sp = s;
  101. *rp++ = SOT; /* auto-anchor: pat is really ^pat$ */
  102. *rp++ = GRP_S; /* auto-group: ^pat$ is really ^[pat]$ */
  103. *rp++ = 0;
  104. }
  105. *rp++ = ALT_S;
  106. alt_ptr = rp;
  107. *rp++ = 0;
  108. for (; *sp; ++sp) {
  109. if (rp - rxpbuf >= RXP_LINE_SZ - 4) {
  110. (void)snprintf(rxperr, sizeof(rxperr),
  111. "regular expression too long %s", s);
  112. return (FALSE);
  113. }
  114. if (*sp == ':' && !esc)
  115. break;
  116. if (esc) {
  117. *rp++ = LIT;
  118. *rp++ = *sp;
  119. esc = 0;
  120. }
  121. else switch (*sp) {
  122. case '\\':
  123. esc = 1;
  124. break;
  125. case '{':
  126. case '[':
  127. *rp++ = GRP_S;
  128. grp_ptr = rp;
  129. *rp++ = 0;
  130. sp++;
  131. if ((err = rxp__compile(s, FALSE)) != TRUE)
  132. return (err);
  133. *rp++ = GRP_E;
  134. *grp_ptr = rp - rxpbuf;
  135. break;
  136. case '}':
  137. case ']':
  138. case '|':
  139. *rp++ = ALT_E;
  140. *alt_ptr = rp - rxpbuf;
  141. if (*sp != ']') {
  142. *rp++ = ALT_S;
  143. alt_ptr = rp;
  144. *rp++ = 0;
  145. }
  146. if (*sp != '|') {
  147. if (*sp != ']') {
  148. *rp++ = ALT_E;
  149. *alt_ptr = rp - rxpbuf;
  150. }
  151. if (first) {
  152. (void)snprintf(rxperr, sizeof(rxperr),
  153. "unmatched alternator in regexp %s",
  154. s);
  155. return (FALSE);
  156. }
  157. return (TRUE);
  158. }
  159. break;
  160. default:
  161. *rp++ = LIT;
  162. *rp++ = *sp;
  163. esc = 0;
  164. break;
  165. }
  166. }
  167. if (!first) {
  168. (void)snprintf(rxperr, sizeof(rxperr),
  169. "unmatched alternator in regexp %s", s);
  170. return (FALSE);
  171. }
  172. *rp++ = ALT_E;
  173. *alt_ptr = rp - rxpbuf;
  174. *rp++ = GRP_E;
  175. *(rxpbuf + 2) = rp - rxpbuf;
  176. *rp++ = EOT;
  177. *rp = END;
  178. return (TRUE);
  179. }
  180. /*
  181. * match string against compiled regular expression
  182. */
  183. int
  184. rxp_match(s)
  185. const char * s;
  186. {
  187. return (rxp__match(s, TRUE, NULL, NULL, NULL));
  188. }
  189. static int
  190. rxp__match(s, first, j_succ, j_fail, sp_fail)
  191. const char *s;
  192. int first;
  193. Rxp_t *j_succ; /* jump here on successful alt match */
  194. Rxp_t *j_fail; /* jump here on failed match */
  195. const char *sp_fail; /* reset sp to here on failed match */
  196. {
  197. static Rxp_t *rp;
  198. static const char *sp;
  199. int ch;
  200. Rxp_t *grp_end = NULL;
  201. if (first) {
  202. rp = rxpbuf;
  203. sp = s;
  204. }
  205. while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
  206. switch(*rp) {
  207. case LIT:
  208. rp++;
  209. ch = isascii(*rp) && isupper(*rp) ? tolower(*rp) : *rp;
  210. if (ch != *sp++) {
  211. rp = j_fail;
  212. sp = sp_fail;
  213. return (FALSE);
  214. }
  215. rp++;
  216. break;
  217. case SOT:
  218. if (sp != s)
  219. return (FALSE);
  220. rp++;
  221. break;
  222. case EOT:
  223. if (*sp != 0)
  224. return (FALSE);
  225. rp++;
  226. break;
  227. case GRP_S:
  228. rp++;
  229. grp_end = rxpbuf + *rp++;
  230. break;
  231. case ALT_S:
  232. rp++;
  233. rxp__match(sp, FALSE, grp_end, rxpbuf + *rp++, sp);
  234. break;
  235. case ALT_E:
  236. rp = j_succ;
  237. return (TRUE);
  238. case GRP_E:
  239. rp = j_fail;
  240. sp = sp_fail;
  241. return (FALSE);
  242. default:
  243. abort();
  244. }
  245. return (*rp != END ? FALSE : TRUE);
  246. }
  247. /*
  248. * Reverse engineer the regular expression, by picking first of all alternates.
  249. */
  250. char *
  251. rxp_expand()
  252. {
  253. return (rxp__expand(TRUE));
  254. }
  255. static char *
  256. rxp__expand(first)
  257. int first;
  258. {
  259. static char buf[RXP_LINE_SZ/2];
  260. static Rxp_t *rp;
  261. static char *bp;
  262. Rxp_t *grp_ptr;
  263. char *err;
  264. if (first) {
  265. rp = rxpbuf;
  266. bp = buf;
  267. }
  268. while (rp < rxpbuf + RXP_LINE_SZ && *rp != END)
  269. switch(*rp) {
  270. case LIT:
  271. rp++;
  272. *bp++ = *rp++;
  273. break;
  274. case GRP_S:
  275. rp++;
  276. grp_ptr = rxpbuf + *rp;
  277. rp++;
  278. if ((err = rxp__expand(FALSE)) == NULL)
  279. return (err);
  280. rp = grp_ptr;
  281. break;
  282. case ALT_E:
  283. return (buf);
  284. case ALT_S:
  285. rp++;
  286. /* FALLTHROUGH */
  287. case SOT:
  288. case EOT:
  289. case GRP_E:
  290. rp++;
  291. break;
  292. default:
  293. return (NULL);
  294. }
  295. if (first) {
  296. if (*rp != END)
  297. return (NULL);
  298. *bp = '\0';
  299. }
  300. return (buf);
  301. }