wildcard.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. /*
  2. * Wildcard matching engine for use with SFTP-based file transfer
  3. * programs (PSFTP, new-look PSCP): since SFTP has no notion of
  4. * getting the remote side to do globbing (and rightly so) we have
  5. * to do it locally, by retrieving all the filenames in a directory
  6. * and checking each against the wildcard pattern.
  7. */
  8. #include <assert.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #include "putty.h"
  12. /*
  13. * Definition of wildcard syntax:
  14. *
  15. * - * matches any sequence of characters, including zero.
  16. * - ? matches exactly one character which can be anything.
  17. * - [abc] matches exactly one character which is a, b or c.
  18. * - [a-f] matches anything from a through f.
  19. * - [^a-f] matches anything _except_ a through f.
  20. * - [-_] matches - or _; [^-_] matches anything else. (The - is
  21. * non-special if it occurs immediately after the opening
  22. * bracket or ^.)
  23. * - [a^] matches an a or a ^. (The ^ is non-special if it does
  24. * _not_ occur immediately after the opening bracket.)
  25. * - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \.
  26. * - All other characters are non-special and match themselves.
  27. */
  28. /*
  29. * Some notes on differences from POSIX globs (IEEE Std 1003.1, 2003 ed.):
  30. * - backslashes act as escapes even within [] bracket expressions
  31. * - does not support [!...] for non-matching list (POSIX are weird);
  32. * NB POSIX allows [^...] as well via "A bracket expression starting
  33. * with an unquoted circumflex character produces unspecified
  34. * results". If we wanted to allow [!...] we might want to define
  35. * [^!] as having its literal meaning (match '^' or '!').
  36. * - none of the scary [[:class:]] stuff, etc
  37. */
  38. /*
  39. * The wildcard matching technique we use is very simple and
  40. * potentially O(N^2) in running time, but I don't anticipate it
  41. * being that bad in reality (particularly since N will be the size
  42. * of a filename, which isn't all that much). Perhaps one day, once
  43. * PuTTY has grown a regexp matcher for some other reason, I might
  44. * come back and reimplement wildcards by translating them into
  45. * regexps or directly into NFAs; but for the moment, in the
  46. * absence of any other need for the NFA->DFA translation engine,
  47. * anything more than the simplest possible wildcard matcher is
  48. * vast code-size overkill.
  49. *
  50. * Essentially, these wildcards are much simpler than regexps in
  51. * that they consist of a sequence of rigid fragments (? and [...]
  52. * can never match more or less than one character) separated by
  53. * asterisks. It is therefore extremely simple to look at a rigid
  54. * fragment and determine whether or not it begins at a particular
  55. * point in the test string; so we can search along the string
  56. * until we find each fragment, then search for the next. As long
  57. * as we find each fragment in the _first_ place it occurs, there
  58. * will never be a danger of having to backpedal and try to find it
  59. * again somewhere else.
  60. */
  61. enum {
  62. WC_TRAILINGBACKSLASH = 1,
  63. WC_UNCLOSEDCLASS,
  64. WC_INVALIDRANGE
  65. };
  66. /*
  67. * Error reporting is done by returning various negative values
  68. * from the wildcard routines. Passing any such value to wc_error
  69. * will give a human-readable message.
  70. */
  71. const char *wc_error(int value)
  72. {
  73. value = abs(value);
  74. switch (value) {
  75. case WC_TRAILINGBACKSLASH:
  76. return "'\' occurred at end of string (expected another character)";
  77. case WC_UNCLOSEDCLASS:
  78. return "expected ']' to close character class";
  79. case WC_INVALIDRANGE:
  80. return "character range was not terminated (']' just after '-')";
  81. }
  82. return "INTERNAL ERROR: unrecognised wildcard error number";
  83. }
  84. /*
  85. * This is the routine that tests a target string to see if an
  86. * initial substring of it matches a fragment. If successful, it
  87. * returns 1, and advances both `fragment' and `target' past the
  88. * fragment and matching substring respectively. If unsuccessful it
  89. * returns zero. If the wildcard fragment suffers a syntax error,
  90. * it returns <0 and the precise value indexes into wc_error.
  91. */
  92. static int wc_match_fragment(const char **fragment, const char **target)
  93. {
  94. const char *f, *t;
  95. f = *fragment;
  96. t = *target;
  97. /*
  98. * The fragment terminates at either the end of the string, or
  99. * the first (unescaped) *.
  100. */
  101. while (*f && *f != '*' && *t) {
  102. /*
  103. * Extract one character from t, and one character's worth
  104. * of pattern from f, and step along both. Return 0 if they
  105. * fail to match.
  106. */
  107. if (*f == '\\') {
  108. /*
  109. * Backslash, which means f[1] is to be treated as a
  110. * literal character no matter what it is. It may not
  111. * be the end of the string.
  112. */
  113. if (!f[1])
  114. return -WC_TRAILINGBACKSLASH; /* error */
  115. if (f[1] != *t)
  116. return 0; /* failed to match */
  117. f += 2;
  118. } else if (*f == '?') {
  119. /*
  120. * Question mark matches anything.
  121. */
  122. f++;
  123. } else if (*f == '[') {
  124. int invert = 0;
  125. int matched = 0;
  126. /*
  127. * Open bracket introduces a character class.
  128. */
  129. f++;
  130. if (*f == '^') {
  131. invert = 1;
  132. f++;
  133. }
  134. while (*f != ']') {
  135. if (*f == '\\')
  136. f++; /* backslashes still work */
  137. if (!*f)
  138. return -WC_UNCLOSEDCLASS; /* error again */
  139. if (f[1] == '-') {
  140. int lower, upper, ourchr;
  141. lower = (unsigned char) *f++;
  142. f++; /* eat the minus */
  143. if (*f == ']')
  144. return -WC_INVALIDRANGE; /* different error! */
  145. if (*f == '\\')
  146. f++; /* backslashes _still_ work */
  147. if (!*f)
  148. return -WC_UNCLOSEDCLASS; /* error again */
  149. upper = (unsigned char) *f++;
  150. ourchr = (unsigned char) *t;
  151. if (lower > upper) {
  152. int t = lower; lower = upper; upper = t;
  153. }
  154. if (ourchr >= lower && ourchr <= upper)
  155. matched = 1;
  156. } else {
  157. matched |= (*t == *f++);
  158. }
  159. }
  160. if (invert == matched)
  161. return 0; /* failed to match character class */
  162. f++; /* eat the ] */
  163. } else {
  164. /*
  165. * Non-special character matches itself.
  166. */
  167. if (*f != *t)
  168. return 0;
  169. f++;
  170. }
  171. /*
  172. * Now we've done that, increment t past the character we
  173. * matched.
  174. */
  175. t++;
  176. }
  177. if (!*f || *f == '*') {
  178. /*
  179. * We have reached the end of f without finding a mismatch;
  180. * so we're done. Update the caller pointers and return 1.
  181. */
  182. *fragment = f;
  183. *target = t;
  184. return 1;
  185. }
  186. /*
  187. * Otherwise, we must have reached the end of t before we
  188. * reached the end of f; so we've failed. Return 0.
  189. */
  190. return 0;
  191. }
  192. /*
  193. * This is the real wildcard matching routine. It returns 1 for a
  194. * successful match, 0 for an unsuccessful match, and <0 for a
  195. * syntax error in the wildcard.
  196. */
  197. int wc_match(const char *wildcard, const char *target)
  198. {
  199. int ret;
  200. /*
  201. * Every time we see a '*' _followed_ by a fragment, we just
  202. * search along the string for a location at which the fragment
  203. * matches. The only special case is when we see a fragment
  204. * right at the start, in which case we just call the matching
  205. * routine once and give up if it fails.
  206. */
  207. if (*wildcard != '*') {
  208. ret = wc_match_fragment(&wildcard, &target);
  209. if (ret <= 0)
  210. return ret; /* pass back failure or error alike */
  211. }
  212. while (*wildcard) {
  213. assert(*wildcard == '*');
  214. while (*wildcard == '*')
  215. wildcard++;
  216. /*
  217. * It's possible we've just hit the end of the wildcard
  218. * after seeing a *, in which case there's no need to
  219. * bother searching any more because we've won.
  220. */
  221. if (!*wildcard)
  222. return 1;
  223. /*
  224. * Now `wildcard' points at the next fragment. So we
  225. * attempt to match it against `target', and if that fails
  226. * we increment `target' and try again, and so on. When we
  227. * find we're about to try matching against the empty
  228. * string, we give up and return 0.
  229. */
  230. ret = 0;
  231. while (*target) {
  232. const char *save_w = wildcard, *save_t = target;
  233. ret = wc_match_fragment(&wildcard, &target);
  234. if (ret < 0)
  235. return ret; /* syntax error */
  236. if (ret > 0 && !*wildcard && *target) {
  237. /*
  238. * Final special case - literally.
  239. *
  240. * This situation arises when we are matching a
  241. * _terminal_ fragment of the wildcard (that is,
  242. * there is nothing after it, e.g. "*a"), and it
  243. * has matched _too early_. For example, matching
  244. * "*a" against "parka" will match the "a" fragment
  245. * against the _first_ a, and then (if it weren't
  246. * for this special case) matching would fail
  247. * because we're at the end of the wildcard but not
  248. * at the end of the target string.
  249. *
  250. * In this case what we must do is measure the
  251. * length of the fragment in the target (which is
  252. * why we saved `target'), jump straight to that
  253. * distance from the end of the string using
  254. * strlen, and match the same fragment again there
  255. * (which is why we saved `wildcard'). Then we
  256. * return whatever that operation returns.
  257. */
  258. target = save_t + strlen(save_t) - (target - save_t);
  259. wildcard = save_w;
  260. return wc_match_fragment(&wildcard, &target);
  261. }
  262. if (ret > 0)
  263. break;
  264. target++;
  265. }
  266. if (ret > 0)
  267. continue;
  268. return 0;
  269. }
  270. /*
  271. * If we reach here, it must be because we successfully matched
  272. * a fragment and then found ourselves right at the end of the
  273. * wildcard. Hence, we return 1 if and only if we are also
  274. * right at the end of the target.
  275. */
  276. return (*target ? 0 : 1);
  277. }
  278. /*
  279. * Another utility routine that translates a non-wildcard string
  280. * into its raw equivalent by removing any escaping backslashes.
  281. * Expects a target string buffer of anything up to the length of
  282. * the original wildcard. You can also pass NULL as the output
  283. * buffer if you're only interested in the return value.
  284. *
  285. * Returns 1 on success, or 0 if a wildcard character was
  286. * encountered. In the latter case the output string MAY not be
  287. * zero-terminated and you should not use it for anything!
  288. */
  289. int wc_unescape(char *output, const char *wildcard)
  290. {
  291. while (*wildcard) {
  292. if (*wildcard == '\\') {
  293. wildcard++;
  294. /* We are lenient about trailing backslashes in non-wildcards. */
  295. if (*wildcard) {
  296. if (output)
  297. *output++ = *wildcard;
  298. wildcard++;
  299. }
  300. } else if (*wildcard == '*' || *wildcard == '?' ||
  301. *wildcard == '[' || *wildcard == ']') {
  302. return 0; /* it's a wildcard! */
  303. } else {
  304. if (output)
  305. *output++ = *wildcard;
  306. wildcard++;
  307. }
  308. }
  309. *output = '\0';
  310. return 1; /* it's clean */
  311. }
  312. #ifdef TESTMODE
  313. struct test {
  314. const char *wildcard;
  315. const char *target;
  316. int expected_result;
  317. };
  318. const struct test fragment_tests[] = {
  319. /*
  320. * We exhaustively unit-test the fragment matching routine
  321. * itself, which should save us the need to test all its
  322. * intricacies during the full wildcard tests.
  323. */
  324. {"abc", "abc", 1},
  325. {"abc", "abd", 0},
  326. {"abc", "abcd", 1},
  327. {"abcd", "abc", 0},
  328. {"ab[cd]", "abc", 1},
  329. {"ab[cd]", "abd", 1},
  330. {"ab[cd]", "abe", 0},
  331. {"ab[^cd]", "abc", 0},
  332. {"ab[^cd]", "abd", 0},
  333. {"ab[^cd]", "abe", 1},
  334. {"ab\\", "abc", -WC_TRAILINGBACKSLASH},
  335. {"ab\\*", "ab*", 1},
  336. {"ab\\?", "ab*", 0},
  337. {"ab?", "abc", 1},
  338. {"ab?", "ab", 0},
  339. {"ab[", "abc", -WC_UNCLOSEDCLASS},
  340. {"ab[c-", "abb", -WC_UNCLOSEDCLASS},
  341. {"ab[c-]", "abb", -WC_INVALIDRANGE},
  342. {"ab[c-e]", "abb", 0},
  343. {"ab[c-e]", "abc", 1},
  344. {"ab[c-e]", "abd", 1},
  345. {"ab[c-e]", "abe", 1},
  346. {"ab[c-e]", "abf", 0},
  347. {"ab[e-c]", "abb", 0},
  348. {"ab[e-c]", "abc", 1},
  349. {"ab[e-c]", "abd", 1},
  350. {"ab[e-c]", "abe", 1},
  351. {"ab[e-c]", "abf", 0},
  352. {"ab[^c-e]", "abb", 1},
  353. {"ab[^c-e]", "abc", 0},
  354. {"ab[^c-e]", "abd", 0},
  355. {"ab[^c-e]", "abe", 0},
  356. {"ab[^c-e]", "abf", 1},
  357. {"ab[^e-c]", "abb", 1},
  358. {"ab[^e-c]", "abc", 0},
  359. {"ab[^e-c]", "abd", 0},
  360. {"ab[^e-c]", "abe", 0},
  361. {"ab[^e-c]", "abf", 1},
  362. {"ab[a^]", "aba", 1},
  363. {"ab[a^]", "ab^", 1},
  364. {"ab[a^]", "abb", 0},
  365. {"ab[^a^]", "aba", 0},
  366. {"ab[^a^]", "ab^", 0},
  367. {"ab[^a^]", "abb", 1},
  368. {"ab[-c]", "ab-", 1},
  369. {"ab[-c]", "abc", 1},
  370. {"ab[-c]", "abd", 0},
  371. {"ab[^-c]", "ab-", 0},
  372. {"ab[^-c]", "abc", 0},
  373. {"ab[^-c]", "abd", 1},
  374. {"ab[\\[-\\]]", "abZ", 0},
  375. {"ab[\\[-\\]]", "ab[", 1},
  376. {"ab[\\[-\\]]", "ab\\", 1},
  377. {"ab[\\[-\\]]", "ab]", 1},
  378. {"ab[\\[-\\]]", "ab^", 0},
  379. {"ab[^\\[-\\]]", "abZ", 1},
  380. {"ab[^\\[-\\]]", "ab[", 0},
  381. {"ab[^\\[-\\]]", "ab\\", 0},
  382. {"ab[^\\[-\\]]", "ab]", 0},
  383. {"ab[^\\[-\\]]", "ab^", 1},
  384. {"ab[a-fA-F]", "aba", 1},
  385. {"ab[a-fA-F]", "abF", 1},
  386. {"ab[a-fA-F]", "abZ", 0},
  387. };
  388. const struct test full_tests[] = {
  389. {"a", "argh", 0},
  390. {"a", "ba", 0},
  391. {"a", "a", 1},
  392. {"a*", "aardvark", 1},
  393. {"a*", "badger", 0},
  394. {"*a", "park", 0},
  395. {"*a", "pArka", 1},
  396. {"*a", "parka", 1},
  397. {"*a*", "park", 1},
  398. {"*a*", "perk", 0},
  399. {"?b*r?", "abracadabra", 1},
  400. {"?b*r?", "abracadabr", 0},
  401. {"?b*r?", "abracadabzr", 0},
  402. };
  403. int main(void)
  404. {
  405. int i;
  406. int fails, passes;
  407. fails = passes = 0;
  408. for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {
  409. const char *f, *t;
  410. int eret, aret;
  411. f = fragment_tests[i].wildcard;
  412. t = fragment_tests[i].target;
  413. eret = fragment_tests[i].expected_result;
  414. aret = wc_match_fragment(&f, &t);
  415. if (aret != eret) {
  416. printf("failed test: /%s/ against /%s/ returned %d not %d\n",
  417. fragment_tests[i].wildcard, fragment_tests[i].target,
  418. aret, eret);
  419. fails++;
  420. } else
  421. passes++;
  422. }
  423. for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {
  424. const char *f, *t;
  425. int eret, aret;
  426. f = full_tests[i].wildcard;
  427. t = full_tests[i].target;
  428. eret = full_tests[i].expected_result;
  429. aret = wc_match(f, t);
  430. if (aret != eret) {
  431. printf("failed test: /%s/ against /%s/ returned %d not %d\n",
  432. full_tests[i].wildcard, full_tests[i].target,
  433. aret, eret);
  434. fails++;
  435. } else
  436. passes++;
  437. }
  438. printf("passed %d, failed %d\n", passes, fails);
  439. return 0;
  440. }
  441. #endif