strfile.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. /* $NetBSD: strfile.c,v 1.22 2003/08/07 09:37:14 agc Exp $ */
  2. /*-
  3. * Copyright (c) 1989, 1993
  4. * The Regents of the University of California. All rights reserved.
  5. *
  6. * This code is derived from software contributed to Berkeley by
  7. * Ken Arnold.
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions
  11. * are met:
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * 2. Redistributions in binary form must reproduce the above copyright
  15. * notice, this list of conditions and the following disclaimer in the
  16. * documentation and/or other materials provided with the distribution.
  17. * 3. Neither the name of the University nor the names of its contributors
  18. * may be used to endorse or promote products derived from this software
  19. * without specific prior written permission.
  20. *
  21. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31. * SUCH DAMAGE.
  32. */
  33. #if 1
  34. #include <sys/cdefs.h>
  35. #ifndef lint
  36. __COPYRIGHT("@(#) Copyright (c) 1989, 1993\n\
  37. The Regents of the University of California. All rights reserved.\n");
  38. #endif /* not lint */
  39. #ifndef lint
  40. #if 0
  41. static char sccsid[] = "@(#)strfile.c 8.1 (Berkeley) 5/31/93";
  42. #else
  43. __RCSID("$NetBSD: strfile.c,v 1.22 2003/08/07 09:37:14 agc Exp $");
  44. #endif
  45. #endif /* not lint */
  46. #endif /* __NetBSD__ */
  47. # include <sys/types.h>
  48. # include <sys/param.h>
  49. # include <ctype.h>
  50. # include <stdio.h>
  51. # include <stdlib.h>
  52. # include <string.h>
  53. # include <time.h>
  54. # include <unistd.h>
  55. # ifndef u_int32_t
  56. # define u_int32_t unsigned int
  57. # endif
  58. # include "strfile.h"
  59. # ifndef MAXPATHLEN
  60. # define MAXPATHLEN 1024
  61. # endif /* MAXPATHLEN */
  62. static u_int32_t
  63. h2nl(u_int32_t h)
  64. {
  65. unsigned char c[4];
  66. u_int32_t rv;
  67. c[0] = (h >> 24) & 0xff;
  68. c[1] = (h >> 16) & 0xff;
  69. c[2] = (h >> 8) & 0xff;
  70. c[3] = (h >> 0) & 0xff;
  71. memcpy(&rv, c, sizeof rv);
  72. return (rv);
  73. }
  74. /*
  75. * This program takes a file composed of strings separated by
  76. * lines starting with two consecutive delimiting character (default
  77. * character is '%') and creates another file which consists of a table
  78. * describing the file (structure from "strfile.h"), a table of seek
  79. * pointers to the start of the strings, and the strings, each terminated
  80. * by a null byte. Usage:
  81. *
  82. * % strfile [-iorsx] [ -cC ] sourcefile [ datafile ]
  83. *
  84. * c - Change delimiting character from '%' to 'C'
  85. * s - Silent. Give no summary of data processed at the end of
  86. * the run.
  87. * o - order the strings in alphabetic order
  88. * i - if ordering, ignore case
  89. * r - randomize the order of the strings
  90. * x - set rotated bit
  91. *
  92. * Ken Arnold Sept. 7, 1978 --
  93. *
  94. * Added ordering options.
  95. */
  96. # define TRUE 1
  97. # define FALSE 0
  98. # define STORING_PTRS (Oflag || Rflag)
  99. # define CHUNKSIZE 512
  100. # define ALLOC(ptr,sz) do { \
  101. if (ptr == NULL) \
  102. ptr = malloc(CHUNKSIZE * sizeof *ptr); \
  103. else if (((sz) + 1) % CHUNKSIZE == 0) \
  104. ptr = realloc(ptr, ((sz) + CHUNKSIZE) * sizeof *ptr); \
  105. if (ptr == NULL) \
  106. die("out of space"); \
  107. } while (0)
  108. typedef struct {
  109. char first;
  110. off_t pos;
  111. } STR;
  112. char *Infile = NULL, /* input file name */
  113. Outfile[MAXPATHLEN] = "", /* output file name */
  114. Delimch = '%'; /* delimiting character */
  115. int Sflag = FALSE; /* silent run flag */
  116. int Oflag = FALSE; /* ordering flag */
  117. int Iflag = FALSE; /* ignore case flag */
  118. int Rflag = FALSE; /* randomize order flag */
  119. int Xflag = FALSE; /* set rotated bit */
  120. long Num_pts = 0; /* number of pointers/strings */
  121. off_t *Seekpts;
  122. FILE *Sort_1, *Sort_2; /* pointers for sorting */
  123. STRFILE Tbl; /* statistics table */
  124. STR *Firstch; /* first chars of each string */
  125. #ifdef __GNUC__
  126. #define NORETURN __attribute__((__noreturn__))
  127. #else
  128. #define NORETURN
  129. #endif
  130. void add_offset(FILE *, off_t);
  131. int cmp_str(const void *, const void *);
  132. void die(const char *) NORETURN;
  133. void dieperror(const char *, char *) NORETURN;
  134. void do_order(void);
  135. void fwrite_be_offt(off_t, FILE *);
  136. void getargs(int, char *[]);
  137. int main(int, char *[]);
  138. void randomize(void);
  139. void usage(void) NORETURN;
  140. /*
  141. * main:
  142. * Drive the sucker. There are two main modes -- either we store
  143. * the seek pointers, if the table is to be sorted or randomized,
  144. * or we write the pointer directly to the file, if we are to stay
  145. * in file order. If the former, we allocate and re-allocate in
  146. * CHUNKSIZE blocks; if the latter, we just write each pointer,
  147. * and then seek back to the beginning to write in the table.
  148. */
  149. int
  150. main(ac, av)
  151. int ac;
  152. char *av[];
  153. {
  154. char *sp, dc;
  155. FILE *inf, *outf;
  156. off_t last_off, length, pos, *p;
  157. int first, cnt;
  158. char *nsp;
  159. STR *fp;
  160. static char string[257];
  161. /* sanity test */
  162. if (sizeof(u_int32_t) != 4)
  163. die("sizeof(unsigned int) != 4");
  164. getargs(ac, av); /* evalute arguments */
  165. dc = Delimch;
  166. if ((inf = fopen(Infile, "r")) == NULL)
  167. dieperror("open `%s'", Infile);
  168. if ((outf = fopen(Outfile, "w")) == NULL)
  169. dieperror("open `%s'", Outfile);
  170. if (!STORING_PTRS)
  171. (void) fseek(outf, sizeof Tbl, SEEK_SET);
  172. /*
  173. * Write the strings onto the file
  174. */
  175. Tbl.str_longlen = 0;
  176. Tbl.str_shortlen = (unsigned int) 0x7fffffff;
  177. Tbl.str_delim = dc;
  178. Tbl.str_version = VERSION;
  179. first = Oflag;
  180. add_offset(outf, ftell(inf));
  181. last_off = 0;
  182. do {
  183. sp = fgets(string, 256, inf);
  184. if (sp == NULL || (sp[0] == dc && sp[1] == '\n')) {
  185. pos = ftell(inf);
  186. length = pos - last_off - (sp ? strlen(sp) : 0);
  187. last_off = pos;
  188. if (!length)
  189. continue;
  190. add_offset(outf, pos);
  191. if ((off_t)Tbl.str_longlen < length)
  192. Tbl.str_longlen = length;
  193. if ((off_t)Tbl.str_shortlen > length)
  194. Tbl.str_shortlen = length;
  195. first = Oflag;
  196. }
  197. else if (first) {
  198. for (nsp = sp; !isalnum(*nsp); nsp++)
  199. continue;
  200. ALLOC(Firstch, Num_pts);
  201. fp = &Firstch[Num_pts - 1];
  202. if (Iflag && isupper(*nsp))
  203. fp->first = tolower(*nsp);
  204. else
  205. fp->first = *nsp;
  206. fp->pos = Seekpts[Num_pts - 1];
  207. first = FALSE;
  208. }
  209. } while (sp != NULL);
  210. /*
  211. * write the tables in
  212. */
  213. (void) fclose(inf);
  214. if (Oflag)
  215. do_order();
  216. else if (Rflag)
  217. randomize();
  218. if (Xflag)
  219. Tbl.str_flags |= STR_ROTATED;
  220. if (!Sflag) {
  221. printf("\"%s\" created\n", Outfile);
  222. if (Num_pts == 2)
  223. puts("There was 1 string");
  224. else
  225. printf("There were %d strings\n", (int)(Num_pts - 1));
  226. printf("Longest string: %lu byte%s\n", (unsigned long)Tbl.str_longlen,
  227. Tbl.str_longlen == 1 ? "" : "s");
  228. printf("Shortest string: %lu byte%s\n", (unsigned long)Tbl.str_shortlen,
  229. Tbl.str_shortlen == 1 ? "" : "s");
  230. }
  231. (void) fseek(outf, (off_t) 0, SEEK_SET);
  232. Tbl.str_version = h2nl(Tbl.str_version);
  233. Tbl.str_numstr = h2nl(Num_pts - 1);
  234. Tbl.str_longlen = h2nl(Tbl.str_longlen);
  235. Tbl.str_shortlen = h2nl(Tbl.str_shortlen);
  236. Tbl.str_flags = h2nl(Tbl.str_flags);
  237. (void) fwrite((char *) &Tbl, sizeof Tbl, 1, outf);
  238. if (STORING_PTRS) {
  239. for (p = Seekpts, cnt = Num_pts; cnt--; ++p)
  240. fwrite_be_offt(*p, outf);
  241. }
  242. fflush(outf);
  243. if (ferror(outf))
  244. dieperror("fwrite %s", Outfile);
  245. (void) fclose(outf);
  246. exit(0);
  247. }
  248. /*
  249. * This routine evaluates arguments from the command line
  250. */
  251. void
  252. getargs(argc, argv)
  253. int argc;
  254. char **argv;
  255. {
  256. int ch;
  257. extern int optind;
  258. extern char *optarg;
  259. while ((ch = getopt(argc, argv, "c:iorsx")) != -1)
  260. switch(ch) {
  261. case 'c': /* new delimiting char */
  262. Delimch = *optarg;
  263. if (!isascii(Delimch)) {
  264. printf("bad delimiting character: '\\%o\n'",
  265. Delimch);
  266. }
  267. break;
  268. case 'i': /* ignore case in ordering */
  269. Iflag++;
  270. break;
  271. case 'o': /* order strings */
  272. Oflag++;
  273. break;
  274. case 'r': /* randomize pointers */
  275. Rflag++;
  276. break;
  277. case 's': /* silent */
  278. Sflag++;
  279. break;
  280. case 'x': /* set the rotated bit */
  281. Xflag++;
  282. break;
  283. case '?':
  284. default:
  285. usage();
  286. }
  287. argv += optind;
  288. if (*argv) {
  289. Infile = *argv;
  290. if (*++argv)
  291. (void) strcpy(Outfile, *argv);
  292. }
  293. if (!Infile) {
  294. puts("No input file name");
  295. usage();
  296. }
  297. if (*Outfile == '\0') {
  298. (void) strcpy(Outfile, Infile);
  299. (void) strcat(Outfile, ".dat");
  300. }
  301. }
  302. void
  303. usage()
  304. {
  305. (void) fprintf(stderr,
  306. "strfile [-iorsx] [-c char] sourcefile [datafile]\n");
  307. exit(1);
  308. }
  309. void
  310. die(str)
  311. const char *str;
  312. {
  313. fprintf(stderr, "strfile: %s\n", str);
  314. exit(1);
  315. }
  316. void
  317. dieperror(fmt, file)
  318. const char *fmt;
  319. char *file;
  320. {
  321. fprintf(stderr, "strfile: ");
  322. fprintf(stderr, fmt, file);
  323. fprintf(stderr, ": ");
  324. perror(NULL);
  325. exit(1);
  326. }
  327. /*
  328. * add_offset:
  329. * Add an offset to the list, or write it out, as appropriate.
  330. */
  331. void
  332. add_offset(fp, off)
  333. FILE *fp;
  334. off_t off;
  335. {
  336. if (!STORING_PTRS) {
  337. fwrite_be_offt(off, fp);
  338. } else {
  339. ALLOC(Seekpts, Num_pts + 1);
  340. Seekpts[Num_pts] = off;
  341. }
  342. Num_pts++;
  343. }
  344. /*
  345. * do_order:
  346. * Order the strings alphabetically (possibly ignoring case).
  347. */
  348. void
  349. do_order()
  350. {
  351. int i;
  352. off_t *lp;
  353. STR *fp;
  354. Sort_1 = fopen(Infile, "r");
  355. Sort_2 = fopen(Infile, "r");
  356. qsort((char *) Firstch, (int) Tbl.str_numstr, sizeof *Firstch, cmp_str);
  357. i = Tbl.str_numstr;
  358. lp = Seekpts;
  359. fp = Firstch;
  360. while (i--)
  361. *lp++ = fp++->pos;
  362. (void) fclose(Sort_1);
  363. (void) fclose(Sort_2);
  364. Tbl.str_flags |= STR_ORDERED;
  365. }
  366. int
  367. cmp_str(vp1, vp2)
  368. const void *vp1, *vp2;
  369. {
  370. const STR *p1, *p2;
  371. int c1, c2;
  372. int n1, n2;
  373. p1 = (const STR *)vp1;
  374. p2 = (const STR *)vp2;
  375. # define SET_N(nf,ch) (nf = (ch == '\n'))
  376. # define IS_END(ch,nf) (ch == Delimch && nf)
  377. c1 = p1->first;
  378. c2 = p2->first;
  379. if (c1 != c2)
  380. return c1 - c2;
  381. (void) fseek(Sort_1, p1->pos, SEEK_SET);
  382. (void) fseek(Sort_2, p2->pos, SEEK_SET);
  383. n1 = FALSE;
  384. n2 = FALSE;
  385. while (!isalnum(c1 = getc(Sort_1)) && c1 != '\0')
  386. SET_N(n1, c1);
  387. while (!isalnum(c2 = getc(Sort_2)) && c2 != '\0')
  388. SET_N(n2, c2);
  389. while (!IS_END(c1, n1) && !IS_END(c2, n2)) {
  390. if (Iflag) {
  391. if (isupper(c1))
  392. c1 = tolower(c1);
  393. if (isupper(c2))
  394. c2 = tolower(c2);
  395. }
  396. if (c1 != c2)
  397. return c1 - c2;
  398. SET_N(n1, c1);
  399. SET_N(n2, c2);
  400. c1 = getc(Sort_1);
  401. c2 = getc(Sort_2);
  402. }
  403. if (IS_END(c1, n1))
  404. c1 = 0;
  405. if (IS_END(c2, n2))
  406. c2 = 0;
  407. return c1 - c2;
  408. }
  409. /*
  410. * randomize:
  411. * Randomize the order of the string table. We must be careful
  412. * not to randomize across delimiter boundaries. All
  413. * randomization is done within each block.
  414. */
  415. void
  416. randomize()
  417. {
  418. int cnt, i;
  419. off_t tmp;
  420. off_t *sp;
  421. srandom((int)(time((time_t *) NULL) + getpid()));
  422. Tbl.str_flags |= STR_RANDOM;
  423. cnt = Tbl.str_numstr;
  424. /*
  425. * move things around randomly
  426. */
  427. for (sp = Seekpts; cnt > 0; cnt--, sp++) {
  428. i = random() % cnt;
  429. tmp = sp[0];
  430. sp[0] = sp[i];
  431. sp[i] = tmp;
  432. }
  433. }
  434. /*
  435. * fwrite_be_offt:
  436. * Write out the off paramater as a 64 bit big endian number
  437. */
  438. void
  439. fwrite_be_offt(off, f)
  440. off_t off;
  441. FILE *f;
  442. {
  443. int i;
  444. unsigned char c[8];
  445. for (i = 7; i >= 0; i--) {
  446. c[i] = off & 0xff;
  447. off >>= 8;
  448. }
  449. fwrite(c, sizeof(c), 1, f);
  450. }