read.c 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851
  1. /* Copyright (C) 1995,1996,1997,1999,2000,2001,2003, 2004, 2006, 2007, 2008, 2009, 2010, 2011 Free Software
  2. * Foundation, Inc.
  3. *
  4. * This library is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public License
  6. * as published by the Free Software Foundation; either version 3 of
  7. * the License, or (at your option) any later version.
  8. *
  9. * This library is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with this library; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  17. * 02110-1301 USA
  18. */
  19. #ifdef HAVE_CONFIG_H
  20. # include <config.h>
  21. #endif
  22. #include <stdio.h>
  23. #include <ctype.h>
  24. #include <string.h>
  25. #include <unistd.h>
  26. #include <unicase.h>
  27. #include <unictype.h>
  28. #include "libguile/_scm.h"
  29. #include "libguile/bytevectors.h"
  30. #include "libguile/chars.h"
  31. #include "libguile/eval.h"
  32. #include "libguile/arrays.h"
  33. #include "libguile/bitvectors.h"
  34. #include "libguile/keywords.h"
  35. #include "libguile/alist.h"
  36. #include "libguile/srcprop.h"
  37. #include "libguile/hashtab.h"
  38. #include "libguile/hash.h"
  39. #include "libguile/ports.h"
  40. #include "libguile/fports.h"
  41. #include "libguile/root.h"
  42. #include "libguile/strings.h"
  43. #include "libguile/strports.h"
  44. #include "libguile/vectors.h"
  45. #include "libguile/validate.h"
  46. #include "libguile/srfi-4.h"
  47. #include "libguile/srfi-13.h"
  48. #include "libguile/read.h"
  49. #include "libguile/private-options.h"
  50. SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
  51. SCM_SYMBOL (scm_keyword_prefix, "prefix");
  52. SCM_SYMBOL (scm_keyword_postfix, "postfix");
  53. SCM_SYMBOL (sym_nil, "nil");
  54. scm_t_option scm_read_opts[] = {
  55. { SCM_OPTION_BOOLEAN, "copy", 0,
  56. "Copy source code expressions." },
  57. { SCM_OPTION_BOOLEAN, "positions", 1,
  58. "Record positions of source code expressions." },
  59. { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
  60. "Convert symbols to lower case."},
  61. { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
  62. "Style of keyword recognition: #f, 'prefix or 'postfix."},
  63. { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
  64. "Use R6RS variable-length character and string hex escapes."},
  65. { SCM_OPTION_BOOLEAN, "square-brackets", 1,
  66. "Treat `[' and `]' as parentheses, for R6RS compatibility."},
  67. { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
  68. "In strings, consume leading whitespace after an escaped end-of-line."},
  69. { 0, },
  70. };
  71. /*
  72. Give meaningful error messages for errors
  73. We use the format
  74. FILE:LINE:COL: MESSAGE
  75. This happened in ....
  76. This is not standard GNU format, but the test-suite likes the real
  77. message to be in front.
  78. */
  79. void
  80. scm_i_input_error (char const *function,
  81. SCM port, const char *message, SCM arg)
  82. {
  83. SCM fn = (scm_is_string (SCM_FILENAME(port))
  84. ? SCM_FILENAME(port)
  85. : scm_from_locale_string ("#<unknown port>"));
  86. SCM string_port = scm_open_output_string ();
  87. SCM string = SCM_EOL;
  88. scm_simple_format (string_port,
  89. scm_from_locale_string ("~A:~S:~S: ~A"),
  90. scm_list_4 (fn,
  91. scm_from_long (SCM_LINUM (port) + 1),
  92. scm_from_int (SCM_COL (port) + 1),
  93. scm_from_locale_string (message)));
  94. string = scm_get_output_string (string_port);
  95. scm_close_output_port (string_port);
  96. scm_error_scm (scm_from_latin1_symbol ("read-error"),
  97. function? scm_from_locale_string (function) : SCM_BOOL_F,
  98. string,
  99. arg,
  100. SCM_BOOL_F);
  101. }
  102. SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
  103. (SCM setting),
  104. "Option interface for the read options. Instead of using\n"
  105. "this procedure directly, use the procedures @code{read-enable},\n"
  106. "@code{read-disable}, @code{read-set!} and @code{read-options}.")
  107. #define FUNC_NAME s_scm_read_options
  108. {
  109. SCM ans = scm_options (setting,
  110. scm_read_opts,
  111. FUNC_NAME);
  112. if (SCM_COPY_SOURCE_P)
  113. SCM_RECORD_POSITIONS_P = 1;
  114. return ans;
  115. }
  116. #undef FUNC_NAME
  117. /* A fluid referring to an association list mapping extra hash
  118. characters to procedures. */
  119. static SCM *scm_i_read_hash_procedures;
  120. static inline SCM
  121. scm_i_read_hash_procedures_ref (void)
  122. {
  123. return scm_fluid_ref (*scm_i_read_hash_procedures);
  124. }
  125. static inline void
  126. scm_i_read_hash_procedures_set_x (SCM value)
  127. {
  128. scm_fluid_set_x (*scm_i_read_hash_procedures, value);
  129. }
  130. /* Token readers. */
  131. /* Size of the C buffer used to read symbols and numbers. */
  132. #define READER_BUFFER_SIZE 128
  133. /* Size of the C buffer used to read strings. */
  134. #define READER_STRING_BUFFER_SIZE 512
  135. /* The maximum size of Scheme character names. */
  136. #define READER_CHAR_NAME_MAX_SIZE 50
  137. /* `isblank' is only in C99. */
  138. #define CHAR_IS_BLANK_(_chr) \
  139. (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
  140. || ((_chr) == '\f') || ((_chr) == '\r'))
  141. #ifdef MSDOS
  142. # define CHAR_IS_BLANK(_chr) \
  143. ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
  144. #else
  145. # define CHAR_IS_BLANK CHAR_IS_BLANK_
  146. #endif
  147. /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
  148. structure''). */
  149. #define CHAR_IS_R5RS_DELIMITER(c) \
  150. (CHAR_IS_BLANK (c) \
  151. || (c == ')') || (c == '(') || (c == ';') || (c == '"') \
  152. || (SCM_SQUARE_BRACKETS_P && ((c == '[') || (c == ']'))))
  153. #define CHAR_IS_DELIMITER CHAR_IS_R5RS_DELIMITER
  154. /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
  155. Structure''. */
  156. #define CHAR_IS_EXPONENT_MARKER(_chr) \
  157. (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
  158. || ((_chr) == 'd') || ((_chr) == 'l'))
  159. /* Read an SCSH block comment. */
  160. static inline SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
  161. static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
  162. static SCM scm_read_commented_expression (scm_t_wchar, SCM);
  163. static SCM scm_read_shebang (scm_t_wchar, SCM);
  164. static SCM scm_get_hash_procedure (int);
  165. /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
  166. result in the pre-allocated buffer BUF. Return zero if the whole token has
  167. fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
  168. bytes actually read. */
  169. static inline int
  170. read_token (SCM port, char *buf, const size_t buf_size, size_t *read)
  171. {
  172. *read = 0;
  173. while (*read < buf_size)
  174. {
  175. int chr;
  176. chr = scm_get_byte_or_eof (port);
  177. if (chr == EOF)
  178. return 0;
  179. else if (CHAR_IS_DELIMITER (chr))
  180. {
  181. scm_unget_byte (chr, port);
  182. return 0;
  183. }
  184. else
  185. {
  186. *buf = (char) chr;
  187. buf++, (*read)++;
  188. }
  189. }
  190. return 1;
  191. }
  192. /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
  193. result in the pre-allocated buffer BUFFER, if the whole token has fewer than
  194. BUFFER_SIZE bytes, or into OVERFLOW_BUFFER, allocated here to be freed by the
  195. caller. Return zero if the token fits in BUFFER, non-zero otherwise. READ
  196. will be set the number of bytes actually read. */
  197. static int
  198. read_complete_token (SCM port, char *buffer, const size_t buffer_size,
  199. char **overflow_buffer, size_t *read)
  200. {
  201. int overflow = 0;
  202. size_t bytes_read, overflow_size;
  203. *overflow_buffer = NULL;
  204. overflow_size = 0;
  205. do
  206. {
  207. overflow = read_token (port, buffer, buffer_size, &bytes_read);
  208. if (bytes_read == 0)
  209. break;
  210. if (overflow || overflow_size != 0)
  211. {
  212. if (overflow_size == 0)
  213. {
  214. *overflow_buffer = scm_malloc (bytes_read);
  215. memcpy (*overflow_buffer, buffer, bytes_read);
  216. overflow_size = bytes_read;
  217. }
  218. else
  219. {
  220. *overflow_buffer = scm_realloc (*overflow_buffer, overflow_size + bytes_read);
  221. memcpy (*overflow_buffer + overflow_size, buffer, bytes_read);
  222. overflow_size += bytes_read;
  223. }
  224. }
  225. }
  226. while (overflow);
  227. if (overflow_size)
  228. *read = overflow_size;
  229. else
  230. *read = bytes_read;
  231. return (overflow_size != 0);
  232. }
  233. /* Skip whitespace from PORT and return the first non-whitespace character
  234. read. Raise an error on end-of-file. */
  235. static int
  236. flush_ws (SCM port, const char *eoferr)
  237. {
  238. register scm_t_wchar c;
  239. while (1)
  240. switch (c = scm_getc (port))
  241. {
  242. case EOF:
  243. goteof:
  244. if (eoferr)
  245. {
  246. scm_i_input_error (eoferr,
  247. port,
  248. "end of file",
  249. SCM_EOL);
  250. }
  251. return c;
  252. case ';':
  253. lp:
  254. switch (c = scm_getc (port))
  255. {
  256. case EOF:
  257. goto goteof;
  258. default:
  259. goto lp;
  260. case SCM_LINE_INCREMENTORS:
  261. break;
  262. }
  263. break;
  264. case '#':
  265. switch (c = scm_getc (port))
  266. {
  267. case EOF:
  268. eoferr = "read_sharp";
  269. goto goteof;
  270. case '!':
  271. scm_read_shebang (c, port);
  272. break;
  273. case ';':
  274. scm_read_commented_expression (c, port);
  275. break;
  276. case '|':
  277. if (scm_is_false (scm_get_hash_procedure (c)))
  278. {
  279. scm_read_r6rs_block_comment (c, port);
  280. break;
  281. }
  282. /* fall through */
  283. default:
  284. scm_ungetc (c, port);
  285. return '#';
  286. }
  287. break;
  288. case SCM_LINE_INCREMENTORS:
  289. case SCM_SINGLE_SPACES:
  290. case '\t':
  291. break;
  292. default:
  293. return c;
  294. }
  295. return 0;
  296. }
  297. /* Token readers. */
  298. static SCM scm_read_expression (SCM port);
  299. static SCM scm_read_sharp (int chr, SCM port);
  300. static SCM recsexpr (SCM obj, long line, int column, SCM filename);
  301. static SCM
  302. scm_read_sexp (scm_t_wchar chr, SCM port)
  303. #define FUNC_NAME "scm_i_lreadparen"
  304. {
  305. register int c;
  306. register SCM tmp;
  307. register SCM tl, ans = SCM_EOL;
  308. SCM tl2 = SCM_EOL, ans2 = SCM_EOL, copy = SCM_BOOL_F;
  309. const int terminating_char = ((chr == '[') ? ']' : ')');
  310. /* Need to capture line and column numbers here. */
  311. long line = SCM_LINUM (port);
  312. int column = SCM_COL (port) - 1;
  313. c = flush_ws (port, FUNC_NAME);
  314. if (terminating_char == c)
  315. return SCM_EOL;
  316. scm_ungetc (c, port);
  317. if (scm_is_eq (scm_sym_dot,
  318. (tmp = scm_read_expression (port))))
  319. {
  320. ans = scm_read_expression (port);
  321. if (terminating_char != (c = flush_ws (port, FUNC_NAME)))
  322. scm_i_input_error (FUNC_NAME, port, "missing close paren",
  323. SCM_EOL);
  324. return ans;
  325. }
  326. /* Build the head of the list structure. */
  327. ans = tl = scm_cons (tmp, SCM_EOL);
  328. if (SCM_COPY_SOURCE_P)
  329. ans2 = tl2 = scm_cons (scm_is_pair (tmp)
  330. ? copy
  331. : tmp,
  332. SCM_EOL);
  333. while (terminating_char != (c = flush_ws (port, FUNC_NAME)))
  334. {
  335. SCM new_tail;
  336. if (c == ')' || (SCM_SQUARE_BRACKETS_P && c == ']'))
  337. scm_i_input_error (FUNC_NAME, port,
  338. "in pair: mismatched close paren: ~A",
  339. scm_list_1 (SCM_MAKE_CHAR (c)));
  340. scm_ungetc (c, port);
  341. tmp = scm_read_expression (port);
  342. if (scm_is_eq (scm_sym_dot, tmp))
  343. {
  344. SCM_SETCDR (tl, tmp = scm_read_expression (port));
  345. if (SCM_COPY_SOURCE_P)
  346. SCM_SETCDR (tl2, scm_cons (scm_is_pair (tmp) ? copy : tmp,
  347. SCM_EOL));
  348. c = flush_ws (port, FUNC_NAME);
  349. if (terminating_char != c)
  350. scm_i_input_error (FUNC_NAME, port,
  351. "in pair: missing close paren", SCM_EOL);
  352. goto exit;
  353. }
  354. new_tail = scm_cons (tmp, SCM_EOL);
  355. SCM_SETCDR (tl, new_tail);
  356. tl = new_tail;
  357. if (SCM_COPY_SOURCE_P)
  358. {
  359. SCM new_tail2 = scm_cons (scm_is_pair (tmp)
  360. ? copy
  361. : tmp, SCM_EOL);
  362. SCM_SETCDR (tl2, new_tail2);
  363. tl2 = new_tail2;
  364. }
  365. }
  366. exit:
  367. if (SCM_RECORD_POSITIONS_P)
  368. scm_hashq_set_x (scm_source_whash,
  369. ans,
  370. scm_make_srcprops (line, column,
  371. SCM_FILENAME (port),
  372. SCM_COPY_SOURCE_P
  373. ? ans2
  374. : SCM_UNDEFINED,
  375. SCM_EOL));
  376. return ans;
  377. }
  378. #undef FUNC_NAME
  379. /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
  380. C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
  381. found. */
  382. #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
  383. do \
  384. { \
  385. scm_t_wchar a; \
  386. size_t i = 0; \
  387. c = 0; \
  388. while (i < ndigits) \
  389. { \
  390. a = scm_getc (port); \
  391. if (a == EOF) \
  392. goto str_eof; \
  393. if (terminator \
  394. && (a == (scm_t_wchar) terminator) \
  395. && (i > 0)) \
  396. break; \
  397. if ('0' <= a && a <= '9') \
  398. a -= '0'; \
  399. else if ('A' <= a && a <= 'F') \
  400. a = a - 'A' + 10; \
  401. else if ('a' <= a && a <= 'f') \
  402. a = a - 'a' + 10; \
  403. else \
  404. { \
  405. c = a; \
  406. goto bad_escaped; \
  407. } \
  408. c = c * 16 + a; \
  409. i ++; \
  410. } \
  411. } while (0)
  412. static void
  413. skip_intraline_whitespace (SCM port)
  414. {
  415. scm_t_wchar c;
  416. do
  417. {
  418. c = scm_getc (port);
  419. if (c == EOF)
  420. return;
  421. }
  422. while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
  423. scm_ungetc (c, port);
  424. }
  425. static SCM
  426. scm_read_string (int chr, SCM port)
  427. #define FUNC_NAME "scm_lreadr"
  428. {
  429. /* For strings smaller than C_STR, this function creates only one Scheme
  430. object (the string returned). */
  431. SCM str = SCM_BOOL_F;
  432. unsigned c_str_len = 0;
  433. scm_t_wchar c;
  434. str = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
  435. while ('"' != (c = scm_getc (port)))
  436. {
  437. if (c == EOF)
  438. {
  439. str_eof:
  440. scm_i_input_error (FUNC_NAME, port,
  441. "end of file in string constant", SCM_EOL);
  442. }
  443. if (c_str_len + 1 >= scm_i_string_length (str))
  444. {
  445. SCM addy = scm_i_make_string (READER_STRING_BUFFER_SIZE, NULL, 0);
  446. str = scm_string_append (scm_list_2 (str, addy));
  447. }
  448. if (c == '\\')
  449. {
  450. switch (c = scm_getc (port))
  451. {
  452. case EOF:
  453. goto str_eof;
  454. case '"':
  455. case '\\':
  456. break;
  457. case '\n':
  458. if (SCM_HUNGRY_EOL_ESCAPES_P)
  459. skip_intraline_whitespace (port);
  460. continue;
  461. case '0':
  462. c = '\0';
  463. break;
  464. case 'f':
  465. c = '\f';
  466. break;
  467. case 'n':
  468. c = '\n';
  469. break;
  470. case 'r':
  471. c = '\r';
  472. break;
  473. case 't':
  474. c = '\t';
  475. break;
  476. case 'a':
  477. c = '\007';
  478. break;
  479. case 'v':
  480. c = '\v';
  481. break;
  482. case 'b':
  483. c = '\010';
  484. break;
  485. case 'x':
  486. if (SCM_R6RS_ESCAPES_P)
  487. SCM_READ_HEX_ESCAPE (10, ';');
  488. else
  489. SCM_READ_HEX_ESCAPE (2, '\0');
  490. break;
  491. case 'u':
  492. if (!SCM_R6RS_ESCAPES_P)
  493. {
  494. SCM_READ_HEX_ESCAPE (4, '\0');
  495. break;
  496. }
  497. case 'U':
  498. if (!SCM_R6RS_ESCAPES_P)
  499. {
  500. SCM_READ_HEX_ESCAPE (6, '\0');
  501. break;
  502. }
  503. default:
  504. bad_escaped:
  505. scm_i_input_error (FUNC_NAME, port,
  506. "illegal character in escape sequence: ~S",
  507. scm_list_1 (SCM_MAKE_CHAR (c)));
  508. }
  509. }
  510. str = scm_i_string_start_writing (str);
  511. scm_i_string_set_x (str, c_str_len++, c);
  512. scm_i_string_stop_writing ();
  513. }
  514. if (c_str_len > 0)
  515. {
  516. return scm_i_substring_copy (str, 0, c_str_len);
  517. }
  518. return scm_nullstr;
  519. }
  520. #undef FUNC_NAME
  521. static SCM
  522. scm_read_number (scm_t_wchar chr, SCM port)
  523. {
  524. SCM result, str = SCM_EOL;
  525. char buffer[READER_BUFFER_SIZE];
  526. char *overflow_buffer = NULL;
  527. size_t bytes_read;
  528. int overflow;
  529. scm_t_port *pt = SCM_PTAB_ENTRY (port);
  530. scm_ungetc (chr, port);
  531. overflow = read_complete_token (port, buffer, sizeof (buffer),
  532. &overflow_buffer, &bytes_read);
  533. if (!overflow)
  534. str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
  535. else
  536. str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
  537. pt->ilseq_handler);
  538. result = scm_string_to_number (str, SCM_UNDEFINED);
  539. if (!scm_is_true (result))
  540. {
  541. /* Return a symbol instead of a number */
  542. if (SCM_CASE_INSENSITIVE_P)
  543. str = scm_string_downcase_x (str);
  544. result = scm_string_to_symbol (str);
  545. }
  546. if (overflow)
  547. free (overflow_buffer);
  548. SCM_COL (port) += scm_i_string_length (str);
  549. return result;
  550. }
  551. static SCM
  552. scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
  553. {
  554. SCM result;
  555. int ends_with_colon = 0;
  556. size_t bytes_read;
  557. int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
  558. int overflow;
  559. char buffer[READER_BUFFER_SIZE], *overflow_buffer;
  560. scm_t_port *pt = SCM_PTAB_ENTRY (port);
  561. SCM str;
  562. scm_ungetc (chr, port);
  563. overflow = read_complete_token (port, buffer, READER_BUFFER_SIZE,
  564. &overflow_buffer, &bytes_read);
  565. if (bytes_read > 0)
  566. {
  567. if (!overflow)
  568. ends_with_colon = buffer[bytes_read - 1] == ':';
  569. else
  570. ends_with_colon = overflow_buffer[bytes_read - 1] == ':';
  571. }
  572. if (postfix && ends_with_colon && (bytes_read > 1))
  573. {
  574. if (!overflow)
  575. str = scm_from_stringn (buffer, bytes_read - 1, pt->encoding, pt->ilseq_handler);
  576. else
  577. str = scm_from_stringn (overflow_buffer, bytes_read - 1, pt->encoding,
  578. pt->ilseq_handler);
  579. if (SCM_CASE_INSENSITIVE_P)
  580. str = scm_string_downcase_x (str);
  581. result = scm_symbol_to_keyword (scm_string_to_symbol (str));
  582. }
  583. else
  584. {
  585. if (!overflow)
  586. str = scm_from_stringn (buffer, bytes_read, pt->encoding, pt->ilseq_handler);
  587. else
  588. str = scm_from_stringn (overflow_buffer, bytes_read, pt->encoding,
  589. pt->ilseq_handler);
  590. if (SCM_CASE_INSENSITIVE_P)
  591. str = scm_string_downcase_x (str);
  592. result = scm_string_to_symbol (str);
  593. }
  594. if (overflow)
  595. free (overflow_buffer);
  596. SCM_COL (port) += scm_i_string_length (str);
  597. return result;
  598. }
  599. static SCM
  600. scm_read_number_and_radix (scm_t_wchar chr, SCM port)
  601. #define FUNC_NAME "scm_lreadr"
  602. {
  603. SCM result;
  604. size_t read;
  605. char buffer[READER_BUFFER_SIZE], *overflow_buffer;
  606. int overflow;
  607. unsigned int radix;
  608. SCM str;
  609. scm_t_port *pt;
  610. switch (chr)
  611. {
  612. case 'B':
  613. case 'b':
  614. radix = 2;
  615. break;
  616. case 'o':
  617. case 'O':
  618. radix = 8;
  619. break;
  620. case 'd':
  621. case 'D':
  622. radix = 10;
  623. break;
  624. case 'x':
  625. case 'X':
  626. radix = 16;
  627. break;
  628. default:
  629. scm_ungetc (chr, port);
  630. scm_ungetc ('#', port);
  631. radix = 10;
  632. }
  633. overflow = read_complete_token (port, buffer, sizeof (buffer),
  634. &overflow_buffer, &read);
  635. pt = SCM_PTAB_ENTRY (port);
  636. if (!overflow)
  637. str = scm_from_stringn (buffer, read, pt->encoding, pt->ilseq_handler);
  638. else
  639. str = scm_from_stringn (overflow_buffer, read, pt->encoding,
  640. pt->ilseq_handler);
  641. result = scm_string_to_number (str, scm_from_uint (radix));
  642. if (overflow)
  643. free (overflow_buffer);
  644. SCM_COL (port) += scm_i_string_length (str);
  645. if (scm_is_true (result))
  646. return result;
  647. scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
  648. return SCM_BOOL_F;
  649. }
  650. #undef FUNC_NAME
  651. static SCM
  652. scm_read_quote (int chr, SCM port)
  653. {
  654. SCM p;
  655. long line = SCM_LINUM (port);
  656. int column = SCM_COL (port) - 1;
  657. switch (chr)
  658. {
  659. case '`':
  660. p = scm_sym_quasiquote;
  661. break;
  662. case '\'':
  663. p = scm_sym_quote;
  664. break;
  665. case ',':
  666. {
  667. scm_t_wchar c;
  668. c = scm_getc (port);
  669. if ('@' == c)
  670. p = scm_sym_uq_splicing;
  671. else
  672. {
  673. scm_ungetc (c, port);
  674. p = scm_sym_unquote;
  675. }
  676. break;
  677. }
  678. default:
  679. fprintf (stderr, "%s: unhandled quote character (%i)\n",
  680. "scm_read_quote", chr);
  681. abort ();
  682. }
  683. p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
  684. if (SCM_RECORD_POSITIONS_P)
  685. scm_hashq_set_x (scm_source_whash, p,
  686. scm_make_srcprops (line, column,
  687. SCM_FILENAME (port),
  688. SCM_COPY_SOURCE_P
  689. ? (scm_cons2 (SCM_CAR (p),
  690. SCM_CAR (SCM_CDR (p)),
  691. SCM_EOL))
  692. : SCM_UNDEFINED,
  693. SCM_EOL));
  694. return p;
  695. }
  696. SCM_SYMBOL (sym_syntax, "syntax");
  697. SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
  698. SCM_SYMBOL (sym_unsyntax, "unsyntax");
  699. SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
  700. static SCM
  701. scm_read_syntax (int chr, SCM port)
  702. {
  703. SCM p;
  704. long line = SCM_LINUM (port);
  705. int column = SCM_COL (port) - 1;
  706. switch (chr)
  707. {
  708. case '`':
  709. p = sym_quasisyntax;
  710. break;
  711. case '\'':
  712. p = sym_syntax;
  713. break;
  714. case ',':
  715. {
  716. int c;
  717. c = scm_getc (port);
  718. if ('@' == c)
  719. p = sym_unsyntax_splicing;
  720. else
  721. {
  722. scm_ungetc (c, port);
  723. p = sym_unsyntax;
  724. }
  725. break;
  726. }
  727. default:
  728. fprintf (stderr, "%s: unhandled syntax character (%i)\n",
  729. "scm_read_syntax", chr);
  730. abort ();
  731. }
  732. p = scm_cons2 (p, scm_read_expression (port), SCM_EOL);
  733. if (SCM_RECORD_POSITIONS_P)
  734. scm_hashq_set_x (scm_source_whash, p,
  735. scm_make_srcprops (line, column,
  736. SCM_FILENAME (port),
  737. SCM_COPY_SOURCE_P
  738. ? (scm_cons2 (SCM_CAR (p),
  739. SCM_CAR (SCM_CDR (p)),
  740. SCM_EOL))
  741. : SCM_UNDEFINED,
  742. SCM_EOL));
  743. return p;
  744. }
  745. static inline SCM
  746. scm_read_nil (int chr, SCM port)
  747. {
  748. SCM id = scm_read_mixed_case_symbol (chr, port);
  749. if (!scm_is_eq (id, sym_nil))
  750. scm_i_input_error ("scm_read_nil", port,
  751. "unexpected input while reading #nil: ~a",
  752. scm_list_1 (id));
  753. return SCM_ELISP_NIL;
  754. }
  755. static inline SCM
  756. scm_read_semicolon_comment (int chr, SCM port)
  757. {
  758. int c;
  759. /* We use the get_byte here because there is no need to get the
  760. locale correct with comment input. This presumes that newline
  761. always represents itself no matter what the encoding is. */
  762. for (c = scm_get_byte_or_eof (port);
  763. (c != EOF) && (c != '\n');
  764. c = scm_get_byte_or_eof (port));
  765. return SCM_UNSPECIFIED;
  766. }
  767. /* Sharp readers, i.e. readers called after a `#' sign has been read. */
  768. static SCM
  769. scm_read_boolean (int chr, SCM port)
  770. {
  771. switch (chr)
  772. {
  773. case 't':
  774. case 'T':
  775. return SCM_BOOL_T;
  776. case 'f':
  777. case 'F':
  778. return SCM_BOOL_F;
  779. }
  780. return SCM_UNSPECIFIED;
  781. }
  782. static SCM
  783. scm_read_character (scm_t_wchar chr, SCM port)
  784. #define FUNC_NAME "scm_lreadr"
  785. {
  786. char buffer[READER_CHAR_NAME_MAX_SIZE];
  787. SCM charname;
  788. size_t charname_len, bytes_read;
  789. scm_t_wchar cp;
  790. int overflow;
  791. scm_t_port *pt;
  792. overflow = read_token (port, buffer, READER_CHAR_NAME_MAX_SIZE, &bytes_read);
  793. if (overflow)
  794. scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
  795. if (bytes_read == 0)
  796. {
  797. chr = scm_getc (port);
  798. if (chr == EOF)
  799. scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
  800. "while reading character", SCM_EOL);
  801. /* CHR must be a token delimiter, like a whitespace. */
  802. return (SCM_MAKE_CHAR (chr));
  803. }
  804. pt = SCM_PTAB_ENTRY (port);
  805. /* Simple ASCII characters can be processed immediately. Also, simple
  806. ISO-8859-1 characters can be processed immediately if the encoding for this
  807. port is ISO-8859-1. */
  808. if (bytes_read == 1 && ((unsigned char) buffer[0] <= 127 || pt->encoding == NULL))
  809. {
  810. SCM_COL (port) += 1;
  811. return SCM_MAKE_CHAR (buffer[0]);
  812. }
  813. /* Otherwise, convert the buffer into a proper scheme string for
  814. processing. */
  815. charname = scm_from_stringn (buffer, bytes_read, pt->encoding,
  816. pt->ilseq_handler);
  817. charname_len = scm_i_string_length (charname);
  818. SCM_COL (port) += charname_len;
  819. cp = scm_i_string_ref (charname, 0);
  820. if (charname_len == 1)
  821. return SCM_MAKE_CHAR (cp);
  822. /* Ignore dotted circles, which may be used to keep combining characters from
  823. combining with the backslash in #\charname. */
  824. if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
  825. return SCM_MAKE_CHAR (scm_i_string_ref (charname, 1));
  826. if (cp >= '0' && cp < '8')
  827. {
  828. /* Dirk:FIXME:: This type of character syntax is not R5RS
  829. * compliant. Further, it should be verified that the constant
  830. * does only consist of octal digits. */
  831. SCM p = scm_string_to_number (charname, scm_from_uint (8));
  832. if (SCM_I_INUMP (p))
  833. {
  834. scm_t_wchar c = scm_to_uint32 (p);
  835. if (SCM_IS_UNICODE_CHAR (c))
  836. return SCM_MAKE_CHAR (c);
  837. else
  838. scm_i_input_error (FUNC_NAME, port,
  839. "out-of-range octal character escape: ~a",
  840. scm_list_1 (charname));
  841. }
  842. }
  843. if (cp == 'x' && (charname_len > 1))
  844. {
  845. SCM p;
  846. /* Convert from hex, skipping the initial 'x' character in CHARNAME */
  847. p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
  848. scm_from_uint (16));
  849. if (SCM_I_INUMP (p))
  850. {
  851. scm_t_wchar c = scm_to_uint32 (p);
  852. if (SCM_IS_UNICODE_CHAR (c))
  853. return SCM_MAKE_CHAR (c);
  854. else
  855. scm_i_input_error (FUNC_NAME, port,
  856. "out-of-range hex character escape: ~a",
  857. scm_list_1 (charname));
  858. }
  859. }
  860. /* The names of characters should never have non-Latin1
  861. characters. */
  862. if (scm_i_is_narrow_string (charname)
  863. || scm_i_try_narrow_string (charname))
  864. { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
  865. charname_len);
  866. if (scm_is_true (ch))
  867. return ch;
  868. }
  869. scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
  870. scm_list_1 (charname));
  871. return SCM_UNSPECIFIED;
  872. }
  873. #undef FUNC_NAME
  874. static inline SCM
  875. scm_read_keyword (int chr, SCM port)
  876. {
  877. SCM symbol;
  878. /* Read the symbol that comprises the keyword. Doing this instead of
  879. invoking a specific symbol reader function allows `scm_read_keyword ()'
  880. to adapt to the delimiters currently valid of symbols.
  881. XXX: This implementation allows sloppy syntaxes like `#: key'. */
  882. symbol = scm_read_expression (port);
  883. if (!scm_is_symbol (symbol))
  884. scm_i_input_error ("scm_read_keyword", port,
  885. "keyword prefix `~a' not followed by a symbol: ~s",
  886. scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
  887. return (scm_symbol_to_keyword (symbol));
  888. }
  889. static inline SCM
  890. scm_read_vector (int chr, SCM port)
  891. {
  892. /* Note: We call `scm_read_sexp ()' rather than READER here in order to
  893. guarantee that it's going to do what we want. After all, this is an
  894. implementation detail of `scm_read_vector ()', not a desirable
  895. property. */
  896. return (scm_vector (scm_read_sexp (chr, port)));
  897. }
  898. static inline SCM
  899. scm_read_srfi4_vector (int chr, SCM port)
  900. {
  901. return scm_i_read_array (port, chr);
  902. }
  903. static SCM
  904. scm_read_bytevector (scm_t_wchar chr, SCM port)
  905. {
  906. chr = scm_getc (port);
  907. if (chr != 'u')
  908. goto syntax;
  909. chr = scm_getc (port);
  910. if (chr != '8')
  911. goto syntax;
  912. chr = scm_getc (port);
  913. if (chr != '(')
  914. goto syntax;
  915. return scm_u8_list_to_bytevector (scm_read_sexp (chr, port));
  916. syntax:
  917. scm_i_input_error ("read_bytevector", port,
  918. "invalid bytevector prefix",
  919. SCM_MAKE_CHAR (chr));
  920. return SCM_UNSPECIFIED;
  921. }
  922. static SCM
  923. scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
  924. {
  925. /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
  926. terribly inefficient but who cares? */
  927. SCM s_bits = SCM_EOL;
  928. for (chr = scm_getc (port);
  929. (chr != EOF) && ((chr == '0') || (chr == '1'));
  930. chr = scm_getc (port))
  931. {
  932. s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
  933. }
  934. if (chr != EOF)
  935. scm_ungetc (chr, port);
  936. return scm_bitvector (scm_reverse_x (s_bits, SCM_EOL));
  937. }
  938. static inline SCM
  939. scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
  940. {
  941. int bang_seen = 0;
  942. for (;;)
  943. {
  944. int c = scm_getc (port);
  945. if (c == EOF)
  946. scm_i_input_error ("skip_block_comment", port,
  947. "unterminated `#! ... !#' comment", SCM_EOL);
  948. if (c == '!')
  949. bang_seen = 1;
  950. else if (c == '#' && bang_seen)
  951. break;
  952. else
  953. bang_seen = 0;
  954. }
  955. return SCM_UNSPECIFIED;
  956. }
  957. static SCM
  958. scm_read_shebang (scm_t_wchar chr, SCM port)
  959. {
  960. int c = 0;
  961. if ((c = scm_get_byte_or_eof (port)) != 'r')
  962. {
  963. scm_ungetc (c, port);
  964. return scm_read_scsh_block_comment (chr, port);
  965. }
  966. if ((c = scm_get_byte_or_eof (port)) != '6')
  967. {
  968. scm_ungetc (c, port);
  969. scm_ungetc ('r', port);
  970. return scm_read_scsh_block_comment (chr, port);
  971. }
  972. if ((c = scm_get_byte_or_eof (port)) != 'r')
  973. {
  974. scm_ungetc (c, port);
  975. scm_ungetc ('6', port);
  976. scm_ungetc ('r', port);
  977. return scm_read_scsh_block_comment (chr, port);
  978. }
  979. if ((c = scm_get_byte_or_eof (port)) != 's')
  980. {
  981. scm_ungetc (c, port);
  982. scm_ungetc ('r', port);
  983. scm_ungetc ('6', port);
  984. scm_ungetc ('r', port);
  985. return scm_read_scsh_block_comment (chr, port);
  986. }
  987. return SCM_UNSPECIFIED;
  988. }
  989. static SCM
  990. scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
  991. {
  992. /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
  993. nested. So care must be taken. */
  994. int nesting_level = 1;
  995. int opening_seen = 0, closing_seen = 0;
  996. while (nesting_level > 0)
  997. {
  998. int c = scm_getc (port);
  999. if (c == EOF)
  1000. scm_i_input_error ("scm_read_r6rs_block_comment", port,
  1001. "unterminated `#| ... |#' comment", SCM_EOL);
  1002. if (opening_seen)
  1003. {
  1004. if (c == '|')
  1005. nesting_level++;
  1006. opening_seen = 0;
  1007. }
  1008. else if (closing_seen)
  1009. {
  1010. if (c == '#')
  1011. nesting_level--;
  1012. closing_seen = 0;
  1013. }
  1014. else if (c == '|')
  1015. closing_seen = 1;
  1016. else if (c == '#')
  1017. opening_seen = 1;
  1018. else
  1019. opening_seen = closing_seen = 0;
  1020. }
  1021. return SCM_UNSPECIFIED;
  1022. }
  1023. static SCM
  1024. scm_read_commented_expression (scm_t_wchar chr, SCM port)
  1025. {
  1026. scm_t_wchar c;
  1027. c = flush_ws (port, (char *) NULL);
  1028. if (EOF == c)
  1029. scm_i_input_error ("read_commented_expression", port,
  1030. "no expression after #; comment", SCM_EOL);
  1031. scm_ungetc (c, port);
  1032. scm_read_expression (port);
  1033. return SCM_UNSPECIFIED;
  1034. }
  1035. static SCM
  1036. scm_read_extended_symbol (scm_t_wchar chr, SCM port)
  1037. {
  1038. /* Guile's extended symbol read syntax looks like this:
  1039. #{This is all a symbol name}#
  1040. So here, CHR is expected to be `{'. */
  1041. int saw_brace = 0;
  1042. size_t len = 0;
  1043. SCM buf = scm_i_make_string (1024, NULL, 0);
  1044. buf = scm_i_string_start_writing (buf);
  1045. while ((chr = scm_getc (port)) != EOF)
  1046. {
  1047. if (saw_brace)
  1048. {
  1049. if (chr == '#')
  1050. {
  1051. break;
  1052. }
  1053. else
  1054. {
  1055. saw_brace = 0;
  1056. scm_i_string_set_x (buf, len++, '}');
  1057. }
  1058. }
  1059. if (chr == '}')
  1060. saw_brace = 1;
  1061. else if (chr == '\\')
  1062. {
  1063. /* It used to be that print.c would print extended-read-syntax
  1064. symbols with backslashes before "non-standard" chars, but
  1065. this routine wouldn't do anything with those escapes.
  1066. Bummer. What we've done is to change print.c to output
  1067. R6RS hex escapes for those characters, relying on the fact
  1068. that the extended read syntax would never put a `\' before
  1069. an `x'. For now, we just ignore other instances of
  1070. backslash in the string. */
  1071. switch ((chr = scm_getc (port)))
  1072. {
  1073. case EOF:
  1074. goto done;
  1075. case 'x':
  1076. {
  1077. scm_t_wchar c;
  1078. SCM_READ_HEX_ESCAPE (10, ';');
  1079. scm_i_string_set_x (buf, len++, c);
  1080. break;
  1081. str_eof:
  1082. chr = EOF;
  1083. goto done;
  1084. bad_escaped:
  1085. scm_i_string_stop_writing ();
  1086. scm_i_input_error ("scm_read_extended_symbol", port,
  1087. "illegal character in escape sequence: ~S",
  1088. scm_list_1 (SCM_MAKE_CHAR (c)));
  1089. break;
  1090. }
  1091. default:
  1092. scm_i_string_set_x (buf, len++, chr);
  1093. break;
  1094. }
  1095. }
  1096. else
  1097. scm_i_string_set_x (buf, len++, chr);
  1098. if (len >= scm_i_string_length (buf) - 2)
  1099. {
  1100. SCM addy;
  1101. scm_i_string_stop_writing ();
  1102. addy = scm_i_make_string (1024, NULL, 0);
  1103. buf = scm_string_append (scm_list_2 (buf, addy));
  1104. len = 0;
  1105. buf = scm_i_string_start_writing (buf);
  1106. }
  1107. }
  1108. done:
  1109. scm_i_string_stop_writing ();
  1110. if (chr == EOF)
  1111. scm_i_input_error ("scm_read_extended_symbol", port,
  1112. "end of file while reading symbol", SCM_EOL);
  1113. return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
  1114. }
  1115. /* Top-level token readers, i.e., dispatchers. */
  1116. static SCM
  1117. scm_read_sharp_extension (int chr, SCM port)
  1118. {
  1119. SCM proc;
  1120. proc = scm_get_hash_procedure (chr);
  1121. if (scm_is_true (scm_procedure_p (proc)))
  1122. {
  1123. long line = SCM_LINUM (port);
  1124. int column = SCM_COL (port) - 2;
  1125. SCM got;
  1126. got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
  1127. if (!scm_is_eq (got, SCM_UNSPECIFIED))
  1128. {
  1129. if (SCM_RECORD_POSITIONS_P)
  1130. return (recsexpr (got, line, column,
  1131. SCM_FILENAME (port)));
  1132. else
  1133. return got;
  1134. }
  1135. }
  1136. return SCM_UNSPECIFIED;
  1137. }
  1138. /* The reader for the sharp `#' character. It basically dispatches reads
  1139. among the above token readers. */
  1140. static SCM
  1141. scm_read_sharp (scm_t_wchar chr, SCM port)
  1142. #define FUNC_NAME "scm_lreadr"
  1143. {
  1144. SCM result;
  1145. chr = scm_getc (port);
  1146. result = scm_read_sharp_extension (chr, port);
  1147. if (!scm_is_eq (result, SCM_UNSPECIFIED))
  1148. return result;
  1149. switch (chr)
  1150. {
  1151. case '\\':
  1152. return (scm_read_character (chr, port));
  1153. case '(':
  1154. return (scm_read_vector (chr, port));
  1155. case 's':
  1156. case 'u':
  1157. case 'f':
  1158. case 'c':
  1159. /* This one may return either a boolean or an SRFI-4 vector. */
  1160. return (scm_read_srfi4_vector (chr, port));
  1161. case 'v':
  1162. return (scm_read_bytevector (chr, port));
  1163. case '*':
  1164. return (scm_read_guile_bit_vector (chr, port));
  1165. case 't':
  1166. case 'T':
  1167. case 'F':
  1168. /* This one may return either a boolean or an SRFI-4 vector. */
  1169. return (scm_read_boolean (chr, port));
  1170. case ':':
  1171. return (scm_read_keyword (chr, port));
  1172. case '0': case '1': case '2': case '3': case '4':
  1173. case '5': case '6': case '7': case '8': case '9':
  1174. case '@':
  1175. return (scm_i_read_array (port, chr));
  1176. case 'i':
  1177. case 'e':
  1178. case 'b':
  1179. case 'B':
  1180. case 'o':
  1181. case 'O':
  1182. case 'd':
  1183. case 'D':
  1184. case 'x':
  1185. case 'X':
  1186. case 'I':
  1187. case 'E':
  1188. return (scm_read_number_and_radix (chr, port));
  1189. case '{':
  1190. return (scm_read_extended_symbol (chr, port));
  1191. case '!':
  1192. return (scm_read_shebang (chr, port));
  1193. case ';':
  1194. return (scm_read_commented_expression (chr, port));
  1195. case '`':
  1196. case '\'':
  1197. case ',':
  1198. return (scm_read_syntax (chr, port));
  1199. case 'n':
  1200. return (scm_read_nil (chr, port));
  1201. default:
  1202. result = scm_read_sharp_extension (chr, port);
  1203. if (scm_is_eq (result, SCM_UNSPECIFIED))
  1204. {
  1205. /* To remain compatible with 1.8 and earlier, the following
  1206. characters have lower precedence than `read-hash-extend'
  1207. characters. */
  1208. switch (chr)
  1209. {
  1210. case '|':
  1211. return scm_read_r6rs_block_comment (chr, port);
  1212. default:
  1213. scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
  1214. scm_list_1 (SCM_MAKE_CHAR (chr)));
  1215. }
  1216. }
  1217. else
  1218. return result;
  1219. }
  1220. return SCM_UNSPECIFIED;
  1221. }
  1222. #undef FUNC_NAME
  1223. static SCM
  1224. scm_read_expression (SCM port)
  1225. #define FUNC_NAME "scm_read_expression"
  1226. {
  1227. while (1)
  1228. {
  1229. register scm_t_wchar chr;
  1230. chr = scm_getc (port);
  1231. switch (chr)
  1232. {
  1233. case SCM_WHITE_SPACES:
  1234. case SCM_LINE_INCREMENTORS:
  1235. break;
  1236. case ';':
  1237. (void) scm_read_semicolon_comment (chr, port);
  1238. break;
  1239. case '[':
  1240. if (!SCM_SQUARE_BRACKETS_P)
  1241. return (scm_read_mixed_case_symbol (chr, port));
  1242. /* otherwise fall through */
  1243. case '(':
  1244. return (scm_read_sexp (chr, port));
  1245. case '"':
  1246. return (scm_read_string (chr, port));
  1247. case '\'':
  1248. case '`':
  1249. case ',':
  1250. return (scm_read_quote (chr, port));
  1251. case '#':
  1252. {
  1253. SCM result;
  1254. result = scm_read_sharp (chr, port);
  1255. if (scm_is_eq (result, SCM_UNSPECIFIED))
  1256. /* We read a comment or some such. */
  1257. break;
  1258. else
  1259. return result;
  1260. }
  1261. case ')':
  1262. scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
  1263. break;
  1264. case ']':
  1265. if (SCM_SQUARE_BRACKETS_P)
  1266. scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
  1267. /* otherwise fall through */
  1268. case EOF:
  1269. return SCM_EOF_VAL;
  1270. case ':':
  1271. if (scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_prefix))
  1272. return scm_symbol_to_keyword (scm_read_expression (port));
  1273. /* Fall through. */
  1274. default:
  1275. {
  1276. if (((chr >= '0') && (chr <= '9'))
  1277. || (strchr ("+-.", chr)))
  1278. return (scm_read_number (chr, port));
  1279. else
  1280. return (scm_read_mixed_case_symbol (chr, port));
  1281. }
  1282. }
  1283. }
  1284. }
  1285. #undef FUNC_NAME
  1286. /* Actual reader. */
  1287. SCM_DEFINE (scm_read, "read", 0, 1, 0,
  1288. (SCM port),
  1289. "Read an s-expression from the input port @var{port}, or from\n"
  1290. "the current input port if @var{port} is not specified.\n"
  1291. "Any whitespace before the next token is discarded.")
  1292. #define FUNC_NAME s_scm_read
  1293. {
  1294. int c;
  1295. if (SCM_UNBNDP (port))
  1296. port = scm_current_input_port ();
  1297. SCM_VALIDATE_OPINPORT (1, port);
  1298. c = flush_ws (port, (char *) NULL);
  1299. if (EOF == c)
  1300. return SCM_EOF_VAL;
  1301. scm_ungetc (c, port);
  1302. return (scm_read_expression (port));
  1303. }
  1304. #undef FUNC_NAME
  1305. /* Used when recording expressions constructed by `scm_read_sharp ()'. */
  1306. static SCM
  1307. recsexpr (SCM obj, long line, int column, SCM filename)
  1308. {
  1309. if (!scm_is_pair(obj)) {
  1310. return obj;
  1311. } else {
  1312. SCM tmp, copy;
  1313. /* If this sexpr is visible in the read:sharp source, we want to
  1314. keep that information, so only record non-constant cons cells
  1315. which haven't previously been read by the reader. */
  1316. if (scm_is_false (scm_hashq_ref (scm_source_whash, obj, SCM_BOOL_F)))
  1317. {
  1318. if (SCM_COPY_SOURCE_P)
  1319. {
  1320. copy = scm_cons (recsexpr (SCM_CAR (obj), line, column, filename),
  1321. SCM_UNDEFINED);
  1322. for (tmp = obj; scm_is_pair (tmp); tmp = SCM_CDR (tmp))
  1323. {
  1324. SCM_SETCDR (copy, scm_cons (recsexpr (SCM_CAR (tmp),
  1325. line,
  1326. column,
  1327. filename),
  1328. SCM_UNDEFINED));
  1329. copy = SCM_CDR (copy);
  1330. }
  1331. SCM_SETCDR (copy, tmp);
  1332. }
  1333. else
  1334. {
  1335. recsexpr (SCM_CAR (obj), line, column, filename);
  1336. for (tmp = obj; scm_is_pair (tmp); tmp = SCM_CDR (tmp))
  1337. recsexpr (SCM_CAR (tmp), line, column, filename);
  1338. copy = SCM_UNDEFINED;
  1339. }
  1340. scm_hashq_set_x (scm_source_whash,
  1341. obj,
  1342. scm_make_srcprops (line,
  1343. column,
  1344. filename,
  1345. copy,
  1346. SCM_EOL));
  1347. }
  1348. return obj;
  1349. }
  1350. }
  1351. /* Manipulate the read-hash-procedures alist. This could be written in
  1352. Scheme, but maybe it will also be used by C code during initialisation. */
  1353. SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
  1354. (SCM chr, SCM proc),
  1355. "Install the procedure @var{proc} for reading expressions\n"
  1356. "starting with the character sequence @code{#} and @var{chr}.\n"
  1357. "@var{proc} will be called with two arguments: the character\n"
  1358. "@var{chr} and the port to read further data from. The object\n"
  1359. "returned will be the return value of @code{read}. \n"
  1360. "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
  1361. )
  1362. #define FUNC_NAME s_scm_read_hash_extend
  1363. {
  1364. SCM this;
  1365. SCM prev;
  1366. SCM_VALIDATE_CHAR (1, chr);
  1367. SCM_ASSERT (scm_is_false (proc)
  1368. || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
  1369. proc, SCM_ARG2, FUNC_NAME);
  1370. /* Check if chr is already in the alist. */
  1371. this = scm_i_read_hash_procedures_ref ();
  1372. prev = SCM_BOOL_F;
  1373. while (1)
  1374. {
  1375. if (scm_is_null (this))
  1376. {
  1377. /* not found, so add it to the beginning. */
  1378. if (scm_is_true (proc))
  1379. {
  1380. SCM new = scm_cons (scm_cons (chr, proc),
  1381. scm_i_read_hash_procedures_ref ());
  1382. scm_i_read_hash_procedures_set_x (new);
  1383. }
  1384. break;
  1385. }
  1386. if (scm_is_eq (chr, SCM_CAAR (this)))
  1387. {
  1388. /* already in the alist. */
  1389. if (scm_is_false (proc))
  1390. {
  1391. /* remove it. */
  1392. if (scm_is_false (prev))
  1393. {
  1394. SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
  1395. scm_i_read_hash_procedures_set_x (rest);
  1396. }
  1397. else
  1398. scm_set_cdr_x (prev, SCM_CDR (this));
  1399. }
  1400. else
  1401. {
  1402. /* replace it. */
  1403. scm_set_cdr_x (SCM_CAR (this), proc);
  1404. }
  1405. break;
  1406. }
  1407. prev = this;
  1408. this = SCM_CDR (this);
  1409. }
  1410. return SCM_UNSPECIFIED;
  1411. }
  1412. #undef FUNC_NAME
  1413. /* Recover the read-hash procedure corresponding to char c. */
  1414. static SCM
  1415. scm_get_hash_procedure (int c)
  1416. {
  1417. SCM rest = scm_i_read_hash_procedures_ref ();
  1418. while (1)
  1419. {
  1420. if (scm_is_null (rest))
  1421. return SCM_BOOL_F;
  1422. if (SCM_CHAR (SCM_CAAR (rest)) == c)
  1423. return SCM_CDAR (rest);
  1424. rest = SCM_CDR (rest);
  1425. }
  1426. }
  1427. #define SCM_ENCODING_SEARCH_SIZE (500)
  1428. /* Search the first few hundred characters of a file for an Emacs-like coding
  1429. declaration. Returns either NULL or a string whose storage has been
  1430. allocated with `scm_gc_malloc ()'. */
  1431. char *
  1432. scm_i_scan_for_encoding (SCM port)
  1433. {
  1434. scm_t_port *pt;
  1435. char header[SCM_ENCODING_SEARCH_SIZE+1];
  1436. size_t bytes_read, encoding_length, i;
  1437. char *encoding = NULL;
  1438. int utf8_bom = 0;
  1439. char *pos, *encoding_start;
  1440. int in_comment;
  1441. pt = SCM_PTAB_ENTRY (port);
  1442. if (pt->rw_active == SCM_PORT_WRITE)
  1443. scm_flush (port);
  1444. if (pt->rw_random)
  1445. pt->rw_active = SCM_PORT_READ;
  1446. if (pt->read_pos == pt->read_end)
  1447. {
  1448. /* We can use the read buffer, and thus avoid a seek. */
  1449. if (scm_fill_input (port) == EOF)
  1450. return NULL;
  1451. bytes_read = pt->read_end - pt->read_pos;
  1452. if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
  1453. bytes_read = SCM_ENCODING_SEARCH_SIZE;
  1454. if (bytes_read <= 1)
  1455. /* An unbuffered port -- don't scan. */
  1456. return NULL;
  1457. memcpy (header, pt->read_pos, bytes_read);
  1458. header[bytes_read] = '\0';
  1459. }
  1460. else
  1461. {
  1462. /* Try to read some bytes and then seek back. Not all ports
  1463. support seeking back; and indeed some file ports (like
  1464. /dev/urandom) will succeed on an lseek (fd, 0, SEEK_CUR)---the
  1465. check performed by SCM_FPORT_FDES---but fail to seek
  1466. backwards. Hence this block comes second. We prefer to use
  1467. the read buffer in-place. */
  1468. if (SCM_FPORTP (port) && !SCM_FDES_RANDOM_P (SCM_FPORT_FDES (port)))
  1469. return NULL;
  1470. bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
  1471. header[bytes_read] = '\0';
  1472. scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
  1473. }
  1474. if (bytes_read > 3
  1475. && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
  1476. utf8_bom = 1;
  1477. /* search past "coding[:=]" */
  1478. pos = header;
  1479. while (1)
  1480. {
  1481. if ((pos = strstr(pos, "coding")) == NULL)
  1482. return NULL;
  1483. pos += strlen("coding");
  1484. if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
  1485. (*pos == ':' || *pos == '='))
  1486. {
  1487. pos ++;
  1488. break;
  1489. }
  1490. }
  1491. /* skip spaces */
  1492. while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
  1493. (*pos == ' ' || *pos == '\t'))
  1494. pos ++;
  1495. /* grab the next token */
  1496. encoding_start = pos;
  1497. i = 0;
  1498. while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
  1499. && encoding_start + i - header < bytes_read
  1500. && (isalnum ((int) encoding_start[i])
  1501. || strchr ("_-.:/,+=()", encoding_start[i]) != NULL))
  1502. i++;
  1503. encoding_length = i;
  1504. if (encoding_length == 0)
  1505. return NULL;
  1506. encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
  1507. for (i = 0; i < encoding_length; i++)
  1508. encoding[i] = toupper ((int) encoding[i]);
  1509. /* push backwards to make sure we were in a comment */
  1510. in_comment = 0;
  1511. pos = encoding_start;
  1512. while (pos >= header)
  1513. {
  1514. if (*pos == ';')
  1515. {
  1516. in_comment = 1;
  1517. break;
  1518. }
  1519. else if (*pos == '\n' || pos == header)
  1520. {
  1521. /* This wasn't in a semicolon comment. Check for a
  1522. hash-bang comment. */
  1523. char *beg = strstr (header, "#!");
  1524. char *end = strstr (header, "!#");
  1525. if (beg < encoding_start && encoding_start + encoding_length <= end)
  1526. in_comment = 1;
  1527. break;
  1528. }
  1529. else
  1530. {
  1531. pos --;
  1532. continue;
  1533. }
  1534. }
  1535. if (!in_comment)
  1536. /* This wasn't in a comment */
  1537. return NULL;
  1538. if (utf8_bom && strcmp(encoding, "UTF-8"))
  1539. scm_misc_error (NULL,
  1540. "the port input declares the encoding ~s but is encoded as UTF-8",
  1541. scm_list_1 (scm_from_locale_string (encoding)));
  1542. return encoding;
  1543. }
  1544. SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
  1545. (SCM port),
  1546. "Scans the port for an Emacs-like character coding declaration\n"
  1547. "near the top of the contents of a port with random-accessible contents.\n"
  1548. "The coding declaration is of the form\n"
  1549. "@code{coding: XXXXX} and must appear in a scheme comment.\n"
  1550. "\n"
  1551. "Returns a string containing the character encoding of the file\n"
  1552. "if a declaration was found, or @code{#f} otherwise.\n")
  1553. #define FUNC_NAME s_scm_file_encoding
  1554. {
  1555. char *enc;
  1556. SCM s_enc;
  1557. SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
  1558. enc = scm_i_scan_for_encoding (port);
  1559. if (enc == NULL)
  1560. return SCM_BOOL_F;
  1561. else
  1562. {
  1563. s_enc = scm_from_locale_string (enc);
  1564. return s_enc;
  1565. }
  1566. return SCM_BOOL_F;
  1567. }
  1568. #undef FUNC_NAME
  1569. void
  1570. scm_init_read ()
  1571. {
  1572. SCM read_hash_procs;
  1573. read_hash_procs = scm_make_fluid ();
  1574. scm_fluid_set_x (read_hash_procs, SCM_EOL);
  1575. scm_i_read_hash_procedures =
  1576. SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
  1577. scm_init_opts (scm_read_options, scm_read_opts);
  1578. #include "libguile/read.x"
  1579. }
  1580. /*
  1581. Local Variables:
  1582. c-file-style: "gnu"
  1583. End:
  1584. */