asc2cld.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. #!/usr/bin/python3
  2. #
  3. # MSX BASIC tokenizer
  4. #
  5. # Copyright © 2020 Pedro Gimeno Fortea
  6. #
  7. #
  8. # Permission is hereby granted, free of charge, to any person obtaining a
  9. # copy of this software and associated documentation files (the "Software"),
  10. # to deal in the Software without restriction, including without limitation
  11. # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  12. # and/or sell copies of the Software, and to permit persons to whom the
  13. # Software is furnished to do so, subject to the following conditions:
  14. #
  15. # The above copyright notice and this permission notice shall be included in
  16. # all copies or substantial portions of the Software.
  17. #
  18. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23. # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  24. # IN THE SOFTWARE.
  25. import sys, re, io, struct
  26. assert sys.hexversion >= 0x3050300, "Invalid Python version, must be 3.5.3+"
  27. tokens1MSX = [
  28. "", "END", "FOR", "NEXT", "DATA", "INPUT", "DIM", "READ", "LET",
  29. "GOTO", "RUN", "IF", "RESTORE", "GOSUB", "RETURN", "REM", "STOP",
  30. "PRINT", "CLEAR", "LIST", "NEW", "ON", "WAIT", "DEF", "POKE", "CONT",
  31. "CSAVE", "CLOAD", "OUT", "LPRINT", "LLIST", "CLS", "WIDTH", "ELSE",
  32. "TRON", "TROFF", "SWAP", "ERASE", "ERROR", "RESUME", "DELETE",
  33. "AUTO", "RENUM", "DEFSTR", "DEFINT", "DEFSNG", "DEFDBL", "LINE",
  34. "OPEN", "FIELD", "GET", "PUT", "CLOSE", "LOAD", "MERGE", "FILES",
  35. "LSET", "RSET", "SAVE", "LFILES", "CIRCLE", "COLOR", "DRAW", "PAINT",
  36. "BEEP", "PLAY", "PSET", "PRESET", "SOUND", "SCREEN", "VPOKE",
  37. "SPRITE", "VDP", "BASE", "CALL", "TIME", "KEY", "MAX", "MOTOR",
  38. "BLOAD", "BSAVE", "DSKO$", "SET", "NAME", "KILL", "IPL", "COPY", "CMD",
  39. "LOCATE", "TO", "THEN", "TAB(", "STEP", "USR", "FN", "SPC(", "NOT",
  40. "ERL", "ERR", "STRING$", "USING", "INSTR", "'", "VARPTR", "CSRLIN",
  41. "ATTR$", "DSKI$", "OFF", "INKEY$", "POINT", ">", "=", "<", "+", "-", "*",
  42. "/", "^", "AND", "OR", "XOR", "EQV", "IMP", "MOD", "\\", "", "", "",
  43. ]
  44. tokens2 = [
  45. "", "LEFT$", "RIGHT$", "MID$", "SGN", "INT", "ABS", "SQR", "RND", "SIN",
  46. "LOG", "EXP", "COS", "TAN", "ATN", "FRE", "INP", "POS", "LEN", "STR$",
  47. "VAL", "ASC", "CHR$", "PEEK", "VPEEK", "SPACE$", "OCT$", "HEX$",
  48. "LPOS", "BIN$", "CINT", "CSNG", "CDBL", "FIX", "STICK", "STRIG", "PDL",
  49. "PAD", "DSKF", "FPOS", "CVI", "CVS", "CVD", "EOF", "LOC", "LOF", "MKI$",
  50. "MKS$", "MKD$", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
  51. "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
  52. "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
  53. "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
  54. ]
  55. trans = (
  56. '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F'
  57. '\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
  58. ' !"#$%&\'()*+,-./0123456789:;<=>?'
  59. '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_'
  60. '`abcdefghijklmnopqrstuvwxyz{|}~\x7F'
  61. 'ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒ'
  62. 'áíóúñѪº¿⌐¬½¼¡«»ÃãĨĩÕõŨũIJij¾∽◊‰¶§'
  63. '▂▚▆🮂▬🮅▎▞▊🮇🮊🮘🮙🭭🭯🭬🭮🮚🮛▘▗▝▖🮖Δ‡ω█▄▌▐▀'
  64. 'αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²■█'
  65. )
  66. # Token parsing modes, affects how integers are parsed
  67. ModeFloat = 0
  68. ModeUint = 1
  69. ModeLiteral = 2
  70. # Create charset translation tables from the above
  71. msx_s2b = {}
  72. msx_b2s = {}
  73. for k, v in enumerate(trans):
  74. b = bytes((k,))
  75. msx_s2b[v] = b
  76. msx_b2s[b] = v
  77. def encode(s):
  78. return b''.join([msx_s2b[c] for c in s])
  79. # Exception for the BASIC errors
  80. class BasicError(Exception):
  81. pass
  82. def num2bytes(s, tok_mode):
  83. """Translate a decimal number string to floating point"""
  84. assert tok_mode != ModeLiteral
  85. assert b'e' not in s, "Lower-case exponent in number"
  86. # Remove embedded spaces and leading zeros
  87. s = s.replace(b' ', b'')
  88. assert s
  89. s = s.lstrip(b'0')
  90. if s == b'':
  91. s = b'0' # don't strip the last character
  92. # Unsigned Int is simple
  93. if tok_mode == ModeUint:
  94. return struct.pack('<BH', 0x0E, int(s))
  95. # Find suffix
  96. suffix = s[-1:]
  97. if suffix in {b'%', b'!', b'#'}:
  98. s = s[:-1]
  99. else:
  100. suffix = b''
  101. # Remove point and exponent after taking note of their position and value
  102. point = s.find(b'.')
  103. s = s.replace(b'.', b'')
  104. ep = s.upper().find(b'E') # exponent pointer
  105. exp = 0 # exponent
  106. has_exp = ep != -1
  107. if not has_exp:
  108. ep = len(s)
  109. else:
  110. tmp = s[ep + 1:]
  111. s = s[:ep]
  112. if tmp not in {b'', b'+', b'-'}:
  113. exp = int(tmp, 10)
  114. has_point = point != -1
  115. if point == -1:
  116. point = ep
  117. exp += point + 0x40 # apply bias
  118. # Remove zeros after the decimal point and subtract the count from exp
  119. orig_len = len(s)
  120. s = s.lstrip(b'0')
  121. exp -= orig_len - len(s)
  122. point -= orig_len - len(s)
  123. # Pad with zeros to the right
  124. s += b'0' * (14 - len(s))
  125. # Calculate the integer value for the case of % or no point/exp
  126. nint = 0
  127. if point > 0:
  128. # Perform rounding of the 14th decimal
  129. nint = int(s[:point])
  130. if point <= 14 and (s[point:14] == b'9' * (14 - point)
  131. and s[14:15] >= b'5'):
  132. nint += 1
  133. if suffix == b'%' or (not has_point and not has_exp and suffix == b''
  134. and 0 <= nint <= 32767):
  135. if not (0 <= nint <= 32767):
  136. raise BasicError("Overflow")
  137. if nint < 10:
  138. return struct.pack('<B', 0x11 + nint)
  139. if nint < 256:
  140. return struct.pack('<BB', 0x0F, nint)
  141. return struct.pack('<Bh', 0x1C, nint)
  142. n = int(s[:14])
  143. if suffix == b'!' or (n % 100000000 == 0 and suffix != b'#'):
  144. # Handle as single-precision
  145. n = (n + 50000000) // 100000000
  146. if n > 999999:
  147. # Replicate a bug in MS's tokenizer where 9.999995! is tokenized as 1!
  148. # (uncomment to fix the bug)
  149. #exp += 1
  150. n //= 10
  151. assert 0 <= n <= 999999
  152. if n == 0: exp = 0
  153. if exp > 127 or exp < 0:
  154. raise BasicError("Overflow")
  155. return struct.pack('>BL', 0x1D, int(str(n), 16) + exp * 0x1000000)
  156. if s[14:15] >= b'5':
  157. n += 1
  158. if n > 99999999999999:
  159. exp += 1
  160. n //= 10
  161. assert 0 <= n <= 99999999999999
  162. if n == 0: exp = 0
  163. if exp > 127 or exp < 0:
  164. raise BasicError("Overflow")
  165. return struct.pack('>BLL', 0x1F,
  166. int(str(n // 100000000), 16) + exp * 0x1000000,
  167. int(str(n % 100000000), 16))
  168. def tokenize(src, use_cr=False, type_mode=False, remove_spaces=False):
  169. """Options:
  170. type_mode: True if it should behave like typing in a program; False if
  171. it should behave like LOAD. When True:
  172. - Spaces at EOL are removed.
  173. - Entering line numbers >= 65530 raises Syntax Error.
  174. use_cr: True to split lines at CR and ignore LF (like the real thing),
  175. False to split lines at LF and ignore CR (multiplatform).
  176. remove_spaces: True to "minify" the lines by removing all spaces between
  177. tokens.
  178. """
  179. src = encode(src)
  180. LeadingASCII = re.compile(b'^[A-Z]+')
  181. tok_translate = {}
  182. for k, v in enumerate(tokens1MSX):
  183. v = encode(v)
  184. assert k < 128 and v not in tok_translate
  185. if v != b'':
  186. tok_translate[v] = bytes((k+128,))
  187. for k, v in enumerate(tokens2):
  188. v = encode(v)
  189. assert k < 128 and v not in tok_translate
  190. if v != b'':
  191. tok_translate[v] = bytes((255,k+128))
  192. # Special encoding of ' as :REM'
  193. tok_translate[b"'"] = b':' + tok_translate[b'REM'] + tok_translate[b"'"]
  194. # Special encoding of ELSE as :ELSE'
  195. tok_translate[b'ELSE'] = b':' + tok_translate[b'ELSE']
  196. # ? is parsed as PRINT
  197. tok_translate[b'?'] = tok_translate[b'PRINT']
  198. # GOTO admits the spelling GO TO but not e.g. GO TO. GOSUB not affected.
  199. tok_translate[b'GO TO'] = tok_translate[b'GOTO']
  200. # Create list of keywords sorted by longest, escaped
  201. L = []
  202. for v in tokens1MSX + tokens2 + ['GO TO']:
  203. v = encode(v)
  204. if LeadingASCII.search(v):
  205. L.append(v)
  206. L.sort(key=len, reverse=True) # longest match first
  207. # Escape for RE
  208. L = [re.escape(v) for v in L]
  209. # Tokenizer for decimal numbers (floats).
  210. decimal = (
  211. br'|(?P<dec>(?:'
  212. # Cautiously avoid matching 1 space alone or 1 trailing space.
  213. br'(?:[0-9][0-9\ ]*)?\.(?:[0-9\ ]*[0-9])?' # at least a dot
  214. br'|(?:[0-9](?:[0-9\ ]*[0-9])?)' # or at least a digit
  215. br')' # not optional
  216. br'(?:'
  217. br'\ *[%!\#]' # suffix
  218. br'|\ *E\ *[-+]?(?:[0-9\ ]*[0-9])?' # or exponent, but not both
  219. br')?)' # optional
  220. )
  221. # Tokenizer for uint. Up to 6552, read up to 5 digits; from 6553 on, up to 4.
  222. # Used for line numbers, either leading or after GOTO/GOSUB/etc.
  223. uint = (br'|(?P<dec>'
  224. br'(?:0[0\ ]*)?' # leading zeros prefix
  225. br'(?:0' # zero
  226. br'|[1-5](?:\ *[0-9]){4}' # prefix 1..5, 5 digits
  227. br'|6\ *[0-4](?:\ *[0-9]){3}' # prefix 60..64, 5 digits
  228. br'|6\ *5\ *[0-4](?:\ *[0-9]){2}' # prefix 650..654, 5 digits
  229. br'|6\ *5\ *5\ *[0-2](?:\ *[0-9])?' # prefix 6550..6552, 5 digits
  230. br'|[1-9](?:\ *[0-9]){,3}' # rest, 1 to 4 digits
  231. br'))'
  232. )
  233. tokenizer = (br'(?:'
  234. br"(?P<rem>(?:REM|').*)" # comment
  235. br'|(?P<data>(?:DATA)(?:"[^"]*(?:"|$)|[^:])*)' # data
  236. br'|(?P<call>(?:CALL|_)[^:(]*)' # call
  237. br'|(?P<kw>' + br'|'.join(L) + br')' # keywords
  238. br'|[A-Z](?:\ *[0-9.])*' # identifier
  239. br'|&H(?P<hex>[0-9A-F]*)' # hex number
  240. br'|&O(?P<oct>[0-7]*)' # octal number
  241. #br'|&B(?P<bin>[01]+)' # binary numbers don't have tokens
  242. br'%s' # decimal number
  243. br'|(?P<str>"(?:[^"]*)(?:"|$))' # string literal
  244. b'|(?P<del>[\x80-\xFF])' # remove those
  245. br'|.'
  246. br')'
  247. )
  248. tokenizer_uint = re.compile(tokenizer % uint, re.I)
  249. tokenizer_float = re.compile(tokenizer % decimal, re.I)
  250. tokenizer_literal = re.compile(tokenizer % b'|(?P<dec>(?!))', re.I)
  251. del L, uint, decimal, LeadingASCII, tokenizer
  252. # Control Character Behaviour:
  253. # \x00: Terminates the line prematurely (ignores up to the next \x0D).
  254. # \x01-\x08: Skip it and next token.
  255. # \x09: Inserted verbatim, finish token.
  256. # \x0B-\x0C: Replaced by spaces
  257. # \x0E-\x0F,\x11-19,\x1B-\x1F: Inserted verbatim.
  258. # \x0A: Ignored (skipped).
  259. # \x0D: EOL
  260. # \x10: Undefined effects
  261. # Two effects we got:
  262. # - Hang the machine.
  263. # - Delete last byte and insert current line's start address.
  264. # \x1A: EOF. Stops further processing.
  265. # \x7F: Unknown
  266. leadingdigit = re.compile(br'[\ \x01-\x1F\x80-\xFF]*(?:([0-9]?))')
  267. trunc_at_null = re.compile(br'\x00.*', re.S)
  268. call_strip = re.compile(br'[^0-\x7F\ (]*')
  269. # Truncate source at \x1A (^Z)
  270. src = re.sub(b'\x1A.*', b'', src, flags=re.S)
  271. if use_cr:
  272. # realistic
  273. src = src.split(b'\r')
  274. ignore = b'\n'
  275. else:
  276. # practical
  277. src = src.split(b'\n')
  278. ignore = b'\r'
  279. # First pass: Read the lines and tokenize them into a dict with the line
  280. # number as the key. Handle line deletion, overwriting, etc.
  281. PRGLines = {}
  282. for line in src:
  283. line = trunc_at_null.sub(b'', line.replace(ignore, b'').lstrip(b' '))
  284. if type_mode: line = line.rstrip(b' ')
  285. if line == b'':
  286. continue
  287. g = tokenizer_uint.match(line)
  288. # Error if not numeric
  289. if not g.group('dec'):
  290. # we can't execute BASIC statements even in typing mode
  291. raise BasicError("Direct statement in file")
  292. p = g.end()
  293. linenum = int(g.group().replace(b' ', b''))
  294. if linenum != 0:
  295. # The first space is not removed for line 0
  296. if line[p:p+1] == b' ':
  297. p += 1
  298. g = leadingdigit.match(line, p)
  299. if type_mode:
  300. # Entering e.g.
  301. # 65530
  302. # alone raises Syntax error. In LOAD mode, it enters 6553 0
  303. if group(1): raise BasicError("Syntax error")
  304. if not g.group(1) and g.end(0) == len(line):
  305. # Delete line
  306. if linenum not in PRGLines:
  307. raise BasicError("Undefined line number")
  308. del PRGLines[linenum]
  309. continue
  310. lbuf = io.BytesIO() # Tokenized line buffer
  311. tok_mode = ModeFloat
  312. while True:
  313. nextch = line[p:p+1]
  314. if tok_mode == ModeUint and (nextch == b':'
  315. or b'A' <= nextch <= b'Z'
  316. or b'a' <= nextch <= b'z'):
  317. tok_mode = ModeFloat
  318. if p == len(line): break
  319. tokenizer = tokenizer_float
  320. if tok_mode == ModeUint:
  321. tokenizer = tokenizer_uint
  322. elif tok_mode == ModeLiteral:
  323. tokenizer = tokenizer_literal
  324. g = tokenizer.match(line, p)
  325. assert g, "No match in tokenizer for " + line[p]
  326. match = g.group().upper()
  327. p = g.end()
  328. # Handle control chars... somewhat
  329. if match in {b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06',
  330. b'\x07', b'\x08'}:
  331. # Eat a token
  332. g = tokenizer.match(line, p)
  333. p = g.end()
  334. continue
  335. if tok_mode != ModeUint and match in {b'\x0B', b'\x0C', b'\x0E',
  336. b'\x0F', b'\x10', b'\x11', b'\x12', b'\x13', b'\x14', b'\x15',
  337. b'\x16', b'\x17', b'\x18', b'\x19', b'\x1B', b'\x1C', b'\x1D',
  338. b'\x1E', b'\x1F'}:
  339. lbuf.write(b' ')
  340. tok_mode = ModeFloat
  341. continue
  342. # Handle token types
  343. if g.group('rem'):
  344. if g.group().startswith(b"'"):
  345. lbuf.write(tok_translate[b"'"])
  346. lbuf.write(g.group()[1:])
  347. else:
  348. lbuf.write(tok_translate[b'REM'])
  349. lbuf.write(g.group()[3:])
  350. elif g.group('data'):
  351. lbuf.write(tok_translate[b'DATA'])
  352. lbuf.write(g.group()[4:])
  353. elif g.group('call'):
  354. match = call_strip.sub(b'', match)
  355. if match.startswith(b'_'):
  356. lbuf.write(match)
  357. else: # starts with 'CALL'
  358. lbuf.write(tok_translate[b'CALL'])
  359. lbuf.write(match[4:])
  360. elif g.group('kw'):
  361. # keyword
  362. if match in {b'GOTO', b'GO TO', b'GOSUB', b'THEN', b'ELSE', b'LIST',
  363. b'DELETE', b'AUTO', b'LLIST', b'RENUM', b'RESTORE', b'RESUME'}:
  364. tok_mode = ModeUint
  365. else:
  366. tok_mode = ModeFloat
  367. lbuf.write(tok_translate[match])
  368. elif g.group('hex') is not None:
  369. # hex literal
  370. lbuf.write(b'\x0C')
  371. num = int(g.group('hex'), 16) if g.group('hex') != b'' else 0
  372. if num > 65535:
  373. raise BasicError("Overflow")
  374. lbuf.write(struct.pack('<H', num))
  375. elif g.group('oct') is not None:
  376. # oct literal
  377. lbuf.write(b'\x0B')
  378. num = int(g.group('oct'), 8) if g.group('oct') != b'' else 0
  379. if num > 65535:
  380. raise BasicError("Overflow")
  381. lbuf.write(struct.pack('<H', num))
  382. elif g.group('dec'):
  383. # dec literal
  384. lbuf.write(num2bytes(match, tok_mode))
  385. elif g.group('str'):
  386. # string
  387. lbuf.write(g.group())
  388. elif g.group('del'):
  389. # characters > 127 aren't written
  390. pass
  391. else:
  392. if len(match) == 1 and (b'A' <= match <= b'Z' or b'0' <= match <= b'9'):
  393. tok_mode = ModeLiteral # for identifiers like A1
  394. elif tok_mode == ModeLiteral:
  395. tok_mode = ModeFloat # revert to float mode otherwise
  396. if remove_spaces and match == b' ':
  397. continue
  398. if match in tok_translate:
  399. # some symbols need to be translated to tokens for some reason
  400. lbuf.write(tok_translate[match])
  401. else:
  402. lbuf.write(match)
  403. lbuf.write(b'\0')
  404. PRGLines[linenum] = lbuf.getvalue()
  405. # Second pass - Write remaining lines in order
  406. addr = 0x8001
  407. buf = io.BytesIO()
  408. for linenum in sorted(PRGLines.keys()):
  409. line = PRGLines[linenum]
  410. addr += len(line) + 4
  411. buf.write(struct.pack('<HH', addr, linenum))
  412. buf.write(line)
  413. buf.write(b'\0\0')
  414. return buf.getvalue()
  415. def main():
  416. f = open(sys.argv[1], 'r', newline='')
  417. try:
  418. lines = f.read()
  419. finally:
  420. f.close()
  421. out = tokenize(lines, use_cr=len(sys.argv) > 3 and sys.argv[3] == '1')
  422. f = open(sys.argv[2], 'wb')
  423. try:
  424. f.write(b'\xFF' + out)
  425. finally:
  426. f.close()
  427. if __name__ == '__main__':
  428. main()