lexer.go 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032
  1. // TOML lexer.
  2. //
  3. // Written using the principles developed by Rob Pike in
  4. // http://www.youtube.com/watch?v=HxaD_trXwRE
  5. package toml
  6. import (
  7. "bytes"
  8. "errors"
  9. "fmt"
  10. "strconv"
  11. "strings"
  12. )
  13. // Define state functions
  14. type tomlLexStateFn func() tomlLexStateFn
  15. // Define lexer
  16. type tomlLexer struct {
  17. inputIdx int
  18. input []rune // Textual source
  19. currentTokenStart int
  20. currentTokenStop int
  21. tokens []token
  22. brackets []rune
  23. line int
  24. col int
  25. endbufferLine int
  26. endbufferCol int
  27. }
  28. // Basic read operations on input
  29. func (l *tomlLexer) read() rune {
  30. r := l.peek()
  31. if r == '\n' {
  32. l.endbufferLine++
  33. l.endbufferCol = 1
  34. } else {
  35. l.endbufferCol++
  36. }
  37. l.inputIdx++
  38. return r
  39. }
  40. func (l *tomlLexer) next() rune {
  41. r := l.read()
  42. if r != eof {
  43. l.currentTokenStop++
  44. }
  45. return r
  46. }
  47. func (l *tomlLexer) ignore() {
  48. l.currentTokenStart = l.currentTokenStop
  49. l.line = l.endbufferLine
  50. l.col = l.endbufferCol
  51. }
  52. func (l *tomlLexer) skip() {
  53. l.next()
  54. l.ignore()
  55. }
  56. func (l *tomlLexer) fastForward(n int) {
  57. for i := 0; i < n; i++ {
  58. l.next()
  59. }
  60. }
  61. func (l *tomlLexer) emitWithValue(t tokenType, value string) {
  62. l.tokens = append(l.tokens, token{
  63. Position: Position{l.line, l.col},
  64. typ: t,
  65. val: value,
  66. })
  67. l.ignore()
  68. }
  69. func (l *tomlLexer) emit(t tokenType) {
  70. l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop]))
  71. }
  72. func (l *tomlLexer) peek() rune {
  73. if l.inputIdx >= len(l.input) {
  74. return eof
  75. }
  76. return l.input[l.inputIdx]
  77. }
  78. func (l *tomlLexer) peekString(size int) string {
  79. maxIdx := len(l.input)
  80. upperIdx := l.inputIdx + size // FIXME: potential overflow
  81. if upperIdx > maxIdx {
  82. upperIdx = maxIdx
  83. }
  84. return string(l.input[l.inputIdx:upperIdx])
  85. }
  86. func (l *tomlLexer) follow(next string) bool {
  87. return next == l.peekString(len(next))
  88. }
  89. // Error management
  90. func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
  91. l.tokens = append(l.tokens, token{
  92. Position: Position{l.line, l.col},
  93. typ: tokenError,
  94. val: fmt.Sprintf(format, args...),
  95. })
  96. return nil
  97. }
  98. // State functions
  99. func (l *tomlLexer) lexVoid() tomlLexStateFn {
  100. for {
  101. next := l.peek()
  102. switch next {
  103. case '}': // after '{'
  104. return l.lexRightCurlyBrace
  105. case '[':
  106. return l.lexTableKey
  107. case '#':
  108. return l.lexComment(l.lexVoid)
  109. case '=':
  110. return l.lexEqual
  111. case '\r':
  112. fallthrough
  113. case '\n':
  114. l.skip()
  115. continue
  116. }
  117. if isSpace(next) {
  118. l.skip()
  119. }
  120. if isKeyStartChar(next) {
  121. return l.lexKey
  122. }
  123. if next == eof {
  124. l.next()
  125. break
  126. }
  127. }
  128. l.emit(tokenEOF)
  129. return nil
  130. }
  131. func (l *tomlLexer) lexRvalue() tomlLexStateFn {
  132. for {
  133. next := l.peek()
  134. switch next {
  135. case '.':
  136. return l.errorf("cannot start float with a dot")
  137. case '=':
  138. return l.lexEqual
  139. case '[':
  140. return l.lexLeftBracket
  141. case ']':
  142. return l.lexRightBracket
  143. case '{':
  144. return l.lexLeftCurlyBrace
  145. case '}':
  146. return l.lexRightCurlyBrace
  147. case '#':
  148. return l.lexComment(l.lexRvalue)
  149. case '"':
  150. return l.lexString
  151. case '\'':
  152. return l.lexLiteralString
  153. case ',':
  154. return l.lexComma
  155. case '\r':
  156. fallthrough
  157. case '\n':
  158. l.skip()
  159. if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '[' {
  160. return l.lexRvalue
  161. }
  162. return l.lexVoid
  163. }
  164. if l.follow("true") {
  165. return l.lexTrue
  166. }
  167. if l.follow("false") {
  168. return l.lexFalse
  169. }
  170. if l.follow("inf") {
  171. return l.lexInf
  172. }
  173. if l.follow("nan") {
  174. return l.lexNan
  175. }
  176. if isSpace(next) {
  177. l.skip()
  178. continue
  179. }
  180. if next == eof {
  181. l.next()
  182. break
  183. }
  184. if next == '+' || next == '-' {
  185. return l.lexNumber
  186. }
  187. if isDigit(next) {
  188. return l.lexDateTimeOrNumber
  189. }
  190. return l.errorf("no value can start with %c", next)
  191. }
  192. l.emit(tokenEOF)
  193. return nil
  194. }
  195. func (l *tomlLexer) lexDateTimeOrNumber() tomlLexStateFn {
  196. // Could be either a date/time, or a digit.
  197. // The options for date/times are:
  198. // YYYY-... => date or date-time
  199. // HH:... => time
  200. // Anything else should be a number.
  201. lookAhead := l.peekString(5)
  202. if len(lookAhead) < 3 {
  203. return l.lexNumber()
  204. }
  205. for idx, r := range lookAhead {
  206. if !isDigit(r) {
  207. if idx == 2 && r == ':' {
  208. return l.lexDateTimeOrTime()
  209. }
  210. if idx == 4 && r == '-' {
  211. return l.lexDateTimeOrTime()
  212. }
  213. return l.lexNumber()
  214. }
  215. }
  216. return l.lexNumber()
  217. }
  218. func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
  219. l.next()
  220. l.emit(tokenLeftCurlyBrace)
  221. l.brackets = append(l.brackets, '{')
  222. return l.lexVoid
  223. }
  224. func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
  225. l.next()
  226. l.emit(tokenRightCurlyBrace)
  227. if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '{' {
  228. return l.errorf("cannot have '}' here")
  229. }
  230. l.brackets = l.brackets[:len(l.brackets)-1]
  231. return l.lexRvalue
  232. }
  233. func (l *tomlLexer) lexDateTimeOrTime() tomlLexStateFn {
  234. // Example matches:
  235. // 1979-05-27T07:32:00Z
  236. // 1979-05-27T00:32:00-07:00
  237. // 1979-05-27T00:32:00.999999-07:00
  238. // 1979-05-27 07:32:00Z
  239. // 1979-05-27 00:32:00-07:00
  240. // 1979-05-27 00:32:00.999999-07:00
  241. // 1979-05-27T07:32:00
  242. // 1979-05-27T00:32:00.999999
  243. // 1979-05-27 07:32:00
  244. // 1979-05-27 00:32:00.999999
  245. // 1979-05-27
  246. // 07:32:00
  247. // 00:32:00.999999
  248. // we already know those two are digits
  249. l.next()
  250. l.next()
  251. // Got 2 digits. At that point it could be either a time or a date(-time).
  252. r := l.next()
  253. if r == ':' {
  254. return l.lexTime()
  255. }
  256. return l.lexDateTime()
  257. }
  258. func (l *tomlLexer) lexDateTime() tomlLexStateFn {
  259. // This state accepts an offset date-time, a local date-time, or a local date.
  260. //
  261. // v--- cursor
  262. // 1979-05-27T07:32:00Z
  263. // 1979-05-27T00:32:00-07:00
  264. // 1979-05-27T00:32:00.999999-07:00
  265. // 1979-05-27 07:32:00Z
  266. // 1979-05-27 00:32:00-07:00
  267. // 1979-05-27 00:32:00.999999-07:00
  268. // 1979-05-27T07:32:00
  269. // 1979-05-27T00:32:00.999999
  270. // 1979-05-27 07:32:00
  271. // 1979-05-27 00:32:00.999999
  272. // 1979-05-27
  273. // date
  274. // already checked by lexRvalue
  275. l.next() // digit
  276. l.next() // -
  277. for i := 0; i < 2; i++ {
  278. r := l.next()
  279. if !isDigit(r) {
  280. return l.errorf("invalid month digit in date: %c", r)
  281. }
  282. }
  283. r := l.next()
  284. if r != '-' {
  285. return l.errorf("expected - to separate month of a date, not %c", r)
  286. }
  287. for i := 0; i < 2; i++ {
  288. r := l.next()
  289. if !isDigit(r) {
  290. return l.errorf("invalid day digit in date: %c", r)
  291. }
  292. }
  293. l.emit(tokenLocalDate)
  294. r = l.peek()
  295. if r == eof {
  296. return l.lexRvalue
  297. }
  298. if r != ' ' && r != 'T' {
  299. return l.errorf("incorrect date/time separation character: %c", r)
  300. }
  301. if r == ' ' {
  302. lookAhead := l.peekString(3)[1:]
  303. if len(lookAhead) < 2 {
  304. return l.lexRvalue
  305. }
  306. for _, r := range lookAhead {
  307. if !isDigit(r) {
  308. return l.lexRvalue
  309. }
  310. }
  311. }
  312. l.skip() // skip the T or ' '
  313. // time
  314. for i := 0; i < 2; i++ {
  315. r := l.next()
  316. if !isDigit(r) {
  317. return l.errorf("invalid hour digit in time: %c", r)
  318. }
  319. }
  320. r = l.next()
  321. if r != ':' {
  322. return l.errorf("time hour/minute separator should be :, not %c", r)
  323. }
  324. for i := 0; i < 2; i++ {
  325. r := l.next()
  326. if !isDigit(r) {
  327. return l.errorf("invalid minute digit in time: %c", r)
  328. }
  329. }
  330. r = l.next()
  331. if r != ':' {
  332. return l.errorf("time minute/second separator should be :, not %c", r)
  333. }
  334. for i := 0; i < 2; i++ {
  335. r := l.next()
  336. if !isDigit(r) {
  337. return l.errorf("invalid second digit in time: %c", r)
  338. }
  339. }
  340. r = l.peek()
  341. if r == '.' {
  342. l.next()
  343. r := l.next()
  344. if !isDigit(r) {
  345. return l.errorf("expected at least one digit in time's fraction, not %c", r)
  346. }
  347. for {
  348. r := l.peek()
  349. if !isDigit(r) {
  350. break
  351. }
  352. l.next()
  353. }
  354. }
  355. l.emit(tokenLocalTime)
  356. return l.lexTimeOffset
  357. }
  358. func (l *tomlLexer) lexTimeOffset() tomlLexStateFn {
  359. // potential offset
  360. // Z
  361. // -07:00
  362. // +07:00
  363. // nothing
  364. r := l.peek()
  365. if r == 'Z' {
  366. l.next()
  367. l.emit(tokenTimeOffset)
  368. } else if r == '+' || r == '-' {
  369. l.next()
  370. for i := 0; i < 2; i++ {
  371. r := l.next()
  372. if !isDigit(r) {
  373. return l.errorf("invalid hour digit in time offset: %c", r)
  374. }
  375. }
  376. r = l.next()
  377. if r != ':' {
  378. return l.errorf("time offset hour/minute separator should be :, not %c", r)
  379. }
  380. for i := 0; i < 2; i++ {
  381. r := l.next()
  382. if !isDigit(r) {
  383. return l.errorf("invalid minute digit in time offset: %c", r)
  384. }
  385. }
  386. l.emit(tokenTimeOffset)
  387. }
  388. return l.lexRvalue
  389. }
  390. func (l *tomlLexer) lexTime() tomlLexStateFn {
  391. // v--- cursor
  392. // 07:32:00
  393. // 00:32:00.999999
  394. for i := 0; i < 2; i++ {
  395. r := l.next()
  396. if !isDigit(r) {
  397. return l.errorf("invalid minute digit in time: %c", r)
  398. }
  399. }
  400. r := l.next()
  401. if r != ':' {
  402. return l.errorf("time minute/second separator should be :, not %c", r)
  403. }
  404. for i := 0; i < 2; i++ {
  405. r := l.next()
  406. if !isDigit(r) {
  407. return l.errorf("invalid second digit in time: %c", r)
  408. }
  409. }
  410. r = l.peek()
  411. if r == '.' {
  412. l.next()
  413. r := l.next()
  414. if !isDigit(r) {
  415. return l.errorf("expected at least one digit in time's fraction, not %c", r)
  416. }
  417. for {
  418. r := l.peek()
  419. if !isDigit(r) {
  420. break
  421. }
  422. l.next()
  423. }
  424. }
  425. l.emit(tokenLocalTime)
  426. return l.lexRvalue
  427. }
  428. func (l *tomlLexer) lexTrue() tomlLexStateFn {
  429. l.fastForward(4)
  430. l.emit(tokenTrue)
  431. return l.lexRvalue
  432. }
  433. func (l *tomlLexer) lexFalse() tomlLexStateFn {
  434. l.fastForward(5)
  435. l.emit(tokenFalse)
  436. return l.lexRvalue
  437. }
  438. func (l *tomlLexer) lexInf() tomlLexStateFn {
  439. l.fastForward(3)
  440. l.emit(tokenInf)
  441. return l.lexRvalue
  442. }
  443. func (l *tomlLexer) lexNan() tomlLexStateFn {
  444. l.fastForward(3)
  445. l.emit(tokenNan)
  446. return l.lexRvalue
  447. }
  448. func (l *tomlLexer) lexEqual() tomlLexStateFn {
  449. l.next()
  450. l.emit(tokenEqual)
  451. return l.lexRvalue
  452. }
  453. func (l *tomlLexer) lexComma() tomlLexStateFn {
  454. l.next()
  455. l.emit(tokenComma)
  456. if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '{' {
  457. return l.lexVoid
  458. }
  459. return l.lexRvalue
  460. }
  461. // Parse the key and emits its value without escape sequences.
  462. // bare keys, basic string keys and literal string keys are supported.
  463. func (l *tomlLexer) lexKey() tomlLexStateFn {
  464. var sb strings.Builder
  465. for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
  466. if r == '"' {
  467. l.next()
  468. str, err := l.lexStringAsString(`"`, false, true)
  469. if err != nil {
  470. return l.errorf(err.Error())
  471. }
  472. sb.WriteString("\"")
  473. sb.WriteString(str)
  474. sb.WriteString("\"")
  475. l.next()
  476. continue
  477. } else if r == '\'' {
  478. l.next()
  479. str, err := l.lexLiteralStringAsString(`'`, false)
  480. if err != nil {
  481. return l.errorf(err.Error())
  482. }
  483. sb.WriteString("'")
  484. sb.WriteString(str)
  485. sb.WriteString("'")
  486. l.next()
  487. continue
  488. } else if r == '\n' {
  489. return l.errorf("keys cannot contain new lines")
  490. } else if isSpace(r) {
  491. var str strings.Builder
  492. str.WriteString(" ")
  493. // skip trailing whitespace
  494. l.next()
  495. for r = l.peek(); isSpace(r); r = l.peek() {
  496. str.WriteRune(r)
  497. l.next()
  498. }
  499. // break loop if not a dot
  500. if r != '.' {
  501. break
  502. }
  503. str.WriteString(".")
  504. // skip trailing whitespace after dot
  505. l.next()
  506. for r = l.peek(); isSpace(r); r = l.peek() {
  507. str.WriteRune(r)
  508. l.next()
  509. }
  510. sb.WriteString(str.String())
  511. continue
  512. } else if r == '.' {
  513. // skip
  514. } else if !isValidBareChar(r) {
  515. return l.errorf("keys cannot contain %c character", r)
  516. }
  517. sb.WriteRune(r)
  518. l.next()
  519. }
  520. l.emitWithValue(tokenKey, sb.String())
  521. return l.lexVoid
  522. }
  523. func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
  524. return func() tomlLexStateFn {
  525. for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
  526. if next == '\r' && l.follow("\r\n") {
  527. break
  528. }
  529. l.next()
  530. }
  531. l.ignore()
  532. return previousState
  533. }
  534. }
  535. func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
  536. l.next()
  537. l.emit(tokenLeftBracket)
  538. l.brackets = append(l.brackets, '[')
  539. return l.lexRvalue
  540. }
  541. func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
  542. var sb strings.Builder
  543. if discardLeadingNewLine {
  544. if l.follow("\r\n") {
  545. l.skip()
  546. l.skip()
  547. } else if l.peek() == '\n' {
  548. l.skip()
  549. }
  550. }
  551. // find end of string
  552. for {
  553. if l.follow(terminator) {
  554. return sb.String(), nil
  555. }
  556. next := l.peek()
  557. if next == eof {
  558. break
  559. }
  560. sb.WriteRune(l.next())
  561. }
  562. return "", errors.New("unclosed string")
  563. }
  564. func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
  565. l.skip()
  566. // handle special case for triple-quote
  567. terminator := "'"
  568. discardLeadingNewLine := false
  569. if l.follow("''") {
  570. l.skip()
  571. l.skip()
  572. terminator = "'''"
  573. discardLeadingNewLine = true
  574. }
  575. str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
  576. if err != nil {
  577. return l.errorf(err.Error())
  578. }
  579. l.emitWithValue(tokenString, str)
  580. l.fastForward(len(terminator))
  581. l.ignore()
  582. return l.lexRvalue
  583. }
  584. // Lex a string and return the results as a string.
  585. // Terminator is the substring indicating the end of the token.
  586. // The resulting string does not include the terminator.
  587. func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
  588. var sb strings.Builder
  589. if discardLeadingNewLine {
  590. if l.follow("\r\n") {
  591. l.skip()
  592. l.skip()
  593. } else if l.peek() == '\n' {
  594. l.skip()
  595. }
  596. }
  597. for {
  598. if l.follow(terminator) {
  599. return sb.String(), nil
  600. }
  601. if l.follow("\\") {
  602. l.next()
  603. switch l.peek() {
  604. case '\r':
  605. fallthrough
  606. case '\n':
  607. fallthrough
  608. case '\t':
  609. fallthrough
  610. case ' ':
  611. // skip all whitespace chars following backslash
  612. for strings.ContainsRune("\r\n\t ", l.peek()) {
  613. l.next()
  614. }
  615. case '"':
  616. sb.WriteString("\"")
  617. l.next()
  618. case 'n':
  619. sb.WriteString("\n")
  620. l.next()
  621. case 'b':
  622. sb.WriteString("\b")
  623. l.next()
  624. case 'f':
  625. sb.WriteString("\f")
  626. l.next()
  627. case '/':
  628. sb.WriteString("/")
  629. l.next()
  630. case 't':
  631. sb.WriteString("\t")
  632. l.next()
  633. case 'r':
  634. sb.WriteString("\r")
  635. l.next()
  636. case '\\':
  637. sb.WriteString("\\")
  638. l.next()
  639. case 'u':
  640. l.next()
  641. var code strings.Builder
  642. for i := 0; i < 4; i++ {
  643. c := l.peek()
  644. if !isHexDigit(c) {
  645. return "", errors.New("unfinished unicode escape")
  646. }
  647. l.next()
  648. code.WriteRune(c)
  649. }
  650. intcode, err := strconv.ParseInt(code.String(), 16, 32)
  651. if err != nil {
  652. return "", errors.New("invalid unicode escape: \\u" + code.String())
  653. }
  654. sb.WriteRune(rune(intcode))
  655. case 'U':
  656. l.next()
  657. var code strings.Builder
  658. for i := 0; i < 8; i++ {
  659. c := l.peek()
  660. if !isHexDigit(c) {
  661. return "", errors.New("unfinished unicode escape")
  662. }
  663. l.next()
  664. code.WriteRune(c)
  665. }
  666. intcode, err := strconv.ParseInt(code.String(), 16, 64)
  667. if err != nil {
  668. return "", errors.New("invalid unicode escape: \\U" + code.String())
  669. }
  670. sb.WriteRune(rune(intcode))
  671. default:
  672. return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
  673. }
  674. } else {
  675. r := l.peek()
  676. if 0x00 <= r && r <= 0x1F && r != '\t' && !(acceptNewLines && (r == '\n' || r == '\r')) {
  677. return "", fmt.Errorf("unescaped control character %U", r)
  678. }
  679. l.next()
  680. sb.WriteRune(r)
  681. }
  682. if l.peek() == eof {
  683. break
  684. }
  685. }
  686. return "", errors.New("unclosed string")
  687. }
  688. func (l *tomlLexer) lexString() tomlLexStateFn {
  689. l.skip()
  690. // handle special case for triple-quote
  691. terminator := `"`
  692. discardLeadingNewLine := false
  693. acceptNewLines := false
  694. if l.follow(`""`) {
  695. l.skip()
  696. l.skip()
  697. terminator = `"""`
  698. discardLeadingNewLine = true
  699. acceptNewLines = true
  700. }
  701. str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
  702. if err != nil {
  703. return l.errorf(err.Error())
  704. }
  705. l.emitWithValue(tokenString, str)
  706. l.fastForward(len(terminator))
  707. l.ignore()
  708. return l.lexRvalue
  709. }
  710. func (l *tomlLexer) lexTableKey() tomlLexStateFn {
  711. l.next()
  712. if l.peek() == '[' {
  713. // token '[[' signifies an array of tables
  714. l.next()
  715. l.emit(tokenDoubleLeftBracket)
  716. return l.lexInsideTableArrayKey
  717. }
  718. // vanilla table key
  719. l.emit(tokenLeftBracket)
  720. return l.lexInsideTableKey
  721. }
  722. // Parse the key till "]]", but only bare keys are supported
  723. func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
  724. for r := l.peek(); r != eof; r = l.peek() {
  725. switch r {
  726. case ']':
  727. if l.currentTokenStop > l.currentTokenStart {
  728. l.emit(tokenKeyGroupArray)
  729. }
  730. l.next()
  731. if l.peek() != ']' {
  732. break
  733. }
  734. l.next()
  735. l.emit(tokenDoubleRightBracket)
  736. return l.lexVoid
  737. case '[':
  738. return l.errorf("table array key cannot contain ']'")
  739. default:
  740. l.next()
  741. }
  742. }
  743. return l.errorf("unclosed table array key")
  744. }
  745. // Parse the key till "]" but only bare keys are supported
  746. func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
  747. for r := l.peek(); r != eof; r = l.peek() {
  748. switch r {
  749. case ']':
  750. if l.currentTokenStop > l.currentTokenStart {
  751. l.emit(tokenKeyGroup)
  752. }
  753. l.next()
  754. l.emit(tokenRightBracket)
  755. return l.lexVoid
  756. case '[':
  757. return l.errorf("table key cannot contain ']'")
  758. default:
  759. l.next()
  760. }
  761. }
  762. return l.errorf("unclosed table key")
  763. }
  764. func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
  765. l.next()
  766. l.emit(tokenRightBracket)
  767. if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '[' {
  768. return l.errorf("cannot have ']' here")
  769. }
  770. l.brackets = l.brackets[:len(l.brackets)-1]
  771. return l.lexRvalue
  772. }
  773. type validRuneFn func(r rune) bool
  774. func isValidHexRune(r rune) bool {
  775. return r >= 'a' && r <= 'f' ||
  776. r >= 'A' && r <= 'F' ||
  777. r >= '0' && r <= '9' ||
  778. r == '_'
  779. }
  780. func isValidOctalRune(r rune) bool {
  781. return r >= '0' && r <= '7' || r == '_'
  782. }
  783. func isValidBinaryRune(r rune) bool {
  784. return r == '0' || r == '1' || r == '_'
  785. }
  786. func (l *tomlLexer) lexNumber() tomlLexStateFn {
  787. r := l.peek()
  788. if r == '0' {
  789. follow := l.peekString(2)
  790. if len(follow) == 2 {
  791. var isValidRune validRuneFn
  792. switch follow[1] {
  793. case 'x':
  794. isValidRune = isValidHexRune
  795. case 'o':
  796. isValidRune = isValidOctalRune
  797. case 'b':
  798. isValidRune = isValidBinaryRune
  799. default:
  800. if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' {
  801. return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1]))
  802. }
  803. }
  804. if isValidRune != nil {
  805. l.next()
  806. l.next()
  807. digitSeen := false
  808. for {
  809. next := l.peek()
  810. if !isValidRune(next) {
  811. break
  812. }
  813. digitSeen = true
  814. l.next()
  815. }
  816. if !digitSeen {
  817. return l.errorf("number needs at least one digit")
  818. }
  819. l.emit(tokenInteger)
  820. return l.lexRvalue
  821. }
  822. }
  823. }
  824. if r == '+' || r == '-' {
  825. l.next()
  826. if l.follow("inf") {
  827. return l.lexInf
  828. }
  829. if l.follow("nan") {
  830. return l.lexNan
  831. }
  832. }
  833. pointSeen := false
  834. expSeen := false
  835. digitSeen := false
  836. for {
  837. next := l.peek()
  838. if next == '.' {
  839. if pointSeen {
  840. return l.errorf("cannot have two dots in one float")
  841. }
  842. l.next()
  843. if !isDigit(l.peek()) {
  844. return l.errorf("float cannot end with a dot")
  845. }
  846. pointSeen = true
  847. } else if next == 'e' || next == 'E' {
  848. expSeen = true
  849. l.next()
  850. r := l.peek()
  851. if r == '+' || r == '-' {
  852. l.next()
  853. }
  854. } else if isDigit(next) {
  855. digitSeen = true
  856. l.next()
  857. } else if next == '_' {
  858. l.next()
  859. } else {
  860. break
  861. }
  862. if pointSeen && !digitSeen {
  863. return l.errorf("cannot start float with a dot")
  864. }
  865. }
  866. if !digitSeen {
  867. return l.errorf("no digit in that number")
  868. }
  869. if pointSeen || expSeen {
  870. l.emit(tokenFloat)
  871. } else {
  872. l.emit(tokenInteger)
  873. }
  874. return l.lexRvalue
  875. }
  876. func (l *tomlLexer) run() {
  877. for state := l.lexVoid; state != nil; {
  878. state = state()
  879. }
  880. }
  881. // Entry point
  882. func lexToml(inputBytes []byte) []token {
  883. runes := bytes.Runes(inputBytes)
  884. l := &tomlLexer{
  885. input: runes,
  886. tokens: make([]token, 0, 256),
  887. line: 1,
  888. col: 1,
  889. endbufferLine: 1,
  890. endbufferCol: 1,
  891. }
  892. l.run()
  893. return l.tokens
  894. }