parsecsv.nim 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2009 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `CSV`:idx:
  10. ## (`comma separated value`:idx:) parser.
  11. ##
  12. ## Example: How to use the parser
  13. ## ==============================
  14. ##
  15. ## .. code-block:: nim
  16. ## import os, parsecsv, streams
  17. ## var s = newFileStream(paramStr(1), fmRead)
  18. ## if s == nil: quit("cannot open the file" & paramStr(1))
  19. ## var x: CsvParser
  20. ## open(x, s, paramStr(1))
  21. ## while readRow(x):
  22. ## echo "new row: "
  23. ## for val in items(x.row):
  24. ## echo "##", val, "##"
  25. ## close(x)
  26. ##
  27. ## For CSV files with a header row, the header can be read and then used as a
  28. ## reference for item access with `rowEntry <#rowEntry.CsvParser.string>`_:
  29. ##
  30. ## .. code-block:: nim
  31. ## import parsecsv
  32. ## import os
  33. ## # Prepare a file
  34. ## let content = """One,Two,Three,Four
  35. ## 1,2,3,4
  36. ## 10,20,30,40
  37. ## 100,200,300,400
  38. ## """
  39. ## writeFile("temp.csv", content)
  40. ##
  41. ## var p: CsvParser
  42. ## p.open("temp.csv")
  43. ## p.readHeaderRow()
  44. ## while p.readRow():
  45. ## echo "new row: "
  46. ## for col in items(p.headers):
  47. ## echo "##", col, ":", p.rowEntry(col), "##"
  48. ## p.close()
  49. import
  50. lexbase, streams
  51. type
  52. CsvRow* = seq[string] ## a row in a CSV file
  53. CsvParser* = object of BaseLexer ## the parser object.
  54. row*: CsvRow ## the current row
  55. filename: string
  56. sep, quote, esc: char
  57. skipWhite: bool
  58. currRow: int
  59. headers*: seq[string] ## The columns that are defined in the csv file
  60. ## (read using `readHeaderRow <#readHeaderRow.CsvParser>`_).
  61. ## Used with `rowEntry <#rowEntry.CsvParser.string>`_).
  62. CsvError* = object of IOError ## exception that is raised if
  63. ## a parsing error occurs
  64. proc raiseEInvalidCsv(filename: string, line, col: int,
  65. msg: string) {.noreturn.} =
  66. var e: ref CsvError
  67. new(e)
  68. if filename.len == 0:
  69. e.msg = "Error: " & msg
  70. else:
  71. e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg
  72. raise e
  73. proc error(my: CsvParser, pos: int, msg: string) =
  74. raiseEInvalidCsv(my.filename, my.lineNumber, getColNumber(my, pos), msg)
  75. proc open*(my: var CsvParser, input: Stream, filename: string,
  76. separator = ',', quote = '"', escape = '\0',
  77. skipInitialSpace = false) =
  78. ## initializes the parser with an input stream. `Filename` is only used
  79. ## for nice error messages. The parser's behaviour can be controlled by
  80. ## the diverse optional parameters:
  81. ## - `separator`: character used to separate fields
  82. ## - `quote`: Used to quote fields containing special characters like
  83. ## `separator`, `quote` or new-line characters. '\0' disables the parsing
  84. ## of quotes.
  85. ## - `escape`: removes any special meaning from the following character;
  86. ## '\0' disables escaping; if escaping is disabled and `quote` is not '\0',
  87. ## two `quote` characters are parsed one literal `quote` character.
  88. ## - `skipInitialSpace`: If true, whitespace immediately following the
  89. ## `separator` is ignored.
  90. lexbase.open(my, input)
  91. my.filename = filename
  92. my.sep = separator
  93. my.quote = quote
  94. my.esc = escape
  95. my.skipWhite = skipInitialSpace
  96. my.row = @[]
  97. my.currRow = 0
  98. proc open*(my: var CsvParser, filename: string,
  99. separator = ',', quote = '"', escape = '\0',
  100. skipInitialSpace = false) =
  101. ## same as the other `open` but creates the file stream for you.
  102. var s = newFileStream(filename, fmRead)
  103. if s == nil: my.error(0, "cannot open: " & filename)
  104. open(my, s, filename, separator,
  105. quote, escape, skipInitialSpace)
  106. proc parseField(my: var CsvParser, a: var string) =
  107. var pos = my.bufpos
  108. var buf = my.buf
  109. if my.skipWhite:
  110. while buf[pos] in {' ', '\t'}: inc(pos)
  111. setLen(a, 0) # reuse memory
  112. if buf[pos] == my.quote and my.quote != '\0':
  113. inc(pos)
  114. while true:
  115. let c = buf[pos]
  116. if c == '\0':
  117. my.bufpos = pos # can continue after exception?
  118. error(my, pos, my.quote & " expected")
  119. break
  120. elif c == my.quote:
  121. if my.esc == '\0' and buf[pos+1] == my.quote:
  122. add(a, my.quote)
  123. inc(pos, 2)
  124. else:
  125. inc(pos)
  126. break
  127. elif c == my.esc:
  128. add(a, buf[pos+1])
  129. inc(pos, 2)
  130. else:
  131. case c
  132. of '\c':
  133. pos = handleCR(my, pos)
  134. buf = my.buf
  135. add(a, "\n")
  136. of '\l':
  137. pos = handleLF(my, pos)
  138. buf = my.buf
  139. add(a, "\n")
  140. else:
  141. add(a, c)
  142. inc(pos)
  143. else:
  144. while true:
  145. let c = buf[pos]
  146. if c == my.sep: break
  147. if c in {'\c', '\l', '\0'}: break
  148. add(a, c)
  149. inc(pos)
  150. my.bufpos = pos
  151. proc processedRows*(my: var CsvParser): int =
  152. ## returns number of the processed rows
  153. return my.currRow
  154. proc readRow*(my: var CsvParser, columns = 0): bool =
  155. ## reads the next row; if `columns` > 0, it expects the row to have
  156. ## exactly this many columns. Returns false if the end of the file
  157. ## has been encountered else true.
  158. ##
  159. ## Blank lines are skipped.
  160. var col = 0 # current column
  161. let oldpos = my.bufpos
  162. while my.buf[my.bufpos] != '\0':
  163. let oldlen = my.row.len
  164. if oldlen < col+1:
  165. setLen(my.row, col+1)
  166. my.row[col] = ""
  167. parseField(my, my.row[col])
  168. inc(col)
  169. if my.buf[my.bufpos] == my.sep:
  170. inc(my.bufpos)
  171. else:
  172. case my.buf[my.bufpos]
  173. of '\c', '\l':
  174. # skip empty lines:
  175. while true:
  176. case my.buf[my.bufpos]
  177. of '\c': my.bufpos = handleCR(my, my.bufpos)
  178. of '\l': my.bufpos = handleLF(my, my.bufpos)
  179. else: break
  180. of '\0': discard
  181. else: error(my, my.bufpos, my.sep & " expected")
  182. break
  183. setLen(my.row, col)
  184. result = col > 0
  185. if result and col != columns and columns > 0:
  186. error(my, oldpos+1, $columns & " columns expected, but found " &
  187. $col & " columns")
  188. inc(my.currRow)
  189. proc close*(my: var CsvParser) {.inline.} =
  190. ## closes the parser `my` and its associated input stream.
  191. lexbase.close(my)
  192. proc readHeaderRow*(my: var CsvParser) =
  193. ## Reads the first row and creates a look-up table for column numbers
  194. ## See also `rowEntry <#rowEntry.CsvParser.string>`_.
  195. let present = my.readRow()
  196. if present:
  197. my.headers = my.row
  198. proc rowEntry*(my: var CsvParser, entry: string): var string =
  199. ## Acceses a specified `entry` from the current row.
  200. ##
  201. ## Assumes that `readHeaderRow <#readHeaderRow.CsvParser>`_ has already been
  202. ## called.
  203. let index = my.headers.find(entry)
  204. if index >= 0:
  205. result = my.row[index]
  206. when not defined(testing) and isMainModule:
  207. import os
  208. var s = newFileStream(paramStr(1), fmRead)
  209. if s == nil: quit("cannot open the file" & paramStr(1))
  210. var x: CsvParser
  211. open(x, s, paramStr(1))
  212. while readRow(x):
  213. echo "new row: "
  214. for val in items(x.row):
  215. echo "##", val, "##"
  216. close(x)
  217. when isMainModule:
  218. import os
  219. import strutils
  220. block: # Tests for reading the header row
  221. let content = "One,Two,Three,Four\n1,2,3,4\n10,20,30,40,\n100,200,300,400\n"
  222. writeFile("temp.csv", content)
  223. var p: CsvParser
  224. p.open("temp.csv")
  225. p.readHeaderRow()
  226. while p.readRow():
  227. let zeros = repeat('0', p.currRow-2)
  228. doAssert p.rowEntry("One") == "1" & zeros
  229. doAssert p.rowEntry("Two") == "2" & zeros
  230. doAssert p.rowEntry("Three") == "3" & zeros
  231. doAssert p.rowEntry("Four") == "4" & zeros
  232. p.close()
  233. when not defined(testing):
  234. var parser: CsvParser
  235. parser.open("temp.csv")
  236. parser.readHeaderRow()
  237. while parser.readRow():
  238. echo "new row: "
  239. for col in items(parser.headers):
  240. echo "##", col, ":", parser.rowEntry(col), "##"
  241. parser.close()
  242. removeFile("temp.csv")
  243. # Tidy up
  244. removeFile("temp.csv")