parsecsv.nim 11 KB


  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2009 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `CSV`:idx:
  10. ## (`comma separated value`:idx:) parser.
  11. ##
  12. ## Basic usage
  13. ## ===========
  14. ##
  15. ## ```nim
  16. ## import std/parsecsv
  17. ## from std/os import paramStr
  18. ## from std/streams import newFileStream
  19. ##
  20. ## var s = newFileStream(paramStr(1), fmRead)
  21. ## if s == nil:
  22. ## quit("cannot open the file" & paramStr(1))
  23. ##
  24. ## var x: CsvParser
  25. ## open(x, s, paramStr(1))
  26. ## while readRow(x):
  27. ## echo "new row: "
  28. ## for val in items(x.row):
  29. ## echo "##", val, "##"
  30. ## close(x)
  31. ## ```
  32. ##
  33. ## For CSV files with a header row, the header can be read and then used as a
  34. ## reference for item access with `rowEntry <#rowEntry,CsvParser,string>`_:
  35. ##
  36. ## ```nim
  37. ## import std/parsecsv
  38. ##
  39. ## # Prepare a file
  40. ## let content = """One,Two,Three,Four
  41. ## 1,2,3,4
  42. ## 10,20,30,40
  43. ## 100,200,300,400
  44. ## """
  45. ## writeFile("temp.csv", content)
  46. ##
  47. ## var p: CsvParser
  48. ## p.open("temp.csv")
  49. ## p.readHeaderRow()
  50. ## while p.readRow():
  51. ## echo "new row: "
  52. ## for col in items(p.headers):
  53. ## echo "##", col, ":", p.rowEntry(col), "##"
  54. ## p.close()
  55. ## ```
  56. ##
  57. ## See also
  58. ## ========
  59. ##
  60. ## * `streams module <streams.html>`_ for using
  61. ## `open proc <#open,CsvParser,Stream,string,char,char,char>`_
  62. ## and other stream processing (like `close proc <streams.html#close,Stream>`_)
  63. ## * `parseopt module <parseopt.html>`_ for a command line parser
  64. ## * `parsecfg module <parsecfg.html>`_ for a configuration file parser
  65. ## * `parsexml module <parsexml.html>`_ for a XML / HTML parser
  66. ## * `parsesql module <parsesql.html>`_ for a SQL parser
  67. ## * `other parsers <lib.html#pure-libraries-parsers>`_ for other parsers
  68. import std/[lexbase, streams]
  69. when defined(nimPreviewSlimSystem):
  70. import std/syncio
  71. type
  72. CsvRow* = seq[string] ## A row in a CSV file.
  73. CsvParser* = object of BaseLexer ## The parser object.
  74. ##
  75. ## It consists of two public fields:
  76. ## * `row` is the current row
  77. ## * `headers` are the columns that are defined in the csv file
  78. ## (read using `readHeaderRow <#readHeaderRow,CsvParser>`_).
  79. ## Used with `rowEntry <#rowEntry,CsvParser,string>`_).
  80. row*: CsvRow
  81. filename: string
  82. sep, quote, esc: char
  83. skipWhite: bool
  84. currRow: int
  85. headers*: seq[string]
  86. CsvError* = object of IOError ## An exception that is raised if
  87. ## a parsing error occurs.
  88. proc raiseEInvalidCsv(filename: string, line, col: int,
  89. msg: string) {.noreturn.} =
  90. var e: ref CsvError
  91. new(e)
  92. if filename.len == 0:
  93. e.msg = "Error: " & msg
  94. else:
  95. e.msg = filename & "(" & $line & ", " & $col & ") Error: " & msg
  96. raise e
  97. proc error(self: CsvParser, pos: int, msg: string) =
  98. raiseEInvalidCsv(self.filename, self.lineNumber, getColNumber(self, pos), msg)
  99. proc open*(self: var CsvParser, input: Stream, filename: string,
  100. separator = ',', quote = '"', escape = '\0',
  101. skipInitialSpace = false) =
  102. ## Initializes the parser with an input stream. `Filename` is only used
  103. ## for nice error messages. The parser's behaviour can be controlled by
  104. ## the diverse optional parameters:
  105. ## - `separator`: character used to separate fields
  106. ## - `quote`: Used to quote fields containing special characters like
  107. ## `separator`, `quote` or new-line characters. '\\0' disables the parsing
  108. ## of quotes.
  109. ## - `escape`: removes any special meaning from the following character;
  110. ## '\\0' disables escaping; if escaping is disabled and `quote` is not '\\0',
  111. ## two `quote` characters are parsed one literal `quote` character.
  112. ## - `skipInitialSpace`: If true, whitespace immediately following the
  113. ## `separator` is ignored.
  114. ##
  115. ## See also:
  116. ## * `open proc <#open,CsvParser,string,char,char,char>`_ which creates the
  117. ## file stream for you
  118. runnableExamples:
  119. import std/streams
  120. var strm = newStringStream("One,Two,Three\n1,2,3\n10,20,30")
  121. var parser: CsvParser
  122. parser.open(strm, "tmp.csv")
  123. parser.close()
  124. strm.close()
  125. lexbase.open(self, input)
  126. self.filename = filename
  127. self.sep = separator
  128. self.quote = quote
  129. self.esc = escape
  130. self.skipWhite = skipInitialSpace
  131. proc open*(self: var CsvParser, filename: string,
  132. separator = ',', quote = '"', escape = '\0',
  133. skipInitialSpace = false) =
  134. ## Similar to the `other open proc<#open,CsvParser,Stream,string,char,char,char>`_,
  135. ## but creates the file stream for you.
  136. runnableExamples:
  137. from std/os import removeFile
  138. writeFile("tmp.csv", "One,Two,Three\n1,2,3\n10,20,300")
  139. var parser: CsvParser
  140. parser.open("tmp.csv")
  141. parser.close()
  142. removeFile("tmp.csv")
  143. var s = newFileStream(filename, fmRead)
  144. if s == nil: self.error(0, "cannot open: " & filename)
  145. open(self, s, filename, separator,
  146. quote, escape, skipInitialSpace)
  147. proc parseField(self: var CsvParser, a: var string) =
  148. var pos = self.bufpos
  149. if self.skipWhite:
  150. while self.buf[pos] in {' ', '\t'}: inc(pos)
  151. setLen(a, 0) # reuse memory
  152. if self.buf[pos] == self.quote and self.quote != '\0':
  153. inc(pos)
  154. while true:
  155. let c = self.buf[pos]
  156. if c == '\0':
  157. self.bufpos = pos # can continue after exception?
  158. error(self, pos, self.quote & " expected")
  159. break
  160. elif c == self.quote:
  161. if self.esc == '\0' and self.buf[pos + 1] == self.quote:
  162. add(a, self.quote)
  163. inc(pos, 2)
  164. else:
  165. inc(pos)
  166. break
  167. elif c == self.esc:
  168. add(a, self.buf[pos + 1])
  169. inc(pos, 2)
  170. else:
  171. case c
  172. of '\c':
  173. pos = handleCR(self, pos)
  174. add(a, "\n")
  175. of '\l':
  176. pos = handleLF(self, pos)
  177. add(a, "\n")
  178. else:
  179. add(a, c)
  180. inc(pos)
  181. else:
  182. while true:
  183. let c = self.buf[pos]
  184. if c == self.sep: break
  185. if c in {'\c', '\l', '\0'}: break
  186. add(a, c)
  187. inc(pos)
  188. self.bufpos = pos
  189. proc processedRows*(self: var CsvParser): int {.inline.} =
  190. ## Returns number of the processed rows.
  191. ##
  192. ## But even if `readRow <#readRow,CsvParser,int>`_ arrived at EOF then
  193. ## processed rows counter is incremented.
  194. runnableExamples:
  195. import std/streams
  196. var strm = newStringStream("One,Two,Three\n1,2,3")
  197. var parser: CsvParser
  198. parser.open(strm, "tmp.csv")
  199. doAssert parser.readRow()
  200. doAssert parser.processedRows() == 1
  201. doAssert parser.readRow()
  202. doAssert parser.processedRows() == 2
  203. ## Even if `readRow` arrived at EOF then `processedRows` is incremented.
  204. doAssert parser.readRow() == false
  205. doAssert parser.processedRows() == 3
  206. doAssert parser.readRow() == false
  207. doAssert parser.processedRows() == 4
  208. parser.close()
  209. strm.close()
  210. self.currRow
  211. proc readRow*(self: var CsvParser, columns = 0): bool =
  212. ## Reads the next row; if `columns` > 0, it expects the row to have
  213. ## exactly this many columns. Returns false if the end of the file
  214. ## has been encountered else true.
  215. ##
  216. ## Blank lines are skipped.
  217. runnableExamples:
  218. import std/streams
  219. var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
  220. var parser: CsvParser
  221. parser.open(strm, "tmp.csv")
  222. doAssert parser.readRow()
  223. doAssert parser.row == @["One", "Two", "Three"]
  224. doAssert parser.readRow()
  225. doAssert parser.row == @["1", "2", "3"]
  226. ## Blank lines are skipped.
  227. doAssert parser.readRow()
  228. doAssert parser.row == @["10", "20", "30"]
  229. var emptySeq: seq[string]
  230. doAssert parser.readRow() == false
  231. doAssert parser.row == emptySeq
  232. doAssert parser.readRow() == false
  233. doAssert parser.row == emptySeq
  234. parser.close()
  235. strm.close()
  236. var col = 0 # current column
  237. let oldpos = self.bufpos
  238. # skip initial empty lines #8365
  239. while true:
  240. case self.buf[self.bufpos]
  241. of '\c': self.bufpos = handleCR(self, self.bufpos)
  242. of '\l': self.bufpos = handleLF(self, self.bufpos)
  243. else: break
  244. while self.buf[self.bufpos] != '\0':
  245. let oldlen = self.row.len
  246. if oldlen < col + 1:
  247. setLen(self.row, col + 1)
  248. self.row[col] = ""
  249. parseField(self, self.row[col])
  250. inc(col)
  251. if self.buf[self.bufpos] == self.sep:
  252. inc(self.bufpos)
  253. else:
  254. case self.buf[self.bufpos]
  255. of '\c', '\l':
  256. # skip empty lines:
  257. while true:
  258. case self.buf[self.bufpos]
  259. of '\c': self.bufpos = handleCR(self, self.bufpos)
  260. of '\l': self.bufpos = handleLF(self, self.bufpos)
  261. else: break
  262. of '\0': discard
  263. else: error(self, self.bufpos, self.sep & " expected")
  264. break
  265. setLen(self.row, col)
  266. result = col > 0
  267. if result and col != columns and columns > 0:
  268. error(self, oldpos + 1, $columns & " columns expected, but found " &
  269. $col & " columns")
  270. inc(self.currRow)
  271. proc close*(self: var CsvParser) {.inline.} =
  272. ## Closes the parser `self` and its associated input stream.
  273. lexbase.close(self)
  274. proc readHeaderRow*(self: var CsvParser) =
  275. ## Reads the first row and creates a look-up table for column numbers
  276. ## See also:
  277. ## * `rowEntry proc <#rowEntry,CsvParser,string>`_
  278. runnableExamples:
  279. import std/streams
  280. var strm = newStringStream("One,Two,Three\n1,2,3")
  281. var parser: CsvParser
  282. parser.open(strm, "tmp.csv")
  283. parser.readHeaderRow()
  284. doAssert parser.headers == @["One", "Two", "Three"]
  285. doAssert parser.row == @["One", "Two", "Three"]
  286. doAssert parser.readRow()
  287. doAssert parser.headers == @["One", "Two", "Three"]
  288. doAssert parser.row == @["1", "2", "3"]
  289. parser.close()
  290. strm.close()
  291. let present = self.readRow()
  292. if present:
  293. self.headers = self.row
  294. proc rowEntry*(self: var CsvParser, entry: string): var string =
  295. ## Accesses a specified `entry` from the current row.
  296. ##
  297. ## Assumes that `readHeaderRow <#readHeaderRow,CsvParser>`_ has already been
  298. ## called.
  299. ##
  300. ## If specified `entry` does not exist, raises KeyError.
  301. runnableExamples:
  302. import std/streams
  303. var strm = newStringStream("One,Two,Three\n1,2,3\n\n10,20,30")
  304. var parser: CsvParser
  305. parser.open(strm, "tmp.csv")
  306. ## Requires calling `readHeaderRow`.
  307. parser.readHeaderRow()
  308. doAssert parser.readRow()
  309. doAssert parser.rowEntry("One") == "1"
  310. doAssert parser.rowEntry("Two") == "2"
  311. doAssert parser.rowEntry("Three") == "3"
  312. doAssertRaises(KeyError):
  313. discard parser.rowEntry("NonexistentEntry")
  314. parser.close()
  315. strm.close()
  316. let index = self.headers.find(entry)
  317. if index >= 0:
  318. result = self.row[index]
  319. else:
  320. raise newException(KeyError, "Entry `" & entry & "` doesn't exist")
  321. when not defined(testing) and isMainModule:
  322. import std/os
  323. var s = newFileStream(paramStr(1), fmRead)
  324. if s == nil: quit("cannot open the file" & paramStr(1))
  325. var x: CsvParser
  326. open(x, s, paramStr(1))
  327. while readRow(x):
  328. echo "new row: "
  329. for val in items(x.row):
  330. echo "##", val, "##"
  331. close(x)