parallel_counts.nim 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. discard """
  2. action: compile
  3. """
  4. import os, parseutils, threadpool, strutils
  5. type
  6. Stats = ref object
  7. domainCode, pageTitle: string
  8. countViews, totalSize: int
  9. proc newStats(): Stats =
  10. Stats(domainCode: "", pageTitle: "", countViews: 0, totalSize: 0)
  11. proc `$`(stats: Stats): string =
  12. "(domainCode: $#, pageTitle: $#, countViews: $#, totalSize: $#)" % [
  13. stats.domainCode, stats.pageTitle, $stats.countViews, $stats.totalSize
  14. ]
  15. proc parse(line: string, domainCode, pageTitle: var string,
  16. countViews, totalSize: var int) =
  17. if line.len == 0: return
  18. var i = 0
  19. domainCode.setLen(0)
  20. i.inc parseUntil(line, domainCode, {' '}, i)
  21. i.inc
  22. pageTitle.setLen(0)
  23. i.inc parseUntil(line, pageTitle, {' '}, i)
  24. i.inc
  25. countViews = 0
  26. i.inc parseInt(line, countViews, i)
  27. i.inc
  28. totalSize = 0
  29. i.inc parseInt(line, totalSize, i)
  30. proc parseChunk(chunk: string): Stats =
  31. result = newStats()
  32. var domainCode = ""
  33. var pageTitle = ""
  34. var countViews = 0
  35. var totalSize = 0
  36. for line in splitLines(chunk):
  37. parse(line, domainCode, pageTitle, countViews, totalSize)
  38. if domainCode == "en" and countViews > result.countViews:
  39. result = Stats(domainCode: domainCode, pageTitle: pageTitle,
  40. countViews: countViews, totalSize: totalSize)
  41. proc readPageCounts(filename: string, chunkSize = 1_000_000) =
  42. var file = open(filename)
  43. var responses = newSeq[FlowVar[Stats]]()
  44. var buffer = newString(chunksize)
  45. var oldBufferLen = 0
  46. while not endOfFile(file):
  47. let reqSize = chunksize - oldBufferLen
  48. let readSize = file.readChars(buffer, oldBufferLen, reqSize) + oldBufferLen
  49. var chunkLen = readSize
  50. while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
  51. chunkLen.dec
  52. responses.add(spawn parseChunk(buffer[0 ..< chunkLen]))
  53. oldBufferLen = readSize - chunkLen
  54. buffer[0 ..< oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]
  55. var mostPopular = newStats()
  56. for resp in responses:
  57. let statistic = ^resp
  58. if statistic.countViews > mostPopular.countViews:
  59. mostPopular = statistic
  60. echo("Most popular is: ", mostPopular)
  61. when true:
  62. const file = "pagecounts-20160101-050000"
  63. let filename = getCurrentDir() / file
  64. readPageCounts(filename)