concurrency_regex.nim 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. discard """
  2. action: compile
  3. """
  4. # See this page for info about the format https://wikitech.wikimedia.org/wiki/Analytics/Data/Pagecounts-all-sites
  5. import tables, parseutils, strutils, threadpool, re
  6. const filename = "pagecounts-20160101-050000"
  7. type
  8. Stats = ref object
  9. projectName, pageTitle: string
  10. requests, contentSize: int
  11. proc `$`(stats: Stats): string =
  12. "(projectName: $#, pageTitle: $#, requests: $#, contentSize: $#)" % [
  13. stats.projectName, stats.pageTitle, $stats.requests, $stats.contentSize
  14. ]
  15. proc parse(chunk: string): Stats =
  16. # Each line looks like: en Main_Page 242332 4737756101
  17. result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
  18. var matches: array[4, string]
  19. var reg = re"([^\s]+)\s([^\s]+)\s(\d+)\s(\d+)"
  20. for line in chunk.splitLines:
  21. let start = find(line, reg, matches)
  22. if start == -1: continue
  23. let requestsInt = matches[2].parseInt
  24. if requestsInt > result.requests and matches[0] == "en":
  25. result = Stats(
  26. projectName: matches[0],
  27. pageTitle: matches[1],
  28. requests: requestsInt,
  29. contentSize: matches[3].parseInt
  30. )
  31. proc readChunks(filename: string, chunksize = 1000000): Stats =
  32. result = Stats(projectName: "", pageTitle: "", requests: 0, contentSize: 0)
  33. var file = open(filename)
  34. var responses = newSeq[FlowVar[Stats]]()
  35. var buffer = newString(chunksize)
  36. var oldBufferLen = 0
  37. while not endOfFile(file):
  38. let readSize = file.readChars(buffer, oldBufferLen, chunksize - oldBufferLen) + oldBufferLen
  39. var chunkLen = readSize
  40. while chunkLen >= 0 and buffer[chunkLen - 1] notin NewLines:
  41. # Find where the last line ends
  42. chunkLen.dec
  43. responses.add(spawn parse(buffer[0 ..< chunkLen]))
  44. oldBufferLen = readSize - chunkLen
  45. buffer[0 ..< oldBufferLen] = buffer[readSize - oldBufferLen .. ^1]
  46. echo("Spawns: ", responses.len)
  47. for resp in responses:
  48. let statistic = ^resp
  49. if statistic.requests > result.requests:
  50. result = statistic
  51. file.close()
  52. when true:
  53. echo readChunks(filename)