stats.nim 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2015 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Statistical analysis framework for performing
  10. ## basic statistical analysis of data.
  11. ## The data is analysed in a single pass, when it
  12. ## is pushed to a `RunningStat` or `RunningRegress` object.
  13. ##
  14. ## `RunningStat` calculates for a single data set
  15. ## - n (data count)
  16. ## - min (smallest value)
  17. ## - max (largest value)
  18. ## - sum
  19. ## - mean
  20. ## - variance
  21. ## - varianceS (sample variance)
  22. ## - standardDeviation
  23. ## - standardDeviationS (sample standard deviation)
  24. ## - skewness (the third statistical moment)
  25. ## - kurtosis (the fourth statistical moment)
  26. ##
  27. ## `RunningRegress` calculates for two sets of data
  28. ## - n (data count)
  29. ## - slope
  30. ## - intercept
  31. ## - correlation
  32. ##
  33. ## Procs are provided to calculate statistics on `openArray`s.
  34. ##
  35. ## However, if more than a single statistical calculation is required, it is more
  36. ## efficient to push the data once to a `RunningStat` object and then
  37. ## call the numerous statistical procs for the `RunningStat` object:
  38. runnableExamples:
  39. from std/math import almostEqual
  40. template `~=`(a, b: float): bool = almostEqual(a, b)
  41. var statistics: RunningStat # must be var
  42. statistics.push(@[1.0, 2.0, 1.0, 4.0, 1.0, 4.0, 1.0, 2.0])
  43. doAssert statistics.n == 8
  44. doAssert statistics.mean() ~= 2.0
  45. doAssert statistics.variance() ~= 1.5
  46. doAssert statistics.varianceS() ~= 1.714285714285715
  47. doAssert statistics.skewness() ~= 0.8164965809277261
  48. doAssert statistics.skewnessS() ~= 1.018350154434631
  49. doAssert statistics.kurtosis() ~= -1.0
  50. doAssert statistics.kurtosisS() ~= -0.7000000000000008
  51. from std/math import FloatClass, sqrt, pow, round
  52. when defined(nimPreviewSlimSystem):
  53. import std/[assertions, formatfloat]
  54. {.push debugger: off.} # the user does not want to trace a part
  55. # of the standard library!
  56. {.push checks: off, line_dir: off, stack_trace: off.}
  57. type
  58. RunningStat* = object ## An accumulator for statistical data.
  59. n*: int ## amount of pushed data
  60. min*, max*, sum*: float ## self-explaining
  61. mom1, mom2, mom3, mom4: float ## statistical moments, mom1 is mean
  62. RunningRegress* = object ## An accumulator for regression calculations.
  63. n*: int ## amount of pushed data
  64. x_stats*: RunningStat ## stats for the first set of data
  65. y_stats*: RunningStat ## stats for the second set of data
  66. s_xy: float ## accumulated data for combined xy
  67. # ----------- RunningStat --------------------------
  68. proc clear*(s: var RunningStat) =
  69. ## Resets `s`.
  70. s.n = 0
  71. s.min = 0.0
  72. s.max = 0.0
  73. s.sum = 0.0
  74. s.mom1 = 0.0
  75. s.mom2 = 0.0
  76. s.mom3 = 0.0
  77. s.mom4 = 0.0
  78. proc push*(s: var RunningStat, x: float) =
  79. ## Pushes a value `x` for processing.
  80. if s.n == 0:
  81. s.min = x
  82. s.max = x
  83. else:
  84. if s.min > x: s.min = x
  85. if s.max < x: s.max = x
  86. inc(s.n)
  87. # See Knuth TAOCP vol 2, 3rd edition, page 232
  88. s.sum += x
  89. let n = toFloat(s.n)
  90. let delta = x - s.mom1
  91. let delta_n = delta / toFloat(s.n)
  92. let delta_n2 = delta_n * delta_n
  93. let term1 = delta * delta_n * toFloat(s.n - 1)
  94. s.mom4 += term1 * delta_n2 * (n*n - 3*n + 3) +
  95. 6*delta_n2*s.mom2 - 4*delta_n*s.mom3
  96. s.mom3 += term1 * delta_n * (n - 2) - 3*delta_n*s.mom2
  97. s.mom2 += term1
  98. s.mom1 += delta_n
  99. proc push*(s: var RunningStat, x: int) =
  100. ## Pushes a value `x` for processing.
  101. ##
  102. ## `x` is simply converted to `float`
  103. ## and the other push operation is called.
  104. s.push(toFloat(x))
  105. proc push*(s: var RunningStat, x: openArray[float|int]) =
  106. ## Pushes all values of `x` for processing.
  107. ##
  108. ## Int values of `x` are simply converted to `float` and
  109. ## the other push operation is called.
  110. for val in x:
  111. s.push(val)
  112. proc mean*(s: RunningStat): float =
  113. ## Computes the current mean of `s`.
  114. result = s.mom1
  115. proc variance*(s: RunningStat): float =
  116. ## Computes the current population variance of `s`.
  117. result = s.mom2 / toFloat(s.n)
  118. proc varianceS*(s: RunningStat): float =
  119. ## Computes the current sample variance of `s`.
  120. if s.n > 1: result = s.mom2 / toFloat(s.n - 1)
  121. else: result = 0.0
  122. proc standardDeviation*(s: RunningStat): float =
  123. ## Computes the current population standard deviation of `s`.
  124. result = sqrt(variance(s))
  125. proc standardDeviationS*(s: RunningStat): float =
  126. ## Computes the current sample standard deviation of `s`.
  127. result = sqrt(varianceS(s))
  128. proc skewness*(s: RunningStat): float =
  129. ## Computes the current population skewness of `s`.
  130. result = sqrt(toFloat(s.n)) * s.mom3 / pow(s.mom2, 1.5)
  131. proc skewnessS*(s: RunningStat): float =
  132. ## Computes the current sample skewness of `s`.
  133. let s2 = skewness(s)
  134. result = sqrt(toFloat(s.n*(s.n-1)))*s2 / toFloat(s.n-2)
  135. proc kurtosis*(s: RunningStat): float =
  136. ## Computes the current population kurtosis of `s`.
  137. result = toFloat(s.n) * s.mom4 / (s.mom2 * s.mom2) - 3.0
  138. proc kurtosisS*(s: RunningStat): float =
  139. ## Computes the current sample kurtosis of `s`.
  140. result = toFloat(s.n-1) / toFloat((s.n-2)*(s.n-3)) *
  141. (toFloat(s.n+1)*kurtosis(s) + 6)
  142. proc `+`*(a, b: RunningStat): RunningStat =
  143. ## Combines two `RunningStat`s.
  144. ##
  145. ## Useful when performing parallel analysis of data series
  146. ## and needing to re-combine parallel result sets.
  147. result = default(RunningStat)
  148. result.clear()
  149. result.n = a.n + b.n
  150. let delta = b.mom1 - a.mom1
  151. let delta2 = delta*delta
  152. let delta3 = delta*delta2
  153. let delta4 = delta2*delta2
  154. let n = toFloat(result.n)
  155. result.mom1 = (a.n.float*a.mom1 + b.n.float*b.mom1) / n
  156. result.mom2 = a.mom2 + b.mom2 + delta2 * a.n.float * b.n.float / n
  157. result.mom3 = a.mom3 + b.mom3 +
  158. delta3 * a.n.float * b.n.float * (a.n.float - b.n.float)/(n*n);
  159. result.mom3 += 3.0*delta * (a.n.float*b.mom2 - b.n.float*a.mom2) / n
  160. result.mom4 = a.mom4 + b.mom4 +
  161. delta4*a.n.float*b.n.float * toFloat(a.n*a.n - a.n*b.n + b.n*b.n) /
  162. (n*n*n)
  163. result.mom4 += 6.0*delta2 * (a.n.float*a.n.float*b.mom2 + b.n.float*b.n.float*a.mom2) /
  164. (n*n) +
  165. 4.0*delta*(a.n.float*b.mom3 - b.n.float*a.mom3) / n
  166. result.max = max(a.max, b.max)
  167. result.min = min(a.min, b.min)
  168. proc `+=`*(a: var RunningStat, b: RunningStat) {.inline.} =
  169. ## Adds the `RunningStat` `b` to `a`.
  170. a = a + b
  171. proc `$`*(a: RunningStat): string =
  172. ## Produces a string representation of the `RunningStat`. The exact
  173. ## format is currently unspecified and subject to change. Currently
  174. ## it contains:
  175. ##
  176. ## - the number of probes
  177. ## - min, max values
  178. ## - sum, mean and standard deviation.
  179. result = "RunningStat(\n"
  180. result.add " number of probes: " & $a.n & "\n"
  181. result.add " max: " & $a.max & "\n"
  182. result.add " min: " & $a.min & "\n"
  183. result.add " sum: " & $a.sum & "\n"
  184. result.add " mean: " & $a.mean & "\n"
  185. result.add " std deviation: " & $a.standardDeviation & "\n"
  186. result.add ")"
  187. # ---------------------- standalone array/seq stats ---------------------
  188. proc mean*[T](x: openArray[T]): float =
  189. ## Computes the mean of `x`.
  190. var rs: RunningStat
  191. rs.push(x)
  192. result = rs.mean()
  193. proc variance*[T](x: openArray[T]): float =
  194. ## Computes the population variance of `x`.
  195. var rs: RunningStat
  196. rs.push(x)
  197. result = rs.variance()
  198. proc varianceS*[T](x: openArray[T]): float =
  199. ## Computes the sample variance of `x`.
  200. var rs: RunningStat
  201. rs.push(x)
  202. result = rs.varianceS()
  203. proc standardDeviation*[T](x: openArray[T]): float =
  204. ## Computes the population standard deviation of `x`.
  205. var rs: RunningStat
  206. rs.push(x)
  207. result = rs.standardDeviation()
  208. proc standardDeviationS*[T](x: openArray[T]): float =
  209. ## Computes the sample standard deviation of `x`.
  210. var rs: RunningStat
  211. rs.push(x)
  212. result = rs.standardDeviationS()
  213. proc skewness*[T](x: openArray[T]): float =
  214. ## Computes the population skewness of `x`.
  215. var rs: RunningStat
  216. rs.push(x)
  217. result = rs.skewness()
  218. proc skewnessS*[T](x: openArray[T]): float =
  219. ## Computes the sample skewness of `x`.
  220. var rs: RunningStat
  221. rs.push(x)
  222. result = rs.skewnessS()
  223. proc kurtosis*[T](x: openArray[T]): float =
  224. ## Computes the population kurtosis of `x`.
  225. var rs: RunningStat
  226. rs.push(x)
  227. result = rs.kurtosis()
  228. proc kurtosisS*[T](x: openArray[T]): float =
  229. ## Computes the sample kurtosis of `x`.
  230. var rs: RunningStat
  231. rs.push(x)
  232. result = rs.kurtosisS()
  233. # ---------------------- Running Regression -----------------------------
  234. proc clear*(r: var RunningRegress) =
  235. ## Resets `r`.
  236. r.x_stats.clear()
  237. r.y_stats.clear()
  238. r.s_xy = 0.0
  239. r.n = 0
  240. proc push*(r: var RunningRegress, x, y: float) =
  241. ## Pushes two values `x` and `y` for processing.
  242. r.s_xy += (r.x_stats.mean() - x)*(r.y_stats.mean() - y) *
  243. toFloat(r.n) / toFloat(r.n + 1)
  244. r.x_stats.push(x)
  245. r.y_stats.push(y)
  246. inc(r.n)
  247. proc push*(r: var RunningRegress, x, y: int) {.inline.} =
  248. ## Pushes two values `x` and `y` for processing.
  249. ##
  250. ## `x` and `y` are converted to `float`
  251. ## and the other push operation is called.
  252. r.push(toFloat(x), toFloat(y))
  253. proc push*(r: var RunningRegress, x, y: openArray[float|int]) =
  254. ## Pushes two sets of values `x` and `y` for processing.
  255. assert(x.len == y.len)
  256. for i in 0..<x.len:
  257. r.push(x[i], y[i])
  258. proc slope*(r: RunningRegress): float =
  259. ## Computes the current slope of `r`.
  260. let s_xx = r.x_stats.varianceS()*toFloat(r.n - 1)
  261. result = r.s_xy / s_xx
  262. proc intercept*(r: RunningRegress): float =
  263. ## Computes the current intercept of `r`.
  264. result = r.y_stats.mean() - r.slope()*r.x_stats.mean()
  265. proc correlation*(r: RunningRegress): float =
  266. ## Computes the current correlation of the two data
  267. ## sets pushed into `r`.
  268. let t = r.x_stats.standardDeviation() * r.y_stats.standardDeviation()
  269. result = r.s_xy / (toFloat(r.n) * t)
  270. proc `+`*(a, b: RunningRegress): RunningRegress =
  271. ## Combines two `RunningRegress` objects.
  272. ##
  273. ## Useful when performing parallel analysis of data series
  274. ## and needing to re-combine parallel result sets
  275. result = default(RunningRegress)
  276. result.clear()
  277. result.x_stats = a.x_stats + b.x_stats
  278. result.y_stats = a.y_stats + b.y_stats
  279. result.n = a.n + b.n
  280. let delta_x = b.x_stats.mean() - a.x_stats.mean()
  281. let delta_y = b.y_stats.mean() - a.y_stats.mean()
  282. result.s_xy = a.s_xy + b.s_xy +
  283. toFloat(a.n*b.n)*delta_x*delta_y/toFloat(result.n)
  284. proc `+=`*(a: var RunningRegress, b: RunningRegress) =
  285. ## Adds the `RunningRegress` `b` to `a`.
  286. a = a + b
  287. {.pop.}
  288. {.pop.}