stats.nim 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2015 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Statistical analysis framework for performing
  10. ## basic statistical analysis of data.
  11. ## The data is analysed in a single pass, when it
  12. ## is pushed to a `RunningStat` or `RunningRegress` object.
  13. ##
  14. ## `RunningStat` calculates for a single data set
  15. ## - n (data count)
  16. ## - min (smallest value)
  17. ## - max (largest value)
  18. ## - sum
  19. ## - mean
  20. ## - variance
  21. ## - varianceS (sample variance)
  22. ## - standardDeviation
  23. ## - standardDeviationS (sample standard deviation)
  24. ## - skewness (the third statistical moment)
  25. ## - kurtosis (the fourth statistical moment)
  26. ##
  27. ## `RunningRegress` calculates for two sets of data
  28. ## - n (data count)
  29. ## - slope
  30. ## - intercept
  31. ## - correlation
  32. ##
  33. ## Procs are provided to calculate statistics on `openArray`s.
  34. ##
  35. ## However, if more than a single statistical calculation is required, it is more
  36. ## efficient to push the data once to a `RunningStat` object and then
  37. ## call the numerous statistical procs for the `RunningStat` object:
  38. runnableExamples:
  39. from std/math import almostEqual
  40. template `~=`(a, b: float): bool = almostEqual(a, b)
  41. var statistics: RunningStat # must be var
  42. statistics.push(@[1.0, 2.0, 1.0, 4.0, 1.0, 4.0, 1.0, 2.0])
  43. doAssert statistics.n == 8
  44. doAssert statistics.mean() ~= 2.0
  45. doAssert statistics.variance() ~= 1.5
  46. doAssert statistics.varianceS() ~= 1.714285714285715
  47. doAssert statistics.skewness() ~= 0.8164965809277261
  48. doAssert statistics.skewnessS() ~= 1.018350154434631
  49. doAssert statistics.kurtosis() ~= -1.0
  50. doAssert statistics.kurtosisS() ~= -0.7000000000000008
  51. from std/math import FloatClass, sqrt, pow, round
  52. when defined(nimPreviewSlimSystem):
  53. import std/[assertions, formatfloat]
  54. {.push debugger: off.} # the user does not want to trace a part
  55. # of the standard library!
  56. {.push checks: off, line_dir: off, stack_trace: off.}
  57. type
  58. RunningStat* = object ## An accumulator for statistical data.
  59. n*: int ## amount of pushed data
  60. min*, max*, sum*: float ## self-explaining
  61. mom1, mom2, mom3, mom4: float ## statistical moments, mom1 is mean
  62. RunningRegress* = object ## An accumulator for regression calculations.
  63. n*: int ## amount of pushed data
  64. x_stats*: RunningStat ## stats for the first set of data
  65. y_stats*: RunningStat ## stats for the second set of data
  66. s_xy: float ## accumulated data for combined xy
  67. # ----------- RunningStat --------------------------
  68. proc clear*(s: var RunningStat) =
  69. ## Resets `s`.
  70. s.n = 0
  71. s.min = 0.0
  72. s.max = 0.0
  73. s.sum = 0.0
  74. s.mom1 = 0.0
  75. s.mom2 = 0.0
  76. s.mom3 = 0.0
  77. s.mom4 = 0.0
  78. proc push*(s: var RunningStat, x: float) =
  79. ## Pushes a value `x` for processing.
  80. if s.n == 0:
  81. s.min = x
  82. s.max = x
  83. else:
  84. if s.min > x: s.min = x
  85. if s.max < x: s.max = x
  86. inc(s.n)
  87. # See Knuth TAOCP vol 2, 3rd edition, page 232
  88. s.sum += x
  89. let n = toFloat(s.n)
  90. let delta = x - s.mom1
  91. let delta_n = delta / toFloat(s.n)
  92. let delta_n2 = delta_n * delta_n
  93. let term1 = delta * delta_n * toFloat(s.n - 1)
  94. s.mom4 += term1 * delta_n2 * (n*n - 3*n + 3) +
  95. 6*delta_n2*s.mom2 - 4*delta_n*s.mom3
  96. s.mom3 += term1 * delta_n * (n - 2) - 3*delta_n*s.mom2
  97. s.mom2 += term1
  98. s.mom1 += delta_n
  99. proc push*(s: var RunningStat, x: int) =
  100. ## Pushes a value `x` for processing.
  101. ##
  102. ## `x` is simply converted to `float`
  103. ## and the other push operation is called.
  104. s.push(toFloat(x))
  105. proc push*(s: var RunningStat, x: openArray[float|int]) =
  106. ## Pushes all values of `x` for processing.
  107. ##
  108. ## Int values of `x` are simply converted to `float` and
  109. ## the other push operation is called.
  110. for val in x:
  111. s.push(val)
  112. proc mean*(s: RunningStat): float =
  113. ## Computes the current mean of `s`.
  114. result = s.mom1
  115. proc variance*(s: RunningStat): float =
  116. ## Computes the current population variance of `s`.
  117. result = s.mom2 / toFloat(s.n)
  118. proc varianceS*(s: RunningStat): float =
  119. ## Computes the current sample variance of `s`.
  120. if s.n > 1: result = s.mom2 / toFloat(s.n - 1)
  121. proc standardDeviation*(s: RunningStat): float =
  122. ## Computes the current population standard deviation of `s`.
  123. result = sqrt(variance(s))
  124. proc standardDeviationS*(s: RunningStat): float =
  125. ## Computes the current sample standard deviation of `s`.
  126. result = sqrt(varianceS(s))
  127. proc skewness*(s: RunningStat): float =
  128. ## Computes the current population skewness of `s`.
  129. result = sqrt(toFloat(s.n)) * s.mom3 / pow(s.mom2, 1.5)
  130. proc skewnessS*(s: RunningStat): float =
  131. ## Computes the current sample skewness of `s`.
  132. let s2 = skewness(s)
  133. result = sqrt(toFloat(s.n*(s.n-1)))*s2 / toFloat(s.n-2)
  134. proc kurtosis*(s: RunningStat): float =
  135. ## Computes the current population kurtosis of `s`.
  136. result = toFloat(s.n) * s.mom4 / (s.mom2 * s.mom2) - 3.0
  137. proc kurtosisS*(s: RunningStat): float =
  138. ## Computes the current sample kurtosis of `s`.
  139. result = toFloat(s.n-1) / toFloat((s.n-2)*(s.n-3)) *
  140. (toFloat(s.n+1)*kurtosis(s) + 6)
  141. proc `+`*(a, b: RunningStat): RunningStat =
  142. ## Combines two `RunningStat`s.
  143. ##
  144. ## Useful when performing parallel analysis of data series
  145. ## and needing to re-combine parallel result sets.
  146. result.clear()
  147. result.n = a.n + b.n
  148. let delta = b.mom1 - a.mom1
  149. let delta2 = delta*delta
  150. let delta3 = delta*delta2
  151. let delta4 = delta2*delta2
  152. let n = toFloat(result.n)
  153. result.mom1 = (a.n.float*a.mom1 + b.n.float*b.mom1) / n
  154. result.mom2 = a.mom2 + b.mom2 + delta2 * a.n.float * b.n.float / n
  155. result.mom3 = a.mom3 + b.mom3 +
  156. delta3 * a.n.float * b.n.float * (a.n.float - b.n.float)/(n*n);
  157. result.mom3 += 3.0*delta * (a.n.float*b.mom2 - b.n.float*a.mom2) / n
  158. result.mom4 = a.mom4 + b.mom4 +
  159. delta4*a.n.float*b.n.float * toFloat(a.n*a.n - a.n*b.n + b.n*b.n) /
  160. (n*n*n)
  161. result.mom4 += 6.0*delta2 * (a.n.float*a.n.float*b.mom2 + b.n.float*b.n.float*a.mom2) /
  162. (n*n) +
  163. 4.0*delta*(a.n.float*b.mom3 - b.n.float*a.mom3) / n
  164. result.max = max(a.max, b.max)
  165. result.min = min(a.min, b.min)
  166. proc `+=`*(a: var RunningStat, b: RunningStat) {.inline.} =
  167. ## Adds the `RunningStat` `b` to `a`.
  168. a = a + b
  169. proc `$`*(a: RunningStat): string =
  170. ## Produces a string representation of the `RunningStat`. The exact
  171. ## format is currently unspecified and subject to change. Currently
  172. ## it contains:
  173. ##
  174. ## - the number of probes
  175. ## - min, max values
  176. ## - sum, mean and standard deviation.
  177. result = "RunningStat(\n"
  178. result.add " number of probes: " & $a.n & "\n"
  179. result.add " max: " & $a.max & "\n"
  180. result.add " min: " & $a.min & "\n"
  181. result.add " sum: " & $a.sum & "\n"
  182. result.add " mean: " & $a.mean & "\n"
  183. result.add " std deviation: " & $a.standardDeviation & "\n"
  184. result.add ")"
  185. # ---------------------- standalone array/seq stats ---------------------
  186. proc mean*[T](x: openArray[T]): float =
  187. ## Computes the mean of `x`.
  188. var rs: RunningStat
  189. rs.push(x)
  190. result = rs.mean()
  191. proc variance*[T](x: openArray[T]): float =
  192. ## Computes the population variance of `x`.
  193. var rs: RunningStat
  194. rs.push(x)
  195. result = rs.variance()
  196. proc varianceS*[T](x: openArray[T]): float =
  197. ## Computes the sample variance of `x`.
  198. var rs: RunningStat
  199. rs.push(x)
  200. result = rs.varianceS()
  201. proc standardDeviation*[T](x: openArray[T]): float =
  202. ## Computes the population standard deviation of `x`.
  203. var rs: RunningStat
  204. rs.push(x)
  205. result = rs.standardDeviation()
  206. proc standardDeviationS*[T](x: openArray[T]): float =
  207. ## Computes the sample standard deviation of `x`.
  208. var rs: RunningStat
  209. rs.push(x)
  210. result = rs.standardDeviationS()
  211. proc skewness*[T](x: openArray[T]): float =
  212. ## Computes the population skewness of `x`.
  213. var rs: RunningStat
  214. rs.push(x)
  215. result = rs.skewness()
  216. proc skewnessS*[T](x: openArray[T]): float =
  217. ## Computes the sample skewness of `x`.
  218. var rs: RunningStat
  219. rs.push(x)
  220. result = rs.skewnessS()
  221. proc kurtosis*[T](x: openArray[T]): float =
  222. ## Computes the population kurtosis of `x`.
  223. var rs: RunningStat
  224. rs.push(x)
  225. result = rs.kurtosis()
  226. proc kurtosisS*[T](x: openArray[T]): float =
  227. ## Computes the sample kurtosis of `x`.
  228. var rs: RunningStat
  229. rs.push(x)
  230. result = rs.kurtosisS()
  231. # ---------------------- Running Regression -----------------------------
  232. proc clear*(r: var RunningRegress) =
  233. ## Resets `r`.
  234. r.x_stats.clear()
  235. r.y_stats.clear()
  236. r.s_xy = 0.0
  237. r.n = 0
  238. proc push*(r: var RunningRegress, x, y: float) =
  239. ## Pushes two values `x` and `y` for processing.
  240. r.s_xy += (r.x_stats.mean() - x)*(r.y_stats.mean() - y) *
  241. toFloat(r.n) / toFloat(r.n + 1)
  242. r.x_stats.push(x)
  243. r.y_stats.push(y)
  244. inc(r.n)
  245. proc push*(r: var RunningRegress, x, y: int) {.inline.} =
  246. ## Pushes two values `x` and `y` for processing.
  247. ##
  248. ## `x` and `y` are converted to `float`
  249. ## and the other push operation is called.
  250. r.push(toFloat(x), toFloat(y))
  251. proc push*(r: var RunningRegress, x, y: openArray[float|int]) =
  252. ## Pushes two sets of values `x` and `y` for processing.
  253. assert(x.len == y.len)
  254. for i in 0..<x.len:
  255. r.push(x[i], y[i])
  256. proc slope*(r: RunningRegress): float =
  257. ## Computes the current slope of `r`.
  258. let s_xx = r.x_stats.varianceS()*toFloat(r.n - 1)
  259. result = r.s_xy / s_xx
  260. proc intercept*(r: RunningRegress): float =
  261. ## Computes the current intercept of `r`.
  262. result = r.y_stats.mean() - r.slope()*r.x_stats.mean()
  263. proc correlation*(r: RunningRegress): float =
  264. ## Computes the current correlation of the two data
  265. ## sets pushed into `r`.
  266. let t = r.x_stats.standardDeviation() * r.y_stats.standardDeviation()
  267. result = r.s_xy / (toFloat(r.n) * t)
  268. proc `+`*(a, b: RunningRegress): RunningRegress =
  269. ## Combines two `RunningRegress` objects.
  270. ##
  271. ## Useful when performing parallel analysis of data series
  272. ## and needing to re-combine parallel result sets
  273. result.clear()
  274. result.x_stats = a.x_stats + b.x_stats
  275. result.y_stats = a.y_stats + b.y_stats
  276. result.n = a.n + b.n
  277. let delta_x = b.x_stats.mean() - a.x_stats.mean()
  278. let delta_y = b.y_stats.mean() - a.y_stats.mean()
  279. result.s_xy = a.s_xy + b.s_xy +
  280. toFloat(a.n*b.n)*delta_x*delta_y/toFloat(result.n)
  281. proc `+=`*(a: var RunningRegress, b: RunningRegress) =
  282. ## Adds the `RunningRegress` `b` to `a`.
  283. a = a + b
  284. {.pop.}
  285. {.pop.}