stats.nim 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2015 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## Statistical analysis framework for performing
  10. ## basic statistical analysis of data.
  11. ## The data is analysed in a single pass, when a data value
  12. ## is pushed to the ``RunningStat`` or ``RunningRegress`` objects
  13. ##
  14. ## ``RunningStat`` calculates for a single data set
  15. ## - n (data count)
  16. ## - min (smallest value)
  17. ## - max (largest value)
  18. ## - sum
  19. ## - mean
  20. ## - variance
  21. ## - varianceS (sample var)
  22. ## - standardDeviation
  23. ## - standardDeviationS (sample stddev)
  24. ## - skewness (the third statistical moment)
  25. ## - kurtosis (the fourth statistical moment)
  26. ##
  27. ## ``RunningRegress`` calculates for two sets of data
  28. ## - n
  29. ## - slope
  30. ## - intercept
  31. ## - correlation
  32. ##
  33. ## Procs have been provided to calculate statistics on arrays and sequences.
  34. ##
  35. ## However, if more than a single statistical calculation is required, it is more
  36. ## efficient to push the data once to the RunningStat object, and
  37. ## call the numerous statistical procs for the RunningStat object.
  38. ##
  39. ## .. code-block:: Nim
  40. ##
  41. ## var rs: RunningStat
  42. ## rs.push(MySeqOfData)
  43. ## rs.mean()
  44. ## rs.variance()
  45. ## rs.skewness()
  46. ## rs.kurtosis()
  47. from math import FloatClass, sqrt, pow, round
  48. {.push debugger:off .} # the user does not want to trace a part
  49. # of the standard library!
  50. {.push checks:off, line_dir:off, stack_trace:off.}
  51. type
  52. RunningStat* = object ## an accumulator for statistical data
  53. n*: int ## number of pushed data
  54. min*, max*, sum*: float ## self-explaining
  55. mom1, mom2, mom3, mom4: float ## statistical moments, mom1 is mean
  56. RunningRegress* = object ## an accumulator for regression calculations
  57. n*: int ## number of pushed data
  58. x_stats*: RunningStat ## stats for first set of data
  59. y_stats*: RunningStat ## stats for second set of data
  60. s_xy: float ## accumulated data for combined xy
  61. # ----------- RunningStat --------------------------
  62. proc clear*(s: var RunningStat) =
  63. ## reset `s`
  64. s.n = 0
  65. s.min = toBiggestFloat(int.high)
  66. s.max = 0.0
  67. s.sum = 0.0
  68. s.mom1 = 0.0
  69. s.mom2 = 0.0
  70. s.mom3 = 0.0
  71. s.mom4 = 0.0
  72. proc push*(s: var RunningStat, x: float) =
  73. ## pushes a value `x` for processing
  74. if s.n == 0: s.min = x
  75. inc(s.n)
  76. # See Knuth TAOCP vol 2, 3rd edition, page 232
  77. if s.min > x: s.min = x
  78. if s.max < x: s.max = x
  79. s.sum += x
  80. let n = toFloat(s.n)
  81. let delta = x - s.mom1
  82. let delta_n = delta / toFloat(s.n)
  83. let delta_n2 = delta_n * delta_n
  84. let term1 = delta * delta_n * toFloat(s.n - 1)
  85. s.mom4 += term1 * delta_n2 * (n*n - 3*n + 3) +
  86. 6*delta_n2*s.mom2 - 4*delta_n*s.mom3
  87. s.mom3 += term1 * delta_n * (n - 2) - 3*delta_n*s.mom2
  88. s.mom2 += term1
  89. s.mom1 += delta_n
  90. proc push*(s: var RunningStat, x: int) =
  91. ## pushes a value `x` for processing.
  92. ##
  93. ## `x` is simply converted to ``float``
  94. ## and the other push operation is called.
  95. s.push(toFloat(x))
  96. proc push*(s: var RunningStat, x: openarray[float|int]) =
  97. ## pushes all values of `x` for processing.
  98. ##
  99. ## Int values of `x` are simply converted to ``float`` and
  100. ## the other push operation is called.
  101. for val in x:
  102. s.push(val)
  103. proc mean*(s: RunningStat): float =
  104. ## computes the current mean of `s`
  105. result = s.mom1
  106. proc variance*(s: RunningStat): float =
  107. ## computes the current population variance of `s`
  108. result = s.mom2 / toFloat(s.n)
  109. proc varianceS*(s: RunningStat): float =
  110. ## computes the current sample variance of `s`
  111. if s.n > 1: result = s.mom2 / toFloat(s.n - 1)
  112. proc standardDeviation*(s: RunningStat): float =
  113. ## computes the current population standard deviation of `s`
  114. result = sqrt(variance(s))
  115. proc standardDeviationS*(s: RunningStat): float =
  116. ## computes the current sample standard deviation of `s`
  117. result = sqrt(varianceS(s))
  118. proc skewness*(s: RunningStat): float =
  119. ## computes the current population skewness of `s`
  120. result = sqrt(toFloat(s.n)) * s.mom3 / pow(s.mom2, 1.5)
  121. proc skewnessS*(s: RunningStat): float =
  122. ## computes the current sample skewness of `s`
  123. let s2 = skewness(s)
  124. result = sqrt(toFloat(s.n*(s.n-1)))*s2 / toFloat(s.n-2)
  125. proc kurtosis*(s: RunningStat): float =
  126. ## computes the current population kurtosis of `s`
  127. result = toFloat(s.n) * s.mom4 / (s.mom2 * s.mom2) - 3.0
  128. proc kurtosisS*(s: RunningStat): float =
  129. ## computes the current sample kurtosis of `s`
  130. result = toFloat(s.n-1) / toFloat((s.n-2)*(s.n-3)) *
  131. (toFloat(s.n+1)*kurtosis(s) + 6)
  132. proc `+`*(a, b: RunningStat): RunningStat =
  133. ## combine two RunningStats.
  134. ##
  135. ## Useful if performing parallel analysis of data series
  136. ## and need to re-combine parallel result sets
  137. result.clear()
  138. result.n = a.n + b.n
  139. let delta = b.mom1 - a.mom1
  140. let delta2 = delta*delta
  141. let delta3 = delta*delta2
  142. let delta4 = delta2*delta2
  143. let n = toFloat(result.n)
  144. result.mom1 = (a.n.float*a.mom1 + b.n.float*b.mom1) / n
  145. result.mom2 = a.mom2 + b.mom2 + delta2 * a.n.float * b.n.float / n
  146. result.mom3 = a.mom3 + b.mom3 +
  147. delta3 * a.n.float * b.n.float * (a.n.float - b.n.float)/(n*n);
  148. result.mom3 += 3.0*delta * (a.n.float*b.mom2 - b.n.float*a.mom2) / n
  149. result.mom4 = a.mom4 + b.mom4 +
  150. delta4*a.n.float*b.n.float * toFloat(a.n*a.n - a.n*b.n + b.n*b.n) /
  151. (n*n*n)
  152. result.mom4 += 6.0*delta2 * (a.n.float*a.n.float*b.mom2 + b.n.float*b.n.float*a.mom2) /
  153. (n*n) +
  154. 4.0*delta*(a.n.float*b.mom3 - b.n.float*a.mom3) / n
  155. result.max = max(a.max, b.max)
  156. result.min = max(a.min, b.min)
  157. proc `+=`*(a: var RunningStat, b: RunningStat) {.inline.} =
  158. ## add a second RunningStats `b` to `a`
  159. a = a + b
  160. proc `$`*(a: RunningStat): string =
  161. ## produces a string representation of the ``RunningStat``. The exact
  162. ## format is currently unspecified and subject to change. Currently
  163. ## it contains:
  164. ##
  165. ## - the number of probes
  166. ## - min, max values
  167. ## - sum, mean and standard deviation.
  168. result = "RunningStat(\n"
  169. result.add " number of probes: " & $a.n & "\n"
  170. result.add " max: " & $a.max & "\n"
  171. result.add " min: " & $a.min & "\n"
  172. result.add " sum: " & $a.sum & "\n"
  173. result.add " mean: " & $a.mean & "\n"
  174. result.add " std deviation: " & $a.standardDeviation & "\n"
  175. result.add ")"
  176. # ---------------------- standalone array/seq stats ---------------------
  177. proc mean*[T](x: openArray[T]): float =
  178. ## computes the mean of `x`
  179. var rs: RunningStat
  180. rs.push(x)
  181. result = rs.mean()
  182. proc variance*[T](x: openArray[T]): float =
  183. ## computes the population variance of `x`
  184. var rs: RunningStat
  185. rs.push(x)
  186. result = rs.variance()
  187. proc varianceS*[T](x: openArray[T]): float =
  188. ## computes the sample variance of `x`
  189. var rs: RunningStat
  190. rs.push(x)
  191. result = rs.varianceS()
  192. proc standardDeviation*[T](x: openArray[T]): float =
  193. ## computes the population standardDeviation of `x`
  194. var rs: RunningStat
  195. rs.push(x)
  196. result = rs.standardDeviation()
  197. proc standardDeviationS*[T](x: openArray[T]): float =
  198. ## computes the sanple standardDeviation of `x`
  199. var rs: RunningStat
  200. rs.push(x)
  201. result = rs.standardDeviationS()
  202. proc skewness*[T](x: openArray[T]): float =
  203. ## computes the population skewness of `x`
  204. var rs: RunningStat
  205. rs.push(x)
  206. result = rs.skewness()
  207. proc skewnessS*[T](x: openArray[T]): float =
  208. ## computes the sample skewness of `x`
  209. var rs: RunningStat
  210. rs.push(x)
  211. result = rs.skewnessS()
  212. proc kurtosis*[T](x: openArray[T]): float =
  213. ## computes the population kurtosis of `x`
  214. var rs: RunningStat
  215. rs.push(x)
  216. result = rs.kurtosis()
  217. proc kurtosisS*[T](x: openArray[T]): float =
  218. ## computes the sample kurtosis of `x`
  219. var rs: RunningStat
  220. rs.push(x)
  221. result = rs.kurtosisS()
  222. # ---------------------- Running Regression -----------------------------
  223. proc clear*(r: var RunningRegress) =
  224. ## reset `r`
  225. r.x_stats.clear()
  226. r.y_stats.clear()
  227. r.s_xy = 0.0
  228. r.n = 0
  229. proc push*(r: var RunningRegress, x, y: float) =
  230. ## pushes two values `x` and `y` for processing
  231. r.s_xy += (r.x_stats.mean() - x)*(r.y_stats.mean() - y)*
  232. toFloat(r.n) / toFloat(r.n + 1)
  233. r.x_stats.push(x)
  234. r.y_stats.push(y)
  235. inc(r.n)
  236. proc push*(r: var RunningRegress, x, y: int) {.inline.} =
  237. ## pushes two values `x` and `y` for processing.
  238. ##
  239. ## `x` and `y` are converted to ``float``
  240. ## and the other push operation is called.
  241. r.push(toFloat(x), toFloat(y))
  242. proc push*(r: var RunningRegress, x, y: openarray[float|int]) =
  243. ## pushes two sets of values `x` and `y` for processing.
  244. assert(x.len == y.len)
  245. for i in 0..<x.len:
  246. r.push(x[i], y[i])
  247. proc slope*(r: RunningRegress): float =
  248. ## computes the current slope of `r`
  249. let s_xx = r.x_stats.varianceS()*toFloat(r.n - 1)
  250. result = r.s_xy / s_xx
  251. proc intercept*(r: RunningRegress): float =
  252. ## computes the current intercept of `r`
  253. result = r.y_stats.mean() - r.slope()*r.x_stats.mean()
  254. proc correlation*(r: RunningRegress): float =
  255. ## computes the current correlation of the two data
  256. ## sets pushed into `r`
  257. let t = r.x_stats.standardDeviation() * r.y_stats.standardDeviation()
  258. result = r.s_xy / ( toFloat(r.n) * t )
  259. proc `+`*(a, b: RunningRegress): RunningRegress =
  260. ## combine two `RunningRegress` objects.
  261. ##
  262. ## Useful if performing parallel analysis of data series
  263. ## and need to re-combine parallel result sets
  264. result.clear()
  265. result.x_stats = a.x_stats + b.x_stats
  266. result.y_stats = a.y_stats + b.y_stats
  267. result.n = a.n + b.n
  268. let delta_x = b.x_stats.mean() - a.x_stats.mean()
  269. let delta_y = b.y_stats.mean() - a.y_stats.mean()
  270. result.s_xy = a.s_xy + b.s_xy +
  271. toFloat(a.n*b.n)*delta_x*delta_y/toFloat(result.n)
  272. proc `+=`*(a: var RunningRegress, b: RunningRegress) =
  273. ## add RunningRegress `b` to `a`
  274. a = a + b
  275. {.pop.}
  276. {.pop.}
  277. when isMainModule:
  278. proc clean(x: float): float =
  279. result = round(1.0e8*x).float * 1.0e-8
  280. var rs: RunningStat
  281. rs.push(@[1.0, 2.0, 1.0, 4.0, 1.0, 4.0, 1.0, 2.0])
  282. doAssert(rs.n == 8)
  283. doAssert(clean(rs.mean) == 2.0)
  284. doAssert(clean(rs.variance()) == 1.5)
  285. doAssert(clean(rs.varianceS()) == 1.71428571)
  286. doAssert(clean(rs.skewness()) == 0.81649658)
  287. doAssert(clean(rs.skewnessS()) == 1.01835015)
  288. doAssert(clean(rs.kurtosis()) == -1.0)
  289. doAssert(clean(rs.kurtosisS()) == -0.7000000000000001)
  290. var rs1, rs2: RunningStat
  291. rs1.push(@[1.0, 2.0, 1.0, 4.0])
  292. rs2.push(@[1.0, 4.0, 1.0, 2.0])
  293. let rs3 = rs1 + rs2
  294. doAssert(clean(rs3.mom2) == clean(rs.mom2))
  295. doAssert(clean(rs3.mom3) == clean(rs.mom3))
  296. doAssert(clean(rs3.mom4) == clean(rs.mom4))
  297. rs1 += rs2
  298. doAssert(clean(rs1.mom2) == clean(rs.mom2))
  299. doAssert(clean(rs1.mom3) == clean(rs.mom3))
  300. doAssert(clean(rs1.mom4) == clean(rs.mom4))
  301. rs1.clear()
  302. rs1.push(@[1.0, 2.2, 1.4, 4.9])
  303. doAssert(rs1.sum == 9.5)
  304. doAssert(rs1.mean() == 2.375)
  305. when not defined(cpu32):
  306. # XXX For some reason on 32bit CPUs these results differ
  307. var rr: RunningRegress
  308. rr.push(@[0.0,1.0,2.8,3.0,4.0], @[0.0,1.0,2.3,3.0,4.0])
  309. doAssert(rr.slope() == 0.9695585996955861)
  310. doAssert(rr.intercept() == -0.03424657534246611)
  311. doAssert(rr.correlation() == 0.9905100362239381)
  312. var rr1, rr2: RunningRegress
  313. rr1.push(@[0.0,1.0], @[0.0,1.0])
  314. rr2.push(@[2.8,3.0,4.0], @[2.3,3.0,4.0])
  315. let rr3 = rr1 + rr2
  316. doAssert(rr3.correlation() == rr.correlation())
  317. doAssert(clean(rr3.slope()) == clean(rr.slope()))
  318. doAssert(clean(rr3.intercept()) == clean(rr.intercept()))