12345678910111213141516171819202122232425262728293031323334353637383940414243444546 |
- (define-module (split-quality-measure))
- (use-modules
- ;; SRFI-1 for additional list procedures
- (srfi srfi-1)
- (tree)
- (dataset)
- (data-point))
- (define-public calc-proportion
- (lambda (subset class-label label-column-index)
- "Calculate the proportion of data points with the given label in the given
- subset, compared to the data points with other labels."
- (cond
- [(dataset-empty? subset) 0]
- [else
- (let* ([row-count (dataset-length subset)]
- [class-count
- (count (lambda (data-point)
- (= (data-point-get-col data-point label-column-index)
- class-label))
- subset)]
- [prop (/ class-count row-count)])
- (* prop (- 1.0 prop)))])))
- ;; The procedure gini-index is used to evaluate the quality of a split. It is a
- ;; cost function for a split. We want to keep the costs for splits low. (also:
- ;; greedy) There are other ways of calculating the quality of a split, but for
- ;; now we implement gini index.
- (define-public gini-index
- (lambda (subsets label-column-index)
- "Calculate the gini index quality measure, based on the result of a split."
- (apply +
- (map (lambda (subset)
- ;; For now assume labels are 0 or 1. Binary classification.
- ;; FUTURE TODO: In the future one might make this more flexible
- ;; by giving the labels as argument.
- (let ([labels '(0 1)])
- ;; For each subset calculate the proportion for each label.
- (apply +
- (map (lambda (label)
- (calc-proportion subset label label-column-index))
- labels))))
- subsets))))
|