2020-09-01 12:49:54 +02:00

854 lines
25 KiB
Rust

//! Empirical cumulative distribution function.
pub struct Ecdf<T: Ord> {
samples: Vec<T>,
length: usize,
}
impl<T: Ord + Clone> Ecdf<T> {
/// Construct a new representation of a cumulative distribution function for
/// a given sample.
///
/// The construction will involve computing a sorted clone of the given sample
/// and may be inefficient or completely prohibitive for large samples. This
/// computation is amortized significantly if there is heavy use of the value
/// function.
///
/// # Panics
///
/// The sample set must be non-empty.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// ```
pub fn new(samples: &[T]) -> Ecdf<T> {
let length = samples.len();
assert!(length > 0);
// Sort a copied sample for binary searching.
let mut sorted = samples.to_vec();
sorted.sort();
Ecdf {
samples: sorted,
length: length,
}
}
/// Calculate a value of the empirical cumulative distribution function for
/// a given sample.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// assert_eq!(ecdf.value(4), 0.5);
/// ```
pub fn value(&self, t: T) -> f64 {
let num_samples_leq_t = match self.samples.binary_search(&t) {
Ok(mut index) => {
// At least one sample is a t and we have the index of it. Need
// to walk down the sorted samples until at last that == t.
while index + 1 < self.length && self.samples[index + 1] == t {
index += 1;
}
// Compensate for 0-based indexing.
index + 1
}
Err(index) => {
// No sample is a t but if we had to put one in it would go at
// index. This means all indices to the left have samples < t
// and should be counted in the cdf proportion. We must take one
// from index to get the last included sample but then we just
// have to add one again to account for 0-based indexing.
index
}
};
num_samples_leq_t as f64 / self.length as f64
}
/// Calculate a percentile for the sample using the Nearest Rank method.
///
/// # Panics
///
/// The percentile requested must be between 1 and 100 inclusive. In
/// particular, there is no 0-percentile.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// assert_eq!(ecdf.percentile(50), 4);
/// ```
pub fn percentile(&self, p: u8) -> T {
assert!(0 < p && p <= 100);
let rank = (p as f64 * self.length as f64 / 100.0).ceil() as usize;
self.samples[rank - 1].clone()
}
/// Return the minimal element of the samples.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// assert_eq!(ecdf.min(), 0);
/// ```
pub fn min(&self) -> T {
self.samples[0].clone()
}
/// Return the maximal element of the samples.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// assert_eq!(ecdf.max(), 9);
/// ```
pub fn max(&self) -> T {
self.samples[self.samples.len() - 1].clone()
}
}
/// Calculate a one-time value of the empirical cumulative distribution function
/// for a given sample.
///
/// Computational running time of this function is O(n) but does not amortize
/// across multiple calls like Ecdf<T>::value. This function should only be
/// used in the case that a small number of ECDF values are required for the
/// sample. Otherwise, Ecdf::new should be used to create a structure that
/// takes the upfront O(n log n) sort cost but calculates values in O(log n).
///
/// # Panics
///
/// The sample set must be non-empty.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let value = ks::ecdf(&samples, 4);
/// assert_eq!(value, 0.5);
/// ```
pub fn ecdf<T: Ord>(samples: &[T], t: T) -> f64 {
let mut num_samples_leq_t = 0;
let mut length = 0;
for sample in samples.iter() {
length += 1;
if *sample <= t {
num_samples_leq_t += 1;
}
}
assert!(length > 0);
num_samples_leq_t as f64 / length as f64
}
/// Calculate a one-time percentile for a given sample using the Nearest Rank
/// method and Quick Select.
///
/// Computational running time of this function is O(n) but does not amortize
/// across multiple calls like Ecdf<T>::percentile. This function should only be
/// used in the case that a small number of percentiles are required for the
/// sample. Otherwise, Ecdf::new should be used to create a structure that
/// takes the upfront O(n log n) sort cost but calculates percentiles in O(1).
///
/// # Panics
///
/// The sample set must be non-empty.
///
/// The percentile requested must be between 1 and 100 inclusive. In particular,
/// there is no 0-percentile.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let percentile = ks::percentile(&samples, 50);
/// assert_eq!(percentile, 4);
/// ```
pub fn percentile<T: Ord + Clone>(samples: &[T], p: u8) -> T {
assert!(0 < p && p <= 100);
let length = samples.len();
assert!(length > 0);
let rank = (p as f64 * length as f64 / 100.0).ceil() as usize;
// Quick Select the element at rank.
let mut samples: Vec<T> = samples.to_vec();
let mut low = 0;
let mut high = length;
loop {
assert!(low < high);
let pivot = samples[low].clone();
if low >= high - 1 {
return pivot;
}
// First determine if the rank item is less than the pivot.
// Organise samples so that all items less than pivot are to the left,
// `bottom` is the number of items less than pivot.
let mut bottom = low;
let mut top = high - 1;
while bottom < top {
while bottom < top && samples[bottom] < pivot {
bottom += 1;
}
while bottom < top && samples[top] >= pivot {
top -= 1;
}
if bottom < top {
samples.swap(bottom, top);
}
}
if rank <= bottom {
// Rank item is less than pivot. Exclude pivot and larger items.
high = bottom;
} else {
// Rank item is pivot or in the larger set. Exclude smaller items.
low = bottom;
// Next, determine if the pivot is the rank item.
// Organise samples so that all items less than or equal to pivot
// are to the left, `bottom` is the number of items less than or
// equal to pivot. Since the left is already less than the pivot,
// this just requires moving the pivots left also.
let mut bottom = low;
let mut top = high - 1;
while bottom < top {
while bottom < top && samples[bottom] == pivot {
bottom += 1;
}
while bottom < top && samples[top] != pivot {
top -= 1;
}
if bottom < top {
samples.swap(bottom, top);
}
}
// Is pivot the rank item?
if rank <= bottom {
return pivot;
}
// Rank item is greater than pivot. Exclude pivot and smaller items.
low = bottom;
}
}
}
#[cfg(test)]
mod tests {
extern crate quickcheck;
extern crate rand;
use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, TestResult, StdGen};
use std::cmp;
use std::usize;
use super::{Ecdf, ecdf, percentile};
fn check<A: Testable>(f: A) {
let g = StdGen::new(rand::thread_rng(), usize::MAX);
QuickCheck::new().gen(g).quickcheck(f);
}
/// Wrapper for generating sample data with QuickCheck.
///
/// Samples must be non-empty sequences of u64 values.
#[derive(Debug, Clone)]
struct Samples {
vec: Vec<u64>,
}
impl Arbitrary for Samples {
fn arbitrary<G: Gen>(g: &mut G) -> Samples {
// Limit size of generated sample set to 1024
let max = cmp::min(g.size(), 1024);
let size = g.gen_range(1, max);
let vec = (0..size).map(|_| u64::arbitrary(g)).collect();
Samples { vec: vec }
}
fn shrink(&self) -> Box<Iterator<Item = Samples>> {
let vec: Vec<u64> = self.vec.clone();
let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink();
Box::new(shrunk.filter(|v| v.len() > 0).map(|v| Samples { vec: v }))
}
}
/// Wrapper for generating percentile query value data with QuickCheck.
///
/// Percentile must be u8 between 1 and 100 inclusive.
#[derive(Debug, Clone)]
struct Percentile {
val: u8,
}
impl Arbitrary for Percentile {
fn arbitrary<G: Gen>(g: &mut G) -> Percentile {
let val = g.gen_range(1, 101) as u8;
Percentile { val: val }
}
fn shrink(&self) -> Box<Iterator<Item = Percentile>> {
let shrunk: Box<Iterator<Item = u8>> = self.val.shrink();
Box::new(shrunk.filter(|&v| 0u8 < v && v <= 100u8).map(|v| Percentile { val: v }))
}
}
#[test]
#[should_panic(expected="assertion failed: length > 0")]
fn single_use_ecdf_panics_on_empty_samples_set() {
let xs: Vec<u64> = vec![];
ecdf(&xs, 0);
}
#[test]
#[should_panic(expected="assertion failed: length > 0")]
fn multiple_use_ecdf_panics_on_empty_samples_set() {
let xs: Vec<u64> = vec![];
Ecdf::new(&xs);
}
#[test]
fn single_use_ecdf_between_zero_and_one() {
fn prop(xs: Samples, val: u64) -> bool {
let actual = ecdf(&xs.vec, val);
0.0 <= actual && actual <= 1.0
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
fn multiple_use_ecdf_between_zero_and_one() {
fn prop(xs: Samples, val: u64) -> bool {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.value(val);
0.0 <= actual && actual <= 1.0
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
fn single_use_ecdf_is_an_increasing_function() {
fn prop(xs: Samples, val: u64) -> bool {
let actual = ecdf(&xs.vec, val);
ecdf(&xs.vec, val - 1) <= actual && actual <= ecdf(&xs.vec, val + 1)
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
fn multiple_use_ecdf_is_an_increasing_function() {
fn prop(xs: Samples, val: u64) -> bool {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.value(val);
ecdf.value(val - 1) <= actual && actual <= ecdf.value(val + 1)
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
fn single_use_ecdf_sample_min_minus_one_is_zero() {
fn prop(xs: Samples) -> bool {
let &min = xs.vec.iter().min().unwrap();
ecdf(&xs.vec, min - 1) == 0.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn multiple_use_ecdf_sample_min_minus_one_is_zero() {
fn prop(xs: Samples) -> bool {
let &min = xs.vec.iter().min().unwrap();
let ecdf = Ecdf::new(&xs.vec);
ecdf.value(min - 1) == 0.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn single_use_ecdf_sample_max_is_one() {
fn prop(xs: Samples) -> bool {
let &max = xs.vec.iter().max().unwrap();
ecdf(&xs.vec, max) == 1.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn multiple_use_ecdf_sample_max_is_one() {
fn prop(xs: Samples) -> bool {
let &max = xs.vec.iter().max().unwrap();
let ecdf = Ecdf::new(&xs.vec);
ecdf.value(max) == 1.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn single_use_ecdf_sample_val_is_num_samples_leq_val_div_length() {
fn prop(xs: Samples) -> bool {
let &val = xs.vec.first().unwrap();
let num_samples = xs.vec
.iter()
.filter(|&&x| x <= val)
.count();
let expected = num_samples as f64 / xs.vec.len() as f64;
ecdf(&xs.vec, val) == expected
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn multiple_use_ecdf_sample_val_is_num_samples_leq_val_div_length() {
fn prop(xs: Samples) -> bool {
let &val = xs.vec.first().unwrap();
let num_samples = xs.vec
.iter()
.filter(|&&x| x <= val)
.count();
let expected = num_samples as f64 / xs.vec.len() as f64;
let ecdf = Ecdf::new(&xs.vec);
ecdf.value(val) == expected
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn single_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() {
fn prop(xs: Samples, val: u64) -> TestResult {
let length = xs.vec.len();
if xs.vec.iter().any(|&x| x == val) {
// Discard Vec containing val.
return TestResult::discard();
}
let num_samples = xs.vec
.iter()
.filter(|&&x| x <= val)
.count();
let expected = num_samples as f64 / length as f64;
let actual = ecdf(&xs.vec, val);
TestResult::from_bool(actual == expected)
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn multiple_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() {
fn prop(xs: Samples, val: u64) -> TestResult {
let length = xs.vec.len();
if xs.vec.iter().any(|&x| x == val) {
// Discard Vec containing val.
return TestResult::discard();
}
let num_samples = xs.vec
.iter()
.filter(|&&x| x <= val)
.count();
let expected = num_samples as f64 / length as f64;
let ecdf = Ecdf::new(&xs.vec);
TestResult::from_bool(ecdf.value(val) == expected)
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn single_and_multiple_use_ecdf_agree() {
fn prop(xs: Samples, val: u64) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
multiple_use.value(val) == ecdf(&xs.vec, val)
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
fn single_use_percentiles_panics_on_zero_percentile() {
let xs: Vec<u64> = vec![0];
percentile(&xs, 0);
}
#[test]
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
fn single_use_percentiles_panics_on_101_percentile() {
let xs: Vec<u64> = vec![0];
percentile(&xs, 101);
}
#[test]
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
fn multiple_use_percentiles_panics_on_zero_percentile() {
let xs: Vec<u64> = vec![0];
let ecdf = Ecdf::new(&xs);
ecdf.percentile(0);
}
#[test]
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
fn multiple_use_percentiles_panics_on_101_percentile() {
let xs: Vec<u64> = vec![0];
let ecdf = Ecdf::new(&xs);
ecdf.percentile(101);
}
#[test]
fn single_use_percentile_between_samples_min_and_max() {
fn prop(xs: Samples, p: Percentile) -> bool {
let &min = xs.vec.iter().min().unwrap();
let &max = xs.vec.iter().max().unwrap();
let actual = percentile(&xs.vec, p.val);
min <= actual && actual <= max
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn single_use_percentile_is_an_increasing_function() {
fn prop(xs: Samples, p: Percentile) -> bool {
let smaller = cmp::max(p.val - 1, 1);
let larger = cmp::min(p.val + 1, 100);
let actual = percentile(&xs.vec, p.val);
percentile(&xs.vec, smaller) <= actual && actual <= percentile(&xs.vec, larger)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn single_use_percentile_100_is_sample_max() {
fn prop(xs: Samples) -> bool {
let &max = xs.vec.iter().max().unwrap();
percentile(&xs.vec, 100) == max
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn multiple_use_percentile_between_samples_min_and_max() {
fn prop(xs: Samples, p: Percentile) -> bool {
let &min = xs.vec.iter().min().unwrap();
let &max = xs.vec.iter().max().unwrap();
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.percentile(p.val);
min <= actual && actual <= max
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn multiple_use_percentile_is_an_increasing_function() {
fn prop(xs: Samples, p: Percentile) -> bool {
let smaller = cmp::max(p.val - 1, 1);
let larger = cmp::min(p.val + 1, 100);
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.percentile(p.val);
ecdf.percentile(smaller) <= actual && actual <= ecdf.percentile(larger)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn multiple_use_percentile_100_is_sample_max() {
fn prop(xs: Samples) -> bool {
let &max = xs.vec.iter().max().unwrap();
let ecdf = Ecdf::new(&xs.vec);
ecdf.percentile(100) == max
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn single_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() {
fn prop(xs: Samples, val: u64) -> TestResult {
let actual = ecdf(&xs.vec, val);
let p = (actual * 100.0).floor() as u8;
match p {
0 => {
// val is below the first percentile threshold. Can't
// calculate 0-percentile value so discard.
TestResult::discard()
}
_ => {
// Not equal because e.g. all percentiles of [0] are 0. So
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
let single_use = percentile(&xs.vec, p);
TestResult::from_bool(single_use <= val)
}
}
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn single_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() {
fn prop(xs: Samples, val: u64) -> TestResult {
let actual = ecdf(&xs.vec, val);
let p = (actual * 100.0).floor() as u8;
match p {
0 => {
// val is below the first percentile threshold. Can't
// calculate 0-percentile value so discard.
TestResult::discard()
}
_ => {
// Not equal because e.g. all percentiles of [0] are 0. So
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
let multiple_use = Ecdf::new(&xs.vec);
TestResult::from_bool(multiple_use.percentile(p) <= val)
}
}
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn multiple_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() {
fn prop(xs: Samples, val: u64) -> TestResult {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.value(val);
let p = (actual * 100.0).floor() as u8;
match p {
0 => {
// val is below the first percentile threshold. Can't
// calculate 0-percentile value so discard.
TestResult::discard()
}
_ => {
// Not equal because e.g. all percentiles of [0] are 0. So
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
let single_use = percentile(&xs.vec, p);
TestResult::from_bool(single_use <= val)
}
}
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn multiple_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() {
fn prop(xs: Samples, val: u64) -> TestResult {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.value(val);
let p = (actual * 100.0).floor() as u8;
match p {
0 => {
// val is below the first percentile threshold. Can't
// calculate 0-percentile value so discard.
TestResult::discard()
}
_ => {
// Not equal because e.g. all percentiles of [0] are 0. So
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
TestResult::from_bool(ecdf.percentile(p) <= val)
}
}
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn single_use_percentile_followed_by_single_use_edf_is_geq_original_value() {
fn prop(xs: Samples, p: Percentile) -> bool {
let actual = percentile(&xs.vec, p.val);
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
p.val as f64 / 100.0 <= ecdf(&xs.vec, actual)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn single_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() {
fn prop(xs: Samples, p: Percentile) -> bool {
let actual = percentile(&xs.vec, p.val);
let ecdf = Ecdf::new(&xs.vec);
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
p.val as f64 / 100.0 <= ecdf.value(actual)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn multiple_use_percentile_followed_by_single_use_edf_is_geq_original_value() {
fn prop(xs: Samples, p: Percentile) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
let actual = multiple_use.percentile(p.val);
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
p.val as f64 / 100.0 <= ecdf(&xs.vec, actual)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn multiple_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() {
fn prop(xs: Samples, p: Percentile) -> bool {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.percentile(p.val);
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
p.val as f64 / 100.0 <= ecdf.value(actual)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn single_and_multiple_use_percentile_agree() {
fn prop(xs: Samples, p: Percentile) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
multiple_use.percentile(p.val) == percentile(&xs.vec, p.val)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn min_is_leq_all_samples() {
fn prop(xs: Samples) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
let actual = multiple_use.min();
xs.vec.iter().all(|&x| actual <= x)
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn max_is_geq_all_samples() {
fn prop(xs: Samples) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
let actual = multiple_use.max();
xs.vec.iter().all(|&x| actual >= x)
}
check(prop as fn(Samples) -> bool);
}
}