Kolmogorov-Smirnov-Test

This commit is contained in:
eugen.betke 2020-09-01 12:49:54 +02:00
parent 88e994f997
commit 8d2ebf0e96
11 changed files with 1995 additions and 0 deletions

36
tools/kslib/Cargo.toml Normal file
View File

@ -0,0 +1,36 @@
[package]
name = "kolmogorov_smirnov"
description = "Implementation of the Kolmogorov-Smirnov statistical test as a Rust library."
version = "1.1.0"
homepage = "http://daithiocrualaoich.github.io/kolmogorov_smirnov"
repository = "https://github.com/daithiocrualaoich/kolmogorov_smirnov"
authors = ["Daithi O Crualaoich <daithi.ocrualaoich@gmail.com>"]
license = "Apache-2.0"
keywords = ["statistics"]
exclude = [
".dockerignore",
".gitignore",
".travis.yml",
"Cargo.lock",
"Dockerfile",
"dat/*",
"doc/*",
]
[[bin]]
name = "critical_values"
[[bin]]
name = "normal"
[[bin]]
name = "ks_f64"
[[bin]]
name = "ks_i64"
[dependencies]
rand = "0.3.12"
[dev-dependencies]
quickcheck = "0.2"

View File

@ -0,0 +1,39 @@
extern crate kolmogorov_smirnov as ks;
use ks::calculate_critical_value;
use std::env;
/// Calculate critical values dataset.
///
/// # Examples
///
/// ```bash
/// cargo run --bin critical_values <confidence> <num_samples> <limit>
/// ```
///
/// This will print the critical values of the Kolmogorov-Smirnov two sample
/// test for samples of size `<num_samples>` against samples of sizes 16
/// through `<limit>` inclusive at the specified confidence level.
///
/// `<num_samples>` and `<limit>` must be positive integers, `<confidence>` must
/// be a floating point number strictly between zero and one.
fn main() {
let args: Vec<String> = env::args().collect();
let confidence: f64 = args[1].parse().expect("<confidence> must be a floating point number.");
let n1: usize = args[2].parse().expect("<num_samples> must be an integer.");
let limit: usize = args[3].parse().expect("<limit> must be an integer.");
assert!(n1 > 0 && limit > 0);
assert!(0.0 < confidence && confidence < 1.0);
println!("n1\tn2\tconfidence\tcritical_value");
for n2 in 16..(limit + 1) {
println!("{}\t{}\t{}\t{}",
n1,
n2,
confidence,
calculate_critical_value(n1, n2, confidence).unwrap());
}
}

View File

@ -0,0 +1,52 @@
extern crate kolmogorov_smirnov as ks;
use ks::test_f64;
use std::env;
use std::io::{BufReader, BufRead};
use std::fs::File;
use std::path::Path;
fn parse_float(s: String) -> f64 {
s.parse::<f64>().expect("Not a floating point number.")
}
/// Runs a Kolmogorov-Smirnov test on floating point data files.
///
/// Input files must be single-column headerless data files. The data samples
/// are tested against each other at the 0.95 confidence level.
///
/// # Examples
///
/// ```bash
/// cargo run --bin ks_f64 <file1> <file2>
/// ```
///
/// This will print the test result to standard output.
fn main() {
let args: Vec<String> = env::args().collect();
let path1 = Path::new(&args[1]);
let path2 = Path::new(&args[2]);
let file1 = BufReader::new(File::open(&path1).unwrap());
let file2 = BufReader::new(File::open(&path2).unwrap());
let lines1 = file1.lines().map(|line| line.unwrap());
let lines2 = file2.lines().map(|line| line.unwrap());
let xs: Vec<f64> = lines1.map(parse_float).collect();
let ys: Vec<f64> = lines2.map(parse_float).collect();
let result = ks::test_f64(&xs, &ys, 0.95).unwrap();
if result.is_rejected {
println!("Samples are from different distributions.");
} else {
println!("Samples are from the same distribution.");
}
println!("test statistic = {}", result.statistic);
println!("critical value = {}", result.critical_value);
println!("reject probability = {}", result.reject_probability);
}

View File

@ -0,0 +1,52 @@
extern crate kolmogorov_smirnov as ks;
use ks::test;
use std::env;
use std::io::{BufReader, BufRead};
use std::fs::File;
use std::path::Path;
fn parse_int(s: String) -> i64 {
s.parse::<i64>().expect("Not an integer.")
}
/// Runs a Kolmogorov-Smirnov test on integer data files.
///
/// Input files must be single-column headerless data files. The data samples
/// are tested against each other at the 0.95 confidence level.
///
/// # Examples
///
/// ```bash
/// cargo run --bin ks_i64 <file1> <file2>
/// ```
///
/// This will print the test result to standard output.
fn main() {
let args: Vec<String> = env::args().collect();
let path1 = Path::new(&args[1]);
let path2 = Path::new(&args[2]);
let file1 = BufReader::new(File::open(&path1).unwrap());
let file2 = BufReader::new(File::open(&path2).unwrap());
let lines1 = file1.lines().map(|line| line.unwrap());
let lines2 = file2.lines().map(|line| line.unwrap());
let xs: Vec<i64> = lines1.map(parse_int).collect();
let ys: Vec<i64> = lines2.map(parse_int).collect();
let result = ks::test(&xs, &ys, 0.95).unwrap();
if result.is_rejected {
println!("Samples are from different distributions.");
} else {
println!("Samples are from the same distribution.");
}
println!("test statistic = {}", result.statistic);
println!("critical value = {}", result.critical_value);
println!("reject probability = {}", result.reject_probability);
}

View File

@ -0,0 +1,35 @@
extern crate rand;
use std::env;
use rand::distributions::{Normal, IndependentSample};
/// Prints a sequence of Normal deviates.
///
/// # Examples
///
/// ```bash
/// cargo run --bin normal <num_deviates> <mean> <variance>
/// ```
/// This will print `<num_deviates>` float point numbers, one per line, to
/// standard output. These numbers will have a Normal distribution with the
/// specified mean and variance.
///
/// `<num_deviates>` must be a positive integer, `<mean>` and `<variance>` may
/// be integers or floating point numbers but `<variance>` must be positive.
fn main() {
let args: Vec<String> = env::args().collect();
let n: u32 = args[1].parse().expect("<num_deviates> must be an integer.");
let mean: f64 = args[2].parse().expect("<mean> must be a floating point number.");
let variance: f64 = args[3].parse().expect("<variance> must be a floating point number.");
assert!(n > 0 && variance.is_sign_positive());
let mut rng = rand::thread_rng();
let normal = Normal::new(mean, variance.sqrt());
for _ in 0..n {
let x = normal.ind_sample(&mut rng);
println!("{}", x)
}
}

853
tools/kslib/src/ecdf.rs Normal file
View File

@ -0,0 +1,853 @@
//! Empirical cumulative distribution function.
pub struct Ecdf<T: Ord> {
samples: Vec<T>,
length: usize,
}
impl<T: Ord + Clone> Ecdf<T> {
/// Construct a new representation of a cumulative distribution function for
/// a given sample.
///
/// The construction will involve computing a sorted clone of the given sample
/// and may be inefficient or completely prohibitive for large samples. This
/// computation is amortized significantly if there is heavy use of the value
/// function.
///
/// # Panics
///
/// The sample set must be non-empty.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// ```
pub fn new(samples: &[T]) -> Ecdf<T> {
let length = samples.len();
assert!(length > 0);
// Sort a copied sample for binary searching.
let mut sorted = samples.to_vec();
sorted.sort();
Ecdf {
samples: sorted,
length: length,
}
}
/// Calculate a value of the empirical cumulative distribution function for
/// a given sample.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// assert_eq!(ecdf.value(4), 0.5);
/// ```
pub fn value(&self, t: T) -> f64 {
let num_samples_leq_t = match self.samples.binary_search(&t) {
Ok(mut index) => {
// At least one sample is a t and we have the index of it. Need
// to walk down the sorted samples until at last that == t.
while index + 1 < self.length && self.samples[index + 1] == t {
index += 1;
}
// Compensate for 0-based indexing.
index + 1
}
Err(index) => {
// No sample is a t but if we had to put one in it would go at
// index. This means all indices to the left have samples < t
// and should be counted in the cdf proportion. We must take one
// from index to get the last included sample but then we just
// have to add one again to account for 0-based indexing.
index
}
};
num_samples_leq_t as f64 / self.length as f64
}
/// Calculate a percentile for the sample using the Nearest Rank method.
///
/// # Panics
///
/// The percentile requested must be between 1 and 100 inclusive. In
/// particular, there is no 0-percentile.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// assert_eq!(ecdf.percentile(50), 4);
/// ```
pub fn percentile(&self, p: u8) -> T {
assert!(0 < p && p <= 100);
let rank = (p as f64 * self.length as f64 / 100.0).ceil() as usize;
self.samples[rank - 1].clone()
}
/// Return the minimal element of the samples.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// assert_eq!(ecdf.min(), 0);
/// ```
pub fn min(&self) -> T {
self.samples[0].clone()
}
/// Return the maximal element of the samples.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let ecdf = ks::Ecdf::new(&samples);
/// assert_eq!(ecdf.max(), 9);
/// ```
pub fn max(&self) -> T {
self.samples[self.samples.len() - 1].clone()
}
}
/// Calculate a one-time value of the empirical cumulative distribution function
/// for a given sample.
///
/// Computational running time of this function is O(n) but does not amortize
/// across multiple calls like Ecdf<T>::value. This function should only be
/// used in the case that a small number of ECDF values are required for the
/// sample. Otherwise, Ecdf::new should be used to create a structure that
/// takes the upfront O(n log n) sort cost but calculates values in O(log n).
///
/// # Panics
///
/// The sample set must be non-empty.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let value = ks::ecdf(&samples, 4);
/// assert_eq!(value, 0.5);
/// ```
pub fn ecdf<T: Ord>(samples: &[T], t: T) -> f64 {
let mut num_samples_leq_t = 0;
let mut length = 0;
for sample in samples.iter() {
length += 1;
if *sample <= t {
num_samples_leq_t += 1;
}
}
assert!(length > 0);
num_samples_leq_t as f64 / length as f64
}
/// Calculate a one-time percentile for a given sample using the Nearest Rank
/// method and Quick Select.
///
/// Computational running time of this function is O(n) but does not amortize
/// across multiple calls like Ecdf<T>::percentile. This function should only be
/// used in the case that a small number of percentiles are required for the
/// sample. Otherwise, Ecdf::new should be used to create a structure that
/// takes the upfront O(n log n) sort cost but calculates percentiles in O(1).
///
/// # Panics
///
/// The sample set must be non-empty.
///
/// The percentile requested must be between 1 and 100 inclusive. In particular,
/// there is no 0-percentile.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let percentile = ks::percentile(&samples, 50);
/// assert_eq!(percentile, 4);
/// ```
pub fn percentile<T: Ord + Clone>(samples: &[T], p: u8) -> T {
assert!(0 < p && p <= 100);
let length = samples.len();
assert!(length > 0);
let rank = (p as f64 * length as f64 / 100.0).ceil() as usize;
// Quick Select the element at rank.
let mut samples: Vec<T> = samples.to_vec();
let mut low = 0;
let mut high = length;
loop {
assert!(low < high);
let pivot = samples[low].clone();
if low >= high - 1 {
return pivot;
}
// First determine if the rank item is less than the pivot.
// Organise samples so that all items less than pivot are to the left,
// `bottom` is the number of items less than pivot.
let mut bottom = low;
let mut top = high - 1;
while bottom < top {
while bottom < top && samples[bottom] < pivot {
bottom += 1;
}
while bottom < top && samples[top] >= pivot {
top -= 1;
}
if bottom < top {
samples.swap(bottom, top);
}
}
if rank <= bottom {
// Rank item is less than pivot. Exclude pivot and larger items.
high = bottom;
} else {
// Rank item is pivot or in the larger set. Exclude smaller items.
low = bottom;
// Next, determine if the pivot is the rank item.
// Organise samples so that all items less than or equal to pivot
// are to the left, `bottom` is the number of items less than or
// equal to pivot. Since the left is already less than the pivot,
// this just requires moving the pivots left also.
let mut bottom = low;
let mut top = high - 1;
while bottom < top {
while bottom < top && samples[bottom] == pivot {
bottom += 1;
}
while bottom < top && samples[top] != pivot {
top -= 1;
}
if bottom < top {
samples.swap(bottom, top);
}
}
// Is pivot the rank item?
if rank <= bottom {
return pivot;
}
// Rank item is greater than pivot. Exclude pivot and smaller items.
low = bottom;
}
}
}
#[cfg(test)]
mod tests {
extern crate quickcheck;
extern crate rand;
use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, TestResult, StdGen};
use std::cmp;
use std::usize;
use super::{Ecdf, ecdf, percentile};
fn check<A: Testable>(f: A) {
let g = StdGen::new(rand::thread_rng(), usize::MAX);
QuickCheck::new().gen(g).quickcheck(f);
}
/// Wrapper for generating sample data with QuickCheck.
///
/// Samples must be non-empty sequences of u64 values.
#[derive(Debug, Clone)]
struct Samples {
vec: Vec<u64>,
}
impl Arbitrary for Samples {
fn arbitrary<G: Gen>(g: &mut G) -> Samples {
// Limit size of generated sample set to 1024
let max = cmp::min(g.size(), 1024);
let size = g.gen_range(1, max);
let vec = (0..size).map(|_| u64::arbitrary(g)).collect();
Samples { vec: vec }
}
fn shrink(&self) -> Box<Iterator<Item = Samples>> {
let vec: Vec<u64> = self.vec.clone();
let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink();
Box::new(shrunk.filter(|v| v.len() > 0).map(|v| Samples { vec: v }))
}
}
/// Wrapper for generating percentile query value data with QuickCheck.
///
/// Percentile must be u8 between 1 and 100 inclusive.
#[derive(Debug, Clone)]
struct Percentile {
val: u8,
}
impl Arbitrary for Percentile {
fn arbitrary<G: Gen>(g: &mut G) -> Percentile {
let val = g.gen_range(1, 101) as u8;
Percentile { val: val }
}
fn shrink(&self) -> Box<Iterator<Item = Percentile>> {
let shrunk: Box<Iterator<Item = u8>> = self.val.shrink();
Box::new(shrunk.filter(|&v| 0u8 < v && v <= 100u8).map(|v| Percentile { val: v }))
}
}
#[test]
#[should_panic(expected="assertion failed: length > 0")]
fn single_use_ecdf_panics_on_empty_samples_set() {
let xs: Vec<u64> = vec![];
ecdf(&xs, 0);
}
#[test]
#[should_panic(expected="assertion failed: length > 0")]
fn multiple_use_ecdf_panics_on_empty_samples_set() {
let xs: Vec<u64> = vec![];
Ecdf::new(&xs);
}
#[test]
fn single_use_ecdf_between_zero_and_one() {
fn prop(xs: Samples, val: u64) -> bool {
let actual = ecdf(&xs.vec, val);
0.0 <= actual && actual <= 1.0
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
fn multiple_use_ecdf_between_zero_and_one() {
fn prop(xs: Samples, val: u64) -> bool {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.value(val);
0.0 <= actual && actual <= 1.0
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
fn single_use_ecdf_is_an_increasing_function() {
fn prop(xs: Samples, val: u64) -> bool {
let actual = ecdf(&xs.vec, val);
ecdf(&xs.vec, val - 1) <= actual && actual <= ecdf(&xs.vec, val + 1)
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
fn multiple_use_ecdf_is_an_increasing_function() {
fn prop(xs: Samples, val: u64) -> bool {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.value(val);
ecdf.value(val - 1) <= actual && actual <= ecdf.value(val + 1)
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
fn single_use_ecdf_sample_min_minus_one_is_zero() {
fn prop(xs: Samples) -> bool {
let &min = xs.vec.iter().min().unwrap();
ecdf(&xs.vec, min - 1) == 0.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn multiple_use_ecdf_sample_min_minus_one_is_zero() {
fn prop(xs: Samples) -> bool {
let &min = xs.vec.iter().min().unwrap();
let ecdf = Ecdf::new(&xs.vec);
ecdf.value(min - 1) == 0.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn single_use_ecdf_sample_max_is_one() {
fn prop(xs: Samples) -> bool {
let &max = xs.vec.iter().max().unwrap();
ecdf(&xs.vec, max) == 1.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn multiple_use_ecdf_sample_max_is_one() {
fn prop(xs: Samples) -> bool {
let &max = xs.vec.iter().max().unwrap();
let ecdf = Ecdf::new(&xs.vec);
ecdf.value(max) == 1.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn single_use_ecdf_sample_val_is_num_samples_leq_val_div_length() {
fn prop(xs: Samples) -> bool {
let &val = xs.vec.first().unwrap();
let num_samples = xs.vec
.iter()
.filter(|&&x| x <= val)
.count();
let expected = num_samples as f64 / xs.vec.len() as f64;
ecdf(&xs.vec, val) == expected
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn multiple_use_ecdf_sample_val_is_num_samples_leq_val_div_length() {
fn prop(xs: Samples) -> bool {
let &val = xs.vec.first().unwrap();
let num_samples = xs.vec
.iter()
.filter(|&&x| x <= val)
.count();
let expected = num_samples as f64 / xs.vec.len() as f64;
let ecdf = Ecdf::new(&xs.vec);
ecdf.value(val) == expected
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn single_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() {
fn prop(xs: Samples, val: u64) -> TestResult {
let length = xs.vec.len();
if xs.vec.iter().any(|&x| x == val) {
// Discard Vec containing val.
return TestResult::discard();
}
let num_samples = xs.vec
.iter()
.filter(|&&x| x <= val)
.count();
let expected = num_samples as f64 / length as f64;
let actual = ecdf(&xs.vec, val);
TestResult::from_bool(actual == expected)
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn multiple_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() {
fn prop(xs: Samples, val: u64) -> TestResult {
let length = xs.vec.len();
if xs.vec.iter().any(|&x| x == val) {
// Discard Vec containing val.
return TestResult::discard();
}
let num_samples = xs.vec
.iter()
.filter(|&&x| x <= val)
.count();
let expected = num_samples as f64 / length as f64;
let ecdf = Ecdf::new(&xs.vec);
TestResult::from_bool(ecdf.value(val) == expected)
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn single_and_multiple_use_ecdf_agree() {
fn prop(xs: Samples, val: u64) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
multiple_use.value(val) == ecdf(&xs.vec, val)
}
check(prop as fn(Samples, u64) -> bool);
}
#[test]
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
fn single_use_percentiles_panics_on_zero_percentile() {
let xs: Vec<u64> = vec![0];
percentile(&xs, 0);
}
#[test]
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
fn single_use_percentiles_panics_on_101_percentile() {
let xs: Vec<u64> = vec![0];
percentile(&xs, 101);
}
#[test]
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
fn multiple_use_percentiles_panics_on_zero_percentile() {
let xs: Vec<u64> = vec![0];
let ecdf = Ecdf::new(&xs);
ecdf.percentile(0);
}
#[test]
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
fn multiple_use_percentiles_panics_on_101_percentile() {
let xs: Vec<u64> = vec![0];
let ecdf = Ecdf::new(&xs);
ecdf.percentile(101);
}
#[test]
fn single_use_percentile_between_samples_min_and_max() {
fn prop(xs: Samples, p: Percentile) -> bool {
let &min = xs.vec.iter().min().unwrap();
let &max = xs.vec.iter().max().unwrap();
let actual = percentile(&xs.vec, p.val);
min <= actual && actual <= max
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn single_use_percentile_is_an_increasing_function() {
fn prop(xs: Samples, p: Percentile) -> bool {
let smaller = cmp::max(p.val - 1, 1);
let larger = cmp::min(p.val + 1, 100);
let actual = percentile(&xs.vec, p.val);
percentile(&xs.vec, smaller) <= actual && actual <= percentile(&xs.vec, larger)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn single_use_percentile_100_is_sample_max() {
fn prop(xs: Samples) -> bool {
let &max = xs.vec.iter().max().unwrap();
percentile(&xs.vec, 100) == max
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn multiple_use_percentile_between_samples_min_and_max() {
fn prop(xs: Samples, p: Percentile) -> bool {
let &min = xs.vec.iter().min().unwrap();
let &max = xs.vec.iter().max().unwrap();
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.percentile(p.val);
min <= actual && actual <= max
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn multiple_use_percentile_is_an_increasing_function() {
fn prop(xs: Samples, p: Percentile) -> bool {
let smaller = cmp::max(p.val - 1, 1);
let larger = cmp::min(p.val + 1, 100);
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.percentile(p.val);
ecdf.percentile(smaller) <= actual && actual <= ecdf.percentile(larger)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn multiple_use_percentile_100_is_sample_max() {
fn prop(xs: Samples) -> bool {
let &max = xs.vec.iter().max().unwrap();
let ecdf = Ecdf::new(&xs.vec);
ecdf.percentile(100) == max
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn single_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() {
fn prop(xs: Samples, val: u64) -> TestResult {
let actual = ecdf(&xs.vec, val);
let p = (actual * 100.0).floor() as u8;
match p {
0 => {
// val is below the first percentile threshold. Can't
// calculate 0-percentile value so discard.
TestResult::discard()
}
_ => {
// Not equal because e.g. all percentiles of [0] are 0. So
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
let single_use = percentile(&xs.vec, p);
TestResult::from_bool(single_use <= val)
}
}
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn single_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() {
fn prop(xs: Samples, val: u64) -> TestResult {
let actual = ecdf(&xs.vec, val);
let p = (actual * 100.0).floor() as u8;
match p {
0 => {
// val is below the first percentile threshold. Can't
// calculate 0-percentile value so discard.
TestResult::discard()
}
_ => {
// Not equal because e.g. all percentiles of [0] are 0. So
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
let multiple_use = Ecdf::new(&xs.vec);
TestResult::from_bool(multiple_use.percentile(p) <= val)
}
}
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn multiple_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() {
fn prop(xs: Samples, val: u64) -> TestResult {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.value(val);
let p = (actual * 100.0).floor() as u8;
match p {
0 => {
// val is below the first percentile threshold. Can't
// calculate 0-percentile value so discard.
TestResult::discard()
}
_ => {
// Not equal because e.g. all percentiles of [0] are 0. So
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
let single_use = percentile(&xs.vec, p);
TestResult::from_bool(single_use <= val)
}
}
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn multiple_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() {
fn prop(xs: Samples, val: u64) -> TestResult {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.value(val);
let p = (actual * 100.0).floor() as u8;
match p {
0 => {
// val is below the first percentile threshold. Can't
// calculate 0-percentile value so discard.
TestResult::discard()
}
_ => {
// Not equal because e.g. all percentiles of [0] are 0. So
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
TestResult::from_bool(ecdf.percentile(p) <= val)
}
}
}
check(prop as fn(Samples, u64) -> TestResult);
}
#[test]
fn single_use_percentile_followed_by_single_use_edf_is_geq_original_value() {
fn prop(xs: Samples, p: Percentile) -> bool {
let actual = percentile(&xs.vec, p.val);
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
p.val as f64 / 100.0 <= ecdf(&xs.vec, actual)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn single_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() {
fn prop(xs: Samples, p: Percentile) -> bool {
let actual = percentile(&xs.vec, p.val);
let ecdf = Ecdf::new(&xs.vec);
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
p.val as f64 / 100.0 <= ecdf.value(actual)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn multiple_use_percentile_followed_by_single_use_edf_is_geq_original_value() {
fn prop(xs: Samples, p: Percentile) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
let actual = multiple_use.percentile(p.val);
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
p.val as f64 / 100.0 <= ecdf(&xs.vec, actual)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn multiple_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() {
fn prop(xs: Samples, p: Percentile) -> bool {
let ecdf = Ecdf::new(&xs.vec);
let actual = ecdf.percentile(p.val);
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
p.val as f64 / 100.0 <= ecdf.value(actual)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn single_and_multiple_use_percentile_agree() {
fn prop(xs: Samples, p: Percentile) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
multiple_use.percentile(p.val) == percentile(&xs.vec, p.val)
}
check(prop as fn(Samples, Percentile) -> bool);
}
#[test]
fn min_is_leq_all_samples() {
fn prop(xs: Samples) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
let actual = multiple_use.min();
xs.vec.iter().all(|&x| actual <= x)
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn max_is_geq_all_samples() {
fn prop(xs: Samples) -> bool {
let multiple_use = Ecdf::new(&xs.vec);
let actual = multiple_use.max();
xs.vec.iter().all(|&x| actual >= x)
}
check(prop as fn(Samples) -> bool);
}
}

5
tools/kslib/src/lib.rs Normal file
View File

@ -0,0 +1,5 @@
pub mod ecdf;
pub mod test;
pub use test::{test, test_f64, calculate_critical_value};
pub use ecdf::{Ecdf, ecdf, percentile};

699
tools/kslib/src/test.rs Normal file
View File

@ -0,0 +1,699 @@
//! Two Sample Kolmogorov-Smirnov Test
use std::cmp::{min, Ord, Ordering};
/// Two sample test result.
pub struct TestResult {
pub is_rejected: bool,
pub statistic: f64,
pub reject_probability: f64,
pub critical_value: f64,
pub confidence: f64,
}
/// Perform a two sample Kolmogorov-Smirnov test on given samples.
///
/// The samples must have length > 7 elements for the test to be valid.
///
/// # Panics
///
/// There are assertion panics if either sequence has <= 7 elements.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let xs = vec!(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
/// let ys = vec!(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
/// let confidence = 0.95;
///
/// let result = ks::test(&xs, &ys, confidence);
///
/// if result.is_rejected {
/// println!("{:?} and {:?} are not from the same distribution with probability {}.",
/// xs, ys, result.reject_probability);
/// }
/// ```
pub fn test<T: Ord + Clone>(xs: &[T], ys: &[T], confidence: f64) -> Result<TestResult, String> {
assert!(xs.len() > 0 && ys.len() > 0);
assert!(0.0 < confidence && confidence < 1.0);
// Only supports samples of size > 7.
assert!(xs.len() > 7 && ys.len() > 7);
let statistic = calculate_statistic(xs, ys);
let critical_value = calculate_critical_value(xs.len(), ys.len(), confidence)?;
let reject_probability = calculate_reject_probability(statistic, xs.len(), ys.len())?;
let is_rejected = reject_probability > confidence;
Ok(TestResult {
is_rejected: is_rejected,
statistic: statistic,
reject_probability: reject_probability,
critical_value: critical_value,
confidence: confidence,
})
}
/// Wrapper type for f64 to implement Ord and make usable with test.
#[derive(PartialEq, Clone)]
struct OrderableF64 {
val: f64,
}
impl OrderableF64 {
fn new(val: f64) -> OrderableF64 {
OrderableF64 { val: val }
}
}
impl Eq for OrderableF64 {}
impl PartialOrd for OrderableF64 {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
self.val.partial_cmp(&other.val)
}
}
impl Ord for OrderableF64 {
fn cmp(&self, other: &Self) -> Ordering {
self.val.partial_cmp(&other.val).unwrap()
}
}
/// Perform a two sample Kolmogorov-Smirnov test on given f64 samples.
///
/// This is necessary because f64 does not implement Ord in Rust as some
/// elements are incomparable, e.g. NaN. This function wraps the f64s in
/// implementation of Ord which panics on incomparable elements.
///
/// The samples must have length > 7 elements for the test to be valid.
///
/// # Panics
///
/// There are assertion panics if either sequence has <= 7 elements.
///
/// If any of the f64 elements in the input samples are unorderable, e.g. NaN.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let xs = vec!(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0);
/// let ys = vec!(12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0);
/// let confidence = 0.95;
///
/// let result = ks::test_f64(&xs, &ys, confidence);
///
/// if result.is_rejected {
/// println!("{:?} and {:?} are not from the same distribution with probability {}.",
/// xs, ys, result.reject_probability);
/// }
/// ```
pub fn test_f64(xs: &[f64], ys: &[f64], confidence: f64) -> Result<TestResult, String> {
let xs: Vec<OrderableF64> = xs.iter().map(|&f| OrderableF64::new(f)).collect();
let ys: Vec<OrderableF64> = ys.iter().map(|&f| OrderableF64::new(f)).collect();
test(&xs, &ys, confidence)
}
/// Calculate the test statistic for the two sample Kolmogorov-Smirnov test.
///
/// The test statistic is the maximum vertical distance between the ECDFs of
/// the two samples.
fn calculate_statistic<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f64 {
let n = xs.len();
let m = ys.len();
assert!(n > 0 && m > 0);
let mut xs = xs.to_vec();
let mut ys = ys.to_vec();
// xs and ys must be sorted for the stepwise ECDF calculations to work.
xs.sort();
ys.sort();
// The current value testing for ECDF difference. Sweeps up through elements
// present in xs and ys.
let mut current: &T;
// i, j index the first values in xs and ys that are greater than current.
let mut i = 0;
let mut j = 0;
// ecdf_xs, ecdf_ys always hold the ECDF(current) of xs and ys.
let mut ecdf_xs = 0.0;
let mut ecdf_ys = 0.0;
// The test statistic value computed over values <= current.
let mut statistic = 0.0;
while i < n && j < m {
// Advance i through duplicate samples in xs.
let x_i = &xs[i];
while i + 1 < n && *x_i == xs[i + 1] {
i += 1;
}
// Advance j through duplicate samples in ys.
let y_j = &ys[j];
while j + 1 < m && *y_j == ys[j + 1] {
j += 1;
}
// Step to the next sample value in the ECDF sweep from low to high.
current = min(x_i, y_j);
// Update invariant conditions for i, j, ecdf_xs, and ecdf_ys.
if current == x_i {
ecdf_xs = (i + 1) as f64 / n as f64;
i += 1;
}
if current == y_j {
ecdf_ys = (j + 1) as f64 / m as f64;
j += 1;
}
// Update invariant conditions for the test statistic.
let diff = (ecdf_xs - ecdf_ys).abs();
if diff > statistic {
statistic = diff;
}
}
// Don't need to walk the rest of the samples because one of the ecdfs is
// already one and the other will be increasing up to one. This means the
// difference will be monotonically decreasing, so we have our test
// statistic value already.
statistic
}
/// Calculate the probability that the null hypothesis is false for a two sample
/// Kolmogorov-Smirnov test. Can only reject the null hypothesis if this
/// evidence exceeds the confidence level required.
fn calculate_reject_probability(statistic: f64, n1: usize, n2: usize) -> Result<f64, String> {
// Only supports samples of size > 7.
assert!(n1 > 7 && n2 > 7);
let n1 = n1 as f64;
let n2 = n2 as f64;
let factor = ((n1 * n2) / (n1 + n2)).sqrt();
let term = (factor + 0.12 + 0.11 / factor) * statistic;
let probability = probability_kolmogorov_smirnov(term)?;
let reject_probability = 1.0 - probability;
assert!(0.0 <= reject_probability && reject_probability <= 1.0);
Ok(reject_probability)
}
/// Calculate the critical value for the two sample Kolmogorov-Smirnov test.
///
/// # Panics
///
/// No convergence panic if the binary search does not locate the critical
/// value in less than 200 iterations.
///
/// # Examples
///
/// ```
/// extern crate kolmogorov_smirnov as ks;
///
/// let critical_value = ks::calculate_critical_value(256, 256, 0.95);
/// println!("Critical value at 95% confidence for samples of size 256 is {}",
/// critical_value);
/// ```
pub fn calculate_critical_value(n1: usize, n2: usize, confidence: f64) -> Result<f64, String> {
assert!(0.0 < confidence && confidence < 1.0);
// Only supports samples of size > 7.
assert!(n1 > 7 && n2 > 7);
// The test statistic is between zero and one so can binary search quickly
// for the critical value.
let mut low = 0.0;
let mut high = 1.0;
for _ in 1..200 {
if low + 1e-8 >= high {
return Ok(high);
}
let mid = low + (high - low) / 2.0;
let reject_probability = calculate_reject_probability(mid, n1, n2)?;
if reject_probability > confidence {
// Maintain invariant that reject_probability(high) > confidence.
high = mid;
} else {
// Maintain invariant that reject_probability(low) <= confidence.
low = mid;
}
}
Err(format!("No convergence in calculate_critical_value({}, {}, {}).",
n1,
n2,
confidence))
//panic!("No convergence in calculate_critical_value({}, {}, {}).",
// n1,
// n2,
// confidence);
}
/// Calculate the Kolmogorov-Smirnov probability function.
fn probability_kolmogorov_smirnov(lambda: f64) -> Result<f64, String> {
if lambda == 0.0 {
return Ok(1.0);
}
let minus_two_lambda_squared = -2.0 * lambda * lambda;
let mut q_ks = 0.0;
for j in 1..200 {
let sign = if j % 2 == 1 {
1.0
} else {
-1.0
};
let j = j as f64;
let term = sign * 2.0 * (minus_two_lambda_squared * j * j).exp();
q_ks += term;
if term.abs() < 1e-8 {
// Trim results that exceed 1.
return Ok(q_ks.min(1.0));
}
}
Err(format!("No convergence in probability_kolmogorov_smirnov({}).",
lambda))
//panic!("No convergence in probability_kolmogorov_smirnov({}).",
//lambda);
}
#[cfg(test)]
mod tests {
extern crate quickcheck;
extern crate rand;
use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, StdGen};
use self::rand::Rng;
use std::cmp;
use std::usize;
use super::test;
use ecdf::Ecdf;
const EPSILON: f64 = 1e-10;
fn check<A: Testable>(f: A) {
// Need - 1 to ensure space for creating non-overlapping samples.
let g = StdGen::new(rand::thread_rng(), usize::MAX - 1);
QuickCheck::new().gen(g).quickcheck(f);
}
/// Wrapper for generating sample data with QuickCheck.
///
/// Samples must be sequences of u64 values with more than 7 elements.
#[derive(Debug, Clone)]
struct Samples {
vec: Vec<u64>,
}
impl Samples {
fn min(&self) -> u64 {
let &min = self.vec.iter().min().unwrap();
min
}
fn max(&self) -> u64 {
let &max = self.vec.iter().max().unwrap();
max
}
fn shuffle(&mut self) {
let mut rng = rand::thread_rng();
rng.shuffle(&mut self.vec);
}
}
impl Arbitrary for Samples {
fn arbitrary<G: Gen>(g: &mut G) -> Samples {
// Limit size of generated sample set to 1024
let max = cmp::min(g.size(), 1024);
let size = g.gen_range(8, max);
let vec = (0..size).map(|_| u64::arbitrary(g)).collect();
Samples { vec: vec }
}
fn shrink(&self) -> Box<Iterator<Item = Samples>> {
let vec: Vec<u64> = self.vec.clone();
let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink();
Box::new(shrunk.filter(|v| v.len() > 7).map(|v| Samples { vec: v }))
}
}
#[test]
#[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")]
fn test_panics_on_empty_samples_set() {
let xs: Vec<u64> = vec![];
let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
test(&xs, &ys, 0.95);
}
#[test]
#[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")]
fn test_panics_on_empty_other_samples_set() {
let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
let ys: Vec<u64> = vec![];
test(&xs, &ys, 0.95);
}
#[test]
#[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")]
fn test_panics_on_confidence_leq_zero() {
let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
test(&xs, &ys, 0.0);
}
#[test]
#[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")]
fn test_panics_on_confidence_geq_one() {
let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
test(&xs, &ys, 1.0);
}
/// Alternative calculation for the test statistic for the two sample
/// Kolmogorov-Smirnov test. This simple implementation is used as a
/// verification check against actual calculation used.
fn calculate_statistic_alt<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f64 {
assert!(xs.len() > 0 && ys.len() > 0);
let ecdf_xs = Ecdf::new(xs);
let ecdf_ys = Ecdf::new(ys);
let mut statistic = 0.0;
for x in xs.iter() {
let diff = (ecdf_xs.value(x.clone()) - ecdf_ys.value(x.clone())).abs();
if diff > statistic {
statistic = diff;
}
}
for y in ys.iter() {
let diff = (ecdf_xs.value(y.clone()) - ecdf_ys.value(y.clone())).abs();
if diff > statistic {
statistic = diff;
}
}
statistic
}
#[test]
fn test_calculate_statistic() {
fn prop(xs: Samples, ys: Samples) -> bool {
let result = test(&xs.vec, &ys.vec, 0.95);
let actual = result.statistic;
let expected = calculate_statistic_alt(&xs.vec, &ys.vec);
actual == expected
}
check(prop as fn(Samples, Samples) -> bool);
}
#[test]
fn test_statistic_is_between_zero_and_one() {
fn prop(xs: Samples, ys: Samples) -> bool {
let result = test(&xs.vec, &ys.vec, 0.95);
let actual = result.statistic;
0.0 <= actual && actual <= 1.0
}
check(prop as fn(Samples, Samples) -> bool);
}
#[test]
fn test_statistic_is_zero_for_identical_samples() {
fn prop(xs: Samples) -> bool {
let ys = xs.clone();
let result = test(&xs.vec, &ys.vec, 0.95);
result.statistic == 0.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn test_statistic_is_zero_for_permuted_sample() {
fn prop(xs: Samples) -> bool {
let mut ys = xs.clone();
ys.shuffle();
let result = test(&xs.vec, &ys.vec, 0.95);
result.statistic == 0.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn test_statistic_is_one_for_samples_with_no_overlap_in_support() {
fn prop(xs: Samples) -> bool {
let mut ys = xs.clone();
// Shift ys so that ys.min > xs.max.
let ys_min = xs.max() + 1;
ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect();
let result = test(&xs.vec, &ys.vec, 0.95);
result.statistic == 1.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn test_statistic_is_one_half_for_sample_with_non_overlapping_in_support_replicate_added() {
fn prop(xs: Samples) -> bool {
let mut ys = xs.clone();
// Shift ys so that ys.min > xs.max.
let ys_min = xs.max() + 1;
ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect();
// Add all the original items back too.
for &x in xs.vec.iter() {
ys.vec.push(x);
}
let result = test(&xs.vec, &ys.vec, 0.95);
result.statistic == 0.5
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn test_statistic_is_one_div_length_for_sample_with_additional_low_value() {
fn prop(xs: Samples) -> bool {
// Add a extra sample of early weight to ys.
let min = xs.min();
let mut ys = xs.clone();
ys.vec.push(min - 1);
let result = test(&xs.vec, &ys.vec, 0.95);
let expected = 1.0 / ys.vec.len() as f64;
result.statistic == expected
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn test_statistic_is_one_div_length_for_sample_with_additional_high_value() {
fn prop(xs: Samples) -> bool {
// Add a extra sample of late weight to ys.
let max = xs.max();
let mut ys = xs.clone();
ys.vec.push(max + 1);
let result = test(&xs.vec, &ys.vec, 0.95);
let expected = 1.0 / ys.vec.len() as f64;
(result.statistic - expected).abs() < EPSILON
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn test_statistic_is_one_div_length_for_sample_with_additional_low_and_high_values() {
fn prop(xs: Samples) -> bool {
// Add a extra sample of late weight to ys.
let min = xs.min();
let max = xs.max();
let mut ys = xs.clone();
ys.vec.push(min - 1);
ys.vec.push(max + 1);
let result = test(&xs.vec, &ys.vec, 0.95);
let expected = 1.0 / ys.vec.len() as f64;
(result.statistic - expected).abs() < EPSILON
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_values() {
fn prop(xs: Samples, n: u8) -> bool {
// Add extra sample of early weight to ys.
let min = xs.min();
let mut ys = xs.clone();
for j in 0..n {
ys.vec.push(min - (j as u64) - 1);
}
let result = test(&xs.vec, &ys.vec, 0.95);
let expected = n as f64 / ys.vec.len() as f64;
result.statistic == expected
}
check(prop as fn(Samples, u8) -> bool);
}
#[test]
fn test_statistic_is_n_div_length_for_sample_with_additional_n_high_values() {
fn prop(xs: Samples, n: u8) -> bool {
// Add extra sample of early weight to ys.
let max = xs.max();
let mut ys = xs.clone();
for j in 0..n {
ys.vec.push(max + (j as u64) + 1);
}
let result = test(&xs.vec, &ys.vec, 0.95);
let expected = n as f64 / ys.vec.len() as f64;
(result.statistic - expected).abs() < EPSILON
}
check(prop as fn(Samples, u8) -> bool);
}
#[test]
fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_and_high_values() {
fn prop(xs: Samples, n: u8) -> bool {
// Add extra sample of early weight to ys.
let min = xs.min();
let max = xs.max();
let mut ys = xs.clone();
for j in 0..n {
ys.vec.push(min - (j as u64) - 1);
ys.vec.push(max + (j as u64) + 1);
}
let result = test(&xs.vec, &ys.vec, 0.95);
let expected = n as f64 / ys.vec.len() as f64;
(result.statistic - expected).abs() < EPSILON
}
check(prop as fn(Samples, u8) -> bool);
}
#[test]
fn test_statistic_is_n_or_m_div_length_for_sample_with_additional_n_low_and_m_high_values() {
fn prop(xs: Samples, n: u8, m: u8) -> bool {
// Add extra sample of early weight to ys.
let min = xs.min();
let max = xs.max();
let mut ys = xs.clone();
for j in 0..n {
ys.vec.push(min - (j as u64) - 1);
}
for j in 0..m {
ys.vec.push(max + (j as u64) + 1);
}
let result = test(&xs.vec, &ys.vec, 0.95);
let expected = cmp::max(n, m) as f64 / ys.vec.len() as f64;
(result.statistic - expected).abs() < EPSILON
}
check(prop as fn(Samples, u8, u8) -> bool);
}
#[test]
fn test_is_rejected_if_reject_probability_greater_than_confidence() {
fn prop(xs: Samples, ys: Samples) -> bool {
let result = test(&xs.vec, &ys.vec, 0.95);
if result.is_rejected {
result.reject_probability > 0.95
} else {
result.reject_probability <= 0.95
}
}
check(prop as fn(Samples, Samples) -> bool);
}
#[test]
fn test_reject_probability_is_zero_for_identical_samples() {
fn prop(xs: Samples) -> bool {
let ys = xs.clone();
let result = test(&xs.vec, &ys.vec, 0.95);
result.reject_probability == 0.0
}
check(prop as fn(Samples) -> bool);
}
#[test]
fn test_reject_probability_is_zero_for_permuted_sample() {
fn prop(xs: Samples) -> bool {
let mut ys = xs.clone();
ys.shuffle();
let result = test(&xs.vec, &ys.vec, 0.95);
result.reject_probability == 0.0
}
check(prop as fn(Samples) -> bool);
}
}

14
tools/kstest/Cargo.toml Normal file
View File

@ -0,0 +1,14 @@
[package]
name = "kstest"
version = "0.1.0"
authors = ["eugen.betke <betke@dkrz.de>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
kolmogorov_smirnov = {path = "../kslib"}
csv = "1.1"
glob = "0.3.0"
serde = { version = "1.0", features = ["derive"] }
chrono = "*"

27
tools/kstest/run.sh Executable file
View File

@ -0,0 +1,27 @@
#!/bin/bash
#7488914 19865984
#4296426 18672376
#5024292 17944118
#dataset_fn="../../datasets/job_codings_v4_confidential.csv"
#jobids=( )
#jobids=( ${jobids[@]} 19865984 )
#jobids=( ${jobids[@]} 18672376 )
#jobids=( ${jobids[@]} 17944118 )
dataset_fn="../../datasets/job_codings_v4.csv"
jobids=( )
jobids=( ${jobids[@]} 7488914 )
jobids=( ${jobids[@]} 4296426 )
jobids=( ${jobids[@]} 5024292 )
set -x
for jobid in ${jobids[@]}; do
sim_fn="./ks_similarities_$jobid.csv"
progress_fn="./ks_progress_$jobid.csv"
log_fn="./ks_fail_$jobid.log"
cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn &
done

183
tools/kstest/src/main.rs Normal file
View File

@ -0,0 +1,183 @@
extern crate kolmogorov_smirnov as ks;
extern crate csv;
extern crate serde;
extern crate chrono;
use std::fs::File;
use std::io::prelude::*;
//use std::fs::File;
use serde::Deserialize;
use serde::Serialize;
use std::collections::HashMap;
use std::env;
use std::io::LineWriter;
pub type Score = u32;
pub type JobCoding = Vec<Score>;
pub type Similarity = f32;
pub type Jobid = u32;
pub type QCodings = HashMap<Jobid, JobCoding>;
#[derive(Debug, Deserialize)]
pub struct Record {
jobid: u32,
//q16_coding: String,
ks_coding: String,
}
#[derive(Debug, Serialize)]
pub struct SimilarityRow {
pub jobid: u32,
pub alg_id: u32,
pub alg_name: String,
pub similarity: f32
}
#[derive(Debug, Serialize)]
pub struct ProgressRow {
jobid: u32,
alg_id: u32,
alg_name: String,
delta: i64,
}
pub fn convert_to_coding(coding: String) -> Vec<Score> {
let split = coding.split(":");
let vec: Vec<Score> = split
.filter(|s| !s.is_empty())
//.map(|s| s.parse::<F>().unwrap())
.map(|s| s.parse().unwrap())
.collect();
vec
}
//fn ks_similarity(xs: &Vec<u32>, ys: &Vec<u32>) -> Result<Similarity, String> {
// let confidence = 0.95;
// ks::test(xs, ys, confidence)?
// let reject_probability = match result {
// Ok(v) => v.reject_probability,
// Err(_) => 1.0,
// };
// //println!("is_rejected: {:?}\nstatistic: {:?}\nreject_probability: {:?}\ncritical_value: {:?}\nconfidence: {:?}", result.is_rejected, result.statistic, result.reject_probability, result.critical_value, result.confidence);
// (1.0 - reject_probability) as Similarity
//}
fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: String, log_fn: String) {
let mut q_codings: QCodings = HashMap::new();
let file = File::open(&dataset_fn).expect("Unable to open dataset.");
let mut rdr = csv::Reader::from_reader(file);
//for result in rdr.deserialize().take(10000) {
for result in rdr.deserialize() {
let record: Record = result.expect("bla bla");
//let q_coding = convert_to_coding(record.q16_coding);
let q_coding = convert_to_coding(record.ks_coding);
// Insert Non-Zero jobs only
if q_coding.iter().sum::<Score>() > (0 as Score) {
q_codings.insert(record.jobid, q_coding);
}
}
let probe = q_codings[&jobid].clone();
let similarities_file = File::create(&similarities_fn).expect("Unable to open");
let mut wtr_similarities = csv::Writer::from_writer(&similarities_file);
let alg_name = "ks";
let alg_id = 6;
let progress_file = File::create(&progress_fn).expect("Unable to open");
let mut wtr_progress = csv::Writer::from_writer(&progress_file);
let mut start = chrono::Utc::now();
let mut counter = 1;
let mut avail_codings: Vec<(u32, &JobCoding)>;
avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect();
let mut similarities: Vec<(Jobid, Similarity)> = Vec::new();
let log_file = File::create(&log_fn).expect("Unable to open");
let mut log_file = LineWriter::new(log_file);
while let Some((jobid, q_coding)) = avail_codings.pop() {
if (counter % 10_000) == 0 {
let stop = chrono::Utc::now();
let progress_row = ProgressRow {
jobid: jobid,
alg_id: alg_id,
alg_name: String::from(alg_name),
delta: ((stop - start).num_nanoseconds().unwrap())
};
wtr_progress.serialize(progress_row).unwrap();
start = stop;
}
//println!("Processing {:?}", jobid);
//let similarity = ks_similarity(q_coding, &probe);
let confidence = 0.95;
let similarity = match ks::test(q_coding, &probe, confidence) {
Ok(sim) => (1.0 - sim.reject_probability) as Similarity,
Err(e) => {
let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
log_file.write_all(message.as_bytes()).unwrap();
1.0
}
};
similarities.push((jobid, similarity));
counter += 1;
}
for (jobid, similarity) in similarities.iter() {
let similarity_row = SimilarityRow {
jobid: *jobid,
alg_id: alg_id,
alg_name: String::from(alg_name),
similarity: *similarity,
};
wtr_similarities.serialize(similarity_row).unwrap();
}
log_file.flush().unwrap();
}
fn main() {
let args: Vec<String> = env::args().collect();
let dataset_fn = args[1].clone();
let jobid = args[2].parse::<u32>().unwrap();
let sim_fn = args[3].clone();
let progress_fn = args[4].clone();
let log_fn = args[5].clone();
println!("{:?}", args);
run(dataset_fn, jobid, sim_fn, progress_fn, log_fn);
}
mod tests {
//use super::*;
#[test]
fn test_ks_test() {
let xs = vec!(0.0 , 1.0 , 2.0 , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 , 8.0 , 9.0 , 10.0 , 11.0 , 12.0);
let ys = vec!(12.0 , 11.0 , 10.0 , 9.0 , 8.0 , 7.0 , 6.0 , 5.0 , 4.0 , 3.0 , 2.0 , 1.0 , 0.0);
ks_test(xs, ys);
let c1 = vec![141.0,143.0,142.0,238.0,132.0,486.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0];
let c2 = vec![239.0,239.0,255.0,255.0,239.0,239.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,502.0,511.0,503.0];
ks_test(c1, c2);
let c1 = vec![2.0,2.0,2.0,9.0,3.0,0.0,0.0,0.0];
let c2 = vec![2.0,2.0,2.0,2.0,8.0,3.0,0.0,10.0];
ks_test(c1, c2);
}
}