diff --git a/tools/kslib/Cargo.toml b/tools/kslib/Cargo.toml new file mode 100644 index 0000000..143447d --- /dev/null +++ b/tools/kslib/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "kolmogorov_smirnov" +description = "Implementation of the Kolmogorov-Smirnov statistical test as a Rust library." +version = "1.1.0" +homepage = "http://daithiocrualaoich.github.io/kolmogorov_smirnov" +repository = "https://github.com/daithiocrualaoich/kolmogorov_smirnov" +authors = ["Daithi O Crualaoich "] +license = "Apache-2.0" +keywords = ["statistics"] +exclude = [ + ".dockerignore", + ".gitignore", + ".travis.yml", + "Cargo.lock", + "Dockerfile", + "dat/*", + "doc/*", +] + +[[bin]] +name = "critical_values" + +[[bin]] +name = "normal" + +[[bin]] +name = "ks_f64" + +[[bin]] +name = "ks_i64" + +[dependencies] +rand = "0.3.12" + +[dev-dependencies] +quickcheck = "0.2" diff --git a/tools/kslib/src/bin/critical_values.rs b/tools/kslib/src/bin/critical_values.rs new file mode 100644 index 0000000..61c1eb5 --- /dev/null +++ b/tools/kslib/src/bin/critical_values.rs @@ -0,0 +1,39 @@ +extern crate kolmogorov_smirnov as ks; + +use ks::calculate_critical_value; + +use std::env; + +/// Calculate critical values dataset. +/// +/// # Examples +/// +/// ```bash +/// cargo run --bin critical_values +/// ``` +/// +/// This will print the critical values of the Kolmogorov-Smirnov two sample +/// test for samples of size `` against samples of sizes 16 +/// through `` inclusive at the specified confidence level. +/// +/// `` and `` must be positive integers, `` must +/// be a floating point number strictly between zero and one. +fn main() { + let args: Vec = env::args().collect(); + + let confidence: f64 = args[1].parse().expect(" must be a floating point number."); + let n1: usize = args[2].parse().expect(" must be an integer."); + let limit: usize = args[3].parse().expect(" must be an integer."); + + assert!(n1 > 0 && limit > 0); + assert!(0.0 < confidence && confidence < 1.0); + + println!("n1\tn2\tconfidence\tcritical_value"); + for n2 in 16..(limit + 1) { + println!("{}\t{}\t{}\t{}", + n1, + n2, + confidence, + calculate_critical_value(n1, n2, confidence).unwrap()); + } +} diff --git a/tools/kslib/src/bin/ks_f64.rs b/tools/kslib/src/bin/ks_f64.rs new file mode 100644 index 0000000..f8d29e0 --- /dev/null +++ b/tools/kslib/src/bin/ks_f64.rs @@ -0,0 +1,52 @@ +extern crate kolmogorov_smirnov as ks; + +use ks::test_f64; + +use std::env; +use std::io::{BufReader, BufRead}; +use std::fs::File; +use std::path::Path; + +fn parse_float(s: String) -> f64 { + s.parse::().expect("Not a floating point number.") +} + +/// Runs a Kolmogorov-Smirnov test on floating point data files. +/// +/// Input files must be single-column headerless data files. The data samples +/// are tested against each other at the 0.95 confidence level. +/// +/// # Examples +/// +/// ```bash +/// cargo run --bin ks_f64 +/// ``` +/// +/// This will print the test result to standard output. +fn main() { + let args: Vec = env::args().collect(); + + let path1 = Path::new(&args[1]); + let path2 = Path::new(&args[2]); + + let file1 = BufReader::new(File::open(&path1).unwrap()); + let file2 = BufReader::new(File::open(&path2).unwrap()); + + let lines1 = file1.lines().map(|line| line.unwrap()); + let lines2 = file2.lines().map(|line| line.unwrap()); + + let xs: Vec = lines1.map(parse_float).collect(); + let ys: Vec = lines2.map(parse_float).collect(); + + let result = ks::test_f64(&xs, &ys, 0.95).unwrap(); + + if result.is_rejected { + println!("Samples are from different distributions."); + } else { + println!("Samples are from the same distribution."); + } + + println!("test statistic = {}", result.statistic); + println!("critical value = {}", result.critical_value); + println!("reject probability = {}", result.reject_probability); +} diff --git a/tools/kslib/src/bin/ks_i64.rs b/tools/kslib/src/bin/ks_i64.rs new file mode 100644 index 0000000..fa24fce --- /dev/null +++ b/tools/kslib/src/bin/ks_i64.rs @@ -0,0 +1,52 @@ +extern crate kolmogorov_smirnov as ks; + +use ks::test; + +use std::env; +use std::io::{BufReader, BufRead}; +use std::fs::File; +use std::path::Path; + +fn parse_int(s: String) -> i64 { + s.parse::().expect("Not an integer.") +} + +/// Runs a Kolmogorov-Smirnov test on integer data files. +/// +/// Input files must be single-column headerless data files. The data samples +/// are tested against each other at the 0.95 confidence level. +/// +/// # Examples +/// +/// ```bash +/// cargo run --bin ks_i64 +/// ``` +/// +/// This will print the test result to standard output. +fn main() { + let args: Vec = env::args().collect(); + + let path1 = Path::new(&args[1]); + let path2 = Path::new(&args[2]); + + let file1 = BufReader::new(File::open(&path1).unwrap()); + let file2 = BufReader::new(File::open(&path2).unwrap()); + + let lines1 = file1.lines().map(|line| line.unwrap()); + let lines2 = file2.lines().map(|line| line.unwrap()); + + let xs: Vec = lines1.map(parse_int).collect(); + let ys: Vec = lines2.map(parse_int).collect(); + + let result = ks::test(&xs, &ys, 0.95).unwrap(); + + if result.is_rejected { + println!("Samples are from different distributions."); + } else { + println!("Samples are from the same distribution."); + } + + println!("test statistic = {}", result.statistic); + println!("critical value = {}", result.critical_value); + println!("reject probability = {}", result.reject_probability); +} diff --git a/tools/kslib/src/bin/normal.rs b/tools/kslib/src/bin/normal.rs new file mode 100644 index 0000000..2f6f318 --- /dev/null +++ b/tools/kslib/src/bin/normal.rs @@ -0,0 +1,35 @@ +extern crate rand; + +use std::env; +use rand::distributions::{Normal, IndependentSample}; + +/// Prints a sequence of Normal deviates. +/// +/// # Examples +/// +/// ```bash +/// cargo run --bin normal +/// ``` +/// This will print `` float point numbers, one per line, to +/// standard output. These numbers will have a Normal distribution with the +/// specified mean and variance. +/// +/// `` must be a positive integer, `` and `` may +/// be integers or floating point numbers but `` must be positive. +fn main() { + let args: Vec = env::args().collect(); + + let n: u32 = args[1].parse().expect(" must be an integer."); + let mean: f64 = args[2].parse().expect(" must be a floating point number."); + let variance: f64 = args[3].parse().expect(" must be a floating point number."); + + assert!(n > 0 && variance.is_sign_positive()); + + let mut rng = rand::thread_rng(); + let normal = Normal::new(mean, variance.sqrt()); + + for _ in 0..n { + let x = normal.ind_sample(&mut rng); + println!("{}", x) + } +} diff --git a/tools/kslib/src/ecdf.rs b/tools/kslib/src/ecdf.rs new file mode 100644 index 0000000..790f50b --- /dev/null +++ b/tools/kslib/src/ecdf.rs @@ -0,0 +1,853 @@ +//! Empirical cumulative distribution function. + +pub struct Ecdf { + samples: Vec, + length: usize, +} + +impl Ecdf { + /// Construct a new representation of a cumulative distribution function for + /// a given sample. + /// + /// The construction will involve computing a sorted clone of the given sample + /// and may be inefficient or completely prohibitive for large samples. This + /// computation is amortized significantly if there is heavy use of the value + /// function. + /// + /// # Panics + /// + /// The sample set must be non-empty. + /// + /// # Examples + /// + /// ``` + /// extern crate kolmogorov_smirnov as ks; + /// + /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + /// let ecdf = ks::Ecdf::new(&samples); + /// ``` + pub fn new(samples: &[T]) -> Ecdf { + let length = samples.len(); + assert!(length > 0); + + // Sort a copied sample for binary searching. + let mut sorted = samples.to_vec(); + sorted.sort(); + + Ecdf { + samples: sorted, + length: length, + } + } + + /// Calculate a value of the empirical cumulative distribution function for + /// a given sample. + /// + /// # Examples + /// + /// ``` + /// extern crate kolmogorov_smirnov as ks; + /// + /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + /// let ecdf = ks::Ecdf::new(&samples); + /// assert_eq!(ecdf.value(4), 0.5); + /// ``` + pub fn value(&self, t: T) -> f64 { + let num_samples_leq_t = match self.samples.binary_search(&t) { + Ok(mut index) => { + // At least one sample is a t and we have the index of it. Need + // to walk down the sorted samples until at last that == t. + while index + 1 < self.length && self.samples[index + 1] == t { + index += 1; + } + + // Compensate for 0-based indexing. + index + 1 + } + Err(index) => { + // No sample is a t but if we had to put one in it would go at + // index. This means all indices to the left have samples < t + // and should be counted in the cdf proportion. We must take one + // from index to get the last included sample but then we just + // have to add one again to account for 0-based indexing. + index + } + }; + + num_samples_leq_t as f64 / self.length as f64 + } + + /// Calculate a percentile for the sample using the Nearest Rank method. + /// + /// # Panics + /// + /// The percentile requested must be between 1 and 100 inclusive. In + /// particular, there is no 0-percentile. + /// + /// # Examples + /// + /// ``` + /// extern crate kolmogorov_smirnov as ks; + /// + /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + /// let ecdf = ks::Ecdf::new(&samples); + /// assert_eq!(ecdf.percentile(50), 4); + /// ``` + pub fn percentile(&self, p: u8) -> T { + assert!(0 < p && p <= 100); + + let rank = (p as f64 * self.length as f64 / 100.0).ceil() as usize; + self.samples[rank - 1].clone() + } + + /// Return the minimal element of the samples. + /// + /// # Examples + /// + /// ``` + /// extern crate kolmogorov_smirnov as ks; + /// + /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + /// let ecdf = ks::Ecdf::new(&samples); + /// assert_eq!(ecdf.min(), 0); + /// ``` + pub fn min(&self) -> T { + self.samples[0].clone() + } + + /// Return the maximal element of the samples. + /// + /// # Examples + /// + /// ``` + /// extern crate kolmogorov_smirnov as ks; + /// + /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + /// let ecdf = ks::Ecdf::new(&samples); + /// assert_eq!(ecdf.max(), 9); + /// ``` + pub fn max(&self) -> T { + self.samples[self.samples.len() - 1].clone() + } +} + +/// Calculate a one-time value of the empirical cumulative distribution function +/// for a given sample. +/// +/// Computational running time of this function is O(n) but does not amortize +/// across multiple calls like Ecdf::value. This function should only be +/// used in the case that a small number of ECDF values are required for the +/// sample. Otherwise, Ecdf::new should be used to create a structure that +/// takes the upfront O(n log n) sort cost but calculates values in O(log n). +/// +/// # Panics +/// +/// The sample set must be non-empty. +/// +/// # Examples +/// +/// ``` +/// extern crate kolmogorov_smirnov as ks; +/// +/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +/// let value = ks::ecdf(&samples, 4); +/// assert_eq!(value, 0.5); +/// ``` +pub fn ecdf(samples: &[T], t: T) -> f64 { + let mut num_samples_leq_t = 0; + let mut length = 0; + + for sample in samples.iter() { + length += 1; + if *sample <= t { + num_samples_leq_t += 1; + } + } + + assert!(length > 0); + + num_samples_leq_t as f64 / length as f64 +} + +/// Calculate a one-time percentile for a given sample using the Nearest Rank +/// method and Quick Select. +/// +/// Computational running time of this function is O(n) but does not amortize +/// across multiple calls like Ecdf::percentile. This function should only be +/// used in the case that a small number of percentiles are required for the +/// sample. Otherwise, Ecdf::new should be used to create a structure that +/// takes the upfront O(n log n) sort cost but calculates percentiles in O(1). +/// +/// # Panics +/// +/// The sample set must be non-empty. +/// +/// The percentile requested must be between 1 and 100 inclusive. In particular, +/// there is no 0-percentile. +/// +/// # Examples +/// +/// ``` +/// extern crate kolmogorov_smirnov as ks; +/// +/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +/// let percentile = ks::percentile(&samples, 50); +/// assert_eq!(percentile, 4); +/// ``` +pub fn percentile(samples: &[T], p: u8) -> T { + assert!(0 < p && p <= 100); + + let length = samples.len(); + assert!(length > 0); + + let rank = (p as f64 * length as f64 / 100.0).ceil() as usize; + + // Quick Select the element at rank. + + let mut samples: Vec = samples.to_vec(); + let mut low = 0; + let mut high = length; + + loop { + assert!(low < high); + + let pivot = samples[low].clone(); + + if low >= high - 1 { + return pivot; + } + + // First determine if the rank item is less than the pivot. + + // Organise samples so that all items less than pivot are to the left, + // `bottom` is the number of items less than pivot. + + let mut bottom = low; + let mut top = high - 1; + + while bottom < top { + while bottom < top && samples[bottom] < pivot { + bottom += 1; + } + while bottom < top && samples[top] >= pivot { + top -= 1; + } + + if bottom < top { + samples.swap(bottom, top); + } + } + + if rank <= bottom { + // Rank item is less than pivot. Exclude pivot and larger items. + high = bottom; + } else { + // Rank item is pivot or in the larger set. Exclude smaller items. + low = bottom; + + // Next, determine if the pivot is the rank item. + + // Organise samples so that all items less than or equal to pivot + // are to the left, `bottom` is the number of items less than or + // equal to pivot. Since the left is already less than the pivot, + // this just requires moving the pivots left also. + + let mut bottom = low; + let mut top = high - 1; + + while bottom < top { + while bottom < top && samples[bottom] == pivot { + bottom += 1; + } + while bottom < top && samples[top] != pivot { + top -= 1; + } + + if bottom < top { + samples.swap(bottom, top); + } + } + + // Is pivot the rank item? + + if rank <= bottom { + return pivot; + } + + // Rank item is greater than pivot. Exclude pivot and smaller items. + low = bottom; + } + } +} + + +#[cfg(test)] +mod tests { + extern crate quickcheck; + extern crate rand; + + use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, TestResult, StdGen}; + use std::cmp; + use std::usize; + use super::{Ecdf, ecdf, percentile}; + + fn check(f: A) { + let g = StdGen::new(rand::thread_rng(), usize::MAX); + QuickCheck::new().gen(g).quickcheck(f); + } + + /// Wrapper for generating sample data with QuickCheck. + /// + /// Samples must be non-empty sequences of u64 values. + #[derive(Debug, Clone)] + struct Samples { + vec: Vec, + } + + impl Arbitrary for Samples { + fn arbitrary(g: &mut G) -> Samples { + // Limit size of generated sample set to 1024 + let max = cmp::min(g.size(), 1024); + + let size = g.gen_range(1, max); + let vec = (0..size).map(|_| u64::arbitrary(g)).collect(); + + Samples { vec: vec } + } + + fn shrink(&self) -> Box> { + let vec: Vec = self.vec.clone(); + let shrunk: Box>> = vec.shrink(); + + Box::new(shrunk.filter(|v| v.len() > 0).map(|v| Samples { vec: v })) + } + } + + /// Wrapper for generating percentile query value data with QuickCheck. + /// + /// Percentile must be u8 between 1 and 100 inclusive. + #[derive(Debug, Clone)] + struct Percentile { + val: u8, + } + + impl Arbitrary for Percentile { + fn arbitrary(g: &mut G) -> Percentile { + let val = g.gen_range(1, 101) as u8; + + Percentile { val: val } + } + + fn shrink(&self) -> Box> { + let shrunk: Box> = self.val.shrink(); + + Box::new(shrunk.filter(|&v| 0u8 < v && v <= 100u8).map(|v| Percentile { val: v })) + } + } + + #[test] + #[should_panic(expected="assertion failed: length > 0")] + fn single_use_ecdf_panics_on_empty_samples_set() { + let xs: Vec = vec![]; + ecdf(&xs, 0); + } + + #[test] + #[should_panic(expected="assertion failed: length > 0")] + fn multiple_use_ecdf_panics_on_empty_samples_set() { + let xs: Vec = vec![]; + Ecdf::new(&xs); + } + + #[test] + fn single_use_ecdf_between_zero_and_one() { + fn prop(xs: Samples, val: u64) -> bool { + let actual = ecdf(&xs.vec, val); + + 0.0 <= actual && actual <= 1.0 + } + + check(prop as fn(Samples, u64) -> bool); + } + + #[test] + fn multiple_use_ecdf_between_zero_and_one() { + fn prop(xs: Samples, val: u64) -> bool { + let ecdf = Ecdf::new(&xs.vec); + let actual = ecdf.value(val); + + 0.0 <= actual && actual <= 1.0 + } + + check(prop as fn(Samples, u64) -> bool); + } + + #[test] + fn single_use_ecdf_is_an_increasing_function() { + fn prop(xs: Samples, val: u64) -> bool { + let actual = ecdf(&xs.vec, val); + + ecdf(&xs.vec, val - 1) <= actual && actual <= ecdf(&xs.vec, val + 1) + } + + check(prop as fn(Samples, u64) -> bool); + } + + #[test] + fn multiple_use_ecdf_is_an_increasing_function() { + fn prop(xs: Samples, val: u64) -> bool { + let ecdf = Ecdf::new(&xs.vec); + let actual = ecdf.value(val); + + ecdf.value(val - 1) <= actual && actual <= ecdf.value(val + 1) + } + + check(prop as fn(Samples, u64) -> bool); + } + + #[test] + fn single_use_ecdf_sample_min_minus_one_is_zero() { + fn prop(xs: Samples) -> bool { + let &min = xs.vec.iter().min().unwrap(); + + ecdf(&xs.vec, min - 1) == 0.0 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn multiple_use_ecdf_sample_min_minus_one_is_zero() { + fn prop(xs: Samples) -> bool { + let &min = xs.vec.iter().min().unwrap(); + let ecdf = Ecdf::new(&xs.vec); + + ecdf.value(min - 1) == 0.0 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn single_use_ecdf_sample_max_is_one() { + fn prop(xs: Samples) -> bool { + let &max = xs.vec.iter().max().unwrap(); + + ecdf(&xs.vec, max) == 1.0 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn multiple_use_ecdf_sample_max_is_one() { + fn prop(xs: Samples) -> bool { + let &max = xs.vec.iter().max().unwrap(); + let ecdf = Ecdf::new(&xs.vec); + + ecdf.value(max) == 1.0 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn single_use_ecdf_sample_val_is_num_samples_leq_val_div_length() { + fn prop(xs: Samples) -> bool { + let &val = xs.vec.first().unwrap(); + let num_samples = xs.vec + .iter() + .filter(|&&x| x <= val) + .count(); + let expected = num_samples as f64 / xs.vec.len() as f64; + + ecdf(&xs.vec, val) == expected + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn multiple_use_ecdf_sample_val_is_num_samples_leq_val_div_length() { + fn prop(xs: Samples) -> bool { + let &val = xs.vec.first().unwrap(); + let num_samples = xs.vec + .iter() + .filter(|&&x| x <= val) + .count(); + let expected = num_samples as f64 / xs.vec.len() as f64; + + let ecdf = Ecdf::new(&xs.vec); + + ecdf.value(val) == expected + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn single_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() { + fn prop(xs: Samples, val: u64) -> TestResult { + let length = xs.vec.len(); + + if xs.vec.iter().any(|&x| x == val) { + // Discard Vec containing val. + return TestResult::discard(); + } + + let num_samples = xs.vec + .iter() + .filter(|&&x| x <= val) + .count(); + let expected = num_samples as f64 / length as f64; + + let actual = ecdf(&xs.vec, val); + + TestResult::from_bool(actual == expected) + } + + check(prop as fn(Samples, u64) -> TestResult); + } + + #[test] + fn multiple_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() { + fn prop(xs: Samples, val: u64) -> TestResult { + let length = xs.vec.len(); + + if xs.vec.iter().any(|&x| x == val) { + // Discard Vec containing val. + return TestResult::discard(); + } + + let num_samples = xs.vec + .iter() + .filter(|&&x| x <= val) + .count(); + let expected = num_samples as f64 / length as f64; + + let ecdf = Ecdf::new(&xs.vec); + + TestResult::from_bool(ecdf.value(val) == expected) + } + + check(prop as fn(Samples, u64) -> TestResult); + } + + #[test] + fn single_and_multiple_use_ecdf_agree() { + fn prop(xs: Samples, val: u64) -> bool { + let multiple_use = Ecdf::new(&xs.vec); + + multiple_use.value(val) == ecdf(&xs.vec, val) + } + + check(prop as fn(Samples, u64) -> bool); + } + + #[test] + #[should_panic(expected="assertion failed: 0 < p && p <= 100")] + fn single_use_percentiles_panics_on_zero_percentile() { + let xs: Vec = vec![0]; + + percentile(&xs, 0); + } + + #[test] + #[should_panic(expected="assertion failed: 0 < p && p <= 100")] + fn single_use_percentiles_panics_on_101_percentile() { + let xs: Vec = vec![0]; + + percentile(&xs, 101); + } + + #[test] + #[should_panic(expected="assertion failed: 0 < p && p <= 100")] + fn multiple_use_percentiles_panics_on_zero_percentile() { + let xs: Vec = vec![0]; + let ecdf = Ecdf::new(&xs); + + ecdf.percentile(0); + } + + #[test] + #[should_panic(expected="assertion failed: 0 < p && p <= 100")] + fn multiple_use_percentiles_panics_on_101_percentile() { + let xs: Vec = vec![0]; + let ecdf = Ecdf::new(&xs); + + ecdf.percentile(101); + } + + #[test] + fn single_use_percentile_between_samples_min_and_max() { + fn prop(xs: Samples, p: Percentile) -> bool { + let &min = xs.vec.iter().min().unwrap(); + let &max = xs.vec.iter().max().unwrap(); + + let actual = percentile(&xs.vec, p.val); + + min <= actual && actual <= max + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn single_use_percentile_is_an_increasing_function() { + fn prop(xs: Samples, p: Percentile) -> bool { + let smaller = cmp::max(p.val - 1, 1); + let larger = cmp::min(p.val + 1, 100); + + let actual = percentile(&xs.vec, p.val); + + percentile(&xs.vec, smaller) <= actual && actual <= percentile(&xs.vec, larger) + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn single_use_percentile_100_is_sample_max() { + fn prop(xs: Samples) -> bool { + let &max = xs.vec.iter().max().unwrap(); + + percentile(&xs.vec, 100) == max + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn multiple_use_percentile_between_samples_min_and_max() { + fn prop(xs: Samples, p: Percentile) -> bool { + let &min = xs.vec.iter().min().unwrap(); + let &max = xs.vec.iter().max().unwrap(); + + let ecdf = Ecdf::new(&xs.vec); + let actual = ecdf.percentile(p.val); + + min <= actual && actual <= max + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn multiple_use_percentile_is_an_increasing_function() { + fn prop(xs: Samples, p: Percentile) -> bool { + let smaller = cmp::max(p.val - 1, 1); + let larger = cmp::min(p.val + 1, 100); + + let ecdf = Ecdf::new(&xs.vec); + let actual = ecdf.percentile(p.val); + + ecdf.percentile(smaller) <= actual && actual <= ecdf.percentile(larger) + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn multiple_use_percentile_100_is_sample_max() { + fn prop(xs: Samples) -> bool { + let &max = xs.vec.iter().max().unwrap(); + let ecdf = Ecdf::new(&xs.vec); + + ecdf.percentile(100) == max + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn single_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() { + fn prop(xs: Samples, val: u64) -> TestResult { + let actual = ecdf(&xs.vec, val); + + let p = (actual * 100.0).floor() as u8; + + match p { + 0 => { + // val is below the first percentile threshold. Can't + // calculate 0-percentile value so discard. + TestResult::discard() + } + _ => { + // Not equal because e.g. all percentiles of [0] are 0. So + // value of 1 gives ecdf == 1.0 and percentile(100) == 0. + let single_use = percentile(&xs.vec, p); + TestResult::from_bool(single_use <= val) + } + } + } + + check(prop as fn(Samples, u64) -> TestResult); + } + + #[test] + fn single_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() { + fn prop(xs: Samples, val: u64) -> TestResult { + let actual = ecdf(&xs.vec, val); + + let p = (actual * 100.0).floor() as u8; + + match p { + 0 => { + // val is below the first percentile threshold. Can't + // calculate 0-percentile value so discard. + TestResult::discard() + } + _ => { + // Not equal because e.g. all percentiles of [0] are 0. So + // value of 1 gives ecdf == 1.0 and percentile(100) == 0. + let multiple_use = Ecdf::new(&xs.vec); + TestResult::from_bool(multiple_use.percentile(p) <= val) + } + } + } + + check(prop as fn(Samples, u64) -> TestResult); + } + + #[test] + fn multiple_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() { + fn prop(xs: Samples, val: u64) -> TestResult { + let ecdf = Ecdf::new(&xs.vec); + let actual = ecdf.value(val); + + let p = (actual * 100.0).floor() as u8; + + match p { + 0 => { + // val is below the first percentile threshold. Can't + // calculate 0-percentile value so discard. + TestResult::discard() + } + _ => { + // Not equal because e.g. all percentiles of [0] are 0. So + // value of 1 gives ecdf == 1.0 and percentile(100) == 0. + let single_use = percentile(&xs.vec, p); + TestResult::from_bool(single_use <= val) + } + } + } + + check(prop as fn(Samples, u64) -> TestResult); + } + + #[test] + fn multiple_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() { + fn prop(xs: Samples, val: u64) -> TestResult { + let ecdf = Ecdf::new(&xs.vec); + let actual = ecdf.value(val); + + let p = (actual * 100.0).floor() as u8; + + match p { + 0 => { + // val is below the first percentile threshold. Can't + // calculate 0-percentile value so discard. + TestResult::discard() + } + _ => { + // Not equal because e.g. all percentiles of [0] are 0. So + // value of 1 gives ecdf == 1.0 and percentile(100) == 0. + TestResult::from_bool(ecdf.percentile(p) <= val) + } + } + } + + check(prop as fn(Samples, u64) -> TestResult); + } + + #[test] + fn single_use_percentile_followed_by_single_use_edf_is_geq_original_value() { + fn prop(xs: Samples, p: Percentile) -> bool { + let actual = percentile(&xs.vec, p.val); + + // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0. + // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5. + p.val as f64 / 100.0 <= ecdf(&xs.vec, actual) + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn single_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() { + fn prop(xs: Samples, p: Percentile) -> bool { + let actual = percentile(&xs.vec, p.val); + + let ecdf = Ecdf::new(&xs.vec); + + // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0. + // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5. + p.val as f64 / 100.0 <= ecdf.value(actual) + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn multiple_use_percentile_followed_by_single_use_edf_is_geq_original_value() { + fn prop(xs: Samples, p: Percentile) -> bool { + let multiple_use = Ecdf::new(&xs.vec); + let actual = multiple_use.percentile(p.val); + + // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0. + // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5. + p.val as f64 / 100.0 <= ecdf(&xs.vec, actual) + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn multiple_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() { + fn prop(xs: Samples, p: Percentile) -> bool { + let ecdf = Ecdf::new(&xs.vec); + let actual = ecdf.percentile(p.val); + + // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0. + // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5. + p.val as f64 / 100.0 <= ecdf.value(actual) + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn single_and_multiple_use_percentile_agree() { + fn prop(xs: Samples, p: Percentile) -> bool { + let multiple_use = Ecdf::new(&xs.vec); + + multiple_use.percentile(p.val) == percentile(&xs.vec, p.val) + } + + check(prop as fn(Samples, Percentile) -> bool); + } + + #[test] + fn min_is_leq_all_samples() { + fn prop(xs: Samples) -> bool { + let multiple_use = Ecdf::new(&xs.vec); + let actual = multiple_use.min(); + + xs.vec.iter().all(|&x| actual <= x) + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn max_is_geq_all_samples() { + fn prop(xs: Samples) -> bool { + let multiple_use = Ecdf::new(&xs.vec); + let actual = multiple_use.max(); + + xs.vec.iter().all(|&x| actual >= x) + } + + check(prop as fn(Samples) -> bool); + } +} diff --git a/tools/kslib/src/lib.rs b/tools/kslib/src/lib.rs new file mode 100644 index 0000000..9bca841 --- /dev/null +++ b/tools/kslib/src/lib.rs @@ -0,0 +1,5 @@ +pub mod ecdf; +pub mod test; + +pub use test::{test, test_f64, calculate_critical_value}; +pub use ecdf::{Ecdf, ecdf, percentile}; diff --git a/tools/kslib/src/test.rs b/tools/kslib/src/test.rs new file mode 100644 index 0000000..1beedc9 --- /dev/null +++ b/tools/kslib/src/test.rs @@ -0,0 +1,699 @@ +//! Two Sample Kolmogorov-Smirnov Test + +use std::cmp::{min, Ord, Ordering}; + +/// Two sample test result. +pub struct TestResult { + pub is_rejected: bool, + pub statistic: f64, + pub reject_probability: f64, + pub critical_value: f64, + pub confidence: f64, +} + +/// Perform a two sample Kolmogorov-Smirnov test on given samples. +/// +/// The samples must have length > 7 elements for the test to be valid. +/// +/// # Panics +/// +/// There are assertion panics if either sequence has <= 7 elements. +/// +/// # Examples +/// +/// ``` +/// extern crate kolmogorov_smirnov as ks; +/// +/// let xs = vec!(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); +/// let ys = vec!(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +/// let confidence = 0.95; +/// +/// let result = ks::test(&xs, &ys, confidence); +/// +/// if result.is_rejected { +/// println!("{:?} and {:?} are not from the same distribution with probability {}.", +/// xs, ys, result.reject_probability); +/// } +/// ``` +pub fn test(xs: &[T], ys: &[T], confidence: f64) -> Result { + assert!(xs.len() > 0 && ys.len() > 0); + assert!(0.0 < confidence && confidence < 1.0); + + // Only supports samples of size > 7. + assert!(xs.len() > 7 && ys.len() > 7); + + let statistic = calculate_statistic(xs, ys); + let critical_value = calculate_critical_value(xs.len(), ys.len(), confidence)?; + + let reject_probability = calculate_reject_probability(statistic, xs.len(), ys.len())?; + let is_rejected = reject_probability > confidence; + + Ok(TestResult { + is_rejected: is_rejected, + statistic: statistic, + reject_probability: reject_probability, + critical_value: critical_value, + confidence: confidence, + }) +} + +/// Wrapper type for f64 to implement Ord and make usable with test. +#[derive(PartialEq, Clone)] +struct OrderableF64 { + val: f64, +} + +impl OrderableF64 { + fn new(val: f64) -> OrderableF64 { + OrderableF64 { val: val } + } +} + +impl Eq for OrderableF64 {} + +impl PartialOrd for OrderableF64 { + fn partial_cmp(&self, other: &Self) -> Option { + self.val.partial_cmp(&other.val) + } +} + +impl Ord for OrderableF64 { + fn cmp(&self, other: &Self) -> Ordering { + self.val.partial_cmp(&other.val).unwrap() + } +} + +/// Perform a two sample Kolmogorov-Smirnov test on given f64 samples. +/// +/// This is necessary because f64 does not implement Ord in Rust as some +/// elements are incomparable, e.g. NaN. This function wraps the f64s in +/// implementation of Ord which panics on incomparable elements. +/// +/// The samples must have length > 7 elements for the test to be valid. +/// +/// # Panics +/// +/// There are assertion panics if either sequence has <= 7 elements. +/// +/// If any of the f64 elements in the input samples are unorderable, e.g. NaN. +/// +/// # Examples +/// +/// ``` +/// extern crate kolmogorov_smirnov as ks; +/// +/// let xs = vec!(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0); +/// let ys = vec!(12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0); +/// let confidence = 0.95; +/// +/// let result = ks::test_f64(&xs, &ys, confidence); +/// +/// if result.is_rejected { +/// println!("{:?} and {:?} are not from the same distribution with probability {}.", +/// xs, ys, result.reject_probability); +/// } +/// ``` +pub fn test_f64(xs: &[f64], ys: &[f64], confidence: f64) -> Result { + let xs: Vec = xs.iter().map(|&f| OrderableF64::new(f)).collect(); + let ys: Vec = ys.iter().map(|&f| OrderableF64::new(f)).collect(); + + test(&xs, &ys, confidence) +} + +/// Calculate the test statistic for the two sample Kolmogorov-Smirnov test. +/// +/// The test statistic is the maximum vertical distance between the ECDFs of +/// the two samples. +fn calculate_statistic(xs: &[T], ys: &[T]) -> f64 { + let n = xs.len(); + let m = ys.len(); + + assert!(n > 0 && m > 0); + + let mut xs = xs.to_vec(); + let mut ys = ys.to_vec(); + + // xs and ys must be sorted for the stepwise ECDF calculations to work. + xs.sort(); + ys.sort(); + + // The current value testing for ECDF difference. Sweeps up through elements + // present in xs and ys. + let mut current: &T; + + // i, j index the first values in xs and ys that are greater than current. + let mut i = 0; + let mut j = 0; + + // ecdf_xs, ecdf_ys always hold the ECDF(current) of xs and ys. + let mut ecdf_xs = 0.0; + let mut ecdf_ys = 0.0; + + // The test statistic value computed over values <= current. + let mut statistic = 0.0; + + while i < n && j < m { + // Advance i through duplicate samples in xs. + let x_i = &xs[i]; + while i + 1 < n && *x_i == xs[i + 1] { + i += 1; + } + + // Advance j through duplicate samples in ys. + let y_j = &ys[j]; + while j + 1 < m && *y_j == ys[j + 1] { + j += 1; + } + + // Step to the next sample value in the ECDF sweep from low to high. + current = min(x_i, y_j); + + // Update invariant conditions for i, j, ecdf_xs, and ecdf_ys. + if current == x_i { + ecdf_xs = (i + 1) as f64 / n as f64; + i += 1; + } + if current == y_j { + ecdf_ys = (j + 1) as f64 / m as f64; + j += 1; + } + + // Update invariant conditions for the test statistic. + let diff = (ecdf_xs - ecdf_ys).abs(); + if diff > statistic { + statistic = diff; + } + } + + // Don't need to walk the rest of the samples because one of the ecdfs is + // already one and the other will be increasing up to one. This means the + // difference will be monotonically decreasing, so we have our test + // statistic value already. + + statistic +} + +/// Calculate the probability that the null hypothesis is false for a two sample +/// Kolmogorov-Smirnov test. Can only reject the null hypothesis if this +/// evidence exceeds the confidence level required. +fn calculate_reject_probability(statistic: f64, n1: usize, n2: usize) -> Result { + // Only supports samples of size > 7. + assert!(n1 > 7 && n2 > 7); + + let n1 = n1 as f64; + let n2 = n2 as f64; + + let factor = ((n1 * n2) / (n1 + n2)).sqrt(); + let term = (factor + 0.12 + 0.11 / factor) * statistic; + + let probability = probability_kolmogorov_smirnov(term)?; + let reject_probability = 1.0 - probability; + + assert!(0.0 <= reject_probability && reject_probability <= 1.0); + + Ok(reject_probability) +} + +/// Calculate the critical value for the two sample Kolmogorov-Smirnov test. +/// +/// # Panics +/// +/// No convergence panic if the binary search does not locate the critical +/// value in less than 200 iterations. +/// +/// # Examples +/// +/// ``` +/// extern crate kolmogorov_smirnov as ks; +/// +/// let critical_value = ks::calculate_critical_value(256, 256, 0.95); +/// println!("Critical value at 95% confidence for samples of size 256 is {}", +/// critical_value); +/// ``` +pub fn calculate_critical_value(n1: usize, n2: usize, confidence: f64) -> Result { + assert!(0.0 < confidence && confidence < 1.0); + + // Only supports samples of size > 7. + assert!(n1 > 7 && n2 > 7); + + // The test statistic is between zero and one so can binary search quickly + // for the critical value. + let mut low = 0.0; + let mut high = 1.0; + + for _ in 1..200 { + if low + 1e-8 >= high { + return Ok(high); + } + + let mid = low + (high - low) / 2.0; + let reject_probability = calculate_reject_probability(mid, n1, n2)?; + + if reject_probability > confidence { + // Maintain invariant that reject_probability(high) > confidence. + high = mid; + } else { + // Maintain invariant that reject_probability(low) <= confidence. + low = mid; + } + } + + Err(format!("No convergence in calculate_critical_value({}, {}, {}).", + n1, + n2, + confidence)) + //panic!("No convergence in calculate_critical_value({}, {}, {}).", + // n1, + // n2, + // confidence); +} + +/// Calculate the Kolmogorov-Smirnov probability function. +fn probability_kolmogorov_smirnov(lambda: f64) -> Result { + if lambda == 0.0 { + return Ok(1.0); + } + + let minus_two_lambda_squared = -2.0 * lambda * lambda; + let mut q_ks = 0.0; + + for j in 1..200 { + let sign = if j % 2 == 1 { + 1.0 + } else { + -1.0 + }; + + let j = j as f64; + let term = sign * 2.0 * (minus_two_lambda_squared * j * j).exp(); + + q_ks += term; + + if term.abs() < 1e-8 { + // Trim results that exceed 1. + return Ok(q_ks.min(1.0)); + } + } + + Err(format!("No convergence in probability_kolmogorov_smirnov({}).", + lambda)) + //panic!("No convergence in probability_kolmogorov_smirnov({}).", + //lambda); +} + +#[cfg(test)] +mod tests { + extern crate quickcheck; + extern crate rand; + + use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, StdGen}; + use self::rand::Rng; + use std::cmp; + use std::usize; + + use super::test; + use ecdf::Ecdf; + + const EPSILON: f64 = 1e-10; + + fn check(f: A) { + // Need - 1 to ensure space for creating non-overlapping samples. + let g = StdGen::new(rand::thread_rng(), usize::MAX - 1); + QuickCheck::new().gen(g).quickcheck(f); + } + + /// Wrapper for generating sample data with QuickCheck. + /// + /// Samples must be sequences of u64 values with more than 7 elements. + #[derive(Debug, Clone)] + struct Samples { + vec: Vec, + } + + impl Samples { + fn min(&self) -> u64 { + let &min = self.vec.iter().min().unwrap(); + min + } + + fn max(&self) -> u64 { + let &max = self.vec.iter().max().unwrap(); + max + } + + fn shuffle(&mut self) { + let mut rng = rand::thread_rng(); + rng.shuffle(&mut self.vec); + } + } + + impl Arbitrary for Samples { + fn arbitrary(g: &mut G) -> Samples { + // Limit size of generated sample set to 1024 + let max = cmp::min(g.size(), 1024); + + let size = g.gen_range(8, max); + let vec = (0..size).map(|_| u64::arbitrary(g)).collect(); + + Samples { vec: vec } + } + + fn shrink(&self) -> Box> { + let vec: Vec = self.vec.clone(); + let shrunk: Box>> = vec.shrink(); + + Box::new(shrunk.filter(|v| v.len() > 7).map(|v| Samples { vec: v })) + } + } + + #[test] + #[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")] + fn test_panics_on_empty_samples_set() { + let xs: Vec = vec![]; + let ys: Vec = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + test(&xs, &ys, 0.95); + } + + #[test] + #[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")] + fn test_panics_on_empty_other_samples_set() { + let xs: Vec = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + let ys: Vec = vec![]; + test(&xs, &ys, 0.95); + } + + #[test] + #[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")] + fn test_panics_on_confidence_leq_zero() { + let xs: Vec = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + let ys: Vec = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + test(&xs, &ys, 0.0); + } + + #[test] + #[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")] + fn test_panics_on_confidence_geq_one() { + let xs: Vec = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + let ys: Vec = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; + test(&xs, &ys, 1.0); + } + + /// Alternative calculation for the test statistic for the two sample + /// Kolmogorov-Smirnov test. This simple implementation is used as a + /// verification check against actual calculation used. + fn calculate_statistic_alt(xs: &[T], ys: &[T]) -> f64 { + assert!(xs.len() > 0 && ys.len() > 0); + + let ecdf_xs = Ecdf::new(xs); + let ecdf_ys = Ecdf::new(ys); + + let mut statistic = 0.0; + + for x in xs.iter() { + let diff = (ecdf_xs.value(x.clone()) - ecdf_ys.value(x.clone())).abs(); + if diff > statistic { + statistic = diff; + } + } + + for y in ys.iter() { + let diff = (ecdf_xs.value(y.clone()) - ecdf_ys.value(y.clone())).abs(); + if diff > statistic { + statistic = diff; + } + } + + statistic + } + + #[test] + fn test_calculate_statistic() { + fn prop(xs: Samples, ys: Samples) -> bool { + let result = test(&xs.vec, &ys.vec, 0.95); + let actual = result.statistic; + let expected = calculate_statistic_alt(&xs.vec, &ys.vec); + + actual == expected + } + + check(prop as fn(Samples, Samples) -> bool); + } + + #[test] + fn test_statistic_is_between_zero_and_one() { + fn prop(xs: Samples, ys: Samples) -> bool { + let result = test(&xs.vec, &ys.vec, 0.95); + let actual = result.statistic; + + 0.0 <= actual && actual <= 1.0 + } + + check(prop as fn(Samples, Samples) -> bool); + } + + #[test] + fn test_statistic_is_zero_for_identical_samples() { + fn prop(xs: Samples) -> bool { + let ys = xs.clone(); + + let result = test(&xs.vec, &ys.vec, 0.95); + + result.statistic == 0.0 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn test_statistic_is_zero_for_permuted_sample() { + fn prop(xs: Samples) -> bool { + let mut ys = xs.clone(); + ys.shuffle(); + + let result = test(&xs.vec, &ys.vec, 0.95); + + result.statistic == 0.0 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn test_statistic_is_one_for_samples_with_no_overlap_in_support() { + fn prop(xs: Samples) -> bool { + let mut ys = xs.clone(); + + // Shift ys so that ys.min > xs.max. + let ys_min = xs.max() + 1; + ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect(); + + let result = test(&xs.vec, &ys.vec, 0.95); + + result.statistic == 1.0 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn test_statistic_is_one_half_for_sample_with_non_overlapping_in_support_replicate_added() { + fn prop(xs: Samples) -> bool { + let mut ys = xs.clone(); + + // Shift ys so that ys.min > xs.max. + let ys_min = xs.max() + 1; + ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect(); + + // Add all the original items back too. + for &x in xs.vec.iter() { + ys.vec.push(x); + } + + let result = test(&xs.vec, &ys.vec, 0.95); + + result.statistic == 0.5 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn test_statistic_is_one_div_length_for_sample_with_additional_low_value() { + fn prop(xs: Samples) -> bool { + // Add a extra sample of early weight to ys. + let min = xs.min(); + let mut ys = xs.clone(); + ys.vec.push(min - 1); + + let result = test(&xs.vec, &ys.vec, 0.95); + let expected = 1.0 / ys.vec.len() as f64; + + result.statistic == expected + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn test_statistic_is_one_div_length_for_sample_with_additional_high_value() { + fn prop(xs: Samples) -> bool { + // Add a extra sample of late weight to ys. + let max = xs.max(); + let mut ys = xs.clone(); + ys.vec.push(max + 1); + + let result = test(&xs.vec, &ys.vec, 0.95); + let expected = 1.0 / ys.vec.len() as f64; + + (result.statistic - expected).abs() < EPSILON + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn test_statistic_is_one_div_length_for_sample_with_additional_low_and_high_values() { + fn prop(xs: Samples) -> bool { + // Add a extra sample of late weight to ys. + let min = xs.min(); + let max = xs.max(); + + let mut ys = xs.clone(); + + ys.vec.push(min - 1); + ys.vec.push(max + 1); + + let result = test(&xs.vec, &ys.vec, 0.95); + let expected = 1.0 / ys.vec.len() as f64; + + (result.statistic - expected).abs() < EPSILON + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_values() { + fn prop(xs: Samples, n: u8) -> bool { + // Add extra sample of early weight to ys. + let min = xs.min(); + let mut ys = xs.clone(); + for j in 0..n { + ys.vec.push(min - (j as u64) - 1); + } + + let result = test(&xs.vec, &ys.vec, 0.95); + let expected = n as f64 / ys.vec.len() as f64; + + result.statistic == expected + } + + check(prop as fn(Samples, u8) -> bool); + } + + #[test] + fn test_statistic_is_n_div_length_for_sample_with_additional_n_high_values() { + fn prop(xs: Samples, n: u8) -> bool { + // Add extra sample of early weight to ys. + let max = xs.max(); + let mut ys = xs.clone(); + for j in 0..n { + ys.vec.push(max + (j as u64) + 1); + } + + let result = test(&xs.vec, &ys.vec, 0.95); + let expected = n as f64 / ys.vec.len() as f64; + + (result.statistic - expected).abs() < EPSILON + } + + check(prop as fn(Samples, u8) -> bool); + } + + #[test] + fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_and_high_values() { + fn prop(xs: Samples, n: u8) -> bool { + // Add extra sample of early weight to ys. + let min = xs.min(); + let max = xs.max(); + let mut ys = xs.clone(); + for j in 0..n { + ys.vec.push(min - (j as u64) - 1); + ys.vec.push(max + (j as u64) + 1); + } + + let result = test(&xs.vec, &ys.vec, 0.95); + let expected = n as f64 / ys.vec.len() as f64; + + (result.statistic - expected).abs() < EPSILON + } + + check(prop as fn(Samples, u8) -> bool); + } + + #[test] + fn test_statistic_is_n_or_m_div_length_for_sample_with_additional_n_low_and_m_high_values() { + fn prop(xs: Samples, n: u8, m: u8) -> bool { + // Add extra sample of early weight to ys. + let min = xs.min(); + let max = xs.max(); + let mut ys = xs.clone(); + for j in 0..n { + ys.vec.push(min - (j as u64) - 1); + } + + for j in 0..m { + ys.vec.push(max + (j as u64) + 1); + } + + let result = test(&xs.vec, &ys.vec, 0.95); + let expected = cmp::max(n, m) as f64 / ys.vec.len() as f64; + + (result.statistic - expected).abs() < EPSILON + } + + check(prop as fn(Samples, u8, u8) -> bool); + } + + #[test] + fn test_is_rejected_if_reject_probability_greater_than_confidence() { + fn prop(xs: Samples, ys: Samples) -> bool { + let result = test(&xs.vec, &ys.vec, 0.95); + + if result.is_rejected { + result.reject_probability > 0.95 + } else { + result.reject_probability <= 0.95 + } + } + + check(prop as fn(Samples, Samples) -> bool); + } + + #[test] + fn test_reject_probability_is_zero_for_identical_samples() { + fn prop(xs: Samples) -> bool { + let ys = xs.clone(); + + let result = test(&xs.vec, &ys.vec, 0.95); + + result.reject_probability == 0.0 + } + + check(prop as fn(Samples) -> bool); + } + + #[test] + fn test_reject_probability_is_zero_for_permuted_sample() { + fn prop(xs: Samples) -> bool { + let mut ys = xs.clone(); + ys.shuffle(); + + let result = test(&xs.vec, &ys.vec, 0.95); + + result.reject_probability == 0.0 + } + + check(prop as fn(Samples) -> bool); + } +} diff --git a/tools/kstest/Cargo.toml b/tools/kstest/Cargo.toml new file mode 100644 index 0000000..02af633 --- /dev/null +++ b/tools/kstest/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "kstest" +version = "0.1.0" +authors = ["eugen.betke "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +kolmogorov_smirnov = {path = "../kslib"} +csv = "1.1" +glob = "0.3.0" +serde = { version = "1.0", features = ["derive"] } +chrono = "*" diff --git a/tools/kstest/run.sh b/tools/kstest/run.sh new file mode 100755 index 0000000..6d5b5da --- /dev/null +++ b/tools/kstest/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash + + +#7488914 19865984 +#4296426 18672376 +#5024292 17944118 + +#dataset_fn="../../datasets/job_codings_v4_confidential.csv" +#jobids=( ) +#jobids=( ${jobids[@]} 19865984 ) +#jobids=( ${jobids[@]} 18672376 ) +#jobids=( ${jobids[@]} 17944118 ) + + +dataset_fn="../../datasets/job_codings_v4.csv" +jobids=( ) +jobids=( ${jobids[@]} 7488914 ) +jobids=( ${jobids[@]} 4296426 ) +jobids=( ${jobids[@]} 5024292 ) + +set -x +for jobid in ${jobids[@]}; do + sim_fn="./ks_similarities_$jobid.csv" + progress_fn="./ks_progress_$jobid.csv" + log_fn="./ks_fail_$jobid.log" + cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn & +done diff --git a/tools/kstest/src/main.rs b/tools/kstest/src/main.rs new file mode 100644 index 0000000..980e49e --- /dev/null +++ b/tools/kstest/src/main.rs @@ -0,0 +1,183 @@ +extern crate kolmogorov_smirnov as ks; +extern crate csv; +extern crate serde; +extern crate chrono; + +use std::fs::File; +use std::io::prelude::*; + +//use std::fs::File; +use serde::Deserialize; +use serde::Serialize; +use std::collections::HashMap; +use std::env; +use std::io::LineWriter; + +pub type Score = u32; +pub type JobCoding = Vec; +pub type Similarity = f32; + +pub type Jobid = u32; +pub type QCodings = HashMap; + +#[derive(Debug, Deserialize)] +pub struct Record { + jobid: u32, + //q16_coding: String, + ks_coding: String, +} + +#[derive(Debug, Serialize)] +pub struct SimilarityRow { + pub jobid: u32, + pub alg_id: u32, + pub alg_name: String, + pub similarity: f32 +} + +#[derive(Debug, Serialize)] +pub struct ProgressRow { + jobid: u32, + alg_id: u32, + alg_name: String, + delta: i64, +} + +pub fn convert_to_coding(coding: String) -> Vec { + let split = coding.split(":"); + let vec: Vec = split + .filter(|s| !s.is_empty()) + //.map(|s| s.parse::().unwrap()) + .map(|s| s.parse().unwrap()) + .collect(); + vec +} + + +//fn ks_similarity(xs: &Vec, ys: &Vec) -> Result { +// let confidence = 0.95; +// ks::test(xs, ys, confidence)? + +// let reject_probability = match result { +// Ok(v) => v.reject_probability, +// Err(_) => 1.0, +// }; +// //println!("is_rejected: {:?}\nstatistic: {:?}\nreject_probability: {:?}\ncritical_value: {:?}\nconfidence: {:?}", result.is_rejected, result.statistic, result.reject_probability, result.critical_value, result.confidence); +// (1.0 - reject_probability) as Similarity +//} + + +fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: String, log_fn: String) { + let mut q_codings: QCodings = HashMap::new(); + let file = File::open(&dataset_fn).expect("Unable to open dataset."); + let mut rdr = csv::Reader::from_reader(file); + + //for result in rdr.deserialize().take(10000) { + for result in rdr.deserialize() { + let record: Record = result.expect("bla bla"); + //let q_coding = convert_to_coding(record.q16_coding); + let q_coding = convert_to_coding(record.ks_coding); + // Insert Non-Zero jobs only + if q_coding.iter().sum::() > (0 as Score) { + q_codings.insert(record.jobid, q_coding); + } + } + + let probe = q_codings[&jobid].clone(); + let similarities_file = File::create(&similarities_fn).expect("Unable to open"); + let mut wtr_similarities = csv::Writer::from_writer(&similarities_file); + let alg_name = "ks"; + let alg_id = 6; + + let progress_file = File::create(&progress_fn).expect("Unable to open"); + let mut wtr_progress = csv::Writer::from_writer(&progress_file); + let mut start = chrono::Utc::now(); + let mut counter = 1; + + let mut avail_codings: Vec<(u32, &JobCoding)>; + + avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect(); + let mut similarities: Vec<(Jobid, Similarity)> = Vec::new(); + + let log_file = File::create(&log_fn).expect("Unable to open"); + let mut log_file = LineWriter::new(log_file); + + + while let Some((jobid, q_coding)) = avail_codings.pop() { + if (counter % 10_000) == 0 { + let stop = chrono::Utc::now(); + let progress_row = ProgressRow { + jobid: jobid, + alg_id: alg_id, + alg_name: String::from(alg_name), + delta: ((stop - start).num_nanoseconds().unwrap()) + }; + wtr_progress.serialize(progress_row).unwrap(); + start = stop; + } + + //println!("Processing {:?}", jobid); + //let similarity = ks_similarity(q_coding, &probe); + + let confidence = 0.95; + let similarity = match ks::test(q_coding, &probe, confidence) { + Ok(sim) => (1.0 - sim.reject_probability) as Similarity, + Err(e) => { + let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e); + log_file.write_all(message.as_bytes()).unwrap(); + 1.0 + } + }; + + similarities.push((jobid, similarity)); + counter += 1; + } + + for (jobid, similarity) in similarities.iter() { + let similarity_row = SimilarityRow { + jobid: *jobid, + alg_id: alg_id, + alg_name: String::from(alg_name), + similarity: *similarity, + }; + wtr_similarities.serialize(similarity_row).unwrap(); + } + log_file.flush().unwrap(); +} + + + + +fn main() { + let args: Vec = env::args().collect(); + let dataset_fn = args[1].clone(); + let jobid = args[2].parse::().unwrap(); + let sim_fn = args[3].clone(); + let progress_fn = args[4].clone(); + let log_fn = args[5].clone(); + println!("{:?}", args); + + run(dataset_fn, jobid, sim_fn, progress_fn, log_fn); +} + + +mod tests { + //use super::*; + + #[test] + fn test_ks_test() { + let xs = vec!(0.0 , 1.0 , 2.0 , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 , 8.0 , 9.0 , 10.0 , 11.0 , 12.0); + let ys = vec!(12.0 , 11.0 , 10.0 , 9.0 , 8.0 , 7.0 , 6.0 , 5.0 , 4.0 , 3.0 , 2.0 , 1.0 , 0.0); + + ks_test(xs, ys); + + let c1 = vec![141.0,143.0,142.0,238.0,132.0,486.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]; + let c2 = vec![239.0,239.0,255.0,255.0,239.0,239.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,502.0,511.0,503.0]; + ks_test(c1, c2); + + + let c1 = vec![2.0,2.0,2.0,9.0,3.0,0.0,0.0,0.0]; + let c2 = vec![2.0,2.0,2.0,2.0,8.0,3.0,0.0,10.0]; + ks_test(c1, c2); + } +}