Kolmogorov-Smirnov-Test
This commit is contained in:
parent
88e994f997
commit
8d2ebf0e96
|
@ -0,0 +1,36 @@
|
|||
[package]
|
||||
name = "kolmogorov_smirnov"
|
||||
description = "Implementation of the Kolmogorov-Smirnov statistical test as a Rust library."
|
||||
version = "1.1.0"
|
||||
homepage = "http://daithiocrualaoich.github.io/kolmogorov_smirnov"
|
||||
repository = "https://github.com/daithiocrualaoich/kolmogorov_smirnov"
|
||||
authors = ["Daithi O Crualaoich <daithi.ocrualaoich@gmail.com>"]
|
||||
license = "Apache-2.0"
|
||||
keywords = ["statistics"]
|
||||
exclude = [
|
||||
".dockerignore",
|
||||
".gitignore",
|
||||
".travis.yml",
|
||||
"Cargo.lock",
|
||||
"Dockerfile",
|
||||
"dat/*",
|
||||
"doc/*",
|
||||
]
|
||||
|
||||
[[bin]]
|
||||
name = "critical_values"
|
||||
|
||||
[[bin]]
|
||||
name = "normal"
|
||||
|
||||
[[bin]]
|
||||
name = "ks_f64"
|
||||
|
||||
[[bin]]
|
||||
name = "ks_i64"
|
||||
|
||||
[dependencies]
|
||||
rand = "0.3.12"
|
||||
|
||||
[dev-dependencies]
|
||||
quickcheck = "0.2"
|
|
@ -0,0 +1,39 @@
|
|||
extern crate kolmogorov_smirnov as ks;
|
||||
|
||||
use ks::calculate_critical_value;
|
||||
|
||||
use std::env;
|
||||
|
||||
/// Calculate critical values dataset.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```bash
|
||||
/// cargo run --bin critical_values <confidence> <num_samples> <limit>
|
||||
/// ```
|
||||
///
|
||||
/// This will print the critical values of the Kolmogorov-Smirnov two sample
|
||||
/// test for samples of size `<num_samples>` against samples of sizes 16
|
||||
/// through `<limit>` inclusive at the specified confidence level.
|
||||
///
|
||||
/// `<num_samples>` and `<limit>` must be positive integers, `<confidence>` must
|
||||
/// be a floating point number strictly between zero and one.
|
||||
fn main() {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
let confidence: f64 = args[1].parse().expect("<confidence> must be a floating point number.");
|
||||
let n1: usize = args[2].parse().expect("<num_samples> must be an integer.");
|
||||
let limit: usize = args[3].parse().expect("<limit> must be an integer.");
|
||||
|
||||
assert!(n1 > 0 && limit > 0);
|
||||
assert!(0.0 < confidence && confidence < 1.0);
|
||||
|
||||
println!("n1\tn2\tconfidence\tcritical_value");
|
||||
for n2 in 16..(limit + 1) {
|
||||
println!("{}\t{}\t{}\t{}",
|
||||
n1,
|
||||
n2,
|
||||
confidence,
|
||||
calculate_critical_value(n1, n2, confidence).unwrap());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
extern crate kolmogorov_smirnov as ks;
|
||||
|
||||
use ks::test_f64;
|
||||
|
||||
use std::env;
|
||||
use std::io::{BufReader, BufRead};
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
|
||||
fn parse_float(s: String) -> f64 {
|
||||
s.parse::<f64>().expect("Not a floating point number.")
|
||||
}
|
||||
|
||||
/// Runs a Kolmogorov-Smirnov test on floating point data files.
|
||||
///
|
||||
/// Input files must be single-column headerless data files. The data samples
|
||||
/// are tested against each other at the 0.95 confidence level.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```bash
|
||||
/// cargo run --bin ks_f64 <file1> <file2>
|
||||
/// ```
|
||||
///
|
||||
/// This will print the test result to standard output.
|
||||
fn main() {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
let path1 = Path::new(&args[1]);
|
||||
let path2 = Path::new(&args[2]);
|
||||
|
||||
let file1 = BufReader::new(File::open(&path1).unwrap());
|
||||
let file2 = BufReader::new(File::open(&path2).unwrap());
|
||||
|
||||
let lines1 = file1.lines().map(|line| line.unwrap());
|
||||
let lines2 = file2.lines().map(|line| line.unwrap());
|
||||
|
||||
let xs: Vec<f64> = lines1.map(parse_float).collect();
|
||||
let ys: Vec<f64> = lines2.map(parse_float).collect();
|
||||
|
||||
let result = ks::test_f64(&xs, &ys, 0.95).unwrap();
|
||||
|
||||
if result.is_rejected {
|
||||
println!("Samples are from different distributions.");
|
||||
} else {
|
||||
println!("Samples are from the same distribution.");
|
||||
}
|
||||
|
||||
println!("test statistic = {}", result.statistic);
|
||||
println!("critical value = {}", result.critical_value);
|
||||
println!("reject probability = {}", result.reject_probability);
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
extern crate kolmogorov_smirnov as ks;
|
||||
|
||||
use ks::test;
|
||||
|
||||
use std::env;
|
||||
use std::io::{BufReader, BufRead};
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
|
||||
fn parse_int(s: String) -> i64 {
|
||||
s.parse::<i64>().expect("Not an integer.")
|
||||
}
|
||||
|
||||
/// Runs a Kolmogorov-Smirnov test on integer data files.
|
||||
///
|
||||
/// Input files must be single-column headerless data files. The data samples
|
||||
/// are tested against each other at the 0.95 confidence level.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```bash
|
||||
/// cargo run --bin ks_i64 <file1> <file2>
|
||||
/// ```
|
||||
///
|
||||
/// This will print the test result to standard output.
|
||||
fn main() {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
let path1 = Path::new(&args[1]);
|
||||
let path2 = Path::new(&args[2]);
|
||||
|
||||
let file1 = BufReader::new(File::open(&path1).unwrap());
|
||||
let file2 = BufReader::new(File::open(&path2).unwrap());
|
||||
|
||||
let lines1 = file1.lines().map(|line| line.unwrap());
|
||||
let lines2 = file2.lines().map(|line| line.unwrap());
|
||||
|
||||
let xs: Vec<i64> = lines1.map(parse_int).collect();
|
||||
let ys: Vec<i64> = lines2.map(parse_int).collect();
|
||||
|
||||
let result = ks::test(&xs, &ys, 0.95).unwrap();
|
||||
|
||||
if result.is_rejected {
|
||||
println!("Samples are from different distributions.");
|
||||
} else {
|
||||
println!("Samples are from the same distribution.");
|
||||
}
|
||||
|
||||
println!("test statistic = {}", result.statistic);
|
||||
println!("critical value = {}", result.critical_value);
|
||||
println!("reject probability = {}", result.reject_probability);
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
extern crate rand;
|
||||
|
||||
use std::env;
|
||||
use rand::distributions::{Normal, IndependentSample};
|
||||
|
||||
/// Prints a sequence of Normal deviates.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```bash
|
||||
/// cargo run --bin normal <num_deviates> <mean> <variance>
|
||||
/// ```
|
||||
/// This will print `<num_deviates>` float point numbers, one per line, to
|
||||
/// standard output. These numbers will have a Normal distribution with the
|
||||
/// specified mean and variance.
|
||||
///
|
||||
/// `<num_deviates>` must be a positive integer, `<mean>` and `<variance>` may
|
||||
/// be integers or floating point numbers but `<variance>` must be positive.
|
||||
fn main() {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
let n: u32 = args[1].parse().expect("<num_deviates> must be an integer.");
|
||||
let mean: f64 = args[2].parse().expect("<mean> must be a floating point number.");
|
||||
let variance: f64 = args[3].parse().expect("<variance> must be a floating point number.");
|
||||
|
||||
assert!(n > 0 && variance.is_sign_positive());
|
||||
|
||||
let mut rng = rand::thread_rng();
|
||||
let normal = Normal::new(mean, variance.sqrt());
|
||||
|
||||
for _ in 0..n {
|
||||
let x = normal.ind_sample(&mut rng);
|
||||
println!("{}", x)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,853 @@
|
|||
//! Empirical cumulative distribution function.
|
||||
|
||||
pub struct Ecdf<T: Ord> {
|
||||
samples: Vec<T>,
|
||||
length: usize,
|
||||
}
|
||||
|
||||
impl<T: Ord + Clone> Ecdf<T> {
|
||||
/// Construct a new representation of a cumulative distribution function for
|
||||
/// a given sample.
|
||||
///
|
||||
/// The construction will involve computing a sorted clone of the given sample
|
||||
/// and may be inefficient or completely prohibitive for large samples. This
|
||||
/// computation is amortized significantly if there is heavy use of the value
|
||||
/// function.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// The sample set must be non-empty.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
/// let ecdf = ks::Ecdf::new(&samples);
|
||||
/// ```
|
||||
pub fn new(samples: &[T]) -> Ecdf<T> {
|
||||
let length = samples.len();
|
||||
assert!(length > 0);
|
||||
|
||||
// Sort a copied sample for binary searching.
|
||||
let mut sorted = samples.to_vec();
|
||||
sorted.sort();
|
||||
|
||||
Ecdf {
|
||||
samples: sorted,
|
||||
length: length,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate a value of the empirical cumulative distribution function for
|
||||
/// a given sample.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
/// let ecdf = ks::Ecdf::new(&samples);
|
||||
/// assert_eq!(ecdf.value(4), 0.5);
|
||||
/// ```
|
||||
pub fn value(&self, t: T) -> f64 {
|
||||
let num_samples_leq_t = match self.samples.binary_search(&t) {
|
||||
Ok(mut index) => {
|
||||
// At least one sample is a t and we have the index of it. Need
|
||||
// to walk down the sorted samples until at last that == t.
|
||||
while index + 1 < self.length && self.samples[index + 1] == t {
|
||||
index += 1;
|
||||
}
|
||||
|
||||
// Compensate for 0-based indexing.
|
||||
index + 1
|
||||
}
|
||||
Err(index) => {
|
||||
// No sample is a t but if we had to put one in it would go at
|
||||
// index. This means all indices to the left have samples < t
|
||||
// and should be counted in the cdf proportion. We must take one
|
||||
// from index to get the last included sample but then we just
|
||||
// have to add one again to account for 0-based indexing.
|
||||
index
|
||||
}
|
||||
};
|
||||
|
||||
num_samples_leq_t as f64 / self.length as f64
|
||||
}
|
||||
|
||||
/// Calculate a percentile for the sample using the Nearest Rank method.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// The percentile requested must be between 1 and 100 inclusive. In
|
||||
/// particular, there is no 0-percentile.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
/// let ecdf = ks::Ecdf::new(&samples);
|
||||
/// assert_eq!(ecdf.percentile(50), 4);
|
||||
/// ```
|
||||
pub fn percentile(&self, p: u8) -> T {
|
||||
assert!(0 < p && p <= 100);
|
||||
|
||||
let rank = (p as f64 * self.length as f64 / 100.0).ceil() as usize;
|
||||
self.samples[rank - 1].clone()
|
||||
}
|
||||
|
||||
/// Return the minimal element of the samples.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
/// let ecdf = ks::Ecdf::new(&samples);
|
||||
/// assert_eq!(ecdf.min(), 0);
|
||||
/// ```
|
||||
pub fn min(&self) -> T {
|
||||
self.samples[0].clone()
|
||||
}
|
||||
|
||||
/// Return the maximal element of the samples.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
/// let ecdf = ks::Ecdf::new(&samples);
|
||||
/// assert_eq!(ecdf.max(), 9);
|
||||
/// ```
|
||||
pub fn max(&self) -> T {
|
||||
self.samples[self.samples.len() - 1].clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate a one-time value of the empirical cumulative distribution function
|
||||
/// for a given sample.
|
||||
///
|
||||
/// Computational running time of this function is O(n) but does not amortize
|
||||
/// across multiple calls like Ecdf<T>::value. This function should only be
|
||||
/// used in the case that a small number of ECDF values are required for the
|
||||
/// sample. Otherwise, Ecdf::new should be used to create a structure that
|
||||
/// takes the upfront O(n log n) sort cost but calculates values in O(log n).
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// The sample set must be non-empty.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
/// let value = ks::ecdf(&samples, 4);
|
||||
/// assert_eq!(value, 0.5);
|
||||
/// ```
|
||||
pub fn ecdf<T: Ord>(samples: &[T], t: T) -> f64 {
|
||||
let mut num_samples_leq_t = 0;
|
||||
let mut length = 0;
|
||||
|
||||
for sample in samples.iter() {
|
||||
length += 1;
|
||||
if *sample <= t {
|
||||
num_samples_leq_t += 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert!(length > 0);
|
||||
|
||||
num_samples_leq_t as f64 / length as f64
|
||||
}
|
||||
|
||||
/// Calculate a one-time percentile for a given sample using the Nearest Rank
|
||||
/// method and Quick Select.
|
||||
///
|
||||
/// Computational running time of this function is O(n) but does not amortize
|
||||
/// across multiple calls like Ecdf<T>::percentile. This function should only be
|
||||
/// used in the case that a small number of percentiles are required for the
|
||||
/// sample. Otherwise, Ecdf::new should be used to create a structure that
|
||||
/// takes the upfront O(n log n) sort cost but calculates percentiles in O(1).
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// The sample set must be non-empty.
|
||||
///
|
||||
/// The percentile requested must be between 1 and 100 inclusive. In particular,
|
||||
/// there is no 0-percentile.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
/// let percentile = ks::percentile(&samples, 50);
|
||||
/// assert_eq!(percentile, 4);
|
||||
/// ```
|
||||
pub fn percentile<T: Ord + Clone>(samples: &[T], p: u8) -> T {
|
||||
assert!(0 < p && p <= 100);
|
||||
|
||||
let length = samples.len();
|
||||
assert!(length > 0);
|
||||
|
||||
let rank = (p as f64 * length as f64 / 100.0).ceil() as usize;
|
||||
|
||||
// Quick Select the element at rank.
|
||||
|
||||
let mut samples: Vec<T> = samples.to_vec();
|
||||
let mut low = 0;
|
||||
let mut high = length;
|
||||
|
||||
loop {
|
||||
assert!(low < high);
|
||||
|
||||
let pivot = samples[low].clone();
|
||||
|
||||
if low >= high - 1 {
|
||||
return pivot;
|
||||
}
|
||||
|
||||
// First determine if the rank item is less than the pivot.
|
||||
|
||||
// Organise samples so that all items less than pivot are to the left,
|
||||
// `bottom` is the number of items less than pivot.
|
||||
|
||||
let mut bottom = low;
|
||||
let mut top = high - 1;
|
||||
|
||||
while bottom < top {
|
||||
while bottom < top && samples[bottom] < pivot {
|
||||
bottom += 1;
|
||||
}
|
||||
while bottom < top && samples[top] >= pivot {
|
||||
top -= 1;
|
||||
}
|
||||
|
||||
if bottom < top {
|
||||
samples.swap(bottom, top);
|
||||
}
|
||||
}
|
||||
|
||||
if rank <= bottom {
|
||||
// Rank item is less than pivot. Exclude pivot and larger items.
|
||||
high = bottom;
|
||||
} else {
|
||||
// Rank item is pivot or in the larger set. Exclude smaller items.
|
||||
low = bottom;
|
||||
|
||||
// Next, determine if the pivot is the rank item.
|
||||
|
||||
// Organise samples so that all items less than or equal to pivot
|
||||
// are to the left, `bottom` is the number of items less than or
|
||||
// equal to pivot. Since the left is already less than the pivot,
|
||||
// this just requires moving the pivots left also.
|
||||
|
||||
let mut bottom = low;
|
||||
let mut top = high - 1;
|
||||
|
||||
while bottom < top {
|
||||
while bottom < top && samples[bottom] == pivot {
|
||||
bottom += 1;
|
||||
}
|
||||
while bottom < top && samples[top] != pivot {
|
||||
top -= 1;
|
||||
}
|
||||
|
||||
if bottom < top {
|
||||
samples.swap(bottom, top);
|
||||
}
|
||||
}
|
||||
|
||||
// Is pivot the rank item?
|
||||
|
||||
if rank <= bottom {
|
||||
return pivot;
|
||||
}
|
||||
|
||||
// Rank item is greater than pivot. Exclude pivot and smaller items.
|
||||
low = bottom;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
extern crate quickcheck;
|
||||
extern crate rand;
|
||||
|
||||
use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, TestResult, StdGen};
|
||||
use std::cmp;
|
||||
use std::usize;
|
||||
use super::{Ecdf, ecdf, percentile};
|
||||
|
||||
fn check<A: Testable>(f: A) {
|
||||
let g = StdGen::new(rand::thread_rng(), usize::MAX);
|
||||
QuickCheck::new().gen(g).quickcheck(f);
|
||||
}
|
||||
|
||||
/// Wrapper for generating sample data with QuickCheck.
|
||||
///
|
||||
/// Samples must be non-empty sequences of u64 values.
|
||||
#[derive(Debug, Clone)]
|
||||
struct Samples {
|
||||
vec: Vec<u64>,
|
||||
}
|
||||
|
||||
impl Arbitrary for Samples {
|
||||
fn arbitrary<G: Gen>(g: &mut G) -> Samples {
|
||||
// Limit size of generated sample set to 1024
|
||||
let max = cmp::min(g.size(), 1024);
|
||||
|
||||
let size = g.gen_range(1, max);
|
||||
let vec = (0..size).map(|_| u64::arbitrary(g)).collect();
|
||||
|
||||
Samples { vec: vec }
|
||||
}
|
||||
|
||||
fn shrink(&self) -> Box<Iterator<Item = Samples>> {
|
||||
let vec: Vec<u64> = self.vec.clone();
|
||||
let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink();
|
||||
|
||||
Box::new(shrunk.filter(|v| v.len() > 0).map(|v| Samples { vec: v }))
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for generating percentile query value data with QuickCheck.
|
||||
///
|
||||
/// Percentile must be u8 between 1 and 100 inclusive.
|
||||
#[derive(Debug, Clone)]
|
||||
struct Percentile {
|
||||
val: u8,
|
||||
}
|
||||
|
||||
impl Arbitrary for Percentile {
|
||||
fn arbitrary<G: Gen>(g: &mut G) -> Percentile {
|
||||
let val = g.gen_range(1, 101) as u8;
|
||||
|
||||
Percentile { val: val }
|
||||
}
|
||||
|
||||
fn shrink(&self) -> Box<Iterator<Item = Percentile>> {
|
||||
let shrunk: Box<Iterator<Item = u8>> = self.val.shrink();
|
||||
|
||||
Box::new(shrunk.filter(|&v| 0u8 < v && v <= 100u8).map(|v| Percentile { val: v }))
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: length > 0")]
|
||||
fn single_use_ecdf_panics_on_empty_samples_set() {
|
||||
let xs: Vec<u64> = vec![];
|
||||
ecdf(&xs, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: length > 0")]
|
||||
fn multiple_use_ecdf_panics_on_empty_samples_set() {
|
||||
let xs: Vec<u64> = vec![];
|
||||
Ecdf::new(&xs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_ecdf_between_zero_and_one() {
|
||||
fn prop(xs: Samples, val: u64) -> bool {
|
||||
let actual = ecdf(&xs.vec, val);
|
||||
|
||||
0.0 <= actual && actual <= 1.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_ecdf_between_zero_and_one() {
|
||||
fn prop(xs: Samples, val: u64) -> bool {
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
let actual = ecdf.value(val);
|
||||
|
||||
0.0 <= actual && actual <= 1.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_ecdf_is_an_increasing_function() {
|
||||
fn prop(xs: Samples, val: u64) -> bool {
|
||||
let actual = ecdf(&xs.vec, val);
|
||||
|
||||
ecdf(&xs.vec, val - 1) <= actual && actual <= ecdf(&xs.vec, val + 1)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_ecdf_is_an_increasing_function() {
|
||||
fn prop(xs: Samples, val: u64) -> bool {
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
let actual = ecdf.value(val);
|
||||
|
||||
ecdf.value(val - 1) <= actual && actual <= ecdf.value(val + 1)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_ecdf_sample_min_minus_one_is_zero() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let &min = xs.vec.iter().min().unwrap();
|
||||
|
||||
ecdf(&xs.vec, min - 1) == 0.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_ecdf_sample_min_minus_one_is_zero() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let &min = xs.vec.iter().min().unwrap();
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
|
||||
ecdf.value(min - 1) == 0.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_ecdf_sample_max_is_one() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let &max = xs.vec.iter().max().unwrap();
|
||||
|
||||
ecdf(&xs.vec, max) == 1.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_ecdf_sample_max_is_one() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let &max = xs.vec.iter().max().unwrap();
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
|
||||
ecdf.value(max) == 1.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_ecdf_sample_val_is_num_samples_leq_val_div_length() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let &val = xs.vec.first().unwrap();
|
||||
let num_samples = xs.vec
|
||||
.iter()
|
||||
.filter(|&&x| x <= val)
|
||||
.count();
|
||||
let expected = num_samples as f64 / xs.vec.len() as f64;
|
||||
|
||||
ecdf(&xs.vec, val) == expected
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_ecdf_sample_val_is_num_samples_leq_val_div_length() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let &val = xs.vec.first().unwrap();
|
||||
let num_samples = xs.vec
|
||||
.iter()
|
||||
.filter(|&&x| x <= val)
|
||||
.count();
|
||||
let expected = num_samples as f64 / xs.vec.len() as f64;
|
||||
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
|
||||
ecdf.value(val) == expected
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() {
|
||||
fn prop(xs: Samples, val: u64) -> TestResult {
|
||||
let length = xs.vec.len();
|
||||
|
||||
if xs.vec.iter().any(|&x| x == val) {
|
||||
// Discard Vec containing val.
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
let num_samples = xs.vec
|
||||
.iter()
|
||||
.filter(|&&x| x <= val)
|
||||
.count();
|
||||
let expected = num_samples as f64 / length as f64;
|
||||
|
||||
let actual = ecdf(&xs.vec, val);
|
||||
|
||||
TestResult::from_bool(actual == expected)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> TestResult);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() {
|
||||
fn prop(xs: Samples, val: u64) -> TestResult {
|
||||
let length = xs.vec.len();
|
||||
|
||||
if xs.vec.iter().any(|&x| x == val) {
|
||||
// Discard Vec containing val.
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
let num_samples = xs.vec
|
||||
.iter()
|
||||
.filter(|&&x| x <= val)
|
||||
.count();
|
||||
let expected = num_samples as f64 / length as f64;
|
||||
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
|
||||
TestResult::from_bool(ecdf.value(val) == expected)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> TestResult);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_and_multiple_use_ecdf_agree() {
|
||||
fn prop(xs: Samples, val: u64) -> bool {
|
||||
let multiple_use = Ecdf::new(&xs.vec);
|
||||
|
||||
multiple_use.value(val) == ecdf(&xs.vec, val)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
|
||||
fn single_use_percentiles_panics_on_zero_percentile() {
|
||||
let xs: Vec<u64> = vec![0];
|
||||
|
||||
percentile(&xs, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
|
||||
fn single_use_percentiles_panics_on_101_percentile() {
|
||||
let xs: Vec<u64> = vec![0];
|
||||
|
||||
percentile(&xs, 101);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
|
||||
fn multiple_use_percentiles_panics_on_zero_percentile() {
|
||||
let xs: Vec<u64> = vec![0];
|
||||
let ecdf = Ecdf::new(&xs);
|
||||
|
||||
ecdf.percentile(0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: 0 < p && p <= 100")]
|
||||
fn multiple_use_percentiles_panics_on_101_percentile() {
|
||||
let xs: Vec<u64> = vec![0];
|
||||
let ecdf = Ecdf::new(&xs);
|
||||
|
||||
ecdf.percentile(101);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_percentile_between_samples_min_and_max() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let &min = xs.vec.iter().min().unwrap();
|
||||
let &max = xs.vec.iter().max().unwrap();
|
||||
|
||||
let actual = percentile(&xs.vec, p.val);
|
||||
|
||||
min <= actual && actual <= max
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_percentile_is_an_increasing_function() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let smaller = cmp::max(p.val - 1, 1);
|
||||
let larger = cmp::min(p.val + 1, 100);
|
||||
|
||||
let actual = percentile(&xs.vec, p.val);
|
||||
|
||||
percentile(&xs.vec, smaller) <= actual && actual <= percentile(&xs.vec, larger)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_percentile_100_is_sample_max() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let &max = xs.vec.iter().max().unwrap();
|
||||
|
||||
percentile(&xs.vec, 100) == max
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_percentile_between_samples_min_and_max() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let &min = xs.vec.iter().min().unwrap();
|
||||
let &max = xs.vec.iter().max().unwrap();
|
||||
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
let actual = ecdf.percentile(p.val);
|
||||
|
||||
min <= actual && actual <= max
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_percentile_is_an_increasing_function() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let smaller = cmp::max(p.val - 1, 1);
|
||||
let larger = cmp::min(p.val + 1, 100);
|
||||
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
let actual = ecdf.percentile(p.val);
|
||||
|
||||
ecdf.percentile(smaller) <= actual && actual <= ecdf.percentile(larger)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_percentile_100_is_sample_max() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let &max = xs.vec.iter().max().unwrap();
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
|
||||
ecdf.percentile(100) == max
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() {
|
||||
fn prop(xs: Samples, val: u64) -> TestResult {
|
||||
let actual = ecdf(&xs.vec, val);
|
||||
|
||||
let p = (actual * 100.0).floor() as u8;
|
||||
|
||||
match p {
|
||||
0 => {
|
||||
// val is below the first percentile threshold. Can't
|
||||
// calculate 0-percentile value so discard.
|
||||
TestResult::discard()
|
||||
}
|
||||
_ => {
|
||||
// Not equal because e.g. all percentiles of [0] are 0. So
|
||||
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
|
||||
let single_use = percentile(&xs.vec, p);
|
||||
TestResult::from_bool(single_use <= val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> TestResult);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() {
|
||||
fn prop(xs: Samples, val: u64) -> TestResult {
|
||||
let actual = ecdf(&xs.vec, val);
|
||||
|
||||
let p = (actual * 100.0).floor() as u8;
|
||||
|
||||
match p {
|
||||
0 => {
|
||||
// val is below the first percentile threshold. Can't
|
||||
// calculate 0-percentile value so discard.
|
||||
TestResult::discard()
|
||||
}
|
||||
_ => {
|
||||
// Not equal because e.g. all percentiles of [0] are 0. So
|
||||
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
|
||||
let multiple_use = Ecdf::new(&xs.vec);
|
||||
TestResult::from_bool(multiple_use.percentile(p) <= val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> TestResult);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() {
|
||||
fn prop(xs: Samples, val: u64) -> TestResult {
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
let actual = ecdf.value(val);
|
||||
|
||||
let p = (actual * 100.0).floor() as u8;
|
||||
|
||||
match p {
|
||||
0 => {
|
||||
// val is below the first percentile threshold. Can't
|
||||
// calculate 0-percentile value so discard.
|
||||
TestResult::discard()
|
||||
}
|
||||
_ => {
|
||||
// Not equal because e.g. all percentiles of [0] are 0. So
|
||||
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
|
||||
let single_use = percentile(&xs.vec, p);
|
||||
TestResult::from_bool(single_use <= val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> TestResult);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() {
|
||||
fn prop(xs: Samples, val: u64) -> TestResult {
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
let actual = ecdf.value(val);
|
||||
|
||||
let p = (actual * 100.0).floor() as u8;
|
||||
|
||||
match p {
|
||||
0 => {
|
||||
// val is below the first percentile threshold. Can't
|
||||
// calculate 0-percentile value so discard.
|
||||
TestResult::discard()
|
||||
}
|
||||
_ => {
|
||||
// Not equal because e.g. all percentiles of [0] are 0. So
|
||||
// value of 1 gives ecdf == 1.0 and percentile(100) == 0.
|
||||
TestResult::from_bool(ecdf.percentile(p) <= val)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u64) -> TestResult);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_percentile_followed_by_single_use_edf_is_geq_original_value() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let actual = percentile(&xs.vec, p.val);
|
||||
|
||||
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
|
||||
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
|
||||
p.val as f64 / 100.0 <= ecdf(&xs.vec, actual)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let actual = percentile(&xs.vec, p.val);
|
||||
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
|
||||
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
|
||||
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
|
||||
p.val as f64 / 100.0 <= ecdf.value(actual)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_percentile_followed_by_single_use_edf_is_geq_original_value() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let multiple_use = Ecdf::new(&xs.vec);
|
||||
let actual = multiple_use.percentile(p.val);
|
||||
|
||||
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
|
||||
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
|
||||
p.val as f64 / 100.0 <= ecdf(&xs.vec, actual)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let ecdf = Ecdf::new(&xs.vec);
|
||||
let actual = ecdf.percentile(p.val);
|
||||
|
||||
// Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
|
||||
// So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
|
||||
p.val as f64 / 100.0 <= ecdf.value(actual)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_and_multiple_use_percentile_agree() {
|
||||
fn prop(xs: Samples, p: Percentile) -> bool {
|
||||
let multiple_use = Ecdf::new(&xs.vec);
|
||||
|
||||
multiple_use.percentile(p.val) == percentile(&xs.vec, p.val)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Percentile) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn min_is_leq_all_samples() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let multiple_use = Ecdf::new(&xs.vec);
|
||||
let actual = multiple_use.min();
|
||||
|
||||
xs.vec.iter().all(|&x| actual <= x)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn max_is_geq_all_samples() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let multiple_use = Ecdf::new(&xs.vec);
|
||||
let actual = multiple_use.max();
|
||||
|
||||
xs.vec.iter().all(|&x| actual >= x)
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
pub mod ecdf;
|
||||
pub mod test;
|
||||
|
||||
pub use test::{test, test_f64, calculate_critical_value};
|
||||
pub use ecdf::{Ecdf, ecdf, percentile};
|
|
@ -0,0 +1,699 @@
|
|||
//! Two Sample Kolmogorov-Smirnov Test
|
||||
|
||||
use std::cmp::{min, Ord, Ordering};
|
||||
|
||||
/// Two sample test result.
|
||||
pub struct TestResult {
|
||||
pub is_rejected: bool,
|
||||
pub statistic: f64,
|
||||
pub reject_probability: f64,
|
||||
pub critical_value: f64,
|
||||
pub confidence: f64,
|
||||
}
|
||||
|
||||
/// Perform a two sample Kolmogorov-Smirnov test on given samples.
|
||||
///
|
||||
/// The samples must have length > 7 elements for the test to be valid.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// There are assertion panics if either sequence has <= 7 elements.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let xs = vec!(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
|
||||
/// let ys = vec!(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
/// let confidence = 0.95;
|
||||
///
|
||||
/// let result = ks::test(&xs, &ys, confidence);
|
||||
///
|
||||
/// if result.is_rejected {
|
||||
/// println!("{:?} and {:?} are not from the same distribution with probability {}.",
|
||||
/// xs, ys, result.reject_probability);
|
||||
/// }
|
||||
/// ```
|
||||
pub fn test<T: Ord + Clone>(xs: &[T], ys: &[T], confidence: f64) -> Result<TestResult, String> {
|
||||
assert!(xs.len() > 0 && ys.len() > 0);
|
||||
assert!(0.0 < confidence && confidence < 1.0);
|
||||
|
||||
// Only supports samples of size > 7.
|
||||
assert!(xs.len() > 7 && ys.len() > 7);
|
||||
|
||||
let statistic = calculate_statistic(xs, ys);
|
||||
let critical_value = calculate_critical_value(xs.len(), ys.len(), confidence)?;
|
||||
|
||||
let reject_probability = calculate_reject_probability(statistic, xs.len(), ys.len())?;
|
||||
let is_rejected = reject_probability > confidence;
|
||||
|
||||
Ok(TestResult {
|
||||
is_rejected: is_rejected,
|
||||
statistic: statistic,
|
||||
reject_probability: reject_probability,
|
||||
critical_value: critical_value,
|
||||
confidence: confidence,
|
||||
})
|
||||
}
|
||||
|
||||
/// Wrapper type for f64 to implement Ord and make usable with test.
|
||||
#[derive(PartialEq, Clone)]
|
||||
struct OrderableF64 {
|
||||
val: f64,
|
||||
}
|
||||
|
||||
impl OrderableF64 {
|
||||
fn new(val: f64) -> OrderableF64 {
|
||||
OrderableF64 { val: val }
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OrderableF64 {}
|
||||
|
||||
impl PartialOrd for OrderableF64 {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
self.val.partial_cmp(&other.val)
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for OrderableF64 {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.val.partial_cmp(&other.val).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform a two sample Kolmogorov-Smirnov test on given f64 samples.
|
||||
///
|
||||
/// This is necessary because f64 does not implement Ord in Rust as some
|
||||
/// elements are incomparable, e.g. NaN. This function wraps the f64s in
|
||||
/// implementation of Ord which panics on incomparable elements.
|
||||
///
|
||||
/// The samples must have length > 7 elements for the test to be valid.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// There are assertion panics if either sequence has <= 7 elements.
|
||||
///
|
||||
/// If any of the f64 elements in the input samples are unorderable, e.g. NaN.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let xs = vec!(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0);
|
||||
/// let ys = vec!(12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0);
|
||||
/// let confidence = 0.95;
|
||||
///
|
||||
/// let result = ks::test_f64(&xs, &ys, confidence);
|
||||
///
|
||||
/// if result.is_rejected {
|
||||
/// println!("{:?} and {:?} are not from the same distribution with probability {}.",
|
||||
/// xs, ys, result.reject_probability);
|
||||
/// }
|
||||
/// ```
|
||||
pub fn test_f64(xs: &[f64], ys: &[f64], confidence: f64) -> Result<TestResult, String> {
|
||||
let xs: Vec<OrderableF64> = xs.iter().map(|&f| OrderableF64::new(f)).collect();
|
||||
let ys: Vec<OrderableF64> = ys.iter().map(|&f| OrderableF64::new(f)).collect();
|
||||
|
||||
test(&xs, &ys, confidence)
|
||||
}
|
||||
|
||||
/// Calculate the test statistic for the two sample Kolmogorov-Smirnov test.
|
||||
///
|
||||
/// The test statistic is the maximum vertical distance between the ECDFs of
|
||||
/// the two samples.
|
||||
fn calculate_statistic<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f64 {
|
||||
let n = xs.len();
|
||||
let m = ys.len();
|
||||
|
||||
assert!(n > 0 && m > 0);
|
||||
|
||||
let mut xs = xs.to_vec();
|
||||
let mut ys = ys.to_vec();
|
||||
|
||||
// xs and ys must be sorted for the stepwise ECDF calculations to work.
|
||||
xs.sort();
|
||||
ys.sort();
|
||||
|
||||
// The current value testing for ECDF difference. Sweeps up through elements
|
||||
// present in xs and ys.
|
||||
let mut current: &T;
|
||||
|
||||
// i, j index the first values in xs and ys that are greater than current.
|
||||
let mut i = 0;
|
||||
let mut j = 0;
|
||||
|
||||
// ecdf_xs, ecdf_ys always hold the ECDF(current) of xs and ys.
|
||||
let mut ecdf_xs = 0.0;
|
||||
let mut ecdf_ys = 0.0;
|
||||
|
||||
// The test statistic value computed over values <= current.
|
||||
let mut statistic = 0.0;
|
||||
|
||||
while i < n && j < m {
|
||||
// Advance i through duplicate samples in xs.
|
||||
let x_i = &xs[i];
|
||||
while i + 1 < n && *x_i == xs[i + 1] {
|
||||
i += 1;
|
||||
}
|
||||
|
||||
// Advance j through duplicate samples in ys.
|
||||
let y_j = &ys[j];
|
||||
while j + 1 < m && *y_j == ys[j + 1] {
|
||||
j += 1;
|
||||
}
|
||||
|
||||
// Step to the next sample value in the ECDF sweep from low to high.
|
||||
current = min(x_i, y_j);
|
||||
|
||||
// Update invariant conditions for i, j, ecdf_xs, and ecdf_ys.
|
||||
if current == x_i {
|
||||
ecdf_xs = (i + 1) as f64 / n as f64;
|
||||
i += 1;
|
||||
}
|
||||
if current == y_j {
|
||||
ecdf_ys = (j + 1) as f64 / m as f64;
|
||||
j += 1;
|
||||
}
|
||||
|
||||
// Update invariant conditions for the test statistic.
|
||||
let diff = (ecdf_xs - ecdf_ys).abs();
|
||||
if diff > statistic {
|
||||
statistic = diff;
|
||||
}
|
||||
}
|
||||
|
||||
// Don't need to walk the rest of the samples because one of the ecdfs is
|
||||
// already one and the other will be increasing up to one. This means the
|
||||
// difference will be monotonically decreasing, so we have our test
|
||||
// statistic value already.
|
||||
|
||||
statistic
|
||||
}
|
||||
|
||||
/// Calculate the probability that the null hypothesis is false for a two sample
|
||||
/// Kolmogorov-Smirnov test. Can only reject the null hypothesis if this
|
||||
/// evidence exceeds the confidence level required.
|
||||
fn calculate_reject_probability(statistic: f64, n1: usize, n2: usize) -> Result<f64, String> {
|
||||
// Only supports samples of size > 7.
|
||||
assert!(n1 > 7 && n2 > 7);
|
||||
|
||||
let n1 = n1 as f64;
|
||||
let n2 = n2 as f64;
|
||||
|
||||
let factor = ((n1 * n2) / (n1 + n2)).sqrt();
|
||||
let term = (factor + 0.12 + 0.11 / factor) * statistic;
|
||||
|
||||
let probability = probability_kolmogorov_smirnov(term)?;
|
||||
let reject_probability = 1.0 - probability;
|
||||
|
||||
assert!(0.0 <= reject_probability && reject_probability <= 1.0);
|
||||
|
||||
Ok(reject_probability)
|
||||
}
|
||||
|
||||
/// Calculate the critical value for the two sample Kolmogorov-Smirnov test.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// No convergence panic if the binary search does not locate the critical
|
||||
/// value in less than 200 iterations.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// extern crate kolmogorov_smirnov as ks;
|
||||
///
|
||||
/// let critical_value = ks::calculate_critical_value(256, 256, 0.95);
|
||||
/// println!("Critical value at 95% confidence for samples of size 256 is {}",
|
||||
/// critical_value);
|
||||
/// ```
|
||||
pub fn calculate_critical_value(n1: usize, n2: usize, confidence: f64) -> Result<f64, String> {
|
||||
assert!(0.0 < confidence && confidence < 1.0);
|
||||
|
||||
// Only supports samples of size > 7.
|
||||
assert!(n1 > 7 && n2 > 7);
|
||||
|
||||
// The test statistic is between zero and one so can binary search quickly
|
||||
// for the critical value.
|
||||
let mut low = 0.0;
|
||||
let mut high = 1.0;
|
||||
|
||||
for _ in 1..200 {
|
||||
if low + 1e-8 >= high {
|
||||
return Ok(high);
|
||||
}
|
||||
|
||||
let mid = low + (high - low) / 2.0;
|
||||
let reject_probability = calculate_reject_probability(mid, n1, n2)?;
|
||||
|
||||
if reject_probability > confidence {
|
||||
// Maintain invariant that reject_probability(high) > confidence.
|
||||
high = mid;
|
||||
} else {
|
||||
// Maintain invariant that reject_probability(low) <= confidence.
|
||||
low = mid;
|
||||
}
|
||||
}
|
||||
|
||||
Err(format!("No convergence in calculate_critical_value({}, {}, {}).",
|
||||
n1,
|
||||
n2,
|
||||
confidence))
|
||||
//panic!("No convergence in calculate_critical_value({}, {}, {}).",
|
||||
// n1,
|
||||
// n2,
|
||||
// confidence);
|
||||
}
|
||||
|
||||
/// Calculate the Kolmogorov-Smirnov probability function.
|
||||
fn probability_kolmogorov_smirnov(lambda: f64) -> Result<f64, String> {
|
||||
if lambda == 0.0 {
|
||||
return Ok(1.0);
|
||||
}
|
||||
|
||||
let minus_two_lambda_squared = -2.0 * lambda * lambda;
|
||||
let mut q_ks = 0.0;
|
||||
|
||||
for j in 1..200 {
|
||||
let sign = if j % 2 == 1 {
|
||||
1.0
|
||||
} else {
|
||||
-1.0
|
||||
};
|
||||
|
||||
let j = j as f64;
|
||||
let term = sign * 2.0 * (minus_two_lambda_squared * j * j).exp();
|
||||
|
||||
q_ks += term;
|
||||
|
||||
if term.abs() < 1e-8 {
|
||||
// Trim results that exceed 1.
|
||||
return Ok(q_ks.min(1.0));
|
||||
}
|
||||
}
|
||||
|
||||
Err(format!("No convergence in probability_kolmogorov_smirnov({}).",
|
||||
lambda))
|
||||
//panic!("No convergence in probability_kolmogorov_smirnov({}).",
|
||||
//lambda);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
extern crate quickcheck;
|
||||
extern crate rand;
|
||||
|
||||
use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, StdGen};
|
||||
use self::rand::Rng;
|
||||
use std::cmp;
|
||||
use std::usize;
|
||||
|
||||
use super::test;
|
||||
use ecdf::Ecdf;
|
||||
|
||||
const EPSILON: f64 = 1e-10;
|
||||
|
||||
fn check<A: Testable>(f: A) {
|
||||
// Need - 1 to ensure space for creating non-overlapping samples.
|
||||
let g = StdGen::new(rand::thread_rng(), usize::MAX - 1);
|
||||
QuickCheck::new().gen(g).quickcheck(f);
|
||||
}
|
||||
|
||||
/// Wrapper for generating sample data with QuickCheck.
|
||||
///
|
||||
/// Samples must be sequences of u64 values with more than 7 elements.
|
||||
#[derive(Debug, Clone)]
|
||||
struct Samples {
|
||||
vec: Vec<u64>,
|
||||
}
|
||||
|
||||
impl Samples {
|
||||
fn min(&self) -> u64 {
|
||||
let &min = self.vec.iter().min().unwrap();
|
||||
min
|
||||
}
|
||||
|
||||
fn max(&self) -> u64 {
|
||||
let &max = self.vec.iter().max().unwrap();
|
||||
max
|
||||
}
|
||||
|
||||
fn shuffle(&mut self) {
|
||||
let mut rng = rand::thread_rng();
|
||||
rng.shuffle(&mut self.vec);
|
||||
}
|
||||
}
|
||||
|
||||
impl Arbitrary for Samples {
|
||||
fn arbitrary<G: Gen>(g: &mut G) -> Samples {
|
||||
// Limit size of generated sample set to 1024
|
||||
let max = cmp::min(g.size(), 1024);
|
||||
|
||||
let size = g.gen_range(8, max);
|
||||
let vec = (0..size).map(|_| u64::arbitrary(g)).collect();
|
||||
|
||||
Samples { vec: vec }
|
||||
}
|
||||
|
||||
fn shrink(&self) -> Box<Iterator<Item = Samples>> {
|
||||
let vec: Vec<u64> = self.vec.clone();
|
||||
let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink();
|
||||
|
||||
Box::new(shrunk.filter(|v| v.len() > 7).map(|v| Samples { vec: v }))
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")]
|
||||
fn test_panics_on_empty_samples_set() {
|
||||
let xs: Vec<u64> = vec![];
|
||||
let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
|
||||
test(&xs, &ys, 0.95);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")]
|
||||
fn test_panics_on_empty_other_samples_set() {
|
||||
let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
|
||||
let ys: Vec<u64> = vec![];
|
||||
test(&xs, &ys, 0.95);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")]
|
||||
fn test_panics_on_confidence_leq_zero() {
|
||||
let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
|
||||
let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
|
||||
test(&xs, &ys, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")]
|
||||
fn test_panics_on_confidence_geq_one() {
|
||||
let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
|
||||
let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
|
||||
test(&xs, &ys, 1.0);
|
||||
}
|
||||
|
||||
/// Alternative calculation for the test statistic for the two sample
|
||||
/// Kolmogorov-Smirnov test. This simple implementation is used as a
|
||||
/// verification check against actual calculation used.
|
||||
fn calculate_statistic_alt<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f64 {
|
||||
assert!(xs.len() > 0 && ys.len() > 0);
|
||||
|
||||
let ecdf_xs = Ecdf::new(xs);
|
||||
let ecdf_ys = Ecdf::new(ys);
|
||||
|
||||
let mut statistic = 0.0;
|
||||
|
||||
for x in xs.iter() {
|
||||
let diff = (ecdf_xs.value(x.clone()) - ecdf_ys.value(x.clone())).abs();
|
||||
if diff > statistic {
|
||||
statistic = diff;
|
||||
}
|
||||
}
|
||||
|
||||
for y in ys.iter() {
|
||||
let diff = (ecdf_xs.value(y.clone()) - ecdf_ys.value(y.clone())).abs();
|
||||
if diff > statistic {
|
||||
statistic = diff;
|
||||
}
|
||||
}
|
||||
|
||||
statistic
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculate_statistic() {
|
||||
fn prop(xs: Samples, ys: Samples) -> bool {
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let actual = result.statistic;
|
||||
let expected = calculate_statistic_alt(&xs.vec, &ys.vec);
|
||||
|
||||
actual == expected
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_between_zero_and_one() {
|
||||
fn prop(xs: Samples, ys: Samples) -> bool {
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let actual = result.statistic;
|
||||
|
||||
0.0 <= actual && actual <= 1.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_zero_for_identical_samples() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let ys = xs.clone();
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
|
||||
result.statistic == 0.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_zero_for_permuted_sample() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let mut ys = xs.clone();
|
||||
ys.shuffle();
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
|
||||
result.statistic == 0.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_one_for_samples_with_no_overlap_in_support() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let mut ys = xs.clone();
|
||||
|
||||
// Shift ys so that ys.min > xs.max.
|
||||
let ys_min = xs.max() + 1;
|
||||
ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect();
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
|
||||
result.statistic == 1.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_one_half_for_sample_with_non_overlapping_in_support_replicate_added() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let mut ys = xs.clone();
|
||||
|
||||
// Shift ys so that ys.min > xs.max.
|
||||
let ys_min = xs.max() + 1;
|
||||
ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect();
|
||||
|
||||
// Add all the original items back too.
|
||||
for &x in xs.vec.iter() {
|
||||
ys.vec.push(x);
|
||||
}
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
|
||||
result.statistic == 0.5
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_one_div_length_for_sample_with_additional_low_value() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
// Add a extra sample of early weight to ys.
|
||||
let min = xs.min();
|
||||
let mut ys = xs.clone();
|
||||
ys.vec.push(min - 1);
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let expected = 1.0 / ys.vec.len() as f64;
|
||||
|
||||
result.statistic == expected
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_one_div_length_for_sample_with_additional_high_value() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
// Add a extra sample of late weight to ys.
|
||||
let max = xs.max();
|
||||
let mut ys = xs.clone();
|
||||
ys.vec.push(max + 1);
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let expected = 1.0 / ys.vec.len() as f64;
|
||||
|
||||
(result.statistic - expected).abs() < EPSILON
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_one_div_length_for_sample_with_additional_low_and_high_values() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
// Add a extra sample of late weight to ys.
|
||||
let min = xs.min();
|
||||
let max = xs.max();
|
||||
|
||||
let mut ys = xs.clone();
|
||||
|
||||
ys.vec.push(min - 1);
|
||||
ys.vec.push(max + 1);
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let expected = 1.0 / ys.vec.len() as f64;
|
||||
|
||||
(result.statistic - expected).abs() < EPSILON
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_values() {
|
||||
fn prop(xs: Samples, n: u8) -> bool {
|
||||
// Add extra sample of early weight to ys.
|
||||
let min = xs.min();
|
||||
let mut ys = xs.clone();
|
||||
for j in 0..n {
|
||||
ys.vec.push(min - (j as u64) - 1);
|
||||
}
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let expected = n as f64 / ys.vec.len() as f64;
|
||||
|
||||
result.statistic == expected
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u8) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_n_div_length_for_sample_with_additional_n_high_values() {
|
||||
fn prop(xs: Samples, n: u8) -> bool {
|
||||
// Add extra sample of early weight to ys.
|
||||
let max = xs.max();
|
||||
let mut ys = xs.clone();
|
||||
for j in 0..n {
|
||||
ys.vec.push(max + (j as u64) + 1);
|
||||
}
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let expected = n as f64 / ys.vec.len() as f64;
|
||||
|
||||
(result.statistic - expected).abs() < EPSILON
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u8) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_and_high_values() {
|
||||
fn prop(xs: Samples, n: u8) -> bool {
|
||||
// Add extra sample of early weight to ys.
|
||||
let min = xs.min();
|
||||
let max = xs.max();
|
||||
let mut ys = xs.clone();
|
||||
for j in 0..n {
|
||||
ys.vec.push(min - (j as u64) - 1);
|
||||
ys.vec.push(max + (j as u64) + 1);
|
||||
}
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let expected = n as f64 / ys.vec.len() as f64;
|
||||
|
||||
(result.statistic - expected).abs() < EPSILON
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u8) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_statistic_is_n_or_m_div_length_for_sample_with_additional_n_low_and_m_high_values() {
|
||||
fn prop(xs: Samples, n: u8, m: u8) -> bool {
|
||||
// Add extra sample of early weight to ys.
|
||||
let min = xs.min();
|
||||
let max = xs.max();
|
||||
let mut ys = xs.clone();
|
||||
for j in 0..n {
|
||||
ys.vec.push(min - (j as u64) - 1);
|
||||
}
|
||||
|
||||
for j in 0..m {
|
||||
ys.vec.push(max + (j as u64) + 1);
|
||||
}
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
let expected = cmp::max(n, m) as f64 / ys.vec.len() as f64;
|
||||
|
||||
(result.statistic - expected).abs() < EPSILON
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, u8, u8) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_rejected_if_reject_probability_greater_than_confidence() {
|
||||
fn prop(xs: Samples, ys: Samples) -> bool {
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
|
||||
if result.is_rejected {
|
||||
result.reject_probability > 0.95
|
||||
} else {
|
||||
result.reject_probability <= 0.95
|
||||
}
|
||||
}
|
||||
|
||||
check(prop as fn(Samples, Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reject_probability_is_zero_for_identical_samples() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let ys = xs.clone();
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
|
||||
result.reject_probability == 0.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reject_probability_is_zero_for_permuted_sample() {
|
||||
fn prop(xs: Samples) -> bool {
|
||||
let mut ys = xs.clone();
|
||||
ys.shuffle();
|
||||
|
||||
let result = test(&xs.vec, &ys.vec, 0.95);
|
||||
|
||||
result.reject_probability == 0.0
|
||||
}
|
||||
|
||||
check(prop as fn(Samples) -> bool);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
[package]
|
||||
name = "kstest"
|
||||
version = "0.1.0"
|
||||
authors = ["eugen.betke <betke@dkrz.de>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
kolmogorov_smirnov = {path = "../kslib"}
|
||||
csv = "1.1"
|
||||
glob = "0.3.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
chrono = "*"
|
|
@ -0,0 +1,27 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
#7488914 19865984
|
||||
#4296426 18672376
|
||||
#5024292 17944118
|
||||
|
||||
#dataset_fn="../../datasets/job_codings_v4_confidential.csv"
|
||||
#jobids=( )
|
||||
#jobids=( ${jobids[@]} 19865984 )
|
||||
#jobids=( ${jobids[@]} 18672376 )
|
||||
#jobids=( ${jobids[@]} 17944118 )
|
||||
|
||||
|
||||
dataset_fn="../../datasets/job_codings_v4.csv"
|
||||
jobids=( )
|
||||
jobids=( ${jobids[@]} 7488914 )
|
||||
jobids=( ${jobids[@]} 4296426 )
|
||||
jobids=( ${jobids[@]} 5024292 )
|
||||
|
||||
set -x
|
||||
for jobid in ${jobids[@]}; do
|
||||
sim_fn="./ks_similarities_$jobid.csv"
|
||||
progress_fn="./ks_progress_$jobid.csv"
|
||||
log_fn="./ks_fail_$jobid.log"
|
||||
cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn &
|
||||
done
|
|
@ -0,0 +1,183 @@
|
|||
extern crate kolmogorov_smirnov as ks;
|
||||
extern crate csv;
|
||||
extern crate serde;
|
||||
extern crate chrono;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
|
||||
//use std::fs::File;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::io::LineWriter;
|
||||
|
||||
pub type Score = u32;
|
||||
pub type JobCoding = Vec<Score>;
|
||||
pub type Similarity = f32;
|
||||
|
||||
pub type Jobid = u32;
|
||||
pub type QCodings = HashMap<Jobid, JobCoding>;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Record {
|
||||
jobid: u32,
|
||||
//q16_coding: String,
|
||||
ks_coding: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SimilarityRow {
|
||||
pub jobid: u32,
|
||||
pub alg_id: u32,
|
||||
pub alg_name: String,
|
||||
pub similarity: f32
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct ProgressRow {
|
||||
jobid: u32,
|
||||
alg_id: u32,
|
||||
alg_name: String,
|
||||
delta: i64,
|
||||
}
|
||||
|
||||
pub fn convert_to_coding(coding: String) -> Vec<Score> {
|
||||
let split = coding.split(":");
|
||||
let vec: Vec<Score> = split
|
||||
.filter(|s| !s.is_empty())
|
||||
//.map(|s| s.parse::<F>().unwrap())
|
||||
.map(|s| s.parse().unwrap())
|
||||
.collect();
|
||||
vec
|
||||
}
|
||||
|
||||
|
||||
//fn ks_similarity(xs: &Vec<u32>, ys: &Vec<u32>) -> Result<Similarity, String> {
|
||||
// let confidence = 0.95;
|
||||
// ks::test(xs, ys, confidence)?
|
||||
|
||||
// let reject_probability = match result {
|
||||
// Ok(v) => v.reject_probability,
|
||||
// Err(_) => 1.0,
|
||||
// };
|
||||
// //println!("is_rejected: {:?}\nstatistic: {:?}\nreject_probability: {:?}\ncritical_value: {:?}\nconfidence: {:?}", result.is_rejected, result.statistic, result.reject_probability, result.critical_value, result.confidence);
|
||||
// (1.0 - reject_probability) as Similarity
|
||||
//}
|
||||
|
||||
|
||||
fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: String, log_fn: String) {
|
||||
let mut q_codings: QCodings = HashMap::new();
|
||||
let file = File::open(&dataset_fn).expect("Unable to open dataset.");
|
||||
let mut rdr = csv::Reader::from_reader(file);
|
||||
|
||||
//for result in rdr.deserialize().take(10000) {
|
||||
for result in rdr.deserialize() {
|
||||
let record: Record = result.expect("bla bla");
|
||||
//let q_coding = convert_to_coding(record.q16_coding);
|
||||
let q_coding = convert_to_coding(record.ks_coding);
|
||||
// Insert Non-Zero jobs only
|
||||
if q_coding.iter().sum::<Score>() > (0 as Score) {
|
||||
q_codings.insert(record.jobid, q_coding);
|
||||
}
|
||||
}
|
||||
|
||||
let probe = q_codings[&jobid].clone();
|
||||
let similarities_file = File::create(&similarities_fn).expect("Unable to open");
|
||||
let mut wtr_similarities = csv::Writer::from_writer(&similarities_file);
|
||||
let alg_name = "ks";
|
||||
let alg_id = 6;
|
||||
|
||||
let progress_file = File::create(&progress_fn).expect("Unable to open");
|
||||
let mut wtr_progress = csv::Writer::from_writer(&progress_file);
|
||||
let mut start = chrono::Utc::now();
|
||||
let mut counter = 1;
|
||||
|
||||
let mut avail_codings: Vec<(u32, &JobCoding)>;
|
||||
|
||||
avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect();
|
||||
let mut similarities: Vec<(Jobid, Similarity)> = Vec::new();
|
||||
|
||||
let log_file = File::create(&log_fn).expect("Unable to open");
|
||||
let mut log_file = LineWriter::new(log_file);
|
||||
|
||||
|
||||
while let Some((jobid, q_coding)) = avail_codings.pop() {
|
||||
if (counter % 10_000) == 0 {
|
||||
let stop = chrono::Utc::now();
|
||||
let progress_row = ProgressRow {
|
||||
jobid: jobid,
|
||||
alg_id: alg_id,
|
||||
alg_name: String::from(alg_name),
|
||||
delta: ((stop - start).num_nanoseconds().unwrap())
|
||||
};
|
||||
wtr_progress.serialize(progress_row).unwrap();
|
||||
start = stop;
|
||||
}
|
||||
|
||||
//println!("Processing {:?}", jobid);
|
||||
//let similarity = ks_similarity(q_coding, &probe);
|
||||
|
||||
let confidence = 0.95;
|
||||
let similarity = match ks::test(q_coding, &probe, confidence) {
|
||||
Ok(sim) => (1.0 - sim.reject_probability) as Similarity,
|
||||
Err(e) => {
|
||||
let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
|
||||
log_file.write_all(message.as_bytes()).unwrap();
|
||||
1.0
|
||||
}
|
||||
};
|
||||
|
||||
similarities.push((jobid, similarity));
|
||||
counter += 1;
|
||||
}
|
||||
|
||||
for (jobid, similarity) in similarities.iter() {
|
||||
let similarity_row = SimilarityRow {
|
||||
jobid: *jobid,
|
||||
alg_id: alg_id,
|
||||
alg_name: String::from(alg_name),
|
||||
similarity: *similarity,
|
||||
};
|
||||
wtr_similarities.serialize(similarity_row).unwrap();
|
||||
}
|
||||
log_file.flush().unwrap();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
let dataset_fn = args[1].clone();
|
||||
let jobid = args[2].parse::<u32>().unwrap();
|
||||
let sim_fn = args[3].clone();
|
||||
let progress_fn = args[4].clone();
|
||||
let log_fn = args[5].clone();
|
||||
println!("{:?}", args);
|
||||
|
||||
run(dataset_fn, jobid, sim_fn, progress_fn, log_fn);
|
||||
}
|
||||
|
||||
|
||||
mod tests {
|
||||
//use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_ks_test() {
|
||||
let xs = vec!(0.0 , 1.0 , 2.0 , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 , 8.0 , 9.0 , 10.0 , 11.0 , 12.0);
|
||||
let ys = vec!(12.0 , 11.0 , 10.0 , 9.0 , 8.0 , 7.0 , 6.0 , 5.0 , 4.0 , 3.0 , 2.0 , 1.0 , 0.0);
|
||||
|
||||
ks_test(xs, ys);
|
||||
|
||||
let c1 = vec![141.0,143.0,142.0,238.0,132.0,486.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0];
|
||||
let c2 = vec![239.0,239.0,255.0,255.0,239.0,239.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,502.0,511.0,503.0];
|
||||
ks_test(c1, c2);
|
||||
|
||||
|
||||
let c1 = vec![2.0,2.0,2.0,9.0,3.0,0.0,0.0,0.0];
|
||||
let c2 = vec![2.0,2.0,2.0,2.0,8.0,3.0,0.0,10.0];
|
||||
ks_test(c1, c2);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue