Kolmogorov-Smirnov-Test

2020-09-01 12:49:54 +02:00 · 2020-09-01 12:49:54 +02:00 · 8d2ebf0e96
commit 8d2ebf0e96
parent 88e994f997
11 changed files with 1995 additions and 0 deletions
--- a/tools/kslib/Cargo.toml
+++ b/tools/kslib/Cargo.toml
@ -0,0 +1,36 @@
+[package]
+name = "kolmogorov_smirnov"
+description = "Implementation of the Kolmogorov-Smirnov statistical test as a Rust library."
+version = "1.1.0"
+homepage = "http://daithiocrualaoich.github.io/kolmogorov_smirnov"
+repository = "https://github.com/daithiocrualaoich/kolmogorov_smirnov"
+authors = ["Daithi O Crualaoich <daithi.ocrualaoich@gmail.com>"]
+license = "Apache-2.0"
+keywords = ["statistics"]
+exclude = [
+  ".dockerignore",
+  ".gitignore",
+  ".travis.yml",
+  "Cargo.lock",
+  "Dockerfile",
+  "dat/*",
+  "doc/*",
+]
+
+[[bin]]
+name = "critical_values"
+
+[[bin]]
+name = "normal"
+
+[[bin]]
+name = "ks_f64"
+
+[[bin]]
+name = "ks_i64"
+
+[dependencies]
+rand = "0.3.12"
+
+[dev-dependencies]
+quickcheck = "0.2"
--- a/tools/kslib/src/bin/critical_values.rs
+++ b/tools/kslib/src/bin/critical_values.rs
@ -0,0 +1,39 @@
+extern crate kolmogorov_smirnov as ks;
+
+use ks::calculate_critical_value;
+
+use std::env;
+
+/// Calculate critical values dataset.
+///
+/// # Examples
+///
+/// ```bash
+/// cargo run --bin critical_values <confidence> <num_samples> <limit>
+/// ```
+///
+/// This will print the critical values of the Kolmogorov-Smirnov two sample
+/// test for samples of size `<num_samples>` against samples of sizes 16
+/// through `<limit>` inclusive at the specified confidence level.
+///
+/// `<num_samples>` and `<limit>` must be positive integers, `<confidence>` must
+/// be a floating point number strictly between zero and one.
+fn main() {
+    let args: Vec<String> = env::args().collect();
+
+    let confidence: f64 = args[1].parse().expect("<confidence> must be a floating point number.");
+    let n1: usize = args[2].parse().expect("<num_samples> must be an integer.");
+    let limit: usize = args[3].parse().expect("<limit> must be an integer.");
+
+    assert!(n1 > 0 && limit > 0);
+    assert!(0.0 < confidence && confidence < 1.0);
+
+    println!("n1\tn2\tconfidence\tcritical_value");
+    for n2 in 16..(limit + 1) {
+        println!("{}\t{}\t{}\t{}",
+                 n1,
+                 n2,
+                 confidence,
+                 calculate_critical_value(n1, n2, confidence).unwrap());
+    }
+}
--- a/tools/kslib/src/bin/ks_f64.rs
+++ b/tools/kslib/src/bin/ks_f64.rs
@ -0,0 +1,52 @@
+extern crate kolmogorov_smirnov as ks;
+
+use ks::test_f64;
+
+use std::env;
+use std::io::{BufReader, BufRead};
+use std::fs::File;
+use std::path::Path;
+
+fn parse_float(s: String) -> f64 {
+    s.parse::<f64>().expect("Not a floating point number.")
+}
+
+/// Runs a Kolmogorov-Smirnov test on floating point data files.
+///
+/// Input files must be single-column headerless data files. The data samples
+/// are tested against each other at the 0.95 confidence level.
+///
+/// # Examples
+///
+/// ```bash
+/// cargo run --bin ks_f64 <file1> <file2>
+/// ```
+///
+/// This will print the test result to standard output.
+fn main() {
+    let args: Vec<String> = env::args().collect();
+
+    let path1 = Path::new(&args[1]);
+    let path2 = Path::new(&args[2]);
+
+    let file1 = BufReader::new(File::open(&path1).unwrap());
+    let file2 = BufReader::new(File::open(&path2).unwrap());
+
+    let lines1 = file1.lines().map(|line| line.unwrap());
+    let lines2 = file2.lines().map(|line| line.unwrap());
+
+    let xs: Vec<f64> = lines1.map(parse_float).collect();
+    let ys: Vec<f64> = lines2.map(parse_float).collect();
+
+    let result = ks::test_f64(&xs, &ys, 0.95).unwrap();
+
+    if result.is_rejected {
+        println!("Samples are from different distributions.");
+    } else {
+        println!("Samples are from the same distribution.");
+    }
+
+    println!("test statistic = {}", result.statistic);
+    println!("critical value = {}", result.critical_value);
+    println!("reject probability = {}", result.reject_probability);
+}
--- a/tools/kslib/src/bin/ks_i64.rs
+++ b/tools/kslib/src/bin/ks_i64.rs
@ -0,0 +1,52 @@
+extern crate kolmogorov_smirnov as ks;
+
+use ks::test;
+
+use std::env;
+use std::io::{BufReader, BufRead};
+use std::fs::File;
+use std::path::Path;
+
+fn parse_int(s: String) -> i64 {
+    s.parse::<i64>().expect("Not an integer.")
+}
+
+/// Runs a Kolmogorov-Smirnov test on integer data files.
+///
+/// Input files must be single-column headerless data files. The data samples
+/// are tested against each other at the 0.95 confidence level.
+///
+/// # Examples
+///
+/// ```bash
+/// cargo run --bin ks_i64 <file1> <file2>
+/// ```
+///
+/// This will print the test result to standard output.
+fn main() {
+    let args: Vec<String> = env::args().collect();
+
+    let path1 = Path::new(&args[1]);
+    let path2 = Path::new(&args[2]);
+
+    let file1 = BufReader::new(File::open(&path1).unwrap());
+    let file2 = BufReader::new(File::open(&path2).unwrap());
+
+    let lines1 = file1.lines().map(|line| line.unwrap());
+    let lines2 = file2.lines().map(|line| line.unwrap());
+
+    let xs: Vec<i64> = lines1.map(parse_int).collect();
+    let ys: Vec<i64> = lines2.map(parse_int).collect();
+
+    let result = ks::test(&xs, &ys, 0.95).unwrap();
+
+    if result.is_rejected {
+        println!("Samples are from different distributions.");
+    } else {
+        println!("Samples are from the same distribution.");
+    }
+
+    println!("test statistic = {}", result.statistic);
+    println!("critical value = {}", result.critical_value);
+    println!("reject probability = {}", result.reject_probability);
+}
--- a/tools/kslib/src/bin/normal.rs
+++ b/tools/kslib/src/bin/normal.rs
@ -0,0 +1,35 @@
+extern crate rand;
+
+use std::env;
+use rand::distributions::{Normal, IndependentSample};
+
+/// Prints a sequence of Normal deviates.
+///
+/// # Examples
+///
+/// ```bash
+/// cargo run --bin normal <num_deviates> <mean> <variance>
+/// ```
+/// This will print `<num_deviates>` float point numbers, one per line, to
+/// standard output. These numbers will have a Normal distribution with the
+/// specified mean and variance.
+///
+/// `<num_deviates>` must be a positive integer, `<mean>` and `<variance>` may
+/// be integers or floating point numbers but `<variance>` must be positive.
+fn main() {
+    let args: Vec<String> = env::args().collect();
+
+    let n: u32 = args[1].parse().expect("<num_deviates> must be an integer.");
+    let mean: f64 = args[2].parse().expect("<mean> must be a floating point number.");
+    let variance: f64 = args[3].parse().expect("<variance> must be a floating point number.");
+
+    assert!(n > 0 && variance.is_sign_positive());
+
+    let mut rng = rand::thread_rng();
+    let normal = Normal::new(mean, variance.sqrt());
+
+    for _ in 0..n {
+        let x = normal.ind_sample(&mut rng);
+        println!("{}", x)
+    }
+}
--- a/tools/kslib/src/ecdf.rs
+++ b/tools/kslib/src/ecdf.rs
@ -0,0 +1,853 @@
+//! Empirical cumulative distribution function.
+
+pub struct Ecdf<T: Ord> {
+    samples: Vec<T>,
+    length: usize,
+}
+
+impl<T: Ord + Clone> Ecdf<T> {
+    /// Construct a new representation of a cumulative distribution function for
+    /// a given sample.
+    ///
+    /// The construction will involve computing a sorted clone of the given sample
+    /// and may be inefficient or completely prohibitive for large samples. This
+    /// computation is amortized significantly if there is heavy use of the value
+    /// function.
+    ///
+    /// # Panics
+    ///
+    /// The sample set must be non-empty.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// extern crate kolmogorov_smirnov as ks;
+    ///
+    /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    /// let ecdf = ks::Ecdf::new(&samples);
+    /// ```
+    pub fn new(samples: &[T]) -> Ecdf<T> {
+        let length = samples.len();
+        assert!(length > 0);
+
+        // Sort a copied sample for binary searching.
+        let mut sorted = samples.to_vec();
+        sorted.sort();
+
+        Ecdf {
+            samples: sorted,
+            length: length,
+        }
+    }
+
+    /// Calculate a value of the empirical cumulative distribution function for
+    /// a given sample.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// extern crate kolmogorov_smirnov as ks;
+    ///
+    /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    /// let ecdf = ks::Ecdf::new(&samples);
+    /// assert_eq!(ecdf.value(4), 0.5);
+    /// ```
+    pub fn value(&self, t: T) -> f64 {
+        let num_samples_leq_t = match self.samples.binary_search(&t) {
+            Ok(mut index) => {
+                // At least one sample is a t and we have the index of it. Need
+                // to walk down the sorted samples until at last that == t.
+                while index + 1 < self.length && self.samples[index + 1] == t {
+                    index += 1;
+                }
+
+                // Compensate for 0-based indexing.
+                index + 1
+            }
+            Err(index) => {
+                // No sample is a t but if we had to put one in it would go at
+                // index. This means all indices to the left have samples < t
+                // and should be counted in the cdf proportion. We must take one
+                // from index to get the last included sample but then we just
+                // have to add one again to account for 0-based indexing.
+                index
+            }
+        };
+
+        num_samples_leq_t as f64 / self.length as f64
+    }
+
+    /// Calculate a percentile for the sample using the Nearest Rank method.
+    ///
+    /// # Panics
+    ///
+    /// The percentile requested must be between 1 and 100 inclusive. In
+    /// particular, there is no 0-percentile.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// extern crate kolmogorov_smirnov as ks;
+    ///
+    /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    /// let ecdf = ks::Ecdf::new(&samples);
+    /// assert_eq!(ecdf.percentile(50), 4);
+    /// ```
+    pub fn percentile(&self, p: u8) -> T {
+        assert!(0 < p && p <= 100);
+
+        let rank = (p as f64 * self.length as f64 / 100.0).ceil() as usize;
+        self.samples[rank - 1].clone()
+    }
+
+    /// Return the minimal element of the samples.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// extern crate kolmogorov_smirnov as ks;
+    ///
+    /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    /// let ecdf = ks::Ecdf::new(&samples);
+    /// assert_eq!(ecdf.min(), 0);
+    /// ```
+    pub fn min(&self) -> T {
+        self.samples[0].clone()
+    }
+
+    /// Return the maximal element of the samples.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// extern crate kolmogorov_smirnov as ks;
+    ///
+    /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    /// let ecdf = ks::Ecdf::new(&samples);
+    /// assert_eq!(ecdf.max(), 9);
+    /// ```
+    pub fn max(&self) -> T {
+        self.samples[self.samples.len() - 1].clone()
+    }
+}
+
+/// Calculate a one-time value of the empirical cumulative distribution function
+/// for a given sample.
+///
+/// Computational running time of this function is O(n) but does not amortize
+/// across multiple calls like Ecdf<T>::value. This function should only be
+/// used in the case that a small number of ECDF values are required for the
+/// sample. Otherwise, Ecdf::new should be used to create a structure that
+/// takes the upfront O(n log n) sort cost but calculates values in O(log n).
+///
+/// # Panics
+///
+/// The sample set must be non-empty.
+///
+/// # Examples
+///
+/// ```
+/// extern crate kolmogorov_smirnov as ks;
+///
+/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+/// let value = ks::ecdf(&samples, 4);
+/// assert_eq!(value, 0.5);
+/// ```
+pub fn ecdf<T: Ord>(samples: &[T], t: T) -> f64 {
+    let mut num_samples_leq_t = 0;
+    let mut length = 0;
+
+    for sample in samples.iter() {
+        length += 1;
+        if *sample <= t {
+            num_samples_leq_t += 1;
+        }
+    }
+
+    assert!(length > 0);
+
+    num_samples_leq_t as f64 / length as f64
+}
+
+/// Calculate a one-time percentile for a given sample using the Nearest Rank
+/// method and Quick Select.
+///
+/// Computational running time of this function is O(n) but does not amortize
+/// across multiple calls like Ecdf<T>::percentile. This function should only be
+/// used in the case that a small number of percentiles are required for the
+/// sample. Otherwise, Ecdf::new should be used to create a structure that
+/// takes the upfront O(n log n) sort cost but calculates percentiles in O(1).
+///
+/// # Panics
+///
+/// The sample set must be non-empty.
+///
+/// The percentile requested must be between 1 and 100 inclusive. In particular,
+/// there is no 0-percentile.
+///
+/// # Examples
+///
+/// ```
+/// extern crate kolmogorov_smirnov as ks;
+///
+/// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+/// let percentile = ks::percentile(&samples, 50);
+/// assert_eq!(percentile, 4);
+/// ```
+pub fn percentile<T: Ord + Clone>(samples: &[T], p: u8) -> T {
+    assert!(0 < p && p <= 100);
+
+    let length = samples.len();
+    assert!(length > 0);
+
+    let rank = (p as f64 * length as f64 / 100.0).ceil() as usize;
+
+    // Quick Select the element at rank.
+
+    let mut samples: Vec<T> = samples.to_vec();
+    let mut low = 0;
+    let mut high = length;
+
+    loop {
+        assert!(low < high);
+
+        let pivot = samples[low].clone();
+
+        if low >= high - 1 {
+            return pivot;
+        }
+
+        // First determine if the rank item is less than the pivot.
+
+        // Organise samples so that all items less than pivot are to the left,
+        // `bottom` is the number of items less than pivot.
+
+        let mut bottom = low;
+        let mut top = high - 1;
+
+        while bottom < top {
+            while bottom < top && samples[bottom] < pivot {
+                bottom += 1;
+            }
+            while bottom < top && samples[top] >= pivot {
+                top -= 1;
+            }
+
+            if bottom < top {
+                samples.swap(bottom, top);
+            }
+        }
+
+        if rank <= bottom {
+            // Rank item is less than pivot. Exclude pivot and larger items.
+            high = bottom;
+        } else {
+            // Rank item is pivot or in the larger set. Exclude smaller items.
+            low = bottom;
+
+            // Next, determine if the pivot is the rank item.
+
+            // Organise samples so that all items less than or equal to pivot
+            // are to the left, `bottom` is the number of items less than or
+            // equal to pivot. Since the left is already less than the pivot,
+            // this just requires moving the pivots left also.
+
+            let mut bottom = low;
+            let mut top = high - 1;
+
+            while bottom < top {
+                while bottom < top && samples[bottom] == pivot {
+                    bottom += 1;
+                }
+                while bottom < top && samples[top] != pivot {
+                    top -= 1;
+                }
+
+                if bottom < top {
+                    samples.swap(bottom, top);
+                }
+            }
+
+            // Is pivot the rank item?
+
+            if rank <= bottom {
+                return pivot;
+            }
+
+            // Rank item is greater than pivot. Exclude pivot and smaller items.
+            low = bottom;
+        }
+    }
+}
+
+
+#[cfg(test)]
+mod tests {
+    extern crate quickcheck;
+    extern crate rand;
+
+    use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, TestResult, StdGen};
+    use std::cmp;
+    use std::usize;
+    use super::{Ecdf, ecdf, percentile};
+
+    fn check<A: Testable>(f: A) {
+        let g = StdGen::new(rand::thread_rng(), usize::MAX);
+        QuickCheck::new().gen(g).quickcheck(f);
+    }
+
+    /// Wrapper for generating sample data with QuickCheck.
+    ///
+    /// Samples must be non-empty sequences of u64 values.
+    #[derive(Debug, Clone)]
+    struct Samples {
+        vec: Vec<u64>,
+    }
+
+    impl Arbitrary for Samples {
+        fn arbitrary<G: Gen>(g: &mut G) -> Samples {
+            // Limit size of generated sample set to 1024
+            let max = cmp::min(g.size(), 1024);
+
+            let size = g.gen_range(1, max);
+            let vec = (0..size).map(|_| u64::arbitrary(g)).collect();
+
+            Samples { vec: vec }
+        }
+
+        fn shrink(&self) -> Box<Iterator<Item = Samples>> {
+            let vec: Vec<u64> = self.vec.clone();
+            let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink();
+
+            Box::new(shrunk.filter(|v| v.len() > 0).map(|v| Samples { vec: v }))
+        }
+    }
+
+    /// Wrapper for generating percentile query value data with QuickCheck.
+    ///
+    /// Percentile must be u8 between 1 and 100 inclusive.
+    #[derive(Debug, Clone)]
+    struct Percentile {
+        val: u8,
+    }
+
+    impl Arbitrary for Percentile {
+        fn arbitrary<G: Gen>(g: &mut G) -> Percentile {
+            let val = g.gen_range(1, 101) as u8;
+
+            Percentile { val: val }
+        }
+
+        fn shrink(&self) -> Box<Iterator<Item = Percentile>> {
+            let shrunk: Box<Iterator<Item = u8>> = self.val.shrink();
+
+            Box::new(shrunk.filter(|&v| 0u8 < v && v <= 100u8).map(|v| Percentile { val: v }))
+        }
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: length > 0")]
+    fn single_use_ecdf_panics_on_empty_samples_set() {
+        let xs: Vec<u64> = vec![];
+        ecdf(&xs, 0);
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: length > 0")]
+    fn multiple_use_ecdf_panics_on_empty_samples_set() {
+        let xs: Vec<u64> = vec![];
+        Ecdf::new(&xs);
+    }
+
+    #[test]
+    fn single_use_ecdf_between_zero_and_one() {
+        fn prop(xs: Samples, val: u64) -> bool {
+            let actual = ecdf(&xs.vec, val);
+
+            0.0 <= actual && actual <= 1.0
+        }
+
+        check(prop as fn(Samples, u64) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_ecdf_between_zero_and_one() {
+        fn prop(xs: Samples, val: u64) -> bool {
+            let ecdf = Ecdf::new(&xs.vec);
+            let actual = ecdf.value(val);
+
+            0.0 <= actual && actual <= 1.0
+        }
+
+        check(prop as fn(Samples, u64) -> bool);
+    }
+
+    #[test]
+    fn single_use_ecdf_is_an_increasing_function() {
+        fn prop(xs: Samples, val: u64) -> bool {
+            let actual = ecdf(&xs.vec, val);
+
+            ecdf(&xs.vec, val - 1) <= actual && actual <= ecdf(&xs.vec, val + 1)
+        }
+
+        check(prop as fn(Samples, u64) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_ecdf_is_an_increasing_function() {
+        fn prop(xs: Samples, val: u64) -> bool {
+            let ecdf = Ecdf::new(&xs.vec);
+            let actual = ecdf.value(val);
+
+            ecdf.value(val - 1) <= actual && actual <= ecdf.value(val + 1)
+        }
+
+        check(prop as fn(Samples, u64) -> bool);
+    }
+
+    #[test]
+    fn single_use_ecdf_sample_min_minus_one_is_zero() {
+        fn prop(xs: Samples) -> bool {
+            let &min = xs.vec.iter().min().unwrap();
+
+            ecdf(&xs.vec, min - 1) == 0.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_ecdf_sample_min_minus_one_is_zero() {
+        fn prop(xs: Samples) -> bool {
+            let &min = xs.vec.iter().min().unwrap();
+            let ecdf = Ecdf::new(&xs.vec);
+
+            ecdf.value(min - 1) == 0.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn single_use_ecdf_sample_max_is_one() {
+        fn prop(xs: Samples) -> bool {
+            let &max = xs.vec.iter().max().unwrap();
+
+            ecdf(&xs.vec, max) == 1.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_ecdf_sample_max_is_one() {
+        fn prop(xs: Samples) -> bool {
+            let &max = xs.vec.iter().max().unwrap();
+            let ecdf = Ecdf::new(&xs.vec);
+
+            ecdf.value(max) == 1.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn single_use_ecdf_sample_val_is_num_samples_leq_val_div_length() {
+        fn prop(xs: Samples) -> bool {
+            let &val = xs.vec.first().unwrap();
+            let num_samples = xs.vec
+                                .iter()
+                                .filter(|&&x| x <= val)
+                                .count();
+            let expected = num_samples as f64 / xs.vec.len() as f64;
+
+            ecdf(&xs.vec, val) == expected
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_ecdf_sample_val_is_num_samples_leq_val_div_length() {
+        fn prop(xs: Samples) -> bool {
+            let &val = xs.vec.first().unwrap();
+            let num_samples = xs.vec
+                                .iter()
+                                .filter(|&&x| x <= val)
+                                .count();
+            let expected = num_samples as f64 / xs.vec.len() as f64;
+
+            let ecdf = Ecdf::new(&xs.vec);
+
+            ecdf.value(val) == expected
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn single_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() {
+        fn prop(xs: Samples, val: u64) -> TestResult {
+            let length = xs.vec.len();
+
+            if xs.vec.iter().any(|&x| x == val) {
+                // Discard Vec containing val.
+                return TestResult::discard();
+            }
+
+            let num_samples = xs.vec
+                                .iter()
+                                .filter(|&&x| x <= val)
+                                .count();
+            let expected = num_samples as f64 / length as f64;
+
+            let actual = ecdf(&xs.vec, val);
+
+            TestResult::from_bool(actual == expected)
+        }
+
+        check(prop as fn(Samples, u64) -> TestResult);
+    }
+
+    #[test]
+    fn multiple_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() {
+        fn prop(xs: Samples, val: u64) -> TestResult {
+            let length = xs.vec.len();
+
+            if xs.vec.iter().any(|&x| x == val) {
+                // Discard Vec containing val.
+                return TestResult::discard();
+            }
+
+            let num_samples = xs.vec
+                                .iter()
+                                .filter(|&&x| x <= val)
+                                .count();
+            let expected = num_samples as f64 / length as f64;
+
+            let ecdf = Ecdf::new(&xs.vec);
+
+            TestResult::from_bool(ecdf.value(val) == expected)
+        }
+
+        check(prop as fn(Samples, u64) -> TestResult);
+    }
+
+    #[test]
+    fn single_and_multiple_use_ecdf_agree() {
+        fn prop(xs: Samples, val: u64) -> bool {
+            let multiple_use = Ecdf::new(&xs.vec);
+
+            multiple_use.value(val) == ecdf(&xs.vec, val)
+        }
+
+        check(prop as fn(Samples, u64) -> bool);
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: 0 < p && p <= 100")]
+    fn single_use_percentiles_panics_on_zero_percentile() {
+        let xs: Vec<u64> = vec![0];
+
+        percentile(&xs, 0);
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: 0 < p && p <= 100")]
+    fn single_use_percentiles_panics_on_101_percentile() {
+        let xs: Vec<u64> = vec![0];
+
+        percentile(&xs, 101);
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: 0 < p && p <= 100")]
+    fn multiple_use_percentiles_panics_on_zero_percentile() {
+        let xs: Vec<u64> = vec![0];
+        let ecdf = Ecdf::new(&xs);
+
+        ecdf.percentile(0);
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: 0 < p && p <= 100")]
+    fn multiple_use_percentiles_panics_on_101_percentile() {
+        let xs: Vec<u64> = vec![0];
+        let ecdf = Ecdf::new(&xs);
+
+        ecdf.percentile(101);
+    }
+
+    #[test]
+    fn single_use_percentile_between_samples_min_and_max() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let &min = xs.vec.iter().min().unwrap();
+            let &max = xs.vec.iter().max().unwrap();
+
+            let actual = percentile(&xs.vec, p.val);
+
+            min <= actual && actual <= max
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn single_use_percentile_is_an_increasing_function() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let smaller = cmp::max(p.val - 1, 1);
+            let larger = cmp::min(p.val + 1, 100);
+
+            let actual = percentile(&xs.vec, p.val);
+
+            percentile(&xs.vec, smaller) <= actual && actual <= percentile(&xs.vec, larger)
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn single_use_percentile_100_is_sample_max() {
+        fn prop(xs: Samples) -> bool {
+            let &max = xs.vec.iter().max().unwrap();
+
+            percentile(&xs.vec, 100) == max
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_percentile_between_samples_min_and_max() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let &min = xs.vec.iter().min().unwrap();
+            let &max = xs.vec.iter().max().unwrap();
+
+            let ecdf = Ecdf::new(&xs.vec);
+            let actual = ecdf.percentile(p.val);
+
+            min <= actual && actual <= max
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_percentile_is_an_increasing_function() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let smaller = cmp::max(p.val - 1, 1);
+            let larger = cmp::min(p.val + 1, 100);
+
+            let ecdf = Ecdf::new(&xs.vec);
+            let actual = ecdf.percentile(p.val);
+
+            ecdf.percentile(smaller) <= actual && actual <= ecdf.percentile(larger)
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_percentile_100_is_sample_max() {
+        fn prop(xs: Samples) -> bool {
+            let &max = xs.vec.iter().max().unwrap();
+            let ecdf = Ecdf::new(&xs.vec);
+
+            ecdf.percentile(100) == max
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn single_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() {
+        fn prop(xs: Samples, val: u64) -> TestResult {
+            let actual = ecdf(&xs.vec, val);
+
+            let p = (actual * 100.0).floor() as u8;
+
+            match p {
+                0 => {
+                    // val is below the first percentile threshold. Can't
+                    // calculate 0-percentile value so discard.
+                    TestResult::discard()
+                }
+                _ => {
+                    // Not equal because e.g. all percentiles of [0] are 0. So
+                    // value of 1 gives ecdf == 1.0 and percentile(100) == 0.
+                    let single_use = percentile(&xs.vec, p);
+                    TestResult::from_bool(single_use <= val)
+                }
+            }
+        }
+
+        check(prop as fn(Samples, u64) -> TestResult);
+    }
+
+    #[test]
+    fn single_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() {
+        fn prop(xs: Samples, val: u64) -> TestResult {
+            let actual = ecdf(&xs.vec, val);
+
+            let p = (actual * 100.0).floor() as u8;
+
+            match p {
+                0 => {
+                    // val is below the first percentile threshold. Can't
+                    // calculate 0-percentile value so discard.
+                    TestResult::discard()
+                }
+                _ => {
+                    // Not equal because e.g. all percentiles of [0] are 0. So
+                    // value of 1 gives ecdf == 1.0 and percentile(100) == 0.
+                    let multiple_use = Ecdf::new(&xs.vec);
+                    TestResult::from_bool(multiple_use.percentile(p) <= val)
+                }
+            }
+        }
+
+        check(prop as fn(Samples, u64) -> TestResult);
+    }
+
+    #[test]
+    fn multiple_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() {
+        fn prop(xs: Samples, val: u64) -> TestResult {
+            let ecdf = Ecdf::new(&xs.vec);
+            let actual = ecdf.value(val);
+
+            let p = (actual * 100.0).floor() as u8;
+
+            match p {
+                0 => {
+                    // val is below the first percentile threshold. Can't
+                    // calculate 0-percentile value so discard.
+                    TestResult::discard()
+                }
+                _ => {
+                    // Not equal because e.g. all percentiles of [0] are 0. So
+                    // value of 1 gives ecdf == 1.0 and percentile(100) == 0.
+                    let single_use = percentile(&xs.vec, p);
+                    TestResult::from_bool(single_use <= val)
+                }
+            }
+        }
+
+        check(prop as fn(Samples, u64) -> TestResult);
+    }
+
+    #[test]
+    fn multiple_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() {
+        fn prop(xs: Samples, val: u64) -> TestResult {
+            let ecdf = Ecdf::new(&xs.vec);
+            let actual = ecdf.value(val);
+
+            let p = (actual * 100.0).floor() as u8;
+
+            match p {
+                0 => {
+                    // val is below the first percentile threshold. Can't
+                    // calculate 0-percentile value so discard.
+                    TestResult::discard()
+                }
+                _ => {
+                    // Not equal because e.g. all percentiles of [0] are 0. So
+                    // value of 1 gives ecdf == 1.0 and percentile(100) == 0.
+                    TestResult::from_bool(ecdf.percentile(p) <= val)
+                }
+            }
+        }
+
+        check(prop as fn(Samples, u64) -> TestResult);
+    }
+
+    #[test]
+    fn single_use_percentile_followed_by_single_use_edf_is_geq_original_value() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let actual = percentile(&xs.vec, p.val);
+
+            // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
+            // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
+            p.val as f64 / 100.0 <= ecdf(&xs.vec, actual)
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn single_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let actual = percentile(&xs.vec, p.val);
+
+            let ecdf = Ecdf::new(&xs.vec);
+
+            // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
+            // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
+            p.val as f64 / 100.0 <= ecdf.value(actual)
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_percentile_followed_by_single_use_edf_is_geq_original_value() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let multiple_use = Ecdf::new(&xs.vec);
+            let actual = multiple_use.percentile(p.val);
+
+            // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
+            // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
+            p.val as f64 / 100.0 <= ecdf(&xs.vec, actual)
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn multiple_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let ecdf = Ecdf::new(&xs.vec);
+            let actual = ecdf.percentile(p.val);
+
+            // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
+            // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
+            p.val as f64 / 100.0 <= ecdf.value(actual)
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn single_and_multiple_use_percentile_agree() {
+        fn prop(xs: Samples, p: Percentile) -> bool {
+            let multiple_use = Ecdf::new(&xs.vec);
+
+            multiple_use.percentile(p.val) == percentile(&xs.vec, p.val)
+        }
+
+        check(prop as fn(Samples, Percentile) -> bool);
+    }
+
+    #[test]
+    fn min_is_leq_all_samples() {
+        fn prop(xs: Samples) -> bool {
+            let multiple_use = Ecdf::new(&xs.vec);
+            let actual = multiple_use.min();
+
+            xs.vec.iter().all(|&x| actual <= x)
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn max_is_geq_all_samples() {
+        fn prop(xs: Samples) -> bool {
+            let multiple_use = Ecdf::new(&xs.vec);
+            let actual = multiple_use.max();
+
+            xs.vec.iter().all(|&x| actual >= x)
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+}
--- a/tools/kslib/src/lib.rs
+++ b/tools/kslib/src/lib.rs
@ -0,0 +1,5 @@
+pub mod ecdf;
+pub mod test;
+
+pub use test::{test, test_f64, calculate_critical_value};
+pub use ecdf::{Ecdf, ecdf, percentile};
--- a/tools/kslib/src/test.rs
+++ b/tools/kslib/src/test.rs
@ -0,0 +1,699 @@
+//! Two Sample Kolmogorov-Smirnov Test
+
+use std::cmp::{min, Ord, Ordering};
+
+/// Two sample test result.
+pub struct TestResult {
+    pub is_rejected: bool,
+    pub statistic: f64,
+    pub reject_probability: f64,
+    pub critical_value: f64,
+    pub confidence: f64,
+}
+
+/// Perform a two sample Kolmogorov-Smirnov test on given samples.
+///
+/// The samples must have length > 7 elements for the test to be valid.
+///
+/// # Panics
+///
+/// There are assertion panics if either sequence has <= 7 elements.
+///
+/// # Examples
+///
+/// ```
+/// extern crate kolmogorov_smirnov as ks;
+///
+/// let xs = vec!(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
+/// let ys = vec!(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+/// let confidence = 0.95;
+///
+/// let result = ks::test(&xs, &ys, confidence);
+///
+/// if result.is_rejected {
+///     println!("{:?} and {:?} are not from the same distribution with probability {}.",
+///       xs, ys, result.reject_probability);
+/// }
+/// ```
+pub fn test<T: Ord + Clone>(xs: &[T], ys: &[T], confidence: f64) -> Result<TestResult, String> {
+    assert!(xs.len() > 0 && ys.len() > 0);
+    assert!(0.0 < confidence && confidence < 1.0);
+
+    // Only supports samples of size > 7.
+    assert!(xs.len() > 7 && ys.len() > 7);
+
+    let statistic = calculate_statistic(xs, ys);
+    let critical_value = calculate_critical_value(xs.len(), ys.len(), confidence)?;
+
+    let reject_probability = calculate_reject_probability(statistic, xs.len(), ys.len())?;
+    let is_rejected = reject_probability > confidence;
+
+    Ok(TestResult {
+        is_rejected: is_rejected,
+        statistic: statistic,
+        reject_probability: reject_probability,
+        critical_value: critical_value,
+        confidence: confidence,
+    })
+}
+
+/// Wrapper type for f64 to implement Ord and make usable with test.
+#[derive(PartialEq, Clone)]
+struct OrderableF64 {
+    val: f64,
+}
+
+impl OrderableF64 {
+    fn new(val: f64) -> OrderableF64 {
+        OrderableF64 { val: val }
+    }
+}
+
+impl Eq for OrderableF64 {}
+
+impl PartialOrd for OrderableF64 {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        self.val.partial_cmp(&other.val)
+    }
+}
+
+impl Ord for OrderableF64 {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.val.partial_cmp(&other.val).unwrap()
+    }
+}
+
+/// Perform a two sample Kolmogorov-Smirnov test on given f64 samples.
+///
+/// This is necessary because f64 does not implement Ord in Rust as some
+/// elements are incomparable, e.g. NaN. This function wraps the f64s in
+/// implementation of Ord which panics on incomparable elements.
+///
+/// The samples must have length > 7 elements for the test to be valid.
+///
+/// # Panics
+///
+/// There are assertion panics if either sequence has <= 7 elements.
+///
+/// If any of the f64 elements in the input samples are unorderable, e.g. NaN.
+///
+/// # Examples
+///
+/// ```
+/// extern crate kolmogorov_smirnov as ks;
+///
+/// let xs = vec!(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0);
+/// let ys = vec!(12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0);
+/// let confidence = 0.95;
+///
+/// let result = ks::test_f64(&xs, &ys, confidence);
+///
+/// if result.is_rejected {
+///     println!("{:?} and {:?} are not from the same distribution with probability {}.",
+///       xs, ys, result.reject_probability);
+/// }
+/// ```
+pub fn test_f64(xs: &[f64], ys: &[f64], confidence: f64) -> Result<TestResult, String> {
+    let xs: Vec<OrderableF64> = xs.iter().map(|&f| OrderableF64::new(f)).collect();
+    let ys: Vec<OrderableF64> = ys.iter().map(|&f| OrderableF64::new(f)).collect();
+
+    test(&xs, &ys, confidence)
+}
+
+/// Calculate the test statistic for the two sample Kolmogorov-Smirnov test.
+///
+/// The test statistic is the maximum vertical distance between the ECDFs of
+/// the two samples.
+fn calculate_statistic<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f64 {
+    let n = xs.len();
+    let m = ys.len();
+
+    assert!(n > 0 && m > 0);
+
+    let mut xs = xs.to_vec();
+    let mut ys = ys.to_vec();
+
+    // xs and ys must be sorted for the stepwise ECDF calculations to work.
+    xs.sort();
+    ys.sort();
+
+    // The current value testing for ECDF difference. Sweeps up through elements
+    // present in xs and ys.
+    let mut current: &T;
+
+    // i, j index the first values in xs and ys that are greater than current.
+    let mut i = 0;
+    let mut j = 0;
+
+    // ecdf_xs, ecdf_ys always hold the ECDF(current) of xs and ys.
+    let mut ecdf_xs = 0.0;
+    let mut ecdf_ys = 0.0;
+
+    // The test statistic value computed over values <= current.
+    let mut statistic = 0.0;
+
+    while i < n && j < m {
+        // Advance i through duplicate samples in xs.
+        let x_i = &xs[i];
+        while i + 1 < n && *x_i == xs[i + 1] {
+            i += 1;
+        }
+
+        // Advance j through duplicate samples in ys.
+        let y_j = &ys[j];
+        while j + 1 < m && *y_j == ys[j + 1] {
+            j += 1;
+        }
+
+        // Step to the next sample value in the ECDF sweep from low to high.
+        current = min(x_i, y_j);
+
+        // Update invariant conditions for i, j, ecdf_xs, and ecdf_ys.
+        if current == x_i {
+            ecdf_xs = (i + 1) as f64 / n as f64;
+            i += 1;
+        }
+        if current == y_j {
+            ecdf_ys = (j + 1) as f64 / m as f64;
+            j += 1;
+        }
+
+        // Update invariant conditions for the test statistic.
+        let diff = (ecdf_xs - ecdf_ys).abs();
+        if diff > statistic {
+            statistic = diff;
+        }
+    }
+
+    // Don't need to walk the rest of the samples because one of the ecdfs is
+    // already one and the other will be increasing up to one. This means the
+    // difference will be monotonically decreasing, so we have our test
+    // statistic value already.
+
+    statistic
+}
+
+/// Calculate the probability that the null hypothesis is false for a two sample
+/// Kolmogorov-Smirnov test. Can only reject the null hypothesis if this
+/// evidence exceeds the confidence level required.
+fn calculate_reject_probability(statistic: f64, n1: usize, n2: usize) -> Result<f64, String> {
+    // Only supports samples of size > 7.
+    assert!(n1 > 7 && n2 > 7);
+
+    let n1 = n1 as f64;
+    let n2 = n2 as f64;
+
+    let factor = ((n1 * n2) / (n1 + n2)).sqrt();
+    let term = (factor + 0.12 + 0.11 / factor) * statistic;
+
+    let probability = probability_kolmogorov_smirnov(term)?;
+    let reject_probability = 1.0 - probability;
+
+    assert!(0.0 <= reject_probability && reject_probability <= 1.0);
+
+    Ok(reject_probability)
+}
+
+/// Calculate the critical value for the two sample Kolmogorov-Smirnov test.
+///
+/// # Panics
+///
+/// No convergence panic if the binary search does not locate the critical
+/// value in less than 200 iterations.
+///
+/// # Examples
+///
+/// ```
+/// extern crate kolmogorov_smirnov as ks;
+///
+/// let critical_value = ks::calculate_critical_value(256, 256, 0.95);
+/// println!("Critical value at 95% confidence for samples of size 256 is {}",
+///       critical_value);
+/// ```
+pub fn calculate_critical_value(n1: usize, n2: usize, confidence: f64) -> Result<f64, String> {
+    assert!(0.0 < confidence && confidence < 1.0);
+
+    // Only supports samples of size > 7.
+    assert!(n1 > 7 && n2 > 7);
+
+    // The test statistic is between zero and one so can binary search quickly
+    // for the critical value.
+    let mut low = 0.0;
+    let mut high = 1.0;
+
+    for _ in 1..200 {
+        if low + 1e-8 >= high {
+            return Ok(high);
+        }
+
+        let mid = low + (high - low) / 2.0;
+        let reject_probability = calculate_reject_probability(mid, n1, n2)?;
+
+        if reject_probability > confidence {
+            // Maintain invariant that reject_probability(high) > confidence.
+            high = mid;
+        } else {
+            // Maintain invariant that reject_probability(low) <= confidence.
+            low = mid;
+        }
+    }
+
+    Err(format!("No convergence in calculate_critical_value({}, {}, {}).",
+           n1,
+           n2,
+           confidence))
+    //panic!("No convergence in calculate_critical_value({}, {}, {}).",
+    //       n1,
+    //       n2,
+    //       confidence);
+}
+
+/// Calculate the Kolmogorov-Smirnov probability function.
+fn probability_kolmogorov_smirnov(lambda: f64) -> Result<f64, String> {
+    if lambda == 0.0 {
+        return Ok(1.0);
+    }
+
+    let minus_two_lambda_squared = -2.0 * lambda * lambda;
+    let mut q_ks = 0.0;
+
+    for j in 1..200 {
+        let sign = if j % 2 == 1 {
+            1.0
+        } else {
+            -1.0
+        };
+
+        let j = j as f64;
+        let term = sign * 2.0 * (minus_two_lambda_squared * j * j).exp();
+
+        q_ks += term;
+
+        if term.abs() < 1e-8 {
+            // Trim results that exceed 1.
+            return Ok(q_ks.min(1.0));
+        }
+    }
+
+    Err(format!("No convergence in probability_kolmogorov_smirnov({}).",
+           lambda))
+    //panic!("No convergence in probability_kolmogorov_smirnov({}).",
+           //lambda);
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate quickcheck;
+    extern crate rand;
+
+    use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, StdGen};
+    use self::rand::Rng;
+    use std::cmp;
+    use std::usize;
+
+    use super::test;
+    use ecdf::Ecdf;
+
+    const EPSILON: f64 = 1e-10;
+
+    fn check<A: Testable>(f: A) {
+        // Need - 1 to ensure space for creating non-overlapping samples.
+        let g = StdGen::new(rand::thread_rng(), usize::MAX - 1);
+        QuickCheck::new().gen(g).quickcheck(f);
+    }
+
+    /// Wrapper for generating sample data with QuickCheck.
+    ///
+    /// Samples must be sequences of u64 values with more than 7 elements.
+    #[derive(Debug, Clone)]
+    struct Samples {
+        vec: Vec<u64>,
+    }
+
+    impl Samples {
+        fn min(&self) -> u64 {
+            let &min = self.vec.iter().min().unwrap();
+            min
+        }
+
+        fn max(&self) -> u64 {
+            let &max = self.vec.iter().max().unwrap();
+            max
+        }
+
+        fn shuffle(&mut self) {
+            let mut rng = rand::thread_rng();
+            rng.shuffle(&mut self.vec);
+        }
+    }
+
+    impl Arbitrary for Samples {
+        fn arbitrary<G: Gen>(g: &mut G) -> Samples {
+            // Limit size of generated sample set to 1024
+            let max = cmp::min(g.size(), 1024);
+
+            let size = g.gen_range(8, max);
+            let vec = (0..size).map(|_| u64::arbitrary(g)).collect();
+
+            Samples { vec: vec }
+        }
+
+        fn shrink(&self) -> Box<Iterator<Item = Samples>> {
+            let vec: Vec<u64> = self.vec.clone();
+            let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink();
+
+            Box::new(shrunk.filter(|v| v.len() > 7).map(|v| Samples { vec: v }))
+        }
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")]
+    fn test_panics_on_empty_samples_set() {
+        let xs: Vec<u64> = vec![];
+        let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        test(&xs, &ys, 0.95);
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")]
+    fn test_panics_on_empty_other_samples_set() {
+        let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let ys: Vec<u64> = vec![];
+        test(&xs, &ys, 0.95);
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")]
+    fn test_panics_on_confidence_leq_zero() {
+        let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        test(&xs, &ys, 0.0);
+    }
+
+    #[test]
+    #[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")]
+    fn test_panics_on_confidence_geq_one() {
+        let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+        test(&xs, &ys, 1.0);
+    }
+
+    /// Alternative calculation for the test statistic for the two sample
+    /// Kolmogorov-Smirnov test. This simple implementation is used as a
+    /// verification check against actual calculation used.
+    fn calculate_statistic_alt<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f64 {
+        assert!(xs.len() > 0 && ys.len() > 0);
+
+        let ecdf_xs = Ecdf::new(xs);
+        let ecdf_ys = Ecdf::new(ys);
+
+        let mut statistic = 0.0;
+
+        for x in xs.iter() {
+            let diff = (ecdf_xs.value(x.clone()) - ecdf_ys.value(x.clone())).abs();
+            if diff > statistic {
+                statistic = diff;
+            }
+        }
+
+        for y in ys.iter() {
+            let diff = (ecdf_xs.value(y.clone()) - ecdf_ys.value(y.clone())).abs();
+            if diff > statistic {
+                statistic = diff;
+            }
+        }
+
+        statistic
+    }
+
+    #[test]
+    fn test_calculate_statistic() {
+        fn prop(xs: Samples, ys: Samples) -> bool {
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let actual = result.statistic;
+            let expected = calculate_statistic_alt(&xs.vec, &ys.vec);
+
+            actual == expected
+        }
+
+        check(prop as fn(Samples, Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_between_zero_and_one() {
+        fn prop(xs: Samples, ys: Samples) -> bool {
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let actual = result.statistic;
+
+            0.0 <= actual && actual <= 1.0
+        }
+
+        check(prop as fn(Samples, Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_zero_for_identical_samples() {
+        fn prop(xs: Samples) -> bool {
+            let ys = xs.clone();
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+
+            result.statistic == 0.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_zero_for_permuted_sample() {
+        fn prop(xs: Samples) -> bool {
+            let mut ys = xs.clone();
+            ys.shuffle();
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+
+            result.statistic == 0.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_one_for_samples_with_no_overlap_in_support() {
+        fn prop(xs: Samples) -> bool {
+            let mut ys = xs.clone();
+
+            // Shift ys so that ys.min > xs.max.
+            let ys_min = xs.max() + 1;
+            ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect();
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+
+            result.statistic == 1.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_one_half_for_sample_with_non_overlapping_in_support_replicate_added() {
+        fn prop(xs: Samples) -> bool {
+            let mut ys = xs.clone();
+
+            // Shift ys so that ys.min > xs.max.
+            let ys_min = xs.max() + 1;
+            ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect();
+
+            // Add all the original items back too.
+            for &x in xs.vec.iter() {
+                ys.vec.push(x);
+            }
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+
+            result.statistic == 0.5
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_one_div_length_for_sample_with_additional_low_value() {
+        fn prop(xs: Samples) -> bool {
+            // Add a extra sample of early weight to ys.
+            let min = xs.min();
+            let mut ys = xs.clone();
+            ys.vec.push(min - 1);
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let expected = 1.0 / ys.vec.len() as f64;
+
+            result.statistic == expected
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_one_div_length_for_sample_with_additional_high_value() {
+        fn prop(xs: Samples) -> bool {
+            // Add a extra sample of late weight to ys.
+            let max = xs.max();
+            let mut ys = xs.clone();
+            ys.vec.push(max + 1);
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let expected = 1.0 / ys.vec.len() as f64;
+
+            (result.statistic - expected).abs() < EPSILON
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_one_div_length_for_sample_with_additional_low_and_high_values() {
+        fn prop(xs: Samples) -> bool {
+            // Add a extra sample of late weight to ys.
+            let min = xs.min();
+            let max = xs.max();
+
+            let mut ys = xs.clone();
+
+            ys.vec.push(min - 1);
+            ys.vec.push(max + 1);
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let expected = 1.0 / ys.vec.len() as f64;
+
+            (result.statistic - expected).abs() < EPSILON
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_values() {
+        fn prop(xs: Samples, n: u8) -> bool {
+            // Add extra sample of early weight to ys.
+            let min = xs.min();
+            let mut ys = xs.clone();
+            for j in 0..n {
+                ys.vec.push(min - (j as u64) - 1);
+            }
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let expected = n as f64 / ys.vec.len() as f64;
+
+            result.statistic == expected
+        }
+
+        check(prop as fn(Samples, u8) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_n_div_length_for_sample_with_additional_n_high_values() {
+        fn prop(xs: Samples, n: u8) -> bool {
+            // Add extra sample of early weight to ys.
+            let max = xs.max();
+            let mut ys = xs.clone();
+            for j in 0..n {
+                ys.vec.push(max + (j as u64) + 1);
+            }
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let expected = n as f64 / ys.vec.len() as f64;
+
+            (result.statistic - expected).abs() < EPSILON
+        }
+
+        check(prop as fn(Samples, u8) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_and_high_values() {
+        fn prop(xs: Samples, n: u8) -> bool {
+            // Add extra sample of early weight to ys.
+            let min = xs.min();
+            let max = xs.max();
+            let mut ys = xs.clone();
+            for j in 0..n {
+                ys.vec.push(min - (j as u64) - 1);
+                ys.vec.push(max + (j as u64) + 1);
+            }
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let expected = n as f64 / ys.vec.len() as f64;
+
+            (result.statistic - expected).abs() < EPSILON
+        }
+
+        check(prop as fn(Samples, u8) -> bool);
+    }
+
+    #[test]
+    fn test_statistic_is_n_or_m_div_length_for_sample_with_additional_n_low_and_m_high_values() {
+        fn prop(xs: Samples, n: u8, m: u8) -> bool {
+            // Add extra sample of early weight to ys.
+            let min = xs.min();
+            let max = xs.max();
+            let mut ys = xs.clone();
+            for j in 0..n {
+                ys.vec.push(min - (j as u64) - 1);
+            }
+
+            for j in 0..m {
+                ys.vec.push(max + (j as u64) + 1);
+            }
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+            let expected = cmp::max(n, m) as f64 / ys.vec.len() as f64;
+
+            (result.statistic - expected).abs() < EPSILON
+        }
+
+        check(prop as fn(Samples, u8, u8) -> bool);
+    }
+
+    #[test]
+    fn test_is_rejected_if_reject_probability_greater_than_confidence() {
+        fn prop(xs: Samples, ys: Samples) -> bool {
+            let result = test(&xs.vec, &ys.vec, 0.95);
+
+            if result.is_rejected {
+                result.reject_probability > 0.95
+            } else {
+                result.reject_probability <= 0.95
+            }
+        }
+
+        check(prop as fn(Samples, Samples) -> bool);
+    }
+
+    #[test]
+    fn test_reject_probability_is_zero_for_identical_samples() {
+        fn prop(xs: Samples) -> bool {
+            let ys = xs.clone();
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+
+            result.reject_probability == 0.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+
+    #[test]
+    fn test_reject_probability_is_zero_for_permuted_sample() {
+        fn prop(xs: Samples) -> bool {
+            let mut ys = xs.clone();
+            ys.shuffle();
+
+            let result = test(&xs.vec, &ys.vec, 0.95);
+
+            result.reject_probability == 0.0
+        }
+
+        check(prop as fn(Samples) -> bool);
+    }
+}
--- a/tools/kstest/Cargo.toml
+++ b/tools/kstest/Cargo.toml
@ -0,0 +1,14 @@
+[package]
+name = "kstest"
+version = "0.1.0"
+authors = ["eugen.betke <betke@dkrz.de>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+kolmogorov_smirnov = {path = "../kslib"}
+csv = "1.1"
+glob = "0.3.0"
+serde = { version = "1.0", features = ["derive"] }
+chrono = "*"
--- a/tools/kstest/run.sh
+++ b/tools/kstest/run.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+
+#7488914 19865984
+#4296426 18672376
+#5024292 17944118
+
+#dataset_fn="../../datasets/job_codings_v4_confidential.csv"
+#jobids=( )
+#jobids=( ${jobids[@]} 19865984 )
+#jobids=( ${jobids[@]} 18672376 )
+#jobids=( ${jobids[@]} 17944118 )
+
+
+dataset_fn="../../datasets/job_codings_v4.csv"
+jobids=( )
+jobids=( ${jobids[@]} 7488914 )
+jobids=( ${jobids[@]} 4296426 )
+jobids=( ${jobids[@]} 5024292 )
+
+set -x
+for jobid in ${jobids[@]}; do
+    sim_fn="./ks_similarities_$jobid.csv"
+    progress_fn="./ks_progress_$jobid.csv"
+    log_fn="./ks_fail_$jobid.log"
+    cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn  &
+done
--- a/tools/kstest/src/main.rs
+++ b/tools/kstest/src/main.rs
@ -0,0 +1,183 @@
+extern crate kolmogorov_smirnov as ks;
+extern crate csv;
+extern crate serde;
+extern crate chrono;
+
+use std::fs::File;
+use std::io::prelude::*;
+
+//use std::fs::File;
+use serde::Deserialize;
+use serde::Serialize;
+use std::collections::HashMap;
+use std::env;
+use std::io::LineWriter;
+
+pub type Score = u32;
+pub type JobCoding = Vec<Score>;
+pub type Similarity = f32;
+
+pub type Jobid = u32;
+pub type QCodings = HashMap<Jobid, JobCoding>;
+
+#[derive(Debug, Deserialize)]
+pub struct Record {
+    jobid: u32,
+    //q16_coding: String,
+    ks_coding: String,
+}
+
+#[derive(Debug, Serialize)]
+pub struct SimilarityRow {
+    pub jobid: u32,
+    pub alg_id: u32,
+    pub alg_name: String,
+    pub similarity: f32
+}
+
+#[derive(Debug, Serialize)]
+pub struct ProgressRow {
+    jobid: u32,
+    alg_id: u32,
+    alg_name: String,
+    delta: i64,
+}
+
+pub fn convert_to_coding(coding: String) -> Vec<Score> {
+    let split = coding.split(":");
+    let vec: Vec<Score> = split
+        .filter(|s| !s.is_empty())
+        //.map(|s| s.parse::<F>().unwrap())
+        .map(|s| s.parse().unwrap())
+        .collect();
+    vec
+}
+
+
+//fn ks_similarity(xs: &Vec<u32>, ys: &Vec<u32>) -> Result<Similarity, String> {
+//    let confidence = 0.95;
+//    ks::test(xs, ys, confidence)?
+
+//    let reject_probability = match result {
+//        Ok(v) => v.reject_probability,
+//        Err(_) => 1.0,
+//    };
+//    //println!("is_rejected: {:?}\nstatistic: {:?}\nreject_probability: {:?}\ncritical_value: {:?}\nconfidence: {:?}", result.is_rejected, result.statistic, result.reject_probability, result.critical_value, result.confidence);
+//    (1.0 - reject_probability) as Similarity
+//}
+
+
+fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: String, log_fn: String) {
+    let mut q_codings: QCodings = HashMap::new();
+    let file = File::open(&dataset_fn).expect("Unable to open dataset.");
+    let mut rdr = csv::Reader::from_reader(file);
+
+    //for result in rdr.deserialize().take(10000) {
+    for result in rdr.deserialize() {
+        let record: Record = result.expect("bla bla");
+        //let q_coding = convert_to_coding(record.q16_coding);
+        let q_coding = convert_to_coding(record.ks_coding);
+        // Insert Non-Zero jobs only
+        if q_coding.iter().sum::<Score>() > (0 as Score) {
+           q_codings.insert(record.jobid, q_coding);
+        }
+    }
+
+    let probe = q_codings[&jobid].clone();
+    let similarities_file = File::create(&similarities_fn).expect("Unable to open");
+    let mut wtr_similarities = csv::Writer::from_writer(&similarities_file);
+    let alg_name = "ks";
+    let alg_id = 6;
+
+    let progress_file = File::create(&progress_fn).expect("Unable to open");
+    let mut wtr_progress = csv::Writer::from_writer(&progress_file);
+    let mut start = chrono::Utc::now();
+    let mut counter = 1;
+
+    let mut avail_codings: Vec<(u32, &JobCoding)>;
+
+    avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect();
+    let mut similarities: Vec<(Jobid, Similarity)> = Vec::new();
+
+    let log_file = File::create(&log_fn).expect("Unable to open");
+    let mut log_file = LineWriter::new(log_file);
+
+
+    while let Some((jobid, q_coding)) = avail_codings.pop() {
+        if (counter % 10_000) == 0 {
+            let stop = chrono::Utc::now();
+            let progress_row = ProgressRow {
+                jobid: jobid,
+                alg_id: alg_id,
+                alg_name: String::from(alg_name),
+                delta: ((stop - start).num_nanoseconds().unwrap())
+            };
+            wtr_progress.serialize(progress_row).unwrap();
+            start = stop;
+        }
+
+        //println!("Processing {:?}", jobid);
+        //let similarity = ks_similarity(q_coding, &probe);
+        
+        let confidence = 0.95;
+        let similarity = match ks::test(q_coding, &probe, confidence) {
+            Ok(sim) => (1.0 - sim.reject_probability) as Similarity,
+            Err(e) => {
+                let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
+                log_file.write_all(message.as_bytes()).unwrap();
+                1.0
+            }
+        };
+
+        similarities.push((jobid, similarity));
+        counter += 1;
+    }
+
+    for (jobid, similarity) in similarities.iter() {
+        let similarity_row = SimilarityRow {
+            jobid: *jobid,
+            alg_id: alg_id,
+            alg_name: String::from(alg_name),
+            similarity: *similarity,
+        };
+        wtr_similarities.serialize(similarity_row).unwrap();
+    }
+    log_file.flush().unwrap();
+}
+
+
+
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let dataset_fn = args[1].clone();
+    let jobid = args[2].parse::<u32>().unwrap();
+    let sim_fn = args[3].clone();
+    let progress_fn = args[4].clone();
+    let log_fn = args[5].clone();
+    println!("{:?}", args);
+
+    run(dataset_fn, jobid, sim_fn, progress_fn, log_fn);
+}
+
+
+mod tests {
+    //use super::*;
+
+    #[test]
+    fn test_ks_test() {
+        let xs = vec!(0.0  , 1.0  , 2.0  , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 , 8.0 , 9.0 , 10.0 , 11.0 , 12.0);
+        let ys = vec!(12.0 , 11.0 , 10.0 , 9.0 , 8.0 , 7.0 , 6.0 , 5.0 , 4.0 , 3.0 , 2.0  , 1.0  , 0.0);
+
+        ks_test(xs, ys);
+
+        let c1 = vec![141.0,143.0,142.0,238.0,132.0,486.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0];
+        let c2 = vec![239.0,239.0,255.0,255.0,239.0,239.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,502.0,511.0,503.0];
+        ks_test(c1, c2);
+
+
+        let c1 = vec![2.0,2.0,2.0,9.0,3.0,0.0,0.0,0.0];
+        let c2 = vec![2.0,2.0,2.0,2.0,8.0,3.0,0.0,10.0];
+        ks_test(c1, c2);
+    }
+}