Kolmogorov-Smirnov-Test
This commit is contained in:
		
							parent
							
								
									88e994f997
								
							
						
					
					
						commit
						8d2ebf0e96
					
				
							
								
								
									
										36
									
								
								tools/kslib/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								tools/kslib/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,36 @@ | ||||
| [package] | ||||
| name = "kolmogorov_smirnov" | ||||
| description = "Implementation of the Kolmogorov-Smirnov statistical test as a Rust library." | ||||
| version = "1.1.0" | ||||
| homepage = "http://daithiocrualaoich.github.io/kolmogorov_smirnov" | ||||
| repository = "https://github.com/daithiocrualaoich/kolmogorov_smirnov" | ||||
| authors = ["Daithi O Crualaoich <daithi.ocrualaoich@gmail.com>"] | ||||
| license = "Apache-2.0" | ||||
| keywords = ["statistics"] | ||||
| exclude = [ | ||||
|   ".dockerignore", | ||||
|   ".gitignore", | ||||
|   ".travis.yml", | ||||
|   "Cargo.lock", | ||||
|   "Dockerfile", | ||||
|   "dat/*", | ||||
|   "doc/*", | ||||
| ] | ||||
| 
 | ||||
| [[bin]] | ||||
| name = "critical_values" | ||||
| 
 | ||||
| [[bin]] | ||||
| name = "normal" | ||||
| 
 | ||||
| [[bin]] | ||||
| name = "ks_f64" | ||||
| 
 | ||||
| [[bin]] | ||||
| name = "ks_i64" | ||||
| 
 | ||||
| [dependencies] | ||||
| rand = "0.3.12" | ||||
| 
 | ||||
| [dev-dependencies] | ||||
| quickcheck = "0.2" | ||||
							
								
								
									
										39
									
								
								tools/kslib/src/bin/critical_values.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								tools/kslib/src/bin/critical_values.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,39 @@ | ||||
| extern crate kolmogorov_smirnov as ks; | ||||
| 
 | ||||
| use ks::calculate_critical_value; | ||||
| 
 | ||||
| use std::env; | ||||
| 
 | ||||
| /// Calculate critical values dataset.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```bash
 | ||||
| /// cargo run --bin critical_values <confidence> <num_samples> <limit>
 | ||||
| /// ```
 | ||||
| ///
 | ||||
| /// This will print the critical values of the Kolmogorov-Smirnov two sample
 | ||||
| /// test for samples of size `<num_samples>` against samples of sizes 16
 | ||||
| /// through `<limit>` inclusive at the specified confidence level.
 | ||||
| ///
 | ||||
| /// `<num_samples>` and `<limit>` must be positive integers, `<confidence>` must
 | ||||
| /// be a floating point number strictly between zero and one.
 | ||||
| fn main() { | ||||
|     let args: Vec<String> = env::args().collect(); | ||||
| 
 | ||||
|     let confidence: f64 = args[1].parse().expect("<confidence> must be a floating point number."); | ||||
|     let n1: usize = args[2].parse().expect("<num_samples> must be an integer."); | ||||
|     let limit: usize = args[3].parse().expect("<limit> must be an integer."); | ||||
| 
 | ||||
|     assert!(n1 > 0 && limit > 0); | ||||
|     assert!(0.0 < confidence && confidence < 1.0); | ||||
| 
 | ||||
|     println!("n1\tn2\tconfidence\tcritical_value"); | ||||
|     for n2 in 16..(limit + 1) { | ||||
|         println!("{}\t{}\t{}\t{}", | ||||
|                  n1, | ||||
|                  n2, | ||||
|                  confidence, | ||||
|                  calculate_critical_value(n1, n2, confidence).unwrap()); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										52
									
								
								tools/kslib/src/bin/ks_f64.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								tools/kslib/src/bin/ks_f64.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,52 @@ | ||||
| extern crate kolmogorov_smirnov as ks; | ||||
| 
 | ||||
| use ks::test_f64; | ||||
| 
 | ||||
| use std::env; | ||||
| use std::io::{BufReader, BufRead}; | ||||
| use std::fs::File; | ||||
| use std::path::Path; | ||||
| 
 | ||||
| fn parse_float(s: String) -> f64 { | ||||
|     s.parse::<f64>().expect("Not a floating point number.") | ||||
| } | ||||
| 
 | ||||
| /// Runs a Kolmogorov-Smirnov test on floating point data files.
 | ||||
| ///
 | ||||
| /// Input files must be single-column headerless data files. The data samples
 | ||||
| /// are tested against each other at the 0.95 confidence level.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```bash
 | ||||
| /// cargo run --bin ks_f64 <file1> <file2>
 | ||||
| /// ```
 | ||||
| ///
 | ||||
| /// This will print the test result to standard output.
 | ||||
| fn main() { | ||||
|     let args: Vec<String> = env::args().collect(); | ||||
| 
 | ||||
|     let path1 = Path::new(&args[1]); | ||||
|     let path2 = Path::new(&args[2]); | ||||
| 
 | ||||
|     let file1 = BufReader::new(File::open(&path1).unwrap()); | ||||
|     let file2 = BufReader::new(File::open(&path2).unwrap()); | ||||
| 
 | ||||
|     let lines1 = file1.lines().map(|line| line.unwrap()); | ||||
|     let lines2 = file2.lines().map(|line| line.unwrap()); | ||||
| 
 | ||||
|     let xs: Vec<f64> = lines1.map(parse_float).collect(); | ||||
|     let ys: Vec<f64> = lines2.map(parse_float).collect(); | ||||
| 
 | ||||
|     let result = ks::test_f64(&xs, &ys, 0.95).unwrap(); | ||||
| 
 | ||||
|     if result.is_rejected { | ||||
|         println!("Samples are from different distributions."); | ||||
|     } else { | ||||
|         println!("Samples are from the same distribution."); | ||||
|     } | ||||
| 
 | ||||
|     println!("test statistic = {}", result.statistic); | ||||
|     println!("critical value = {}", result.critical_value); | ||||
|     println!("reject probability = {}", result.reject_probability); | ||||
| } | ||||
							
								
								
									
										52
									
								
								tools/kslib/src/bin/ks_i64.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								tools/kslib/src/bin/ks_i64.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,52 @@ | ||||
| extern crate kolmogorov_smirnov as ks; | ||||
| 
 | ||||
| use ks::test; | ||||
| 
 | ||||
| use std::env; | ||||
| use std::io::{BufReader, BufRead}; | ||||
| use std::fs::File; | ||||
| use std::path::Path; | ||||
| 
 | ||||
| fn parse_int(s: String) -> i64 { | ||||
|     s.parse::<i64>().expect("Not an integer.") | ||||
| } | ||||
| 
 | ||||
| /// Runs a Kolmogorov-Smirnov test on integer data files.
 | ||||
| ///
 | ||||
| /// Input files must be single-column headerless data files. The data samples
 | ||||
| /// are tested against each other at the 0.95 confidence level.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```bash
 | ||||
| /// cargo run --bin ks_i64 <file1> <file2>
 | ||||
| /// ```
 | ||||
| ///
 | ||||
| /// This will print the test result to standard output.
 | ||||
| fn main() { | ||||
|     let args: Vec<String> = env::args().collect(); | ||||
| 
 | ||||
|     let path1 = Path::new(&args[1]); | ||||
|     let path2 = Path::new(&args[2]); | ||||
| 
 | ||||
|     let file1 = BufReader::new(File::open(&path1).unwrap()); | ||||
|     let file2 = BufReader::new(File::open(&path2).unwrap()); | ||||
| 
 | ||||
|     let lines1 = file1.lines().map(|line| line.unwrap()); | ||||
|     let lines2 = file2.lines().map(|line| line.unwrap()); | ||||
| 
 | ||||
|     let xs: Vec<i64> = lines1.map(parse_int).collect(); | ||||
|     let ys: Vec<i64> = lines2.map(parse_int).collect(); | ||||
| 
 | ||||
|     let result = ks::test(&xs, &ys, 0.95).unwrap(); | ||||
| 
 | ||||
|     if result.is_rejected { | ||||
|         println!("Samples are from different distributions."); | ||||
|     } else { | ||||
|         println!("Samples are from the same distribution."); | ||||
|     } | ||||
| 
 | ||||
|     println!("test statistic = {}", result.statistic); | ||||
|     println!("critical value = {}", result.critical_value); | ||||
|     println!("reject probability = {}", result.reject_probability); | ||||
| } | ||||
							
								
								
									
										35
									
								
								tools/kslib/src/bin/normal.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								tools/kslib/src/bin/normal.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,35 @@ | ||||
| extern crate rand; | ||||
| 
 | ||||
| use std::env; | ||||
| use rand::distributions::{Normal, IndependentSample}; | ||||
| 
 | ||||
| /// Prints a sequence of Normal deviates.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```bash
 | ||||
| /// cargo run --bin normal <num_deviates> <mean> <variance>
 | ||||
| /// ```
 | ||||
| /// This will print `<num_deviates>` float point numbers, one per line, to
 | ||||
| /// standard output. These numbers will have a Normal distribution with the
 | ||||
| /// specified mean and variance.
 | ||||
| ///
 | ||||
| /// `<num_deviates>` must be a positive integer, `<mean>` and `<variance>` may
 | ||||
| /// be integers or floating point numbers but `<variance>` must be positive.
 | ||||
| fn main() { | ||||
|     let args: Vec<String> = env::args().collect(); | ||||
| 
 | ||||
|     let n: u32 = args[1].parse().expect("<num_deviates> must be an integer."); | ||||
|     let mean: f64 = args[2].parse().expect("<mean> must be a floating point number."); | ||||
|     let variance: f64 = args[3].parse().expect("<variance> must be a floating point number."); | ||||
| 
 | ||||
|     assert!(n > 0 && variance.is_sign_positive()); | ||||
| 
 | ||||
|     let mut rng = rand::thread_rng(); | ||||
|     let normal = Normal::new(mean, variance.sqrt()); | ||||
| 
 | ||||
|     for _ in 0..n { | ||||
|         let x = normal.ind_sample(&mut rng); | ||||
|         println!("{}", x) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										853
									
								
								tools/kslib/src/ecdf.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										853
									
								
								tools/kslib/src/ecdf.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,853 @@ | ||||
| //! Empirical cumulative distribution function.
 | ||||
| 
 | ||||
| pub struct Ecdf<T: Ord> { | ||||
|     samples: Vec<T>, | ||||
|     length: usize, | ||||
| } | ||||
| 
 | ||||
| impl<T: Ord + Clone> Ecdf<T> { | ||||
|     /// Construct a new representation of a cumulative distribution function for
 | ||||
|     /// a given sample.
 | ||||
|     ///
 | ||||
|     /// The construction will involve computing a sorted clone of the given sample
 | ||||
|     /// and may be inefficient or completely prohibitive for large samples. This
 | ||||
|     /// computation is amortized significantly if there is heavy use of the value
 | ||||
|     /// function.
 | ||||
|     ///
 | ||||
|     /// # Panics
 | ||||
|     ///
 | ||||
|     /// The sample set must be non-empty.
 | ||||
|     ///
 | ||||
|     /// # Examples
 | ||||
|     ///
 | ||||
|     /// ```
 | ||||
|     /// extern crate kolmogorov_smirnov as ks;
 | ||||
|     ///
 | ||||
|     /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 | ||||
|     /// let ecdf = ks::Ecdf::new(&samples);
 | ||||
|     /// ```
 | ||||
|     pub fn new(samples: &[T]) -> Ecdf<T> { | ||||
|         let length = samples.len(); | ||||
|         assert!(length > 0); | ||||
| 
 | ||||
|         // Sort a copied sample for binary searching.
 | ||||
|         let mut sorted = samples.to_vec(); | ||||
|         sorted.sort(); | ||||
| 
 | ||||
|         Ecdf { | ||||
|             samples: sorted, | ||||
|             length: length, | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     /// Calculate a value of the empirical cumulative distribution function for
 | ||||
|     /// a given sample.
 | ||||
|     ///
 | ||||
|     /// # Examples
 | ||||
|     ///
 | ||||
|     /// ```
 | ||||
|     /// extern crate kolmogorov_smirnov as ks;
 | ||||
|     ///
 | ||||
|     /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 | ||||
|     /// let ecdf = ks::Ecdf::new(&samples);
 | ||||
|     /// assert_eq!(ecdf.value(4), 0.5);
 | ||||
|     /// ```
 | ||||
|     pub fn value(&self, t: T) -> f64 { | ||||
|         let num_samples_leq_t = match self.samples.binary_search(&t) { | ||||
|             Ok(mut index) => { | ||||
|                 // At least one sample is a t and we have the index of it. Need
 | ||||
|                 // to walk down the sorted samples until at last that == t.
 | ||||
|                 while index + 1 < self.length && self.samples[index + 1] == t { | ||||
|                     index += 1; | ||||
|                 } | ||||
| 
 | ||||
|                 // Compensate for 0-based indexing.
 | ||||
|                 index + 1 | ||||
|             } | ||||
|             Err(index) => { | ||||
|                 // No sample is a t but if we had to put one in it would go at
 | ||||
|                 // index. This means all indices to the left have samples < t
 | ||||
|                 // and should be counted in the cdf proportion. We must take one
 | ||||
|                 // from index to get the last included sample but then we just
 | ||||
|                 // have to add one again to account for 0-based indexing.
 | ||||
|                 index | ||||
|             } | ||||
|         }; | ||||
| 
 | ||||
|         num_samples_leq_t as f64 / self.length as f64 | ||||
|     } | ||||
| 
 | ||||
|     /// Calculate a percentile for the sample using the Nearest Rank method.
 | ||||
|     ///
 | ||||
|     /// # Panics
 | ||||
|     ///
 | ||||
|     /// The percentile requested must be between 1 and 100 inclusive. In
 | ||||
|     /// particular, there is no 0-percentile.
 | ||||
|     ///
 | ||||
|     /// # Examples
 | ||||
|     ///
 | ||||
|     /// ```
 | ||||
|     /// extern crate kolmogorov_smirnov as ks;
 | ||||
|     ///
 | ||||
|     /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 | ||||
|     /// let ecdf = ks::Ecdf::new(&samples);
 | ||||
|     /// assert_eq!(ecdf.percentile(50), 4);
 | ||||
|     /// ```
 | ||||
|     pub fn percentile(&self, p: u8) -> T { | ||||
|         assert!(0 < p && p <= 100); | ||||
| 
 | ||||
|         let rank = (p as f64 * self.length as f64 / 100.0).ceil() as usize; | ||||
|         self.samples[rank - 1].clone() | ||||
|     } | ||||
| 
 | ||||
|     /// Return the minimal element of the samples.
 | ||||
|     ///
 | ||||
|     /// # Examples
 | ||||
|     ///
 | ||||
|     /// ```
 | ||||
|     /// extern crate kolmogorov_smirnov as ks;
 | ||||
|     ///
 | ||||
|     /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 | ||||
|     /// let ecdf = ks::Ecdf::new(&samples);
 | ||||
|     /// assert_eq!(ecdf.min(), 0);
 | ||||
|     /// ```
 | ||||
|     pub fn min(&self) -> T { | ||||
|         self.samples[0].clone() | ||||
|     } | ||||
| 
 | ||||
|     /// Return the maximal element of the samples.
 | ||||
|     ///
 | ||||
|     /// # Examples
 | ||||
|     ///
 | ||||
|     /// ```
 | ||||
|     /// extern crate kolmogorov_smirnov as ks;
 | ||||
|     ///
 | ||||
|     /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 | ||||
|     /// let ecdf = ks::Ecdf::new(&samples);
 | ||||
|     /// assert_eq!(ecdf.max(), 9);
 | ||||
|     /// ```
 | ||||
|     pub fn max(&self) -> T { | ||||
|         self.samples[self.samples.len() - 1].clone() | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /// Calculate a one-time value of the empirical cumulative distribution function
 | ||||
| /// for a given sample.
 | ||||
| ///
 | ||||
| /// Computational running time of this function is O(n) but does not amortize
 | ||||
| /// across multiple calls like Ecdf<T>::value. This function should only be
 | ||||
| /// used in the case that a small number of ECDF values are required for the
 | ||||
| /// sample. Otherwise, Ecdf::new should be used to create a structure that
 | ||||
| /// takes the upfront O(n log n) sort cost but calculates values in O(log n).
 | ||||
| ///
 | ||||
| /// # Panics
 | ||||
| ///
 | ||||
| /// The sample set must be non-empty.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```
 | ||||
| /// extern crate kolmogorov_smirnov as ks;
 | ||||
| ///
 | ||||
| /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 | ||||
| /// let value = ks::ecdf(&samples, 4);
 | ||||
| /// assert_eq!(value, 0.5);
 | ||||
| /// ```
 | ||||
| pub fn ecdf<T: Ord>(samples: &[T], t: T) -> f64 { | ||||
|     let mut num_samples_leq_t = 0; | ||||
|     let mut length = 0; | ||||
| 
 | ||||
|     for sample in samples.iter() { | ||||
|         length += 1; | ||||
|         if *sample <= t { | ||||
|             num_samples_leq_t += 1; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     assert!(length > 0); | ||||
| 
 | ||||
|     num_samples_leq_t as f64 / length as f64 | ||||
| } | ||||
| 
 | ||||
| /// Calculate a one-time percentile for a given sample using the Nearest Rank
 | ||||
| /// method and Quick Select.
 | ||||
| ///
 | ||||
| /// Computational running time of this function is O(n) but does not amortize
 | ||||
| /// across multiple calls like Ecdf<T>::percentile. This function should only be
 | ||||
| /// used in the case that a small number of percentiles are required for the
 | ||||
| /// sample. Otherwise, Ecdf::new should be used to create a structure that
 | ||||
| /// takes the upfront O(n log n) sort cost but calculates percentiles in O(1).
 | ||||
| ///
 | ||||
| /// # Panics
 | ||||
| ///
 | ||||
| /// The sample set must be non-empty.
 | ||||
| ///
 | ||||
| /// The percentile requested must be between 1 and 100 inclusive. In particular,
 | ||||
| /// there is no 0-percentile.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```
 | ||||
| /// extern crate kolmogorov_smirnov as ks;
 | ||||
| ///
 | ||||
| /// let samples = vec!(9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 | ||||
| /// let percentile = ks::percentile(&samples, 50);
 | ||||
| /// assert_eq!(percentile, 4);
 | ||||
| /// ```
 | ||||
| pub fn percentile<T: Ord + Clone>(samples: &[T], p: u8) -> T { | ||||
|     assert!(0 < p && p <= 100); | ||||
| 
 | ||||
|     let length = samples.len(); | ||||
|     assert!(length > 0); | ||||
| 
 | ||||
|     let rank = (p as f64 * length as f64 / 100.0).ceil() as usize; | ||||
| 
 | ||||
|     // Quick Select the element at rank.
 | ||||
| 
 | ||||
|     let mut samples: Vec<T> = samples.to_vec(); | ||||
|     let mut low = 0; | ||||
|     let mut high = length; | ||||
| 
 | ||||
|     loop { | ||||
|         assert!(low < high); | ||||
| 
 | ||||
|         let pivot = samples[low].clone(); | ||||
| 
 | ||||
|         if low >= high - 1 { | ||||
|             return pivot; | ||||
|         } | ||||
| 
 | ||||
|         // First determine if the rank item is less than the pivot.
 | ||||
| 
 | ||||
|         // Organise samples so that all items less than pivot are to the left,
 | ||||
|         // `bottom` is the number of items less than pivot.
 | ||||
| 
 | ||||
|         let mut bottom = low; | ||||
|         let mut top = high - 1; | ||||
| 
 | ||||
|         while bottom < top { | ||||
|             while bottom < top && samples[bottom] < pivot { | ||||
|                 bottom += 1; | ||||
|             } | ||||
|             while bottom < top && samples[top] >= pivot { | ||||
|                 top -= 1; | ||||
|             } | ||||
| 
 | ||||
|             if bottom < top { | ||||
|                 samples.swap(bottom, top); | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         if rank <= bottom { | ||||
|             // Rank item is less than pivot. Exclude pivot and larger items.
 | ||||
|             high = bottom; | ||||
|         } else { | ||||
|             // Rank item is pivot or in the larger set. Exclude smaller items.
 | ||||
|             low = bottom; | ||||
| 
 | ||||
|             // Next, determine if the pivot is the rank item.
 | ||||
| 
 | ||||
|             // Organise samples so that all items less than or equal to pivot
 | ||||
|             // are to the left, `bottom` is the number of items less than or
 | ||||
|             // equal to pivot. Since the left is already less than the pivot,
 | ||||
|             // this just requires moving the pivots left also.
 | ||||
| 
 | ||||
|             let mut bottom = low; | ||||
|             let mut top = high - 1; | ||||
| 
 | ||||
|             while bottom < top { | ||||
|                 while bottom < top && samples[bottom] == pivot { | ||||
|                     bottom += 1; | ||||
|                 } | ||||
|                 while bottom < top && samples[top] != pivot { | ||||
|                     top -= 1; | ||||
|                 } | ||||
| 
 | ||||
|                 if bottom < top { | ||||
|                     samples.swap(bottom, top); | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             // Is pivot the rank item?
 | ||||
| 
 | ||||
|             if rank <= bottom { | ||||
|                 return pivot; | ||||
|             } | ||||
| 
 | ||||
|             // Rank item is greater than pivot. Exclude pivot and smaller items.
 | ||||
|             low = bottom; | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     extern crate quickcheck; | ||||
|     extern crate rand; | ||||
| 
 | ||||
|     use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, TestResult, StdGen}; | ||||
|     use std::cmp; | ||||
|     use std::usize; | ||||
|     use super::{Ecdf, ecdf, percentile}; | ||||
| 
 | ||||
|     fn check<A: Testable>(f: A) { | ||||
|         let g = StdGen::new(rand::thread_rng(), usize::MAX); | ||||
|         QuickCheck::new().gen(g).quickcheck(f); | ||||
|     } | ||||
| 
 | ||||
|     /// Wrapper for generating sample data with QuickCheck.
 | ||||
|     ///
 | ||||
|     /// Samples must be non-empty sequences of u64 values.
 | ||||
|     #[derive(Debug, Clone)] | ||||
|     struct Samples { | ||||
|         vec: Vec<u64>, | ||||
|     } | ||||
| 
 | ||||
|     impl Arbitrary for Samples { | ||||
|         fn arbitrary<G: Gen>(g: &mut G) -> Samples { | ||||
|             // Limit size of generated sample set to 1024
 | ||||
|             let max = cmp::min(g.size(), 1024); | ||||
| 
 | ||||
|             let size = g.gen_range(1, max); | ||||
|             let vec = (0..size).map(|_| u64::arbitrary(g)).collect(); | ||||
| 
 | ||||
|             Samples { vec: vec } | ||||
|         } | ||||
| 
 | ||||
|         fn shrink(&self) -> Box<Iterator<Item = Samples>> { | ||||
|             let vec: Vec<u64> = self.vec.clone(); | ||||
|             let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink(); | ||||
| 
 | ||||
|             Box::new(shrunk.filter(|v| v.len() > 0).map(|v| Samples { vec: v })) | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     /// Wrapper for generating percentile query value data with QuickCheck.
 | ||||
|     ///
 | ||||
|     /// Percentile must be u8 between 1 and 100 inclusive.
 | ||||
|     #[derive(Debug, Clone)] | ||||
|     struct Percentile { | ||||
|         val: u8, | ||||
|     } | ||||
| 
 | ||||
|     impl Arbitrary for Percentile { | ||||
|         fn arbitrary<G: Gen>(g: &mut G) -> Percentile { | ||||
|             let val = g.gen_range(1, 101) as u8; | ||||
| 
 | ||||
|             Percentile { val: val } | ||||
|         } | ||||
| 
 | ||||
|         fn shrink(&self) -> Box<Iterator<Item = Percentile>> { | ||||
|             let shrunk: Box<Iterator<Item = u8>> = self.val.shrink(); | ||||
| 
 | ||||
|             Box::new(shrunk.filter(|&v| 0u8 < v && v <= 100u8).map(|v| Percentile { val: v })) | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: length > 0")] | ||||
|     fn single_use_ecdf_panics_on_empty_samples_set() { | ||||
|         let xs: Vec<u64> = vec![]; | ||||
|         ecdf(&xs, 0); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: length > 0")] | ||||
|     fn multiple_use_ecdf_panics_on_empty_samples_set() { | ||||
|         let xs: Vec<u64> = vec![]; | ||||
|         Ecdf::new(&xs); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_ecdf_between_zero_and_one() { | ||||
|         fn prop(xs: Samples, val: u64) -> bool { | ||||
|             let actual = ecdf(&xs.vec, val); | ||||
| 
 | ||||
|             0.0 <= actual && actual <= 1.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_ecdf_between_zero_and_one() { | ||||
|         fn prop(xs: Samples, val: u64) -> bool { | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
|             let actual = ecdf.value(val); | ||||
| 
 | ||||
|             0.0 <= actual && actual <= 1.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_ecdf_is_an_increasing_function() { | ||||
|         fn prop(xs: Samples, val: u64) -> bool { | ||||
|             let actual = ecdf(&xs.vec, val); | ||||
| 
 | ||||
|             ecdf(&xs.vec, val - 1) <= actual && actual <= ecdf(&xs.vec, val + 1) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_ecdf_is_an_increasing_function() { | ||||
|         fn prop(xs: Samples, val: u64) -> bool { | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
|             let actual = ecdf.value(val); | ||||
| 
 | ||||
|             ecdf.value(val - 1) <= actual && actual <= ecdf.value(val + 1) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_ecdf_sample_min_minus_one_is_zero() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let &min = xs.vec.iter().min().unwrap(); | ||||
| 
 | ||||
|             ecdf(&xs.vec, min - 1) == 0.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_ecdf_sample_min_minus_one_is_zero() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let &min = xs.vec.iter().min().unwrap(); | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
| 
 | ||||
|             ecdf.value(min - 1) == 0.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_ecdf_sample_max_is_one() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let &max = xs.vec.iter().max().unwrap(); | ||||
| 
 | ||||
|             ecdf(&xs.vec, max) == 1.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_ecdf_sample_max_is_one() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let &max = xs.vec.iter().max().unwrap(); | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
| 
 | ||||
|             ecdf.value(max) == 1.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_ecdf_sample_val_is_num_samples_leq_val_div_length() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let &val = xs.vec.first().unwrap(); | ||||
|             let num_samples = xs.vec | ||||
|                                 .iter() | ||||
|                                 .filter(|&&x| x <= val) | ||||
|                                 .count(); | ||||
|             let expected = num_samples as f64 / xs.vec.len() as f64; | ||||
| 
 | ||||
|             ecdf(&xs.vec, val) == expected | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_ecdf_sample_val_is_num_samples_leq_val_div_length() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let &val = xs.vec.first().unwrap(); | ||||
|             let num_samples = xs.vec | ||||
|                                 .iter() | ||||
|                                 .filter(|&&x| x <= val) | ||||
|                                 .count(); | ||||
|             let expected = num_samples as f64 / xs.vec.len() as f64; | ||||
| 
 | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
| 
 | ||||
|             ecdf.value(val) == expected | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() { | ||||
|         fn prop(xs: Samples, val: u64) -> TestResult { | ||||
|             let length = xs.vec.len(); | ||||
| 
 | ||||
|             if xs.vec.iter().any(|&x| x == val) { | ||||
|                 // Discard Vec containing val.
 | ||||
|                 return TestResult::discard(); | ||||
|             } | ||||
| 
 | ||||
|             let num_samples = xs.vec | ||||
|                                 .iter() | ||||
|                                 .filter(|&&x| x <= val) | ||||
|                                 .count(); | ||||
|             let expected = num_samples as f64 / length as f64; | ||||
| 
 | ||||
|             let actual = ecdf(&xs.vec, val); | ||||
| 
 | ||||
|             TestResult::from_bool(actual == expected) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> TestResult); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_ecdf_non_sample_val_is_num_samples_leq_val_div_length() { | ||||
|         fn prop(xs: Samples, val: u64) -> TestResult { | ||||
|             let length = xs.vec.len(); | ||||
| 
 | ||||
|             if xs.vec.iter().any(|&x| x == val) { | ||||
|                 // Discard Vec containing val.
 | ||||
|                 return TestResult::discard(); | ||||
|             } | ||||
| 
 | ||||
|             let num_samples = xs.vec | ||||
|                                 .iter() | ||||
|                                 .filter(|&&x| x <= val) | ||||
|                                 .count(); | ||||
|             let expected = num_samples as f64 / length as f64; | ||||
| 
 | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
| 
 | ||||
|             TestResult::from_bool(ecdf.value(val) == expected) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> TestResult); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_and_multiple_use_ecdf_agree() { | ||||
|         fn prop(xs: Samples, val: u64) -> bool { | ||||
|             let multiple_use = Ecdf::new(&xs.vec); | ||||
| 
 | ||||
|             multiple_use.value(val) == ecdf(&xs.vec, val) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: 0 < p && p <= 100")] | ||||
|     fn single_use_percentiles_panics_on_zero_percentile() { | ||||
|         let xs: Vec<u64> = vec![0]; | ||||
| 
 | ||||
|         percentile(&xs, 0); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: 0 < p && p <= 100")] | ||||
|     fn single_use_percentiles_panics_on_101_percentile() { | ||||
|         let xs: Vec<u64> = vec![0]; | ||||
| 
 | ||||
|         percentile(&xs, 101); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: 0 < p && p <= 100")] | ||||
|     fn multiple_use_percentiles_panics_on_zero_percentile() { | ||||
|         let xs: Vec<u64> = vec![0]; | ||||
|         let ecdf = Ecdf::new(&xs); | ||||
| 
 | ||||
|         ecdf.percentile(0); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: 0 < p && p <= 100")] | ||||
|     fn multiple_use_percentiles_panics_on_101_percentile() { | ||||
|         let xs: Vec<u64> = vec![0]; | ||||
|         let ecdf = Ecdf::new(&xs); | ||||
| 
 | ||||
|         ecdf.percentile(101); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_percentile_between_samples_min_and_max() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let &min = xs.vec.iter().min().unwrap(); | ||||
|             let &max = xs.vec.iter().max().unwrap(); | ||||
| 
 | ||||
|             let actual = percentile(&xs.vec, p.val); | ||||
| 
 | ||||
|             min <= actual && actual <= max | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_percentile_is_an_increasing_function() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let smaller = cmp::max(p.val - 1, 1); | ||||
|             let larger = cmp::min(p.val + 1, 100); | ||||
| 
 | ||||
|             let actual = percentile(&xs.vec, p.val); | ||||
| 
 | ||||
|             percentile(&xs.vec, smaller) <= actual && actual <= percentile(&xs.vec, larger) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_percentile_100_is_sample_max() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let &max = xs.vec.iter().max().unwrap(); | ||||
| 
 | ||||
|             percentile(&xs.vec, 100) == max | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_percentile_between_samples_min_and_max() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let &min = xs.vec.iter().min().unwrap(); | ||||
|             let &max = xs.vec.iter().max().unwrap(); | ||||
| 
 | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
|             let actual = ecdf.percentile(p.val); | ||||
| 
 | ||||
|             min <= actual && actual <= max | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_percentile_is_an_increasing_function() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let smaller = cmp::max(p.val - 1, 1); | ||||
|             let larger = cmp::min(p.val + 1, 100); | ||||
| 
 | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
|             let actual = ecdf.percentile(p.val); | ||||
| 
 | ||||
|             ecdf.percentile(smaller) <= actual && actual <= ecdf.percentile(larger) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_percentile_100_is_sample_max() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let &max = xs.vec.iter().max().unwrap(); | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
| 
 | ||||
|             ecdf.percentile(100) == max | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() { | ||||
|         fn prop(xs: Samples, val: u64) -> TestResult { | ||||
|             let actual = ecdf(&xs.vec, val); | ||||
| 
 | ||||
|             let p = (actual * 100.0).floor() as u8; | ||||
| 
 | ||||
|             match p { | ||||
|                 0 => { | ||||
|                     // val is below the first percentile threshold. Can't
 | ||||
|                     // calculate 0-percentile value so discard.
 | ||||
|                     TestResult::discard() | ||||
|                 } | ||||
|                 _ => { | ||||
|                     // Not equal because e.g. all percentiles of [0] are 0. So
 | ||||
|                     // value of 1 gives ecdf == 1.0 and percentile(100) == 0.
 | ||||
|                     let single_use = percentile(&xs.vec, p); | ||||
|                     TestResult::from_bool(single_use <= val) | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> TestResult); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() { | ||||
|         fn prop(xs: Samples, val: u64) -> TestResult { | ||||
|             let actual = ecdf(&xs.vec, val); | ||||
| 
 | ||||
|             let p = (actual * 100.0).floor() as u8; | ||||
| 
 | ||||
|             match p { | ||||
|                 0 => { | ||||
|                     // val is below the first percentile threshold. Can't
 | ||||
|                     // calculate 0-percentile value so discard.
 | ||||
|                     TestResult::discard() | ||||
|                 } | ||||
|                 _ => { | ||||
|                     // Not equal because e.g. all percentiles of [0] are 0. So
 | ||||
|                     // value of 1 gives ecdf == 1.0 and percentile(100) == 0.
 | ||||
|                     let multiple_use = Ecdf::new(&xs.vec); | ||||
|                     TestResult::from_bool(multiple_use.percentile(p) <= val) | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> TestResult); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_ecdf_followed_by_single_use_percentile_is_leq_original_value() { | ||||
|         fn prop(xs: Samples, val: u64) -> TestResult { | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
|             let actual = ecdf.value(val); | ||||
| 
 | ||||
|             let p = (actual * 100.0).floor() as u8; | ||||
| 
 | ||||
|             match p { | ||||
|                 0 => { | ||||
|                     // val is below the first percentile threshold. Can't
 | ||||
|                     // calculate 0-percentile value so discard.
 | ||||
|                     TestResult::discard() | ||||
|                 } | ||||
|                 _ => { | ||||
|                     // Not equal because e.g. all percentiles of [0] are 0. So
 | ||||
|                     // value of 1 gives ecdf == 1.0 and percentile(100) == 0.
 | ||||
|                     let single_use = percentile(&xs.vec, p); | ||||
|                     TestResult::from_bool(single_use <= val) | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> TestResult); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_ecdf_followed_by_multiple_use_percentile_is_leq_original_value() { | ||||
|         fn prop(xs: Samples, val: u64) -> TestResult { | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
|             let actual = ecdf.value(val); | ||||
| 
 | ||||
|             let p = (actual * 100.0).floor() as u8; | ||||
| 
 | ||||
|             match p { | ||||
|                 0 => { | ||||
|                     // val is below the first percentile threshold. Can't
 | ||||
|                     // calculate 0-percentile value so discard.
 | ||||
|                     TestResult::discard() | ||||
|                 } | ||||
|                 _ => { | ||||
|                     // Not equal because e.g. all percentiles of [0] are 0. So
 | ||||
|                     // value of 1 gives ecdf == 1.0 and percentile(100) == 0.
 | ||||
|                     TestResult::from_bool(ecdf.percentile(p) <= val) | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u64) -> TestResult); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_percentile_followed_by_single_use_edf_is_geq_original_value() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let actual = percentile(&xs.vec, p.val); | ||||
| 
 | ||||
|             // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
 | ||||
|             // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
 | ||||
|             p.val as f64 / 100.0 <= ecdf(&xs.vec, actual) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let actual = percentile(&xs.vec, p.val); | ||||
| 
 | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
| 
 | ||||
|             // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
 | ||||
|             // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
 | ||||
|             p.val as f64 / 100.0 <= ecdf.value(actual) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_percentile_followed_by_single_use_edf_is_geq_original_value() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let multiple_use = Ecdf::new(&xs.vec); | ||||
|             let actual = multiple_use.percentile(p.val); | ||||
| 
 | ||||
|             // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
 | ||||
|             // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
 | ||||
|             p.val as f64 / 100.0 <= ecdf(&xs.vec, actual) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn multiple_use_percentile_followed_by_multiple_use_edf_is_geq_original_value() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let ecdf = Ecdf::new(&xs.vec); | ||||
|             let actual = ecdf.percentile(p.val); | ||||
| 
 | ||||
|             // Not equal because e.g. 1- through 50-percentiles of [0, 1] are 0.
 | ||||
|             // So original value of 1 gives percentile == 0 and ecdf(0) == 0.5.
 | ||||
|             p.val as f64 / 100.0 <= ecdf.value(actual) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn single_and_multiple_use_percentile_agree() { | ||||
|         fn prop(xs: Samples, p: Percentile) -> bool { | ||||
|             let multiple_use = Ecdf::new(&xs.vec); | ||||
| 
 | ||||
|             multiple_use.percentile(p.val) == percentile(&xs.vec, p.val) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Percentile) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn min_is_leq_all_samples() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let multiple_use = Ecdf::new(&xs.vec); | ||||
|             let actual = multiple_use.min(); | ||||
| 
 | ||||
|             xs.vec.iter().all(|&x| actual <= x) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn max_is_geq_all_samples() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let multiple_use = Ecdf::new(&xs.vec); | ||||
|             let actual = multiple_use.max(); | ||||
| 
 | ||||
|             xs.vec.iter().all(|&x| actual >= x) | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										5
									
								
								tools/kslib/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								tools/kslib/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | ||||
| pub mod ecdf; | ||||
| pub mod test; | ||||
| 
 | ||||
| pub use test::{test, test_f64, calculate_critical_value}; | ||||
| pub use ecdf::{Ecdf, ecdf, percentile}; | ||||
							
								
								
									
										699
									
								
								tools/kslib/src/test.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										699
									
								
								tools/kslib/src/test.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,699 @@ | ||||
| //! Two Sample Kolmogorov-Smirnov Test
 | ||||
| 
 | ||||
| use std::cmp::{min, Ord, Ordering}; | ||||
| 
 | ||||
| /// Two sample test result.
 | ||||
| pub struct TestResult { | ||||
|     pub is_rejected: bool, | ||||
|     pub statistic: f64, | ||||
|     pub reject_probability: f64, | ||||
|     pub critical_value: f64, | ||||
|     pub confidence: f64, | ||||
| } | ||||
| 
 | ||||
| /// Perform a two sample Kolmogorov-Smirnov test on given samples.
 | ||||
| ///
 | ||||
| /// The samples must have length > 7 elements for the test to be valid.
 | ||||
| ///
 | ||||
| /// # Panics
 | ||||
| ///
 | ||||
| /// There are assertion panics if either sequence has <= 7 elements.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```
 | ||||
| /// extern crate kolmogorov_smirnov as ks;
 | ||||
| ///
 | ||||
| /// let xs = vec!(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
 | ||||
| /// let ys = vec!(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 | ||||
| /// let confidence = 0.95;
 | ||||
| ///
 | ||||
| /// let result = ks::test(&xs, &ys, confidence);
 | ||||
| ///
 | ||||
| /// if result.is_rejected {
 | ||||
| ///     println!("{:?} and {:?} are not from the same distribution with probability {}.",
 | ||||
| ///       xs, ys, result.reject_probability);
 | ||||
| /// }
 | ||||
| /// ```
 | ||||
| pub fn test<T: Ord + Clone>(xs: &[T], ys: &[T], confidence: f64) -> Result<TestResult, String> { | ||||
|     assert!(xs.len() > 0 && ys.len() > 0); | ||||
|     assert!(0.0 < confidence && confidence < 1.0); | ||||
| 
 | ||||
|     // Only supports samples of size > 7.
 | ||||
|     assert!(xs.len() > 7 && ys.len() > 7); | ||||
| 
 | ||||
|     let statistic = calculate_statistic(xs, ys); | ||||
|     let critical_value = calculate_critical_value(xs.len(), ys.len(), confidence)?; | ||||
| 
 | ||||
|     let reject_probability = calculate_reject_probability(statistic, xs.len(), ys.len())?; | ||||
|     let is_rejected = reject_probability > confidence; | ||||
| 
 | ||||
|     Ok(TestResult { | ||||
|         is_rejected: is_rejected, | ||||
|         statistic: statistic, | ||||
|         reject_probability: reject_probability, | ||||
|         critical_value: critical_value, | ||||
|         confidence: confidence, | ||||
|     }) | ||||
| } | ||||
| 
 | ||||
| /// Wrapper type for f64 to implement Ord and make usable with test.
 | ||||
| #[derive(PartialEq, Clone)] | ||||
| struct OrderableF64 { | ||||
|     val: f64, | ||||
| } | ||||
| 
 | ||||
| impl OrderableF64 { | ||||
|     fn new(val: f64) -> OrderableF64 { | ||||
|         OrderableF64 { val: val } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl Eq for OrderableF64 {} | ||||
| 
 | ||||
| impl PartialOrd for OrderableF64 { | ||||
|     fn partial_cmp(&self, other: &Self) -> Option<Ordering> { | ||||
|         self.val.partial_cmp(&other.val) | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| impl Ord for OrderableF64 { | ||||
|     fn cmp(&self, other: &Self) -> Ordering { | ||||
|         self.val.partial_cmp(&other.val).unwrap() | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /// Perform a two sample Kolmogorov-Smirnov test on given f64 samples.
 | ||||
| ///
 | ||||
| /// This is necessary because f64 does not implement Ord in Rust as some
 | ||||
| /// elements are incomparable, e.g. NaN. This function wraps the f64s in
 | ||||
| /// implementation of Ord which panics on incomparable elements.
 | ||||
| ///
 | ||||
| /// The samples must have length > 7 elements for the test to be valid.
 | ||||
| ///
 | ||||
| /// # Panics
 | ||||
| ///
 | ||||
| /// There are assertion panics if either sequence has <= 7 elements.
 | ||||
| ///
 | ||||
| /// If any of the f64 elements in the input samples are unorderable, e.g. NaN.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```
 | ||||
| /// extern crate kolmogorov_smirnov as ks;
 | ||||
| ///
 | ||||
| /// let xs = vec!(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0);
 | ||||
| /// let ys = vec!(12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0);
 | ||||
| /// let confidence = 0.95;
 | ||||
| ///
 | ||||
| /// let result = ks::test_f64(&xs, &ys, confidence);
 | ||||
| ///
 | ||||
| /// if result.is_rejected {
 | ||||
| ///     println!("{:?} and {:?} are not from the same distribution with probability {}.",
 | ||||
| ///       xs, ys, result.reject_probability);
 | ||||
| /// }
 | ||||
| /// ```
 | ||||
| pub fn test_f64(xs: &[f64], ys: &[f64], confidence: f64) -> Result<TestResult, String> { | ||||
|     let xs: Vec<OrderableF64> = xs.iter().map(|&f| OrderableF64::new(f)).collect(); | ||||
|     let ys: Vec<OrderableF64> = ys.iter().map(|&f| OrderableF64::new(f)).collect(); | ||||
| 
 | ||||
|     test(&xs, &ys, confidence) | ||||
| } | ||||
| 
 | ||||
| /// Calculate the test statistic for the two sample Kolmogorov-Smirnov test.
 | ||||
| ///
 | ||||
| /// The test statistic is the maximum vertical distance between the ECDFs of
 | ||||
| /// the two samples.
 | ||||
| fn calculate_statistic<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f64 { | ||||
|     let n = xs.len(); | ||||
|     let m = ys.len(); | ||||
| 
 | ||||
|     assert!(n > 0 && m > 0); | ||||
| 
 | ||||
|     let mut xs = xs.to_vec(); | ||||
|     let mut ys = ys.to_vec(); | ||||
| 
 | ||||
|     // xs and ys must be sorted for the stepwise ECDF calculations to work.
 | ||||
|     xs.sort(); | ||||
|     ys.sort(); | ||||
| 
 | ||||
|     // The current value testing for ECDF difference. Sweeps up through elements
 | ||||
|     // present in xs and ys.
 | ||||
|     let mut current: &T; | ||||
| 
 | ||||
|     // i, j index the first values in xs and ys that are greater than current.
 | ||||
|     let mut i = 0; | ||||
|     let mut j = 0; | ||||
| 
 | ||||
|     // ecdf_xs, ecdf_ys always hold the ECDF(current) of xs and ys.
 | ||||
|     let mut ecdf_xs = 0.0; | ||||
|     let mut ecdf_ys = 0.0; | ||||
| 
 | ||||
|     // The test statistic value computed over values <= current.
 | ||||
|     let mut statistic = 0.0; | ||||
| 
 | ||||
|     while i < n && j < m { | ||||
|         // Advance i through duplicate samples in xs.
 | ||||
|         let x_i = &xs[i]; | ||||
|         while i + 1 < n && *x_i == xs[i + 1] { | ||||
|             i += 1; | ||||
|         } | ||||
| 
 | ||||
|         // Advance j through duplicate samples in ys.
 | ||||
|         let y_j = &ys[j]; | ||||
|         while j + 1 < m && *y_j == ys[j + 1] { | ||||
|             j += 1; | ||||
|         } | ||||
| 
 | ||||
|         // Step to the next sample value in the ECDF sweep from low to high.
 | ||||
|         current = min(x_i, y_j); | ||||
| 
 | ||||
|         // Update invariant conditions for i, j, ecdf_xs, and ecdf_ys.
 | ||||
|         if current == x_i { | ||||
|             ecdf_xs = (i + 1) as f64 / n as f64; | ||||
|             i += 1; | ||||
|         } | ||||
|         if current == y_j { | ||||
|             ecdf_ys = (j + 1) as f64 / m as f64; | ||||
|             j += 1; | ||||
|         } | ||||
| 
 | ||||
|         // Update invariant conditions for the test statistic.
 | ||||
|         let diff = (ecdf_xs - ecdf_ys).abs(); | ||||
|         if diff > statistic { | ||||
|             statistic = diff; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     // Don't need to walk the rest of the samples because one of the ecdfs is
 | ||||
|     // already one and the other will be increasing up to one. This means the
 | ||||
|     // difference will be monotonically decreasing, so we have our test
 | ||||
|     // statistic value already.
 | ||||
| 
 | ||||
|     statistic | ||||
| } | ||||
| 
 | ||||
| /// Calculate the probability that the null hypothesis is false for a two sample
 | ||||
| /// Kolmogorov-Smirnov test. Can only reject the null hypothesis if this
 | ||||
| /// evidence exceeds the confidence level required.
 | ||||
| fn calculate_reject_probability(statistic: f64, n1: usize, n2: usize) -> Result<f64, String> { | ||||
|     // Only supports samples of size > 7.
 | ||||
|     assert!(n1 > 7 && n2 > 7); | ||||
| 
 | ||||
|     let n1 = n1 as f64; | ||||
|     let n2 = n2 as f64; | ||||
| 
 | ||||
|     let factor = ((n1 * n2) / (n1 + n2)).sqrt(); | ||||
|     let term = (factor + 0.12 + 0.11 / factor) * statistic; | ||||
| 
 | ||||
|     let probability = probability_kolmogorov_smirnov(term)?; | ||||
|     let reject_probability = 1.0 - probability; | ||||
| 
 | ||||
|     assert!(0.0 <= reject_probability && reject_probability <= 1.0); | ||||
| 
 | ||||
|     Ok(reject_probability) | ||||
| } | ||||
| 
 | ||||
| /// Calculate the critical value for the two sample Kolmogorov-Smirnov test.
 | ||||
| ///
 | ||||
| /// # Panics
 | ||||
| ///
 | ||||
| /// No convergence panic if the binary search does not locate the critical
 | ||||
| /// value in less than 200 iterations.
 | ||||
| ///
 | ||||
| /// # Examples
 | ||||
| ///
 | ||||
| /// ```
 | ||||
| /// extern crate kolmogorov_smirnov as ks;
 | ||||
| ///
 | ||||
| /// let critical_value = ks::calculate_critical_value(256, 256, 0.95);
 | ||||
| /// println!("Critical value at 95% confidence for samples of size 256 is {}",
 | ||||
| ///       critical_value);
 | ||||
| /// ```
 | ||||
| pub fn calculate_critical_value(n1: usize, n2: usize, confidence: f64) -> Result<f64, String> { | ||||
|     assert!(0.0 < confidence && confidence < 1.0); | ||||
| 
 | ||||
|     // Only supports samples of size > 7.
 | ||||
|     assert!(n1 > 7 && n2 > 7); | ||||
| 
 | ||||
|     // The test statistic is between zero and one so can binary search quickly
 | ||||
|     // for the critical value.
 | ||||
|     let mut low = 0.0; | ||||
|     let mut high = 1.0; | ||||
| 
 | ||||
|     for _ in 1..200 { | ||||
|         if low + 1e-8 >= high { | ||||
|             return Ok(high); | ||||
|         } | ||||
| 
 | ||||
|         let mid = low + (high - low) / 2.0; | ||||
|         let reject_probability = calculate_reject_probability(mid, n1, n2)?; | ||||
| 
 | ||||
|         if reject_probability > confidence { | ||||
|             // Maintain invariant that reject_probability(high) > confidence.
 | ||||
|             high = mid; | ||||
|         } else { | ||||
|             // Maintain invariant that reject_probability(low) <= confidence.
 | ||||
|             low = mid; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     Err(format!("No convergence in calculate_critical_value({}, {}, {}).", | ||||
|            n1, | ||||
|            n2, | ||||
|            confidence)) | ||||
|     //panic!("No convergence in calculate_critical_value({}, {}, {}).",
 | ||||
|     //       n1,
 | ||||
|     //       n2,
 | ||||
|     //       confidence);
 | ||||
| } | ||||
| 
 | ||||
| /// Calculate the Kolmogorov-Smirnov probability function.
 | ||||
| fn probability_kolmogorov_smirnov(lambda: f64) -> Result<f64, String> { | ||||
|     if lambda == 0.0 { | ||||
|         return Ok(1.0); | ||||
|     } | ||||
| 
 | ||||
|     let minus_two_lambda_squared = -2.0 * lambda * lambda; | ||||
|     let mut q_ks = 0.0; | ||||
| 
 | ||||
|     for j in 1..200 { | ||||
|         let sign = if j % 2 == 1 { | ||||
|             1.0 | ||||
|         } else { | ||||
|             -1.0 | ||||
|         }; | ||||
| 
 | ||||
|         let j = j as f64; | ||||
|         let term = sign * 2.0 * (minus_two_lambda_squared * j * j).exp(); | ||||
| 
 | ||||
|         q_ks += term; | ||||
| 
 | ||||
|         if term.abs() < 1e-8 { | ||||
|             // Trim results that exceed 1.
 | ||||
|             return Ok(q_ks.min(1.0)); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     Err(format!("No convergence in probability_kolmogorov_smirnov({}).", | ||||
|            lambda)) | ||||
|     //panic!("No convergence in probability_kolmogorov_smirnov({}).",
 | ||||
|            //lambda);
 | ||||
| } | ||||
| 
 | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     extern crate quickcheck; | ||||
|     extern crate rand; | ||||
| 
 | ||||
|     use self::quickcheck::{Arbitrary, Gen, QuickCheck, Testable, StdGen}; | ||||
|     use self::rand::Rng; | ||||
|     use std::cmp; | ||||
|     use std::usize; | ||||
| 
 | ||||
|     use super::test; | ||||
|     use ecdf::Ecdf; | ||||
| 
 | ||||
|     const EPSILON: f64 = 1e-10; | ||||
| 
 | ||||
|     fn check<A: Testable>(f: A) { | ||||
|         // Need - 1 to ensure space for creating non-overlapping samples.
 | ||||
|         let g = StdGen::new(rand::thread_rng(), usize::MAX - 1); | ||||
|         QuickCheck::new().gen(g).quickcheck(f); | ||||
|     } | ||||
| 
 | ||||
|     /// Wrapper for generating sample data with QuickCheck.
 | ||||
|     ///
 | ||||
|     /// Samples must be sequences of u64 values with more than 7 elements.
 | ||||
|     #[derive(Debug, Clone)] | ||||
|     struct Samples { | ||||
|         vec: Vec<u64>, | ||||
|     } | ||||
| 
 | ||||
|     impl Samples { | ||||
|         fn min(&self) -> u64 { | ||||
|             let &min = self.vec.iter().min().unwrap(); | ||||
|             min | ||||
|         } | ||||
| 
 | ||||
|         fn max(&self) -> u64 { | ||||
|             let &max = self.vec.iter().max().unwrap(); | ||||
|             max | ||||
|         } | ||||
| 
 | ||||
|         fn shuffle(&mut self) { | ||||
|             let mut rng = rand::thread_rng(); | ||||
|             rng.shuffle(&mut self.vec); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     impl Arbitrary for Samples { | ||||
|         fn arbitrary<G: Gen>(g: &mut G) -> Samples { | ||||
|             // Limit size of generated sample set to 1024
 | ||||
|             let max = cmp::min(g.size(), 1024); | ||||
| 
 | ||||
|             let size = g.gen_range(8, max); | ||||
|             let vec = (0..size).map(|_| u64::arbitrary(g)).collect(); | ||||
| 
 | ||||
|             Samples { vec: vec } | ||||
|         } | ||||
| 
 | ||||
|         fn shrink(&self) -> Box<Iterator<Item = Samples>> { | ||||
|             let vec: Vec<u64> = self.vec.clone(); | ||||
|             let shrunk: Box<Iterator<Item = Vec<u64>>> = vec.shrink(); | ||||
| 
 | ||||
|             Box::new(shrunk.filter(|v| v.len() > 7).map(|v| Samples { vec: v })) | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")] | ||||
|     fn test_panics_on_empty_samples_set() { | ||||
|         let xs: Vec<u64> = vec![]; | ||||
|         let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; | ||||
|         test(&xs, &ys, 0.95); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: xs.len() > 0 && ys.len() > 0")] | ||||
|     fn test_panics_on_empty_other_samples_set() { | ||||
|         let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; | ||||
|         let ys: Vec<u64> = vec![]; | ||||
|         test(&xs, &ys, 0.95); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")] | ||||
|     fn test_panics_on_confidence_leq_zero() { | ||||
|         let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; | ||||
|         let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; | ||||
|         test(&xs, &ys, 0.0); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     #[should_panic(expected="assertion failed: 0.0 < confidence && confidence < 1.0")] | ||||
|     fn test_panics_on_confidence_geq_one() { | ||||
|         let xs: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; | ||||
|         let ys: Vec<u64> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; | ||||
|         test(&xs, &ys, 1.0); | ||||
|     } | ||||
| 
 | ||||
|     /// Alternative calculation for the test statistic for the two sample
 | ||||
|     /// Kolmogorov-Smirnov test. This simple implementation is used as a
 | ||||
|     /// verification check against actual calculation used.
 | ||||
|     fn calculate_statistic_alt<T: Ord + Clone>(xs: &[T], ys: &[T]) -> f64 { | ||||
|         assert!(xs.len() > 0 && ys.len() > 0); | ||||
| 
 | ||||
|         let ecdf_xs = Ecdf::new(xs); | ||||
|         let ecdf_ys = Ecdf::new(ys); | ||||
| 
 | ||||
|         let mut statistic = 0.0; | ||||
| 
 | ||||
|         for x in xs.iter() { | ||||
|             let diff = (ecdf_xs.value(x.clone()) - ecdf_ys.value(x.clone())).abs(); | ||||
|             if diff > statistic { | ||||
|                 statistic = diff; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         for y in ys.iter() { | ||||
|             let diff = (ecdf_xs.value(y.clone()) - ecdf_ys.value(y.clone())).abs(); | ||||
|             if diff > statistic { | ||||
|                 statistic = diff; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         statistic | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_calculate_statistic() { | ||||
|         fn prop(xs: Samples, ys: Samples) -> bool { | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let actual = result.statistic; | ||||
|             let expected = calculate_statistic_alt(&xs.vec, &ys.vec); | ||||
| 
 | ||||
|             actual == expected | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_between_zero_and_one() { | ||||
|         fn prop(xs: Samples, ys: Samples) -> bool { | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let actual = result.statistic; | ||||
| 
 | ||||
|             0.0 <= actual && actual <= 1.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_zero_for_identical_samples() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let ys = xs.clone(); | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
| 
 | ||||
|             result.statistic == 0.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_zero_for_permuted_sample() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let mut ys = xs.clone(); | ||||
|             ys.shuffle(); | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
| 
 | ||||
|             result.statistic == 0.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_one_for_samples_with_no_overlap_in_support() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let mut ys = xs.clone(); | ||||
| 
 | ||||
|             // Shift ys so that ys.min > xs.max.
 | ||||
|             let ys_min = xs.max() + 1; | ||||
|             ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect(); | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
| 
 | ||||
|             result.statistic == 1.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_one_half_for_sample_with_non_overlapping_in_support_replicate_added() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let mut ys = xs.clone(); | ||||
| 
 | ||||
|             // Shift ys so that ys.min > xs.max.
 | ||||
|             let ys_min = xs.max() + 1; | ||||
|             ys.vec = ys.vec.iter().map(|&y| cmp::max(y, ys_min)).collect(); | ||||
| 
 | ||||
|             // Add all the original items back too.
 | ||||
|             for &x in xs.vec.iter() { | ||||
|                 ys.vec.push(x); | ||||
|             } | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
| 
 | ||||
|             result.statistic == 0.5 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_one_div_length_for_sample_with_additional_low_value() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             // Add a extra sample of early weight to ys.
 | ||||
|             let min = xs.min(); | ||||
|             let mut ys = xs.clone(); | ||||
|             ys.vec.push(min - 1); | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let expected = 1.0 / ys.vec.len() as f64; | ||||
| 
 | ||||
|             result.statistic == expected | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_one_div_length_for_sample_with_additional_high_value() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             // Add a extra sample of late weight to ys.
 | ||||
|             let max = xs.max(); | ||||
|             let mut ys = xs.clone(); | ||||
|             ys.vec.push(max + 1); | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let expected = 1.0 / ys.vec.len() as f64; | ||||
| 
 | ||||
|             (result.statistic - expected).abs() < EPSILON | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_one_div_length_for_sample_with_additional_low_and_high_values() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             // Add a extra sample of late weight to ys.
 | ||||
|             let min = xs.min(); | ||||
|             let max = xs.max(); | ||||
| 
 | ||||
|             let mut ys = xs.clone(); | ||||
| 
 | ||||
|             ys.vec.push(min - 1); | ||||
|             ys.vec.push(max + 1); | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let expected = 1.0 / ys.vec.len() as f64; | ||||
| 
 | ||||
|             (result.statistic - expected).abs() < EPSILON | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_values() { | ||||
|         fn prop(xs: Samples, n: u8) -> bool { | ||||
|             // Add extra sample of early weight to ys.
 | ||||
|             let min = xs.min(); | ||||
|             let mut ys = xs.clone(); | ||||
|             for j in 0..n { | ||||
|                 ys.vec.push(min - (j as u64) - 1); | ||||
|             } | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let expected = n as f64 / ys.vec.len() as f64; | ||||
| 
 | ||||
|             result.statistic == expected | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u8) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_n_div_length_for_sample_with_additional_n_high_values() { | ||||
|         fn prop(xs: Samples, n: u8) -> bool { | ||||
|             // Add extra sample of early weight to ys.
 | ||||
|             let max = xs.max(); | ||||
|             let mut ys = xs.clone(); | ||||
|             for j in 0..n { | ||||
|                 ys.vec.push(max + (j as u64) + 1); | ||||
|             } | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let expected = n as f64 / ys.vec.len() as f64; | ||||
| 
 | ||||
|             (result.statistic - expected).abs() < EPSILON | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u8) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_n_div_length_for_sample_with_additional_n_low_and_high_values() { | ||||
|         fn prop(xs: Samples, n: u8) -> bool { | ||||
|             // Add extra sample of early weight to ys.
 | ||||
|             let min = xs.min(); | ||||
|             let max = xs.max(); | ||||
|             let mut ys = xs.clone(); | ||||
|             for j in 0..n { | ||||
|                 ys.vec.push(min - (j as u64) - 1); | ||||
|                 ys.vec.push(max + (j as u64) + 1); | ||||
|             } | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let expected = n as f64 / ys.vec.len() as f64; | ||||
| 
 | ||||
|             (result.statistic - expected).abs() < EPSILON | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u8) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_statistic_is_n_or_m_div_length_for_sample_with_additional_n_low_and_m_high_values() { | ||||
|         fn prop(xs: Samples, n: u8, m: u8) -> bool { | ||||
|             // Add extra sample of early weight to ys.
 | ||||
|             let min = xs.min(); | ||||
|             let max = xs.max(); | ||||
|             let mut ys = xs.clone(); | ||||
|             for j in 0..n { | ||||
|                 ys.vec.push(min - (j as u64) - 1); | ||||
|             } | ||||
| 
 | ||||
|             for j in 0..m { | ||||
|                 ys.vec.push(max + (j as u64) + 1); | ||||
|             } | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
|             let expected = cmp::max(n, m) as f64 / ys.vec.len() as f64; | ||||
| 
 | ||||
|             (result.statistic - expected).abs() < EPSILON | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, u8, u8) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_is_rejected_if_reject_probability_greater_than_confidence() { | ||||
|         fn prop(xs: Samples, ys: Samples) -> bool { | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
| 
 | ||||
|             if result.is_rejected { | ||||
|                 result.reject_probability > 0.95 | ||||
|             } else { | ||||
|                 result.reject_probability <= 0.95 | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples, Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_reject_probability_is_zero_for_identical_samples() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let ys = xs.clone(); | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
| 
 | ||||
|             result.reject_probability == 0.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_reject_probability_is_zero_for_permuted_sample() { | ||||
|         fn prop(xs: Samples) -> bool { | ||||
|             let mut ys = xs.clone(); | ||||
|             ys.shuffle(); | ||||
| 
 | ||||
|             let result = test(&xs.vec, &ys.vec, 0.95); | ||||
| 
 | ||||
|             result.reject_probability == 0.0 | ||||
|         } | ||||
| 
 | ||||
|         check(prop as fn(Samples) -> bool); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										14
									
								
								tools/kstest/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								tools/kstest/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,14 @@ | ||||
| [package] | ||||
| name = "kstest" | ||||
| version = "0.1.0" | ||||
| authors = ["eugen.betke <betke@dkrz.de>"] | ||||
| edition = "2018" | ||||
| 
 | ||||
| # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||||
| 
 | ||||
| [dependencies] | ||||
| kolmogorov_smirnov = {path = "../kslib"} | ||||
| csv = "1.1" | ||||
| glob = "0.3.0" | ||||
| serde = { version = "1.0", features = ["derive"] } | ||||
| chrono = "*" | ||||
							
								
								
									
										27
									
								
								tools/kstest/run.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										27
									
								
								tools/kstest/run.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,27 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| 
 | ||||
| #7488914 19865984 | ||||
| #4296426 18672376 | ||||
| #5024292 17944118 | ||||
| 
 | ||||
| #dataset_fn="../../datasets/job_codings_v4_confidential.csv" | ||||
| #jobids=( ) | ||||
| #jobids=( ${jobids[@]} 19865984 ) | ||||
| #jobids=( ${jobids[@]} 18672376 ) | ||||
| #jobids=( ${jobids[@]} 17944118 ) | ||||
| 
 | ||||
| 
 | ||||
| dataset_fn="../../datasets/job_codings_v4.csv" | ||||
| jobids=( ) | ||||
| jobids=( ${jobids[@]} 7488914 ) | ||||
| jobids=( ${jobids[@]} 4296426 ) | ||||
| jobids=( ${jobids[@]} 5024292 ) | ||||
| 
 | ||||
| set -x | ||||
| for jobid in ${jobids[@]}; do | ||||
|     sim_fn="./ks_similarities_$jobid.csv" | ||||
|     progress_fn="./ks_progress_$jobid.csv" | ||||
|     log_fn="./ks_fail_$jobid.log" | ||||
|     cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn  & | ||||
| done | ||||
							
								
								
									
										183
									
								
								tools/kstest/src/main.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										183
									
								
								tools/kstest/src/main.rs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,183 @@ | ||||
| extern crate kolmogorov_smirnov as ks; | ||||
| extern crate csv; | ||||
| extern crate serde; | ||||
| extern crate chrono; | ||||
| 
 | ||||
| use std::fs::File; | ||||
| use std::io::prelude::*; | ||||
| 
 | ||||
| //use std::fs::File;
 | ||||
| use serde::Deserialize; | ||||
| use serde::Serialize; | ||||
| use std::collections::HashMap; | ||||
| use std::env; | ||||
| use std::io::LineWriter; | ||||
| 
 | ||||
| pub type Score = u32; | ||||
| pub type JobCoding = Vec<Score>; | ||||
| pub type Similarity = f32; | ||||
| 
 | ||||
| pub type Jobid = u32; | ||||
| pub type QCodings = HashMap<Jobid, JobCoding>; | ||||
| 
 | ||||
| #[derive(Debug, Deserialize)] | ||||
| pub struct Record { | ||||
|     jobid: u32, | ||||
|     //q16_coding: String,
 | ||||
|     ks_coding: String, | ||||
| } | ||||
| 
 | ||||
| #[derive(Debug, Serialize)] | ||||
| pub struct SimilarityRow { | ||||
|     pub jobid: u32, | ||||
|     pub alg_id: u32, | ||||
|     pub alg_name: String, | ||||
|     pub similarity: f32 | ||||
| } | ||||
| 
 | ||||
| #[derive(Debug, Serialize)] | ||||
| pub struct ProgressRow { | ||||
|     jobid: u32, | ||||
|     alg_id: u32, | ||||
|     alg_name: String, | ||||
|     delta: i64, | ||||
| } | ||||
| 
 | ||||
| pub fn convert_to_coding(coding: String) -> Vec<Score> { | ||||
|     let split = coding.split(":"); | ||||
|     let vec: Vec<Score> = split | ||||
|         .filter(|s| !s.is_empty()) | ||||
|         //.map(|s| s.parse::<F>().unwrap())
 | ||||
|         .map(|s| s.parse().unwrap()) | ||||
|         .collect(); | ||||
|     vec | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| //fn ks_similarity(xs: &Vec<u32>, ys: &Vec<u32>) -> Result<Similarity, String> {
 | ||||
| //    let confidence = 0.95;
 | ||||
| //    ks::test(xs, ys, confidence)?
 | ||||
| 
 | ||||
| //    let reject_probability = match result {
 | ||||
| //        Ok(v) => v.reject_probability,
 | ||||
| //        Err(_) => 1.0,
 | ||||
| //    };
 | ||||
| //    //println!("is_rejected: {:?}\nstatistic: {:?}\nreject_probability: {:?}\ncritical_value: {:?}\nconfidence: {:?}", result.is_rejected, result.statistic, result.reject_probability, result.critical_value, result.confidence);
 | ||||
| //    (1.0 - reject_probability) as Similarity
 | ||||
| //}
 | ||||
| 
 | ||||
| 
 | ||||
| fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: String, log_fn: String) { | ||||
|     let mut q_codings: QCodings = HashMap::new(); | ||||
|     let file = File::open(&dataset_fn).expect("Unable to open dataset."); | ||||
|     let mut rdr = csv::Reader::from_reader(file); | ||||
| 
 | ||||
|     //for result in rdr.deserialize().take(10000) {
 | ||||
|     for result in rdr.deserialize() { | ||||
|         let record: Record = result.expect("bla bla"); | ||||
|         //let q_coding = convert_to_coding(record.q16_coding);
 | ||||
|         let q_coding = convert_to_coding(record.ks_coding); | ||||
|         // Insert Non-Zero jobs only
 | ||||
|         if q_coding.iter().sum::<Score>() > (0 as Score) { | ||||
|            q_codings.insert(record.jobid, q_coding); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     let probe = q_codings[&jobid].clone(); | ||||
|     let similarities_file = File::create(&similarities_fn).expect("Unable to open"); | ||||
|     let mut wtr_similarities = csv::Writer::from_writer(&similarities_file); | ||||
|     let alg_name = "ks"; | ||||
|     let alg_id = 6; | ||||
| 
 | ||||
|     let progress_file = File::create(&progress_fn).expect("Unable to open"); | ||||
|     let mut wtr_progress = csv::Writer::from_writer(&progress_file); | ||||
|     let mut start = chrono::Utc::now(); | ||||
|     let mut counter = 1; | ||||
| 
 | ||||
|     let mut avail_codings: Vec<(u32, &JobCoding)>; | ||||
| 
 | ||||
|     avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect(); | ||||
|     let mut similarities: Vec<(Jobid, Similarity)> = Vec::new(); | ||||
| 
 | ||||
|     let log_file = File::create(&log_fn).expect("Unable to open"); | ||||
|     let mut log_file = LineWriter::new(log_file); | ||||
| 
 | ||||
| 
 | ||||
|     while let Some((jobid, q_coding)) = avail_codings.pop() { | ||||
|         if (counter % 10_000) == 0 { | ||||
|             let stop = chrono::Utc::now(); | ||||
|             let progress_row = ProgressRow { | ||||
|                 jobid: jobid, | ||||
|                 alg_id: alg_id, | ||||
|                 alg_name: String::from(alg_name), | ||||
|                 delta: ((stop - start).num_nanoseconds().unwrap()) | ||||
|             }; | ||||
|             wtr_progress.serialize(progress_row).unwrap(); | ||||
|             start = stop; | ||||
|         } | ||||
| 
 | ||||
|         //println!("Processing {:?}", jobid);
 | ||||
|         //let similarity = ks_similarity(q_coding, &probe);
 | ||||
|         
 | ||||
|         let confidence = 0.95; | ||||
|         let similarity = match ks::test(q_coding, &probe, confidence) { | ||||
|             Ok(sim) => (1.0 - sim.reject_probability) as Similarity, | ||||
|             Err(e) => { | ||||
|                 let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e); | ||||
|                 log_file.write_all(message.as_bytes()).unwrap(); | ||||
|                 1.0 | ||||
|             } | ||||
|         }; | ||||
| 
 | ||||
|         similarities.push((jobid, similarity)); | ||||
|         counter += 1; | ||||
|     } | ||||
| 
 | ||||
|     for (jobid, similarity) in similarities.iter() { | ||||
|         let similarity_row = SimilarityRow { | ||||
|             jobid: *jobid, | ||||
|             alg_id: alg_id, | ||||
|             alg_name: String::from(alg_name), | ||||
|             similarity: *similarity, | ||||
|         }; | ||||
|         wtr_similarities.serialize(similarity_row).unwrap(); | ||||
|     } | ||||
|     log_file.flush().unwrap(); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| fn main() { | ||||
|     let args: Vec<String> = env::args().collect(); | ||||
|     let dataset_fn = args[1].clone(); | ||||
|     let jobid = args[2].parse::<u32>().unwrap(); | ||||
|     let sim_fn = args[3].clone(); | ||||
|     let progress_fn = args[4].clone(); | ||||
|     let log_fn = args[5].clone(); | ||||
|     println!("{:?}", args); | ||||
| 
 | ||||
|     run(dataset_fn, jobid, sim_fn, progress_fn, log_fn); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| mod tests { | ||||
|     //use super::*;
 | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_ks_test() { | ||||
|         let xs = vec!(0.0  , 1.0  , 2.0  , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 , 8.0 , 9.0 , 10.0 , 11.0 , 12.0); | ||||
|         let ys = vec!(12.0 , 11.0 , 10.0 , 9.0 , 8.0 , 7.0 , 6.0 , 5.0 , 4.0 , 3.0 , 2.0  , 1.0  , 0.0); | ||||
| 
 | ||||
|         ks_test(xs, ys); | ||||
| 
 | ||||
|         let c1 = vec![141.0,143.0,142.0,238.0,132.0,486.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,128.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]; | ||||
|         let c2 = vec![239.0,239.0,255.0,255.0,239.0,239.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,502.0,511.0,503.0]; | ||||
|         ks_test(c1, c2); | ||||
| 
 | ||||
| 
 | ||||
|         let c1 = vec![2.0,2.0,2.0,9.0,3.0,0.0,0.0,0.0]; | ||||
|         let c2 = vec![2.0,2.0,2.0,2.0,8.0,3.0,0.0,10.0]; | ||||
|         ks_test(c1, c2); | ||||
|     } | ||||
| } | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user