diff --git a/datasets/compress.sh b/datasets/compress.sh index 73c4805..a8187dd 100755 --- a/datasets/compress.sh +++ b/datasets/compress.sh @@ -5,6 +5,7 @@ filenames=( ${filenames[@]} "clustering_progress.csv" ) filenames=( ${filenames[@]} $(ls job_metadata*.csv) ) filenames=( ${filenames[@]} $( ls job_similarities_*.csv ) ) filenames=( ${filenames[@]} $( ls sim_computation_times_*.csv ) ) +filenames=( ${filenames[@]} $( ls ks_*.csv ) ) echo "${filenames[*]}" diff --git a/tools/kstest/run.sh b/tools/kstest/run.sh index a3e577f..0073b31 100755 --- a/tools/kstest/run.sh +++ b/tools/kstest/run.sh @@ -1,10 +1,5 @@ #!/bin/bash - -#7488914 19865984 -#4296426 18672376 -#5024292 17944118 - #dataset_fn="../../datasets/job_codings_v4_confidential.csv" #jobids=( ) #jobids=( ${jobids[@]} 19865984 ) @@ -20,8 +15,8 @@ jobids=( ${jobids[@]} 5024292 ) set -x for jobid in ${jobids[@]}; do - sim_fn="./ks_similarities_$jobid.csv" - progress_fn="./ks_progress_$jobid.csv" - log_fn="./ks_fail_$jobid.log" - cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn & + sim_fn="$output_dir/ks_similarities_$jobid.csv" + progress_fn="$output_dir/ks_progress_$jobid.csv" + log_fn="$output_dir/ks_fail_$jobid.log" + time cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn done diff --git a/tools/kstest/src/main.rs b/tools/kstest/src/main.rs index 980e49e..2c4e3e5 100644 --- a/tools/kstest/src/main.rs +++ b/tools/kstest/src/main.rs @@ -14,7 +14,8 @@ use std::env; use std::io::LineWriter; pub type Score = u32; -pub type JobCoding = Vec; +pub type MetricCoding = Vec; +pub type JobCoding = Vec; pub type Similarity = f32; pub type Jobid = u32; @@ -23,8 +24,15 @@ pub type QCodings = HashMap; #[derive(Debug, Deserialize)] pub struct Record { jobid: u32, - //q16_coding: String, - ks_coding: String, + ks_md_file_create: String, + ks_md_file_delete: String, + ks_md_mod: String, + ks_md_other: String, + ks_md_read: String, + ks_read_bytes: String, + ks_read_calls: String, + ks_write_bytes: String, + ks_write_calls: String, } #[derive(Debug, Serialize)] @@ -75,15 +83,24 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S //for result in rdr.deserialize().take(10000) { for result in rdr.deserialize() { let record: Record = result.expect("bla bla"); - //let q_coding = convert_to_coding(record.q16_coding); - let q_coding = convert_to_coding(record.ks_coding); - // Insert Non-Zero jobs only - if q_coding.iter().sum::() > (0 as Score) { + let q_coding = vec![ + convert_to_coding(record.ks_md_file_create), + convert_to_coding(record.ks_md_file_delete), + convert_to_coding(record.ks_md_mod), + convert_to_coding(record.ks_md_other), + convert_to_coding(record.ks_md_read), + convert_to_coding(record.ks_read_bytes), + convert_to_coding(record.ks_read_calls), + convert_to_coding(record.ks_write_bytes), + convert_to_coding(record.ks_write_calls), + ]; + + // Filter Zero-Jobs + if q_coding.iter().map(|x| x.iter().sum::()).sum::() > (0 as Score) { q_codings.insert(record.jobid, q_coding); } } - let probe = q_codings[&jobid].clone(); let similarities_file = File::create(&similarities_fn).expect("Unable to open"); let mut wtr_similarities = csv::Writer::from_writer(&similarities_file); let alg_name = "ks"; @@ -91,11 +108,9 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S let progress_file = File::create(&progress_fn).expect("Unable to open"); let mut wtr_progress = csv::Writer::from_writer(&progress_file); - let mut start = chrono::Utc::now(); let mut counter = 1; let mut avail_codings: Vec<(u32, &JobCoding)>; - avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect(); let mut similarities: Vec<(Jobid, Similarity)> = Vec::new(); @@ -103,6 +118,8 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S let mut log_file = LineWriter::new(log_file); + let probe = q_codings[&jobid].clone(); + let mut start = chrono::Utc::now(); while let Some((jobid, q_coding)) = avail_codings.pop() { if (counter % 10_000) == 0 { let stop = chrono::Utc::now(); @@ -118,16 +135,35 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S //println!("Processing {:?}", jobid); //let similarity = ks_similarity(q_coding, &probe); - + + let mut metric_similarities = vec![]; + let confidence = 0.95; - let similarity = match ks::test(q_coding, &probe, confidence) { - Ok(sim) => (1.0 - sim.reject_probability) as Similarity, - Err(e) => { - let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e); - log_file.write_all(message.as_bytes()).unwrap(); - 1.0 - } - }; + for metric_codings in q_coding.iter().zip(&probe) { + let metric_similarity = match ks::test(metric_codings.0, metric_codings.1, confidence) { + Ok(sim) => { + (1.0 - sim.reject_probability) as Similarity + } + Err(e) => { + let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e); + log_file.write_all(message.as_bytes()).unwrap(); + 1.0 + } + }; + metric_similarities.push(metric_similarity); + } + let similarity = metric_similarities.iter().sum::() / (metric_similarities.len() as f32); + + //let similarity = match ks::test(q_coding, &probe, confidence) { + // Ok(sim) => { + // (1.0 - sim.reject_probability) as Similarity, + // } + // Err(e) => { + // let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e); + // log_file.write_all(message.as_bytes()).unwrap(); + // 1.0 + // } + //}; similarities.push((jobid, similarity)); counter += 1;