Kolmogorov-Smirnov changed to metric-wise computation

This commit is contained in:
eugen.betke 2020-09-02 20:02:56 +02:00
parent d9fbbdb87f
commit cd307c98da
3 changed files with 60 additions and 28 deletions

View File

@ -5,6 +5,7 @@ filenames=( ${filenames[@]} "clustering_progress.csv" )
filenames=( ${filenames[@]} $(ls job_metadata*.csv) ) filenames=( ${filenames[@]} $(ls job_metadata*.csv) )
filenames=( ${filenames[@]} $( ls job_similarities_*.csv ) ) filenames=( ${filenames[@]} $( ls job_similarities_*.csv ) )
filenames=( ${filenames[@]} $( ls sim_computation_times_*.csv ) ) filenames=( ${filenames[@]} $( ls sim_computation_times_*.csv ) )
filenames=( ${filenames[@]} $( ls ks_*.csv ) )
echo "${filenames[*]}" echo "${filenames[*]}"

View File

@ -1,10 +1,5 @@
#!/bin/bash #!/bin/bash
#7488914 19865984
#4296426 18672376
#5024292 17944118
#dataset_fn="../../datasets/job_codings_v4_confidential.csv" #dataset_fn="../../datasets/job_codings_v4_confidential.csv"
#jobids=( ) #jobids=( )
#jobids=( ${jobids[@]} 19865984 ) #jobids=( ${jobids[@]} 19865984 )
@ -20,8 +15,8 @@ jobids=( ${jobids[@]} 5024292 )
set -x set -x
for jobid in ${jobids[@]}; do for jobid in ${jobids[@]}; do
sim_fn="./ks_similarities_$jobid.csv" sim_fn="$output_dir/ks_similarities_$jobid.csv"
progress_fn="./ks_progress_$jobid.csv" progress_fn="$output_dir/ks_progress_$jobid.csv"
log_fn="./ks_fail_$jobid.log" log_fn="$output_dir/ks_fail_$jobid.log"
cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn & time cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn
done done

View File

@ -14,7 +14,8 @@ use std::env;
use std::io::LineWriter; use std::io::LineWriter;
pub type Score = u32; pub type Score = u32;
pub type JobCoding = Vec<Score>; pub type MetricCoding = Vec<Score>;
pub type JobCoding = Vec<MetricCoding>;
pub type Similarity = f32; pub type Similarity = f32;
pub type Jobid = u32; pub type Jobid = u32;
@ -23,8 +24,15 @@ pub type QCodings = HashMap<Jobid, JobCoding>;
#[derive(Debug, Deserialize)] #[derive(Debug, Deserialize)]
pub struct Record { pub struct Record {
jobid: u32, jobid: u32,
//q16_coding: String, ks_md_file_create: String,
ks_coding: String, ks_md_file_delete: String,
ks_md_mod: String,
ks_md_other: String,
ks_md_read: String,
ks_read_bytes: String,
ks_read_calls: String,
ks_write_bytes: String,
ks_write_calls: String,
} }
#[derive(Debug, Serialize)] #[derive(Debug, Serialize)]
@ -75,15 +83,24 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
//for result in rdr.deserialize().take(10000) { //for result in rdr.deserialize().take(10000) {
for result in rdr.deserialize() { for result in rdr.deserialize() {
let record: Record = result.expect("bla bla"); let record: Record = result.expect("bla bla");
//let q_coding = convert_to_coding(record.q16_coding); let q_coding = vec![
let q_coding = convert_to_coding(record.ks_coding); convert_to_coding(record.ks_md_file_create),
// Insert Non-Zero jobs only convert_to_coding(record.ks_md_file_delete),
if q_coding.iter().sum::<Score>() > (0 as Score) { convert_to_coding(record.ks_md_mod),
convert_to_coding(record.ks_md_other),
convert_to_coding(record.ks_md_read),
convert_to_coding(record.ks_read_bytes),
convert_to_coding(record.ks_read_calls),
convert_to_coding(record.ks_write_bytes),
convert_to_coding(record.ks_write_calls),
];
// Filter Zero-Jobs
if q_coding.iter().map(|x| x.iter().sum::<Score>()).sum::<Score>() > (0 as Score) {
q_codings.insert(record.jobid, q_coding); q_codings.insert(record.jobid, q_coding);
} }
} }
let probe = q_codings[&jobid].clone();
let similarities_file = File::create(&similarities_fn).expect("Unable to open"); let similarities_file = File::create(&similarities_fn).expect("Unable to open");
let mut wtr_similarities = csv::Writer::from_writer(&similarities_file); let mut wtr_similarities = csv::Writer::from_writer(&similarities_file);
let alg_name = "ks"; let alg_name = "ks";
@ -91,11 +108,9 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
let progress_file = File::create(&progress_fn).expect("Unable to open"); let progress_file = File::create(&progress_fn).expect("Unable to open");
let mut wtr_progress = csv::Writer::from_writer(&progress_file); let mut wtr_progress = csv::Writer::from_writer(&progress_file);
let mut start = chrono::Utc::now();
let mut counter = 1; let mut counter = 1;
let mut avail_codings: Vec<(u32, &JobCoding)>; let mut avail_codings: Vec<(u32, &JobCoding)>;
avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect(); avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect();
let mut similarities: Vec<(Jobid, Similarity)> = Vec::new(); let mut similarities: Vec<(Jobid, Similarity)> = Vec::new();
@ -103,6 +118,8 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
let mut log_file = LineWriter::new(log_file); let mut log_file = LineWriter::new(log_file);
let probe = q_codings[&jobid].clone();
let mut start = chrono::Utc::now();
while let Some((jobid, q_coding)) = avail_codings.pop() { while let Some((jobid, q_coding)) = avail_codings.pop() {
if (counter % 10_000) == 0 { if (counter % 10_000) == 0 {
let stop = chrono::Utc::now(); let stop = chrono::Utc::now();
@ -119,15 +136,34 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
//println!("Processing {:?}", jobid); //println!("Processing {:?}", jobid);
//let similarity = ks_similarity(q_coding, &probe); //let similarity = ks_similarity(q_coding, &probe);
let mut metric_similarities = vec![];
let confidence = 0.95; let confidence = 0.95;
let similarity = match ks::test(q_coding, &probe, confidence) { for metric_codings in q_coding.iter().zip(&probe) {
Ok(sim) => (1.0 - sim.reject_probability) as Similarity, let metric_similarity = match ks::test(metric_codings.0, metric_codings.1, confidence) {
Ok(sim) => {
(1.0 - sim.reject_probability) as Similarity
}
Err(e) => { Err(e) => {
let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e); let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
log_file.write_all(message.as_bytes()).unwrap(); log_file.write_all(message.as_bytes()).unwrap();
1.0 1.0
} }
}; };
metric_similarities.push(metric_similarity);
}
let similarity = metric_similarities.iter().sum::<f32>() / (metric_similarities.len() as f32);
//let similarity = match ks::test(q_coding, &probe, confidence) {
// Ok(sim) => {
// (1.0 - sim.reject_probability) as Similarity,
// }
// Err(e) => {
// let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
// log_file.write_all(message.as_bytes()).unwrap();
// 1.0
// }
//};
similarities.push((jobid, similarity)); similarities.push((jobid, similarity));
counter += 1; counter += 1;