Kolmogorov-Smirnov changed to metric-wise computation

This commit is contained in:
eugen.betke 2020-09-02 20:02:56 +02:00
parent d9fbbdb87f
commit cd307c98da
3 changed files with 60 additions and 28 deletions

View File

@ -5,6 +5,7 @@ filenames=( ${filenames[@]} "clustering_progress.csv" )
filenames=( ${filenames[@]} $(ls job_metadata*.csv) )
filenames=( ${filenames[@]} $( ls job_similarities_*.csv ) )
filenames=( ${filenames[@]} $( ls sim_computation_times_*.csv ) )
filenames=( ${filenames[@]} $( ls ks_*.csv ) )
echo "${filenames[*]}"

View File

@ -1,10 +1,5 @@
#!/bin/bash
#7488914 19865984
#4296426 18672376
#5024292 17944118
#dataset_fn="../../datasets/job_codings_v4_confidential.csv"
#jobids=( )
#jobids=( ${jobids[@]} 19865984 )
@ -20,8 +15,8 @@ jobids=( ${jobids[@]} 5024292 )
set -x
for jobid in ${jobids[@]}; do
sim_fn="./ks_similarities_$jobid.csv"
progress_fn="./ks_progress_$jobid.csv"
log_fn="./ks_fail_$jobid.log"
cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn &
sim_fn="$output_dir/ks_similarities_$jobid.csv"
progress_fn="$output_dir/ks_progress_$jobid.csv"
log_fn="$output_dir/ks_fail_$jobid.log"
time cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn
done

View File

@ -14,7 +14,8 @@ use std::env;
use std::io::LineWriter;
pub type Score = u32;
pub type JobCoding = Vec<Score>;
pub type MetricCoding = Vec<Score>;
pub type JobCoding = Vec<MetricCoding>;
pub type Similarity = f32;
pub type Jobid = u32;
@ -23,8 +24,15 @@ pub type QCodings = HashMap<Jobid, JobCoding>;
#[derive(Debug, Deserialize)]
pub struct Record {
jobid: u32,
//q16_coding: String,
ks_coding: String,
ks_md_file_create: String,
ks_md_file_delete: String,
ks_md_mod: String,
ks_md_other: String,
ks_md_read: String,
ks_read_bytes: String,
ks_read_calls: String,
ks_write_bytes: String,
ks_write_calls: String,
}
#[derive(Debug, Serialize)]
@ -75,15 +83,24 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
//for result in rdr.deserialize().take(10000) {
for result in rdr.deserialize() {
let record: Record = result.expect("bla bla");
//let q_coding = convert_to_coding(record.q16_coding);
let q_coding = convert_to_coding(record.ks_coding);
// Insert Non-Zero jobs only
if q_coding.iter().sum::<Score>() > (0 as Score) {
let q_coding = vec![
convert_to_coding(record.ks_md_file_create),
convert_to_coding(record.ks_md_file_delete),
convert_to_coding(record.ks_md_mod),
convert_to_coding(record.ks_md_other),
convert_to_coding(record.ks_md_read),
convert_to_coding(record.ks_read_bytes),
convert_to_coding(record.ks_read_calls),
convert_to_coding(record.ks_write_bytes),
convert_to_coding(record.ks_write_calls),
];
// Filter Zero-Jobs
if q_coding.iter().map(|x| x.iter().sum::<Score>()).sum::<Score>() > (0 as Score) {
q_codings.insert(record.jobid, q_coding);
}
}
let probe = q_codings[&jobid].clone();
let similarities_file = File::create(&similarities_fn).expect("Unable to open");
let mut wtr_similarities = csv::Writer::from_writer(&similarities_file);
let alg_name = "ks";
@ -91,11 +108,9 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
let progress_file = File::create(&progress_fn).expect("Unable to open");
let mut wtr_progress = csv::Writer::from_writer(&progress_file);
let mut start = chrono::Utc::now();
let mut counter = 1;
let mut avail_codings: Vec<(u32, &JobCoding)>;
avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect();
let mut similarities: Vec<(Jobid, Similarity)> = Vec::new();
@ -103,6 +118,8 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
let mut log_file = LineWriter::new(log_file);
let probe = q_codings[&jobid].clone();
let mut start = chrono::Utc::now();
while let Some((jobid, q_coding)) = avail_codings.pop() {
if (counter % 10_000) == 0 {
let stop = chrono::Utc::now();
@ -119,15 +136,34 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
//println!("Processing {:?}", jobid);
//let similarity = ks_similarity(q_coding, &probe);
let mut metric_similarities = vec![];
let confidence = 0.95;
let similarity = match ks::test(q_coding, &probe, confidence) {
Ok(sim) => (1.0 - sim.reject_probability) as Similarity,
Err(e) => {
let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
log_file.write_all(message.as_bytes()).unwrap();
1.0
}
};
for metric_codings in q_coding.iter().zip(&probe) {
let metric_similarity = match ks::test(metric_codings.0, metric_codings.1, confidence) {
Ok(sim) => {
(1.0 - sim.reject_probability) as Similarity
}
Err(e) => {
let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
log_file.write_all(message.as_bytes()).unwrap();
1.0
}
};
metric_similarities.push(metric_similarity);
}
let similarity = metric_similarities.iter().sum::<f32>() / (metric_similarities.len() as f32);
//let similarity = match ks::test(q_coding, &probe, confidence) {
// Ok(sim) => {
// (1.0 - sim.reject_probability) as Similarity,
// }
// Err(e) => {
// let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
// log_file.write_all(message.as_bytes()).unwrap();
// 1.0
// }
//};
similarities.push((jobid, similarity));
counter += 1;