Kolmogorov-Smirnov changed to metric-wise computation
This commit is contained in:
parent
d9fbbdb87f
commit
cd307c98da
|
@ -5,6 +5,7 @@ filenames=( ${filenames[@]} "clustering_progress.csv" )
|
||||||
filenames=( ${filenames[@]} $(ls job_metadata*.csv) )
|
filenames=( ${filenames[@]} $(ls job_metadata*.csv) )
|
||||||
filenames=( ${filenames[@]} $( ls job_similarities_*.csv ) )
|
filenames=( ${filenames[@]} $( ls job_similarities_*.csv ) )
|
||||||
filenames=( ${filenames[@]} $( ls sim_computation_times_*.csv ) )
|
filenames=( ${filenames[@]} $( ls sim_computation_times_*.csv ) )
|
||||||
|
filenames=( ${filenames[@]} $( ls ks_*.csv ) )
|
||||||
|
|
||||||
echo "${filenames[*]}"
|
echo "${filenames[*]}"
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
#7488914 19865984
|
|
||||||
#4296426 18672376
|
|
||||||
#5024292 17944118
|
|
||||||
|
|
||||||
#dataset_fn="../../datasets/job_codings_v4_confidential.csv"
|
#dataset_fn="../../datasets/job_codings_v4_confidential.csv"
|
||||||
#jobids=( )
|
#jobids=( )
|
||||||
#jobids=( ${jobids[@]} 19865984 )
|
#jobids=( ${jobids[@]} 19865984 )
|
||||||
|
@ -20,8 +15,8 @@ jobids=( ${jobids[@]} 5024292 )
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
for jobid in ${jobids[@]}; do
|
for jobid in ${jobids[@]}; do
|
||||||
sim_fn="./ks_similarities_$jobid.csv"
|
sim_fn="$output_dir/ks_similarities_$jobid.csv"
|
||||||
progress_fn="./ks_progress_$jobid.csv"
|
progress_fn="$output_dir/ks_progress_$jobid.csv"
|
||||||
log_fn="./ks_fail_$jobid.log"
|
log_fn="$output_dir/ks_fail_$jobid.log"
|
||||||
cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn &
|
time cargo run --release -- $dataset_fn $jobid $sim_fn $progress_fn $log_fn
|
||||||
done
|
done
|
||||||
|
|
|
@ -14,7 +14,8 @@ use std::env;
|
||||||
use std::io::LineWriter;
|
use std::io::LineWriter;
|
||||||
|
|
||||||
pub type Score = u32;
|
pub type Score = u32;
|
||||||
pub type JobCoding = Vec<Score>;
|
pub type MetricCoding = Vec<Score>;
|
||||||
|
pub type JobCoding = Vec<MetricCoding>;
|
||||||
pub type Similarity = f32;
|
pub type Similarity = f32;
|
||||||
|
|
||||||
pub type Jobid = u32;
|
pub type Jobid = u32;
|
||||||
|
@ -23,8 +24,15 @@ pub type QCodings = HashMap<Jobid, JobCoding>;
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
pub struct Record {
|
pub struct Record {
|
||||||
jobid: u32,
|
jobid: u32,
|
||||||
//q16_coding: String,
|
ks_md_file_create: String,
|
||||||
ks_coding: String,
|
ks_md_file_delete: String,
|
||||||
|
ks_md_mod: String,
|
||||||
|
ks_md_other: String,
|
||||||
|
ks_md_read: String,
|
||||||
|
ks_read_bytes: String,
|
||||||
|
ks_read_calls: String,
|
||||||
|
ks_write_bytes: String,
|
||||||
|
ks_write_calls: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
|
@ -75,15 +83,24 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
|
||||||
//for result in rdr.deserialize().take(10000) {
|
//for result in rdr.deserialize().take(10000) {
|
||||||
for result in rdr.deserialize() {
|
for result in rdr.deserialize() {
|
||||||
let record: Record = result.expect("bla bla");
|
let record: Record = result.expect("bla bla");
|
||||||
//let q_coding = convert_to_coding(record.q16_coding);
|
let q_coding = vec![
|
||||||
let q_coding = convert_to_coding(record.ks_coding);
|
convert_to_coding(record.ks_md_file_create),
|
||||||
// Insert Non-Zero jobs only
|
convert_to_coding(record.ks_md_file_delete),
|
||||||
if q_coding.iter().sum::<Score>() > (0 as Score) {
|
convert_to_coding(record.ks_md_mod),
|
||||||
|
convert_to_coding(record.ks_md_other),
|
||||||
|
convert_to_coding(record.ks_md_read),
|
||||||
|
convert_to_coding(record.ks_read_bytes),
|
||||||
|
convert_to_coding(record.ks_read_calls),
|
||||||
|
convert_to_coding(record.ks_write_bytes),
|
||||||
|
convert_to_coding(record.ks_write_calls),
|
||||||
|
];
|
||||||
|
|
||||||
|
// Filter Zero-Jobs
|
||||||
|
if q_coding.iter().map(|x| x.iter().sum::<Score>()).sum::<Score>() > (0 as Score) {
|
||||||
q_codings.insert(record.jobid, q_coding);
|
q_codings.insert(record.jobid, q_coding);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let probe = q_codings[&jobid].clone();
|
|
||||||
let similarities_file = File::create(&similarities_fn).expect("Unable to open");
|
let similarities_file = File::create(&similarities_fn).expect("Unable to open");
|
||||||
let mut wtr_similarities = csv::Writer::from_writer(&similarities_file);
|
let mut wtr_similarities = csv::Writer::from_writer(&similarities_file);
|
||||||
let alg_name = "ks";
|
let alg_name = "ks";
|
||||||
|
@ -91,11 +108,9 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
|
||||||
|
|
||||||
let progress_file = File::create(&progress_fn).expect("Unable to open");
|
let progress_file = File::create(&progress_fn).expect("Unable to open");
|
||||||
let mut wtr_progress = csv::Writer::from_writer(&progress_file);
|
let mut wtr_progress = csv::Writer::from_writer(&progress_file);
|
||||||
let mut start = chrono::Utc::now();
|
|
||||||
let mut counter = 1;
|
let mut counter = 1;
|
||||||
|
|
||||||
let mut avail_codings: Vec<(u32, &JobCoding)>;
|
let mut avail_codings: Vec<(u32, &JobCoding)>;
|
||||||
|
|
||||||
avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect();
|
avail_codings = q_codings.iter().map(|(k, v)| (*k, v)).collect();
|
||||||
let mut similarities: Vec<(Jobid, Similarity)> = Vec::new();
|
let mut similarities: Vec<(Jobid, Similarity)> = Vec::new();
|
||||||
|
|
||||||
|
@ -103,6 +118,8 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
|
||||||
let mut log_file = LineWriter::new(log_file);
|
let mut log_file = LineWriter::new(log_file);
|
||||||
|
|
||||||
|
|
||||||
|
let probe = q_codings[&jobid].clone();
|
||||||
|
let mut start = chrono::Utc::now();
|
||||||
while let Some((jobid, q_coding)) = avail_codings.pop() {
|
while let Some((jobid, q_coding)) = avail_codings.pop() {
|
||||||
if (counter % 10_000) == 0 {
|
if (counter % 10_000) == 0 {
|
||||||
let stop = chrono::Utc::now();
|
let stop = chrono::Utc::now();
|
||||||
|
@ -119,15 +136,34 @@ fn run(dataset_fn: String, jobid: Jobid, similarities_fn: String, progress_fn: S
|
||||||
//println!("Processing {:?}", jobid);
|
//println!("Processing {:?}", jobid);
|
||||||
//let similarity = ks_similarity(q_coding, &probe);
|
//let similarity = ks_similarity(q_coding, &probe);
|
||||||
|
|
||||||
|
let mut metric_similarities = vec![];
|
||||||
|
|
||||||
let confidence = 0.95;
|
let confidence = 0.95;
|
||||||
let similarity = match ks::test(q_coding, &probe, confidence) {
|
for metric_codings in q_coding.iter().zip(&probe) {
|
||||||
Ok(sim) => (1.0 - sim.reject_probability) as Similarity,
|
let metric_similarity = match ks::test(metric_codings.0, metric_codings.1, confidence) {
|
||||||
|
Ok(sim) => {
|
||||||
|
(1.0 - sim.reject_probability) as Similarity
|
||||||
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
|
let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
|
||||||
log_file.write_all(message.as_bytes()).unwrap();
|
log_file.write_all(message.as_bytes()).unwrap();
|
||||||
1.0
|
1.0
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
metric_similarities.push(metric_similarity);
|
||||||
|
}
|
||||||
|
let similarity = metric_similarities.iter().sum::<f32>() / (metric_similarities.len() as f32);
|
||||||
|
|
||||||
|
//let similarity = match ks::test(q_coding, &probe, confidence) {
|
||||||
|
// Ok(sim) => {
|
||||||
|
// (1.0 - sim.reject_probability) as Similarity,
|
||||||
|
// }
|
||||||
|
// Err(e) => {
|
||||||
|
// let message = format!("jobid failed {:?}, because \" {:?}\"\n", jobid, e);
|
||||||
|
// log_file.write_all(message.as_bytes()).unwrap();
|
||||||
|
// 1.0
|
||||||
|
// }
|
||||||
|
//};
|
||||||
|
|
||||||
similarities.push((jobid, similarity));
|
similarities.push((jobid, similarity));
|
||||||
counter += 1;
|
counter += 1;
|
||||||
|
|
Loading…
Reference in New Issue