mistral-io-datasets/datasets/anonymise.py

79 lines
2.7 KiB
Python
Executable File

#!/usr/bin/env python3
'''
; ; User
; --- ; "username": "u241117"
+++ ; --- ; "user_id": 20391,
; --- ; "groupname": "ifmto",
+++ ; --- ; "group_id": 1597,
+++ ; --- ; "account": "ku0646",
; --- ; "parent_accounts": "/root/dkrz/ku0646/ku0646",
; ; Job configuration
+++ ; --- ; "jobname": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.ddt.job",
; --- ; "job_name": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.ddt.job",
; --- ; "work_dir": "/mnt/lustre01/work/ku0646/u241117/TiME/1deg_res/build_dbg",
; ; "time_limit": 1800,
+++ ; ; "total_cpus": 48,
+++ ; ; "total_nodes": 1,
+++ ; ; "ntasks_per_node": 1,
+++ ; ; "ntasks": 1,
+++ ; ; "cpus_per_task": 1,
; ; Job runtime statistics
+++ ; --- ; "jobid": 19611958,
+++ ; ; "cluster": "mistral",
+++ ; ; "nodes": " m11275 ",
+++ ; ; "partition": "compute",
+++ ; ; "@start": "2020-02-21T13:41:25",
+++ ; ; "@end": "2020-02-21T14:00:48",
; ; "@eligible": "2020-02-21T13:41:23",
; ; "@submit": "2020-02-21T13:41:23",
+++ ; ; "exit_code": "0:0",
+++ ; ; "state": "CANCELLED",
+++ ; ; "elapsed": 1163,
; ; "cpu_hours": 15.506667,
; ; Other
; --- ; "std_in": "/dev/null",
; --- ; "std_out": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.%j.out",
; --- ; "std_err": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.%j.err",
; ; "pack_job_id": 0,
; ; "qos": "normal",
; ; "alloc_node": "mlogin100",
; ; "pack_job_offset": 0,
; ; "derived_ec": "0:0",
; ; "queue_wait": 2,
'''
import os
#import time
#import json
#from difflib import SequenceMatcher
import numpy as np
import pandas as pd
if __name__ == '__main__':
FNS = [
'job_codings_v3_confidential.csv',
'job_metadata_confidential.csv',
]
for in_fn in FNS:
#(name, ext) = os.path.splitext(in_fn)
out_fn = in_fn.replace('_confidential', '')
if not os.path.exists(out_fn):
print('Processing %s' % in_fn)
df = pd.read_csv(in_fn)
df['jobid'] = df['jobid'] ^ 22897682
if 'user_id' in df:
df['user_id'] = df['user_id'] ^ 90235
if 'grou_id' in df:
df['group_id'] = df['group_id'] ^ 30235
if 'account' in df:
df.drop(['account', 'job_name', 'nodes'], inplace=True, axis=1)
df.to_csv(out_fn, index=False)
else:
print('Skipping %s. File exists.' % in_fn)