mistral-io-datasets/datasets/anonymise.py

#!/usr/bin/env python3
''' 
        ;     ; User
        ; --- ; "username": "u241117"
    +++ ; --- ; "user_id": 20391,
        ; --- ; "groupname": "ifmto",
    +++ ; --- ; "group_id": 1597,
    +++ ; --- ; "account": "ku0646",
        ; --- ; "parent_accounts": "/root/dkrz/ku0646/ku0646",

        ;     ; Job configuration
    +++ ; --- ; "jobname": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.ddt.job",
        ; --- ; "job_name": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.ddt.job",
        ; --- ; "work_dir": "/mnt/lustre01/work/ku0646/u241117/TiME/1deg_res/build_dbg",
        ;     ; "time_limit": 1800,
    +++ ;     ; "total_cpus": 48,
    +++ ;     ; "total_nodes": 1,
    +++ ;     ; "ntasks_per_node": 1,
    +++ ;     ; "ntasks": 1,
    +++ ;     ; "cpus_per_task": 1,

        ;     ; Job runtime statistics
    +++ ; --- ; "jobid": 19611958,
    +++ ;     ; "cluster": "mistral",
    +++ ;     ; "nodes": " m11275 ",
    +++ ;     ; "partition": "compute",
    +++ ;     ; "@start": "2020-02-21T13:41:25",
    +++ ;     ; "@end": "2020-02-21T14:00:48",
        ;     ; "@eligible": "2020-02-21T13:41:23",
        ;     ; "@submit": "2020-02-21T13:41:23",
    +++ ;     ; "exit_code": "0:0",
    +++ ;     ; "state": "CANCELLED",
    +++ ;     ; "elapsed": 1163,
        ;     ; "cpu_hours": 15.506667,

        ;     ; Other
        ; --- ; "std_in": "/dev/null",
        ; --- ; "std_out": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.%j.out",
        ; --- ; "std_err": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.%j.err",
        ;     ; "pack_job_id": 0,
        ;     ; "qos": "normal",
        ;     ; "alloc_node": "mlogin100",
        ;     ; "pack_job_offset": 0,
        ;     ; "derived_ec": "0:0",
        ;     ; "queue_wait": 2,

'''

import os
#import time
#import json
#from difflib import SequenceMatcher
import numpy as np
import pandas as pd


if __name__ == '__main__':
    FNS = [
        'job_codings_v4_confidential.csv',
        #'job_codings_v3_confidential.csv',
        #'job_metadata_confidential.csv',
        ]

    for in_fn in FNS:
        #(name, ext) = os.path.splitext(in_fn)
        out_fn = in_fn.replace('_confidential', '')
        if not os.path.exists(out_fn):
            print('Processing %s' % in_fn)
            df = pd.read_csv(in_fn)
            df['jobid'] = df['jobid'] ^ 22897682
            if 'user_id' in df:
                df['user_id'] = df['user_id'] ^ 90235
            if 'grou_id' in df:
                df['group_id'] = df['group_id'] ^ 30235
            if 'account' in df:
                df.drop(['account', 'job_name', 'nodes'], inplace=True, axis=1)
            df.to_csv(out_fn, index=False)
        else:
            print('Skipping %s. File exists.' % in_fn)
Anonymizer script 2020-08-18 17:57:13 +00:00			`#!/usr/bin/env python3`
			`'''`
			`; ; User`
			`; --- ; "username": "u241117"`
			`+++ ; --- ; "user_id": 20391,`
			`; --- ; "groupname": "ifmto",`
			`+++ ; --- ; "group_id": 1597,`
			`+++ ; --- ; "account": "ku0646",`
			`; --- ; "parent_accounts": "/root/dkrz/ku0646/ku0646",`

			`; ; Job configuration`
			`+++ ; --- ; "jobname": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.ddt.job",`
			`; --- ; "job_name": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.ddt.job",`
			`; --- ; "work_dir": "/mnt/lustre01/work/ku0646/u241117/TiME/1deg_res/build_dbg",`
			`; ; "time_limit": 1800,`
			`+++ ; ; "total_cpus": 48,`
			`+++ ; ; "total_nodes": 1,`
			`+++ ; ; "ntasks_per_node": 1,`
			`+++ ; ; "ntasks": 1,`
			`+++ ; ; "cpus_per_task": 1,`

			`; ; Job runtime statistics`
			`+++ ; --- ; "jobid": 19611958,`
			`+++ ; ; "cluster": "mistral",`
			`+++ ; ; "nodes": " m11275 ",`
			`+++ ; ; "partition": "compute",`
			`+++ ; ; "@start": "2020-02-21T13:41:25",`
			`+++ ; ; "@end": "2020-02-21T14:00:48",`
			`; ; "@eligible": "2020-02-21T13:41:23",`
			`; ; "@submit": "2020-02-21T13:41:23",`
			`+++ ; ; "exit_code": "0:0",`
			`+++ ; ; "state": "CANCELLED",`
			`+++ ; ; "elapsed": 1163,`
			`; ; "cpu_hours": 15.506667,`

			`; ; Other`
			`; --- ; "std_in": "/dev/null",`
			`; --- ; "std_out": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.%j.out",`
			`; --- ; "std_err": "/home/zmaw/u241117/wr-work/TiME/1deg_res/build_dbg/time.%j.err",`
			`; ; "pack_job_id": 0,`
			`; ; "qos": "normal",`
			`; ; "alloc_node": "mlogin100",`
			`; ; "pack_job_offset": 0,`
			`; ; "derived_ec": "0:0",`
			`; ; "queue_wait": 2,`

			`'''`

			`import os`
			`#import time`
			`#import json`
			`#from difflib import SequenceMatcher`
			`import numpy as np`
			`import pandas as pd`


			`if __name__ == '__main__':`
			`FNS = [`
Merge branch 'master' of http://git.hps.vi4io.org/eugen.betke/mistral-io-datasets into master 2020-09-02 11:46:52 +00:00			`'job_codings_v4_confidential.csv',`
			`#'job_codings_v3_confidential.csv',`
			`#'job_metadata_confidential.csv',`
Anonymizer script 2020-08-18 17:57:13 +00:00			`]`

			`for in_fn in FNS:`
			`#(name, ext) = os.path.splitext(in_fn)`
			`out_fn = in_fn.replace('_confidential', '')`
			`if not os.path.exists(out_fn):`
			`print('Processing %s' % in_fn)`
			`df = pd.read_csv(in_fn)`
			`df['jobid'] = df['jobid'] ^ 22897682`
			`if 'user_id' in df:`
			`df['user_id'] = df['user_id'] ^ 90235`
			`if 'grou_id' in df:`
			`df['group_id'] = df['group_id'] ^ 30235`
			`if 'account' in df:`
			`df.drop(['account', 'job_name', 'nodes'], inplace=True, axis=1)`
			`df.to_csv(out_fn, index=False)`
			`else:`
			`print('Skipping %s. File exists.' % in_fn)`