From 780964943e5164ecb28806f68f63bad694e08ca3 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Mon, 3 Jun 2019 17:06:03 +0200 Subject: [PATCH 01/21] added vagrantfiles to LSTM repository --- Vagrantfile | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ bootstrap.sh | 12 +++++++++ provision.sh | 14 ++++++++++ 3 files changed, 99 insertions(+) create mode 100644 Vagrantfile create mode 100644 bootstrap.sh create mode 100755 provision.sh diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 0000000..434703f --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,73 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# All Vagrant configuration is done below. The "2" in Vagrant.configure +# configures the configuration version (we support older styles for +# backwards compatibility). Please don't change it unless you know what +# you're doing. +Vagrant.configure("2") do |config| + # The most common configuration options are documented and commented below. + # For a complete reference, please see the online documentation at + # https://docs.vagrantup.com. + + # Every Vagrant development environment requires a box. You can search for + # boxes at https://atlas.hashicorp.com/search. + config.vm.box = "ubuntu/xenial64" + + # Disable automatic box update checking. If you disable this, then + # boxes will only be checked for updates when the user runs + # `vagrant box outdated`. This is not recommended. + # config.vm.box_check_update = false + + # Create a forwarded port mapping which allows access to a specific port + # within the machine from a port on the host machine. In the example below, + # accessing "localhost:8080" will access port 80 on the guest machine. + # config.vm.network "forwarded_port", guest: 80, host: 8080 + + # Create a private network, which allows host-only access to the machine + # using a specific IP. + # config.vm.network "private_network", ip: "192.168.33.10" + # config.vm.network "public_network", ip: "127.0.0.1", bridge: "enp0s25" + + + # Create a public network, which generally matched to bridged network. + # Bridged networks make the machine appear as another physical device on + # your network. + config.vm.network "public_network", :mac => "0A0100000000", :auto_config => false + + # Share an additional folder to the guest VM. The first argument is + # the path on the host to the actual folder. The second argument is + # the path on the guest to mount the folder. And the optional third + # argument is a set of non-required options. + # config.vm.synced_folder "../data", "/vagrant_data" + + # Provider-specific configuration so you can fine-tune various + # backing providers for Vagrant. These expose provider-specific options. + # Example for VirtualBox: + # + config.vm.provider "virtualbox" do |vb| + # # Display the VirtualBox GUI when booting the machine + vb.gui = false + # + # # Customize the amount of memory on the VM: + vb.memory = 4096 + vb.cpus = 2 + end + # + # View the documentation for the provider you are using for more + # information on available options. + + # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies + # such as FTP and Heroku are also available. See the documentation at + # https://docs.vagrantup.com/v2/push/atlas.html for more information. + # config.push.define "atlas" do |push| + # push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME" + # end + + # Enable provisioning with a shell script. Additional provisioners such as + # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the + # documentation for more information about their specific syntax and use. + config.vm.provision "shell", path:"provision.sh", privileged:false; + config.vm.provision "shell", path:"bootstrap.sh" , run:"always"; + +end diff --git a/bootstrap.sh b/bootstrap.sh new file mode 100644 index 0000000..20131cf --- /dev/null +++ b/bootstrap.sh @@ -0,0 +1,12 @@ + +############################## +# +# This is a bootstrap script which is +# run at every startup of the vagrant machine +# If you want to run something just once at provisioning +# and first bootup of the vagrant machine please see +# provision.sh +# +# Contributor: Bernhard Blieninger +############################## + diff --git a/provision.sh b/provision.sh new file mode 100755 index 0000000..0a68924 --- /dev/null +++ b/provision.sh @@ -0,0 +1,14 @@ +#!/bin/bash +####################### +# +# This is a provision script +# it will be called once when the vagrant vm is first provisioned +# If you have commands that you want to run always please have a +# look at the bootstrap.sh script +# +# Contributor: Bernhard Blieninger, Robert Hamsch +###################### + +sudo apt update -qq +sudo apt install python3.5 python3-pip tmux -qq + From 7cffaaf57d60d4b97165b696815698c6b46c138f Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Wed, 19 Jun 2019 13:05:26 +0200 Subject: [PATCH 02/21] structured python files into one folder --- CuDNNLSTM.py => python3-lstm/CuDNNLSTM.py | 0 Data_preparation.py => python3-lstm/Data_preparation.py | 0 Evaluation.py => python3-lstm/Evaluation.py | 0 Plotting.py => python3-lstm/Plotting.py | 0 parallel_search.py => python3-lstm/parallel_search.py | 0 prediction.py => python3-lstm/prediction.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename CuDNNLSTM.py => python3-lstm/CuDNNLSTM.py (100%) rename Data_preparation.py => python3-lstm/Data_preparation.py (100%) rename Evaluation.py => python3-lstm/Evaluation.py (100%) rename Plotting.py => python3-lstm/Plotting.py (100%) rename parallel_search.py => python3-lstm/parallel_search.py (100%) rename prediction.py => python3-lstm/prediction.py (100%) diff --git a/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py similarity index 100% rename from CuDNNLSTM.py rename to python3-lstm/CuDNNLSTM.py diff --git a/Data_preparation.py b/python3-lstm/Data_preparation.py similarity index 100% rename from Data_preparation.py rename to python3-lstm/Data_preparation.py diff --git a/Evaluation.py b/python3-lstm/Evaluation.py similarity index 100% rename from Evaluation.py rename to python3-lstm/Evaluation.py diff --git a/Plotting.py b/python3-lstm/Plotting.py similarity index 100% rename from Plotting.py rename to python3-lstm/Plotting.py diff --git a/parallel_search.py b/python3-lstm/parallel_search.py similarity index 100% rename from parallel_search.py rename to python3-lstm/parallel_search.py diff --git a/prediction.py b/python3-lstm/prediction.py similarity index 100% rename from prediction.py rename to python3-lstm/prediction.py From df9c96e0d5872e3148d7ed639320889230566dec Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Thu, 5 Sep 2019 23:21:39 +0200 Subject: [PATCH 03/21] rewrote data_preparation --- data_preparation.py | 317 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 317 insertions(+) create mode 100644 data_preparation.py diff --git a/data_preparation.py b/data_preparation.py new file mode 100644 index 0000000..ce52853 --- /dev/null +++ b/data_preparation.py @@ -0,0 +1,317 @@ +import pickle +import sys +import numpy as np +import pandas as pd +from tqdm import tqdm +from keras.preprocessing.sequence import pad_sequences +import sqlite3 + +# After exporting the relational database to separate tables with .csv extension, the transformation can begin +# The first step is to read the cvs files as Dataframes +#df_taskset = pd.read_csv('TaskSet.csv') # import task-sets +# print(df_taskset.head()) if you want to see how the data look like + +#df_task = pd.read_csv('Task.csv') # import tasks +# print(df_task.head()) + +#df_job = pd.read_csv('Job.csv') # import jobs +# print(df_job.head()) + + +# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label +PKGs = { + 'pi' : 0, + 'hey' : 1, + 'tumatmul' : 2, + 'cond_mod' : 3 + } + +# Integer encoding for Exit_Values from Jobs +Exit_Values = { + 'EXIT' : 1, + 'EXIT_CRITICAL' : 0, + 'EXIT_PERIOD' : 2, + 'OUT_OF_CAPS' : 3, + 'OUT_OF_QUOTA' : 4 + } + +# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17 +Arg_Values = { + 1 : 1, + 4096 : 2, + 8192 : 3, + 16384 : 4, + 32768 : 5, + 65536 : 6, + 131072 : 7, + 262144 : 8, + 524288 : 9, + 1048576 : 10, + 2097152 : 11, + 847288609443 : 12, + 2541865828329 : 13, + 7625597484987 : 14, + 22876792454961 : 15, + 68630377364883 : 16, + 205891132094649 : 17 + } + + +print("Doing writing") + +DB_PATH = "/home/bernhard/panda_v4.db" +TASKS_DICT = {} + +def taskToFeatureList(task): + #returns a fature list for the corresponding task values + feature = [] + feature.append(task['Priority']) + feature.append(task['Period']) + feature.append(task['Number_of_Jobs']) + feature.append(task['PKG']) + feature.append(task['Arg']) + feature.append(task['CRITICALTIME']) + return feature + + +def getTaskFeatures(db_path): #c is the cursor for the db + # returns a dictionary + # { task_id : [ feature, list ] + conn = sqlite3.connect(db_path) + conn.row_factory = lambda C, R: { c[0]: R[i] for i, c in enumerate(C.description) } + db_cursor = conn.cursor() + db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task') + outputTable = db_cursor.fetchall() + + tasks_dict = {} + for row in outputTable: + row['Period'] = int(row['Period']/1000) + row['Number_of_Jobs'] = int(row['Number_of_Jobs']) + row['PKG'] = PKGs[row['PKG']] + row['CRITICALTIME'] = int(row['CRITICALTIME']/1000) + row['Arg'] = Arg_Values[row['Arg']] + tasks_dict[row['Task_ID']] = taskToFeatureList(row) + return tasks_dict + + +def processTaskset(tasksetData): + # tasksetData is a list of tuples returned from the DB in getTasksetData() + try: + label = tasksetData[0][-1] + features = [] + jobExitsByTask = {} + for tsData in tasksetData: + try: + jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]]) + except KeyError: + jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]] + for taskIdNo in (1,2,3): + if tasksetData[0][taskIdNo] != -1: + features += TASKS_DICT[tasksetData[0][taskIdNo]] + features += jobExitsByTask[tasksetData[0][taskIdNo]] + except KeyError as k: + for t in tasksetData: + print(t) + raise k + return np.array(features), label + + +def getFeaturesLabels(db_path): + conn = sqlite3.connect(db_path) + db_cursor = conn.cursor() + command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful'\ + ' FROM TaskSet JOIN Job'\ + ' ON TaskSet.Set_ID = Job.Set_ID and'\ + ' (TaskSet.TASK1_ID == Job.Task_ID or'\ + ' TaskSet.TASK2_ID == Job.Task_ID or'\ + ' TaskSet.TASK3_ID == Job.Task_ID);' + db_cursor.execute(command) + # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)] + data_table = db_cursor.fetchall() + print('reading taskset_jobs join done') + finalFeatureList = [] + finalLabelList = [] + currentTset = data_table[0][0] # first taskset id + tSetJobs = [] + totalSize = len(data_table) + for row in data_table: + if row[0] % 1000 == 0: + print('processed',int(100 * (row[0]/totalSize)),'%' ) + if row[0] == currentTset: + #then still same setTset + tSetJobs.append(row) + else: + # job of next taskset + # process data and record new + features, label = processTaskset(tSetJobs) + finalFeatureList.append(features) + finalLabelList.append(label) + tSetJobs = [] + currentTset = row[0] + tSetJobs.append(row) + # proess last taskset + features, label = processTaskset(tSetJobs) + finalFeatureList.append(features) + finalLabelList.append(label) + return finalFeatureList, finalLabelList + + + +TASKS_DICT = getTaskFeatures(DB_PATH) +print('Tasks have been added to TASKS_DICT') +print('length of taskdict: ', len(TASKS_DICT)) +print('example task 222:',TASKS_DICT[222]) + +features, labels = getFeaturesLabels(DB_PATH) + +print("Done reading") + + +''' + + +# 2. data transformation + +# here starts data transformation +#ntn = df_task[['PKG']].values # get values from PKG in tasks. This step is equivalent to: Select distinct PKG from Task +#ntn1 = [] +#for n in ntn: +# ntn1.append(n[0]) +#print(np.unique(ntn1)) # print the unique values + + + + + + + + + +# 3. Features and Labels extraction +i = 0 + + + + +sys.exit() +features = [] # create an empty list for features +labels = [] # create an empty list for labels +new_task_list = [] +# loop in the task-set +with tqdm ( total=len( list(df_taskset.iterrows()))) as pbar: # the total length would be total=len(list(df_taskset.iterrows())) + + for task_set in taskset_table: + + if task_set['TASK1_ID']!= -1: + + new_task_list.append() + + + blub + + + for index, row in df_taskset.iterrows (): + + try: + + i += 1 + grid = int(df_taskset.loc[index, 'Set_ID']) # task_set ID + first_task = int(df_taskset.loc[index, 'TASK1_ID']) # first task_id + second_task = int(df_taskset.loc[index, 'TASK2_ID']) # second task_id + third_task = int(df_taskset.loc[index, 'TASK3_ID']) # third task_id + fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) # fourth task_id + tasks = [] # empty list of tasks where features are saved later + + if first_task != -1: # if the first task exists in this task-set then : + + task_info = df_task.loc[df_task['Task_ID'] == first_task] + tasks.append(int(task_info['Priority'])) # save the priority + tasks.append(int(task_info['Period']/1000)) # save the period in seconds + tasks.append(int(task_info['Number_of_Jobs'])) # save number of jobs + n = str(task_info['PKG'].item()) + tasks.append(PKGs[n]) #save the numerical value of PKG + av = int(task_info['Arg'].item()) + tasks.append(Arg_Values[av]) #save the scaled value of Arg + tasks.append(int(task_info['CRITICALTIME']/1000)) # save criticaltime in seconds + # for each job in that is in the task and has this task_set id + job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] + + for ind, r in job_info.iterrows(): + tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) # save the transformed exit value + + if second_task != -1: # if the second task exists in this task-set then : + first_task = second_task + task_info = df_task.loc[df_task['Task_ID'] == first_task] + tasks.append(int(task_info['Priority'])) + tasks.append(int(task_info['Period']/1000)) + tasks.append(int(task_info['Number_of_Jobs'])) + n = str(task_info['PKG'].item()) + tasks.append(PKGs[n]) + av = int(task_info['Arg'].item()) + tasks.append(Arg_Values[av]) + tasks.append(int(task_info['CRITICALTIME']/1000)) + print(tasks) + job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] + for ind, r in job_info.iterrows(): + tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) + + if third_task != -1: # if the third task exists in this task-set then : + first_task = third_task + task_info = df_task.loc[df_task['Task_ID'] == first_task] + tasks.append(int(task_info['Priority'])) + tasks.append(int(task_info['Period']/1000)) + tasks.append(int(task_info['Number_of_Jobs'])) + n = str(task_info['PKG'].item()) + tasks.append(PKGs[n]) + av = int(task_info['Arg'].item()) + tasks.append(Arg_Values[av]) + tasks.append(int(task_info['CRITICALTIME']/1000)) + + job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] + for ind, r in job_info.iterrows(): + tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) + + + if fourth_task != -1: # if the fourth task exists in this task-set then : + first_task = fourth_task + task_info = df_task.loc[df_task['Task_ID'] == first_task] + tasks.append(int(task_info['Priority'])) + tasks.append(int(task_info['Period']/1000)) + tasks.append(int(task_info['Number_of_Jobs'])) + n = str(task_info['PKG'].item()) + tasks.append(PKGs[n]) + av = int(task_info['Arg'].item()) + tasks.append(Arg_Values[av]) + tasks.append(int(task_info['CRITICALTIME']/1000)) + + job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] + for ind, r in job_info.iterrows(): + tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) + + + tasks = np.array(tasks) # to save the task list as numpy array + features.append(tasks) # values in tasks are features + labels.append(int(df_taskset.loc[index, 'Successful'])) # in the label list, append the value in the successful col from task-set + except Exception as e: # exception handler + print(e) + pass + pbar.update(1) + +''' + +labels = np.array(labels) # to save the labels list as numpy array + +# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value +features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post') + +#print(features.shape) # the dimensionality of features +#print(labels.shape) # the dimensionality of labels + +# save both files for the training +with open ( '56_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' + pickle.dump(features, outfile) + +with open ( '56_labels', 'wb' ) as outfile: + pickle.dump(labels, outfile) + + From dd0c0921f5b654e98ae478a67dab92acf01674df Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Thu, 5 Sep 2019 23:24:30 +0200 Subject: [PATCH 04/21] added timestamps --- data_preparation.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/data_preparation.py b/data_preparation.py index ce52853..1abab29 100644 --- a/data_preparation.py +++ b/data_preparation.py @@ -1,3 +1,4 @@ +from datetime import datetime import pickle import sys import numpy as np @@ -129,6 +130,7 @@ def getFeaturesLabels(db_path): # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)] data_table = db_cursor.fetchall() print('reading taskset_jobs join done') + print('The current time is:',datetime.now()) finalFeatureList = [] finalLabelList = [] currentTset = data_table[0][0] # first taskset id @@ -164,9 +166,12 @@ def getFeaturesLabels(db_path): features, labels = getFeaturesLabels(DB_PATH) -print("Done reading") +print('The current time is:',datetime.now()) +print("Done reading") + + ''' @@ -314,4 +319,4 @@ def getFeaturesLabels(db_path): with open ( '56_labels', 'wb' ) as outfile: pickle.dump(labels, outfile) - +print('The current time is:',datetime.now()) From 2c2f519d7afe070ece521d8c955e228dcb20b492 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Thu, 5 Sep 2019 23:35:50 +0200 Subject: [PATCH 05/21] catched error where datasets are not fully completed in jobs table --- data_preparation.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/data_preparation.py b/data_preparation.py index 1abab29..772aa28 100644 --- a/data_preparation.py +++ b/data_preparation.py @@ -33,7 +33,8 @@ 'EXIT_CRITICAL' : 0, 'EXIT_PERIOD' : 2, 'OUT_OF_CAPS' : 3, - 'OUT_OF_QUOTA' : 4 + 'OUT_OF_QUOTA' : 4, + 'EXIT_ERROR' : 5 } # ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17 @@ -97,23 +98,21 @@ def getTaskFeatures(db_path): #c is the cursor for the db def processTaskset(tasksetData): # tasksetData is a list of tuples returned from the DB in getTasksetData() - try: - label = tasksetData[0][-1] - features = [] - jobExitsByTask = {} - for tsData in tasksetData: + label = tasksetData[0][-1] + features = [] + jobExitsByTask = {} + for tsData in tasksetData: + try: + jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]]) + except KeyError: + jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]] + for taskIdNo in (1,2,3): + if tasksetData[0][taskIdNo] != -1: + features += TASKS_DICT[tasksetData[0][taskIdNo]] try: - jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]]) - except KeyError: - jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]] - for taskIdNo in (1,2,3): - if tasksetData[0][taskIdNo] != -1: - features += TASKS_DICT[tasksetData[0][taskIdNo]] features += jobExitsByTask[tasksetData[0][taskIdNo]] - except KeyError as k: - for t in tasksetData: - print(t) - raise k + except KeyError: + features += [Exit_Values['EXIT_ERROR']] return np.array(features), label From f8ba1be68f71099b44639fcca56e8123e7d8321f Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Sun, 8 Sep 2019 00:20:05 +0200 Subject: [PATCH 06/21] changed prediction.py --- python3-lstm/prediction.py | 278 +++++++++++++++++++++---------------- 1 file changed, 155 insertions(+), 123 deletions(-) diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py index 3280154..b9fa43c 100644 --- a/python3-lstm/prediction.py +++ b/python3-lstm/prediction.py @@ -6,130 +6,162 @@ from keras.models import load_model import csv -df_taskset = pd.read_csv ( 'TaskSet.csv' ) -# df_taskset = df_taskset.sample(frac=0.0001, random_state=99) -df_task = pd.read_csv ( 'Task.csv' ) -df_job = pd.read_csv ( 'Job.csv' ) - -ntn = df_task[['PKG']].values -ntn1 = [] -for n in ntn: - ntn1.append ( n[0] ) - -PKGs = {} -PKGs['pi'] = 0 -PKGs['hey'] = 1 -PKGs['tumatmul'] = 2 -PKGs['cond_mod'] = 3 - -Exit_Values = {} -Exit_Values['EXIT'] = 1 -Exit_Values['EXIT_CRITICAL'] = 0 - -Arg_Values = {} -Arg_Values[1] = 1 -Arg_Values[4096] = 2 -Arg_Values[8192] = 3 -Arg_Values[16384] = 4 -Arg_Values[32768] = 5 -Arg_Values[65536] = 6 -Arg_Values[131072] = 7 -Arg_Values[262144] = 8 -Arg_Values[524288] = 9 -Arg_Values[1048576] = 10 -Arg_Values[2097152] = 11 -Arg_Values[847288609443] = 12 -Arg_Values[2541865828329] = 13 -Arg_Values[7625597484987] = 14 -Arg_Values[22876792454961] = 15 -Arg_Values[68630377364883] = 16 -Arg_Values[205891132094649] = 17 - -i = 0 -features = [] -labels = [] -with tqdm(total=len(list(df_taskset.iterrows()))) as pbar: - for index, row in df_taskset.iterrows(): - +# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label +PKGs = { + 'pi': 0, + 'hey': 1, + 'tumatmul': 2, + 'cond_mod': 3 +} + +# Integer encoding for Exit_Values from Jobs +Exit_Values = { + 'EXIT': 1, + 'EXIT_CRITICAL': 0, + 'EXIT_PERIOD': 2, + 'OUT_OF_CAPS': 3, + 'OUT_OF_QUOTA': 4, + 'EXIT_ERROR': 5 +} + +# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17 +Arg_Values = { + 1: 1, + 4096: 2, + 8192: 3, + 16384: 4, + 32768: 5, + 65536: 6, + 131072: 7, + 262144: 8, + 524288: 9, + 1048576: 10, + 2097152: 11, + 847288609443: 12, + 2541865828329: 13, + 7625597484987: 14, + 22876792454961: 15, + 68630377364883: 16, + 205891132094649: 17 +} + +print("Doing writing") + +DB_PATH = "/home/bernhard/panda_v4.db" +TASKS_DICT = {} + + +def taskToFeatureList(task): + # returns a fature list for the corresponding task values + feature = [] + feature.append(task['Priority']) + feature.append(task['Period']) + feature.append(task['Number_of_Jobs']) + feature.append(task['PKG']) + feature.append(task['Arg']) + feature.append(task['CRITICALTIME']) + return feature + + +def getTaskFeatures(db_path): # c is the cursor for the db + # returns a dictionary + # { task_id : [ feature, list ] + conn = sqlite3.connect(db_path) + conn.row_factory = lambda C, R: {c[0]: R[i] for i, c in enumerate(C.description)} + db_cursor = conn.cursor() + db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task') + outputTable = db_cursor.fetchall() + + tasks_dict = {} + for row in outputTable: + row['Period'] = int(row['Period'] / 1000) + row['Number_of_Jobs'] = int(row['Number_of_Jobs']) + row['PKG'] = PKGs[row['PKG']] + row['CRITICALTIME'] = int(row['CRITICALTIME'] / 1000) + row['Arg'] = Arg_Values[row['Arg']] + tasks_dict[row['Task_ID']] = taskToFeatureList(row) + return tasks_dict + + +def processTaskset(tasksetData): + # tasksetData is a list of tuples returned from the DB in getTasksetData() + label = tasksetData[0][-1] + features = [] + jobExitsByTask = {} + for tsData in tasksetData: try: - - i += 1 - grid = int(df_taskset.loc[index, 'Set_ID']) - res = int(df_taskset.loc[index, 'Successful']) - print(grid) - first_task = int(df_taskset.loc[index, 'TASK1_ID']) - second_task = int(df_taskset.loc[index, 'TASK2_ID']) - third_task = int(df_taskset.loc[index, 'TASK3_ID']) - fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) - tasks = [] - - if first_task != -1: - - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period'] / 1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - tasks.append(int(task_info['Arg'])) - tasks.append(int(task_info['CRITICALTIME'] / 1000)) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if second_task != -1: - first_task = second_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period'] / 1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - tasks.append(int(task_info['Arg'])) - tasks.append(int(task_info['CRITICALTIME'] / 1000)) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if third_task != -1: - first_task = third_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period'] / 1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - tasks.append(int(task_info['Arg'])) - tasks.append( int ( task_info['CRITICALTIME'] / 1000)) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if fourth_task != -1: - first_task = fourth_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period'])) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - tasks.append(int(task_info['Arg'])) - tasks.append(int(task_info['CRITICALTIME'])) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - labels = np.array(int(df_taskset.loc[index, 'Successful'])) - - tasks = np.array(tasks) - features.append(tasks) - labels.append(res) - except Exception as e: - print(e) - pass - pbar.update(1) - -labels = np.array(labels) -features = pad_sequences(features, maxlen=42, value=-1, padding='post', truncating='post') + jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]]) + except KeyError: + jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]] + for taskIdNo in (1, 2, 3): + if tasksetData[0][taskIdNo] != -1: + features += TASKS_DICT[tasksetData[0][taskIdNo]] + try: + features += jobExitsByTask[tasksetData[0][taskIdNo]] + except KeyError: + features += [Exit_Values['EXIT_ERROR']] + return np.array(features), label + + +def getFeaturesLabels(db_path): + conn = sqlite3.connect(db_path) + db_cursor = conn.cursor() + command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful' \ + ' FROM TaskSet JOIN Job' \ + ' ON TaskSet.Set_ID = Job.Set_ID and' \ + ' (TaskSet.TASK1_ID == Job.Task_ID or' \ + ' TaskSet.TASK2_ID == Job.Task_ID or' \ + ' TaskSet.TASK3_ID == Job.Task_ID);' + db_cursor.execute(command) + # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)] + data_table = db_cursor.fetchall() + print('reading taskset_jobs join done') + print('The current time is:', datetime.now()) + finalFeatureList = [] + finalLabelList = [] + currentTset = data_table[0][0] # first taskset id + tSetJobs = [] + totalSize = len(data_table) + for row in data_table: + if row[0] % 1000 == 0: + print('processed', int(100 * (row[0] / totalSize)), '%') + if row[0] == currentTset: + # then still same setTset + tSetJobs.append(row) + else: + # job of next taskset + # process data and record new + features, label = processTaskset(tSetJobs) + finalFeatureList.append(features) + finalLabelList.append(label) + tSetJobs = [] + currentTset = row[0] + tSetJobs.append(row) + # proess last taskset + features, label = processTaskset(tSetJobs) + finalFeatureList.append(features) + finalLabelList.append(label) + return finalFeatureList, finalLabelList + + +TASKS_DICT = getTaskFeatures(DB_PATH) +print('Tasks have been added to TASKS_DICT') +print('length of taskdict: ', len(TASKS_DICT)) +print('example task 222:', TASKS_DICT[222]) + +features, labels = getFeaturesLabels(DB_PATH) + +print('The current time is:', datetime.now()) + +print("Done reading") + + + + +labels = np.array(labels) # to save the labels list as numpy array + +# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value +features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post') model = load_model('My_LSTM_Model.h5') X = np.expand_dims(features, axis=2) From f83b34dcdc9966484d7f6fa445d9a8840773eb53 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Sun, 8 Sep 2019 00:44:31 +0200 Subject: [PATCH 07/21] enhanced data_preparation --- data_preparation.py | 145 -------------------------------------------- 1 file changed, 145 deletions(-) diff --git a/data_preparation.py b/data_preparation.py index 772aa28..42e4949 100644 --- a/data_preparation.py +++ b/data_preparation.py @@ -7,18 +7,6 @@ from keras.preprocessing.sequence import pad_sequences import sqlite3 -# After exporting the relational database to separate tables with .csv extension, the transformation can begin -# The first step is to read the cvs files as Dataframes -#df_taskset = pd.read_csv('TaskSet.csv') # import task-sets -# print(df_taskset.head()) if you want to see how the data look like - -#df_task = pd.read_csv('Task.csv') # import tasks -# print(df_task.head()) - -#df_job = pd.read_csv('Job.csv') # import jobs -# print(df_job.head()) - - # PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label PKGs = { 'pi' : 0, @@ -170,139 +158,6 @@ def getFeaturesLabels(db_path): print("Done reading") - -''' - - -# 2. data transformation - -# here starts data transformation -#ntn = df_task[['PKG']].values # get values from PKG in tasks. This step is equivalent to: Select distinct PKG from Task -#ntn1 = [] -#for n in ntn: -# ntn1.append(n[0]) -#print(np.unique(ntn1)) # print the unique values - - - - - - - - - -# 3. Features and Labels extraction -i = 0 - - - - -sys.exit() -features = [] # create an empty list for features -labels = [] # create an empty list for labels -new_task_list = [] -# loop in the task-set -with tqdm ( total=len( list(df_taskset.iterrows()))) as pbar: # the total length would be total=len(list(df_taskset.iterrows())) - - for task_set in taskset_table: - - if task_set['TASK1_ID']!= -1: - - new_task_list.append() - - - blub - - - for index, row in df_taskset.iterrows (): - - try: - - i += 1 - grid = int(df_taskset.loc[index, 'Set_ID']) # task_set ID - first_task = int(df_taskset.loc[index, 'TASK1_ID']) # first task_id - second_task = int(df_taskset.loc[index, 'TASK2_ID']) # second task_id - third_task = int(df_taskset.loc[index, 'TASK3_ID']) # third task_id - fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) # fourth task_id - tasks = [] # empty list of tasks where features are saved later - - if first_task != -1: # if the first task exists in this task-set then : - - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) # save the priority - tasks.append(int(task_info['Period']/1000)) # save the period in seconds - tasks.append(int(task_info['Number_of_Jobs'])) # save number of jobs - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) #save the numerical value of PKG - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) #save the scaled value of Arg - tasks.append(int(task_info['CRITICALTIME']/1000)) # save criticaltime in seconds - # for each job in that is in the task and has this task_set id - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) # save the transformed exit value - - if second_task != -1: # if the second task exists in this task-set then : - first_task = second_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - print(tasks) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if third_task != -1: # if the third task exists in this task-set then : - first_task = third_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - - if fourth_task != -1: # if the fourth task exists in this task-set then : - first_task = fourth_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - - tasks = np.array(tasks) # to save the task list as numpy array - features.append(tasks) # values in tasks are features - labels.append(int(df_taskset.loc[index, 'Successful'])) # in the label list, append the value in the successful col from task-set - except Exception as e: # exception handler - print(e) - pass - pbar.update(1) - -''' - labels = np.array(labels) # to save the labels list as numpy array # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value From e1a22e3061bd8c80b777411c4b87bc2fa7c43738 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Sun, 8 Sep 2019 00:50:12 +0200 Subject: [PATCH 08/21] moved data_preparation and updated prediction, could be improved further --- data_preparation.py | 176 ------------------ python3-lstm/Data_preparation.py | 304 +++++++++++++++---------------- python3-lstm/prediction.py | 8 +- 3 files changed, 152 insertions(+), 336 deletions(-) delete mode 100644 data_preparation.py diff --git a/data_preparation.py b/data_preparation.py deleted file mode 100644 index 42e4949..0000000 --- a/data_preparation.py +++ /dev/null @@ -1,176 +0,0 @@ -from datetime import datetime -import pickle -import sys -import numpy as np -import pandas as pd -from tqdm import tqdm -from keras.preprocessing.sequence import pad_sequences -import sqlite3 - -# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label -PKGs = { - 'pi' : 0, - 'hey' : 1, - 'tumatmul' : 2, - 'cond_mod' : 3 - } - -# Integer encoding for Exit_Values from Jobs -Exit_Values = { - 'EXIT' : 1, - 'EXIT_CRITICAL' : 0, - 'EXIT_PERIOD' : 2, - 'OUT_OF_CAPS' : 3, - 'OUT_OF_QUOTA' : 4, - 'EXIT_ERROR' : 5 - } - -# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17 -Arg_Values = { - 1 : 1, - 4096 : 2, - 8192 : 3, - 16384 : 4, - 32768 : 5, - 65536 : 6, - 131072 : 7, - 262144 : 8, - 524288 : 9, - 1048576 : 10, - 2097152 : 11, - 847288609443 : 12, - 2541865828329 : 13, - 7625597484987 : 14, - 22876792454961 : 15, - 68630377364883 : 16, - 205891132094649 : 17 - } - - -print("Doing writing") - -DB_PATH = "/home/bernhard/panda_v4.db" -TASKS_DICT = {} - -def taskToFeatureList(task): - #returns a fature list for the corresponding task values - feature = [] - feature.append(task['Priority']) - feature.append(task['Period']) - feature.append(task['Number_of_Jobs']) - feature.append(task['PKG']) - feature.append(task['Arg']) - feature.append(task['CRITICALTIME']) - return feature - - -def getTaskFeatures(db_path): #c is the cursor for the db - # returns a dictionary - # { task_id : [ feature, list ] - conn = sqlite3.connect(db_path) - conn.row_factory = lambda C, R: { c[0]: R[i] for i, c in enumerate(C.description) } - db_cursor = conn.cursor() - db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task') - outputTable = db_cursor.fetchall() - - tasks_dict = {} - for row in outputTable: - row['Period'] = int(row['Period']/1000) - row['Number_of_Jobs'] = int(row['Number_of_Jobs']) - row['PKG'] = PKGs[row['PKG']] - row['CRITICALTIME'] = int(row['CRITICALTIME']/1000) - row['Arg'] = Arg_Values[row['Arg']] - tasks_dict[row['Task_ID']] = taskToFeatureList(row) - return tasks_dict - - -def processTaskset(tasksetData): - # tasksetData is a list of tuples returned from the DB in getTasksetData() - label = tasksetData[0][-1] - features = [] - jobExitsByTask = {} - for tsData in tasksetData: - try: - jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]]) - except KeyError: - jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]] - for taskIdNo in (1,2,3): - if tasksetData[0][taskIdNo] != -1: - features += TASKS_DICT[tasksetData[0][taskIdNo]] - try: - features += jobExitsByTask[tasksetData[0][taskIdNo]] - except KeyError: - features += [Exit_Values['EXIT_ERROR']] - return np.array(features), label - - -def getFeaturesLabels(db_path): - conn = sqlite3.connect(db_path) - db_cursor = conn.cursor() - command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful'\ - ' FROM TaskSet JOIN Job'\ - ' ON TaskSet.Set_ID = Job.Set_ID and'\ - ' (TaskSet.TASK1_ID == Job.Task_ID or'\ - ' TaskSet.TASK2_ID == Job.Task_ID or'\ - ' TaskSet.TASK3_ID == Job.Task_ID);' - db_cursor.execute(command) - # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)] - data_table = db_cursor.fetchall() - print('reading taskset_jobs join done') - print('The current time is:',datetime.now()) - finalFeatureList = [] - finalLabelList = [] - currentTset = data_table[0][0] # first taskset id - tSetJobs = [] - totalSize = len(data_table) - for row in data_table: - if row[0] % 1000 == 0: - print('processed',int(100 * (row[0]/totalSize)),'%' ) - if row[0] == currentTset: - #then still same setTset - tSetJobs.append(row) - else: - # job of next taskset - # process data and record new - features, label = processTaskset(tSetJobs) - finalFeatureList.append(features) - finalLabelList.append(label) - tSetJobs = [] - currentTset = row[0] - tSetJobs.append(row) - # proess last taskset - features, label = processTaskset(tSetJobs) - finalFeatureList.append(features) - finalLabelList.append(label) - return finalFeatureList, finalLabelList - - - -TASKS_DICT = getTaskFeatures(DB_PATH) -print('Tasks have been added to TASKS_DICT') -print('length of taskdict: ', len(TASKS_DICT)) -print('example task 222:',TASKS_DICT[222]) - -features, labels = getFeaturesLabels(DB_PATH) - - -print('The current time is:',datetime.now()) - -print("Done reading") - -labels = np.array(labels) # to save the labels list as numpy array - -# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value -features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post') - -#print(features.shape) # the dimensionality of features -#print(labels.shape) # the dimensionality of labels - -# save both files for the training -with open ( '56_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' - pickle.dump(features, outfile) - -with open ( '56_labels', 'wb' ) as outfile: - pickle.dump(labels, outfile) - -print('The current time is:',datetime.now()) diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py index 0d847a1..d215f89 100644 --- a/python3-lstm/Data_preparation.py +++ b/python3-lstm/Data_preparation.py @@ -1,164 +1,160 @@ +from datetime import datetime import pickle import numpy as np -import pandas as pd -from tqdm import tqdm from keras.preprocessing.sequence import pad_sequences - -# After exporting the relational database to separate tables with .csv extension, the transformation can begin -# The first step is to read the cvs files as Dataframes -df_taskset = pd.read_csv('TaskSet.csv') # import task-sets -# print(df_taskset.head()) if you want to see how the data look like - -df_task = pd.read_csv('Task.csv') # import tasks -# print(df_task.head()) - -df_job = pd.read_csv('Job.csv') # import jobs -# print(df_job.head()) - - -# 2. data transformation - -# here starts data transformation -ntn = df_task[['PKG']].values # get values from PKG in tasks. This step is equivalent to: Select distinct PKG from Task -ntn1 = [] -for n in ntn: - ntn1.append(n[0]) -print(np.unique(ntn1)) # print the unique values - - +import sqlite3 # PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label -PKGs = {} -PKGs['pi'] = 0 -PKGs['hey'] = 1 -PKGs['tumatmul'] = 2 -PKGs['cond_mod'] = 3 - -# INteger encoding for Exit_Values from Jobs -Exit_Values = {} -Exit_Values['EXIT'] = 1 -Exit_Values['EXIT_CRITICAL'] = 0 - +PKGs = { + 'pi' : 0, + 'hey' : 1, + 'tumatmul' : 2, + 'cond_mod' : 3 + } + +# Integer encoding for Exit_Values from Jobs +Exit_Values = { + 'EXIT' : 1, + 'EXIT_CRITICAL' : 0, + 'EXIT_PERIOD' : 2, + 'OUT_OF_CAPS' : 3, + 'OUT_OF_QUOTA' : 4, + 'EXIT_ERROR' : 5 + } # ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17 -Arg_Values = {} -Arg_Values[1] = 1 -Arg_Values[4096] = 2 -Arg_Values[8192] = 3 -Arg_Values[16384] = 4 -Arg_Values[32768] = 5 -Arg_Values[65536] = 6 -Arg_Values[131072] = 7 -Arg_Values[262144] = 8 -Arg_Values[524288] = 9 -Arg_Values[1048576] = 10 -Arg_Values[2097152] = 11 -Arg_Values[847288609443] = 12 -Arg_Values[2541865828329] = 13 -Arg_Values[7625597484987] = 14 -Arg_Values[22876792454961] = 15 -Arg_Values[68630377364883] = 16 -Arg_Values[205891132094649] = 17 - - - -# 3. Features and Labels extraction -i = 0 - -features = [] # create an empty list for features -labels = [] # create an empty list for labels -# loop in the task-set -with tqdm ( total=len ( - list(df_taskset.iterrows()))) as pbar: # the total length would be total=len(list(df_taskset.iterrows())) - for index, row in df_taskset.iterrows (): - +Arg_Values = { + 1 : 1, + 4096 : 2, + 8192 : 3, + 16384 : 4, + 32768 : 5, + 65536 : 6, + 131072 : 7, + 262144 : 8, + 524288 : 9, + 1048576 : 10, + 2097152 : 11, + 847288609443 : 12, + 2541865828329 : 13, + 7625597484987 : 14, + 22876792454961 : 15, + 68630377364883 : 16, + 205891132094649 : 17 + } + + +print("Doing writing") + +DB_PATH = "/home/bernhard/panda_v4.db" +TASKS_DICT = {} + +def taskToFeatureList(task): + #returns a fature list for the corresponding task values + feature = [] + feature.append(task['Priority']) + feature.append(task['Period']) + feature.append(task['Number_of_Jobs']) + feature.append(task['PKG']) + feature.append(task['Arg']) + feature.append(task['CRITICALTIME']) + return feature + + +def getTaskFeatures(db_path): #c is the cursor for the db + # returns a dictionary + # { task_id : [ feature, list ] + conn = sqlite3.connect(db_path) + conn.row_factory = lambda C, R: { c[0]: R[i] for i, c in enumerate(C.description) } + db_cursor = conn.cursor() + db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task') + outputTable = db_cursor.fetchall() + + tasks_dict = {} + for row in outputTable: + row['Period'] = int(row['Period']/1000) + row['Number_of_Jobs'] = int(row['Number_of_Jobs']) + row['PKG'] = PKGs[row['PKG']] + row['CRITICALTIME'] = int(row['CRITICALTIME']/1000) + row['Arg'] = Arg_Values[row['Arg']] + tasks_dict[row['Task_ID']] = taskToFeatureList(row) + return tasks_dict + + +def processTaskset(tasksetData): + # tasksetData is a list of tuples returned from the DB in getTasksetData() + label = tasksetData[0][-1] + features = [] + jobExitsByTask = {} + for tsData in tasksetData: try: - - i += 1 - grid = int(df_taskset.loc[index, 'Set_ID']) # task_set ID - first_task = int(df_taskset.loc[index, 'TASK1_ID']) # first task_id - second_task = int(df_taskset.loc[index, 'TASK2_ID']) # second task_id - third_task = int(df_taskset.loc[index, 'TASK3_ID']) # third task_id - fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) # fourth task_id - tasks = [] # empty list of tasks where features are saved later - - if first_task != -1: # if the first task exists in this task-set then : - - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) # save the priority - tasks.append(int(task_info['Period']/1000)) # save the period in seconds - tasks.append(int(task_info['Number_of_Jobs'])) # save number of jobs - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) #save the numerical value of PKG - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) #save the scaled value of Arg - tasks.append(int(task_info['CRITICALTIME']/1000)) # save criticaltime in seconds - # for each job in that is in the task and has this task_set id - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) # save the transformed exit value - - if second_task != -1: # if the second task exists in this task-set then : - first_task = second_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - print(tasks) - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - if third_task != -1: # if the third task exists in this task-set then : - first_task = third_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - - if fourth_task != -1: # if the fourth task exists in this task-set then : - first_task = fourth_task - task_info = df_task.loc[df_task['Task_ID'] == first_task] - tasks.append(int(task_info['Priority'])) - tasks.append(int(task_info['Period']/1000)) - tasks.append(int(task_info['Number_of_Jobs'])) - n = str(task_info['PKG'].item()) - tasks.append(PKGs[n]) - av = int(task_info['Arg'].item()) - tasks.append(Arg_Values[av]) - tasks.append(int(task_info['CRITICALTIME']/1000)) - - job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)] - for ind, r in job_info.iterrows(): - tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']]) - - - tasks = np.array(tasks) # to save the task list as numpy array - features.append(tasks) # values in tasks are features - labels.append(int(df_taskset.loc[index, 'Successful'])) # in the label list, append the value in the successful col from task-set - except Exception as e: # exception handler - print(e) - pass - pbar.update(1) - - - + jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]]) + except KeyError: + jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]] + for taskIdNo in (1,2,3): + if tasksetData[0][taskIdNo] != -1: + features += TASKS_DICT[tasksetData[0][taskIdNo]] + try: + features += jobExitsByTask[tasksetData[0][taskIdNo]] + except KeyError: + features += [Exit_Values['EXIT_ERROR']] + return np.array(features), label + + +def getFeaturesLabels(db_path): + conn = sqlite3.connect(db_path) + db_cursor = conn.cursor() + command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful'\ + ' FROM TaskSet JOIN Job'\ + ' ON TaskSet.Set_ID = Job.Set_ID and'\ + ' (TaskSet.TASK1_ID == Job.Task_ID or'\ + ' TaskSet.TASK2_ID == Job.Task_ID or'\ + ' TaskSet.TASK3_ID == Job.Task_ID);' + db_cursor.execute(command) + # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)] + data_table = db_cursor.fetchall() + print('reading taskset_jobs join done') + print('The current time is:',datetime.now()) + finalFeatureList = [] + finalLabelList = [] + currentTset = data_table[0][0] # first taskset id + tSetJobs = [] + totalSize = len(data_table) + for row in data_table: + if row[0] % 1000 == 0: + print('processed',int(100 * (row[0]/totalSize)),'%' ) + if row[0] == currentTset: + #then still same setTset + tSetJobs.append(row) + else: + # job of next taskset + # process data and record new + features, label = processTaskset(tSetJobs) + finalFeatureList.append(features) + finalLabelList.append(label) + tSetJobs = [] + currentTset = row[0] + tSetJobs.append(row) + # proess last taskset + features, label = processTaskset(tSetJobs) + finalFeatureList.append(features) + finalLabelList.append(label) + return finalFeatureList, finalLabelList + + + +TASKS_DICT = getTaskFeatures(DB_PATH) +print('Tasks have been added to TASKS_DICT') +print('length of taskdict: ', len(TASKS_DICT)) +print('example task 222:',TASKS_DICT[222]) + +features, labels = getFeaturesLabels(DB_PATH) + + +print('The current time is:',datetime.now()) + +print("Done reading") + labels = np.array(labels) # to save the labels list as numpy array # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value @@ -173,3 +169,5 @@ with open ( '56_labels', 'wb' ) as outfile: pickle.dump(labels, outfile) + +print('The current time is:',datetime.now()) diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py index b9fa43c..c19c3fd 100644 --- a/python3-lstm/prediction.py +++ b/python3-lstm/prediction.py @@ -1,7 +1,4 @@ -import pickle import numpy as np -import pandas as pd -from tqdm import tqdm from keras.preprocessing.sequence import pad_sequences from keras.models import load_model import csv @@ -155,10 +152,7 @@ def getFeaturesLabels(db_path): print("Done reading") - - - -labels = np.array(labels) # to save the labels list as numpy array +labels = np.array(labels) # to save the labels list as numpy array # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post') From 4f0b26a35deb445e4f5ed5fda0bcdba0d7061c54 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Sun, 8 Sep 2019 12:46:17 +0200 Subject: [PATCH 09/21] updated Data_preparation to accept a different DB --- python3-lstm/Data_preparation.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py index d215f89..4312e77 100644 --- a/python3-lstm/Data_preparation.py +++ b/python3-lstm/Data_preparation.py @@ -4,6 +4,9 @@ from keras.preprocessing.sequence import pad_sequences import sqlite3 +debug = False + + # PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label PKGs = { 'pi' : 0, @@ -44,9 +47,10 @@ } -print("Doing writing") +if debug: + print("Doing writing") -DB_PATH = "/home/bernhard/panda_v4.db" +DB_PATH = sys.argv[1] TASKS_DICT = {} def taskToFeatureList(task): @@ -135,7 +139,7 @@ def getFeaturesLabels(db_path): tSetJobs = [] currentTset = row[0] tSetJobs.append(row) - # proess last taskset + # process last taskset features, label = processTaskset(tSetJobs) finalFeatureList.append(features) finalLabelList.append(label) @@ -160,8 +164,9 @@ def getFeaturesLabels(db_path): # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post') -#print(features.shape) # the dimensionality of features -#print(labels.shape) # the dimensionality of labels +if debug: + print(features.shape) # the dimensionality of features + print(labels.shape) # the dimensionality of labels # save both files for the training with open ( '56_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' From 626490ec5e9f0e11d757ec1280756c49f0c75d4a Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Sun, 8 Sep 2019 12:47:55 +0200 Subject: [PATCH 10/21] added gitignore file --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f11b75 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ From 78c0fd45f8ed537170bb757346dc9d14b001b167 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Sun, 8 Sep 2019 22:14:45 +0200 Subject: [PATCH 11/21] Updated environment setup and prediction --- bootstrap.sh | 3 + provision.sh | 3 + python3-lstm/prediction.py | 160 ++-------------------------------- python3-lstm/requirements.txt | 8 ++ 4 files changed, 21 insertions(+), 153 deletions(-) create mode 100644 python3-lstm/requirements.txt diff --git a/bootstrap.sh b/bootstrap.sh index 20131cf..f5e64fd 100644 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -10,3 +10,6 @@ # Contributor: Bernhard Blieninger ############################## +python3 -m venv lstm-virtenv +source lstm-virtenv/bin/activate +pip3 install -r python3-lstm/requirements.txt diff --git a/provision.sh b/provision.sh index 0a68924..5b4f244 100755 --- a/provision.sh +++ b/provision.sh @@ -10,5 +10,8 @@ ###################### sudo apt update -qq + sudo apt install python3.5 python3-pip tmux -qq +sudo apt install python3-venv +#pip3 install --user virtualenv diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py index c19c3fd..dda7bbb 100644 --- a/python3-lstm/prediction.py +++ b/python3-lstm/prediction.py @@ -1,161 +1,15 @@ import numpy as np -from keras.preprocessing.sequence import pad_sequences +import pickle from keras.models import load_model import csv -# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label -PKGs = { - 'pi': 0, - 'hey': 1, - 'tumatmul': 2, - 'cond_mod': 3 -} +with open ( '56_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' + #pickle.load(outfile features, outfile) + features = pickle.load(outfile, *, fix_imports=True, encoding="ASCII", errors="strict") -# Integer encoding for Exit_Values from Jobs -Exit_Values = { - 'EXIT': 1, - 'EXIT_CRITICAL': 0, - 'EXIT_PERIOD': 2, - 'OUT_OF_CAPS': 3, - 'OUT_OF_QUOTA': 4, - 'EXIT_ERROR': 5 -} - -# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17 -Arg_Values = { - 1: 1, - 4096: 2, - 8192: 3, - 16384: 4, - 32768: 5, - 65536: 6, - 131072: 7, - 262144: 8, - 524288: 9, - 1048576: 10, - 2097152: 11, - 847288609443: 12, - 2541865828329: 13, - 7625597484987: 14, - 22876792454961: 15, - 68630377364883: 16, - 205891132094649: 17 -} - -print("Doing writing") - -DB_PATH = "/home/bernhard/panda_v4.db" -TASKS_DICT = {} - - -def taskToFeatureList(task): - # returns a fature list for the corresponding task values - feature = [] - feature.append(task['Priority']) - feature.append(task['Period']) - feature.append(task['Number_of_Jobs']) - feature.append(task['PKG']) - feature.append(task['Arg']) - feature.append(task['CRITICALTIME']) - return feature - - -def getTaskFeatures(db_path): # c is the cursor for the db - # returns a dictionary - # { task_id : [ feature, list ] - conn = sqlite3.connect(db_path) - conn.row_factory = lambda C, R: {c[0]: R[i] for i, c in enumerate(C.description)} - db_cursor = conn.cursor() - db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task') - outputTable = db_cursor.fetchall() - - tasks_dict = {} - for row in outputTable: - row['Period'] = int(row['Period'] / 1000) - row['Number_of_Jobs'] = int(row['Number_of_Jobs']) - row['PKG'] = PKGs[row['PKG']] - row['CRITICALTIME'] = int(row['CRITICALTIME'] / 1000) - row['Arg'] = Arg_Values[row['Arg']] - tasks_dict[row['Task_ID']] = taskToFeatureList(row) - return tasks_dict - - -def processTaskset(tasksetData): - # tasksetData is a list of tuples returned from the DB in getTasksetData() - label = tasksetData[0][-1] - features = [] - jobExitsByTask = {} - for tsData in tasksetData: - try: - jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]]) - except KeyError: - jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]] - for taskIdNo in (1, 2, 3): - if tasksetData[0][taskIdNo] != -1: - features += TASKS_DICT[tasksetData[0][taskIdNo]] - try: - features += jobExitsByTask[tasksetData[0][taskIdNo]] - except KeyError: - features += [Exit_Values['EXIT_ERROR']] - return np.array(features), label - - -def getFeaturesLabels(db_path): - conn = sqlite3.connect(db_path) - db_cursor = conn.cursor() - command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful' \ - ' FROM TaskSet JOIN Job' \ - ' ON TaskSet.Set_ID = Job.Set_ID and' \ - ' (TaskSet.TASK1_ID == Job.Task_ID or' \ - ' TaskSet.TASK2_ID == Job.Task_ID or' \ - ' TaskSet.TASK3_ID == Job.Task_ID);' - db_cursor.execute(command) - # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)] - data_table = db_cursor.fetchall() - print('reading taskset_jobs join done') - print('The current time is:', datetime.now()) - finalFeatureList = [] - finalLabelList = [] - currentTset = data_table[0][0] # first taskset id - tSetJobs = [] - totalSize = len(data_table) - for row in data_table: - if row[0] % 1000 == 0: - print('processed', int(100 * (row[0] / totalSize)), '%') - if row[0] == currentTset: - # then still same setTset - tSetJobs.append(row) - else: - # job of next taskset - # process data and record new - features, label = processTaskset(tSetJobs) - finalFeatureList.append(features) - finalLabelList.append(label) - tSetJobs = [] - currentTset = row[0] - tSetJobs.append(row) - # proess last taskset - features, label = processTaskset(tSetJobs) - finalFeatureList.append(features) - finalLabelList.append(label) - return finalFeatureList, finalLabelList - - -TASKS_DICT = getTaskFeatures(DB_PATH) -print('Tasks have been added to TASKS_DICT') -print('length of taskdict: ', len(TASKS_DICT)) -print('example task 222:', TASKS_DICT[222]) - -features, labels = getFeaturesLabels(DB_PATH) - -print('The current time is:', datetime.now()) - -print("Done reading") - -labels = np.array(labels) # to save the labels list as numpy array - -# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value -features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post') +with open ( '56_labels', 'wb' ) as outfile: + labels = pickle.load(outfile, *, fix_imports=True, encoding="ASCII", errors="strict") + #pickle.dump(labels, outfile) model = load_model('My_LSTM_Model.h5') X = np.expand_dims(features, axis=2) diff --git a/python3-lstm/requirements.txt b/python3-lstm/requirements.txt new file mode 100644 index 0000000..99bb79b --- /dev/null +++ b/python3-lstm/requirements.txt @@ -0,0 +1,8 @@ +keras==2.2.5 +matplotlib==3.1.1 +numpy==1.17.2 +pandas==0.25.1 +seaborn==0.9.0 +scikit_learn==0.21.3 +tensorboard==1.14.0 +tensorflow==1.14.0 From 32da32ceb992a5167c707a2d1b9a6f4831cecb97 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Sun, 8 Sep 2019 22:18:45 +0200 Subject: [PATCH 12/21] corrected typo --- python3-lstm/Evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python3-lstm/Evaluation.py b/python3-lstm/Evaluation.py index ffd9811..2b5e489 100644 --- a/python3-lstm/Evaluation.py +++ b/python3-lstm/Evaluation.py @@ -58,8 +58,8 @@ plt.figure ( figsize=(5.5, 4) ) sns.heatmap ( cm_df, annot=True, fmt='g' ) -plt.title ( 'Confusoin Matrix \n Accuracy:{0:.3f}'.format ( accuracy_score ( yt, yp ) ) ) +plt.title ( 'Confusion Matrix \n Accuracy:{0:.3f}'.format ( accuracy_score ( yt, yp ) ) ) plt.ylabel ( 'True label' ) plt.xlabel ( 'Predicted label' ) plt.show () -plt.savefig ( 'Confusoin_Matrix.png' ) +plt.savefig ( 'Confusion_Matrix.png' ) From 35664b09923ad3bcda4e434ee080606712ccda0b Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Sun, 8 Sep 2019 22:40:19 +0200 Subject: [PATCH 13/21] fixed missing spaces in string --- python3-lstm/prediction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py index dda7bbb..46d1db7 100644 --- a/python3-lstm/prediction.py +++ b/python3-lstm/prediction.py @@ -19,7 +19,7 @@ for i in range(len(labels)): l = labels[i] p = np.argmax(preds[i]) - print ( "the actual value is{0}and the predicted value is {1}".format(l, p)) + print ( "the actual value is {0} and the predicted value is {1}".format(l, p)) arr.append([i + 1, l, p]) csvfile = "Predicion_results.csv" From f7b1d458ce0810bceedf4c038a5db33ac91563cc Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Mon, 9 Sep 2019 00:35:51 +0200 Subject: [PATCH 14/21] Added a random value to dataset split --- python3-lstm/CuDNNLSTM.py | 2 +- python3-lstm/Evaluation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py index 5fd9629..569ba2a 100644 --- a/python3-lstm/CuDNNLSTM.py +++ b/python3-lstm/CuDNNLSTM.py @@ -39,7 +39,7 @@ # print ( count ) # devide data into training and test sets -X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 ) +X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 ,random_state=42) # print ( X_train.shape ) # LSTM input is fifty-six time-steps and one feature at each time-step is represented by the notation: (56,1). diff --git a/python3-lstm/Evaluation.py b/python3-lstm/Evaluation.py index 2b5e489..cc2c75c 100644 --- a/python3-lstm/Evaluation.py +++ b/python3-lstm/Evaluation.py @@ -30,7 +30,7 @@ y = np.array ( newy ) print ( count ) -X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 ) +X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3, random_state=42) print ( X_train.shape ) model = load_model ( 'My_LSTM_Model.h5' ) # loading saved model From 284e8cc1674ad545477d6782793c03e9b461ae81 Mon Sep 17 00:00:00 2001 From: Bernhard Blieninger <5148715+privatereese@users.noreply.github.com> Date: Mon, 9 Sep 2019 10:20:15 +0200 Subject: [PATCH 15/21] fixed minor typos --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 70d2e70..c488bc9 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ The first step is to preprocess the data. The database was imported and transfor 5. Task Critical time: Integer 6. Number of Jobs: Integer From Jobs only one feature was selected: Job Exit_Value: String. -After exporting all tables, start with Data_preparation.py. Line 165 is responsible for the length of the feature vector. +After exporting all tables, start with Data_preparation.py. Feature and labels are save in the end. @@ -35,7 +35,7 @@ CuDNNLSTM.py. When using CPU, install Tensorflow and replace CuDNNLSTM with LSTM Evaluation.py. Evaluation prints the confusion matrix and classification report. Tensorboard can be launched by typing tensorboard -–logdir=logs/ into the terminal and logs from trained models can be visualized **4. Prediction:** -predictin.py. A CSV file will be save with actual and predictied values. The trained model should be loaded first. +prediction.py. A CSV file will be save with actual and predicted values. The trained model should be loaded first. **5. Plotting:** Plotting.py. Another way to visualize the model built. From 9b224b6fba2df3483837800f0323170f30baec88 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Mon, 9 Sep 2019 11:25:35 +0200 Subject: [PATCH 16/21] edited CUDNNLSTM and Data_preparation --- python3-lstm/CuDNNLSTM.py | 3 +-- python3-lstm/Data_preparation.py | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py index 569ba2a..2ceaab5 100644 --- a/python3-lstm/CuDNNLSTM.py +++ b/python3-lstm/CuDNNLSTM.py @@ -46,8 +46,7 @@ input = Input ( shape=(56, 1) ) # the first LSTM layer has 64 cells, the number must be equal/bigger than the input size. If you are using a CPU then change CuDNNLSTM to LSTM -lstm = CuDNNLSTM ( 64, return_sequences=True ) ( - input ) # Return_sequences is set true because the first LSTM has to return a sequence, which then can be fed into the 2nd LSTM +lstm = CuDNNLSTM ( 64, return_sequences=True ) ( input ) # Return_sequences is set true because the first LSTM has to return a sequence, which then can be fed into the 2nd LSTM lstm = CuDNNLSTM ( 128, return_sequences=True ) ( lstm ) lstm = CuDNNLSTM ( 256 ) ( lstm ) diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py index 4312e77..7e1bb70 100644 --- a/python3-lstm/Data_preparation.py +++ b/python3-lstm/Data_preparation.py @@ -1,5 +1,6 @@ from datetime import datetime import pickle +import sys import numpy as np from keras.preprocessing.sequence import pad_sequences import sqlite3 @@ -166,7 +167,9 @@ def getFeaturesLabels(db_path): if debug: print(features.shape) # the dimensionality of features + input() print(labels.shape) # the dimensionality of labels + input() # save both files for the training with open ( '56_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' From 57201f58626a0436e120fdce6b3541bc0257615e Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Mon, 9 Sep 2019 11:30:33 +0200 Subject: [PATCH 17/21] put time in it --- python3-lstm/Data_preparation.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py index 7e1bb70..a5fd194 100644 --- a/python3-lstm/Data_preparation.py +++ b/python3-lstm/Data_preparation.py @@ -4,6 +4,9 @@ import numpy as np from keras.preprocessing.sequence import pad_sequences import sqlite3 +import timeit + +start = timeit.default_timer() debug = False @@ -178,4 +181,7 @@ def getFeaturesLabels(db_path): with open ( '56_labels', 'wb' ) as outfile: pickle.dump(labels, outfile) -print('The current time is:',datetime.now()) + +stop = timeit.default_timer() + +print('Time elapsed: ', stop - start) From e3b8733d8c4c26cea03199f2e254a4c278f286fb Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Mon, 9 Sep 2019 13:07:03 +0200 Subject: [PATCH 18/21] deleted timer again will be done with linux (time) --- python3-lstm/Data_preparation.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py index a5fd194..4954465 100644 --- a/python3-lstm/Data_preparation.py +++ b/python3-lstm/Data_preparation.py @@ -4,9 +4,6 @@ import numpy as np from keras.preprocessing.sequence import pad_sequences import sqlite3 -import timeit - -start = timeit.default_timer() debug = False @@ -181,7 +178,3 @@ def getFeaturesLabels(db_path): with open ( '56_labels', 'wb' ) as outfile: pickle.dump(labels, outfile) - -stop = timeit.default_timer() - -print('Time elapsed: ', stop - start) From 106c51783f32a789d7de5eccd0315b8600c4c601 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Mon, 9 Sep 2019 13:50:17 +0200 Subject: [PATCH 19/21] adapted some values to fit smaller tensor range --- python3-lstm/CuDNNLSTM.py | 5 +++++ python3-lstm/Data_preparation.py | 28 ++++++++++++---------------- python3-lstm/prediction.py | 10 ++++------ 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py index 2ceaab5..ef79ca5 100644 --- a/python3-lstm/CuDNNLSTM.py +++ b/python3-lstm/CuDNNLSTM.py @@ -10,6 +10,11 @@ from keras.optimizers import Adam from sklearn.model_selection import train_test_split + +#ignore deprecation warnings to get a better and cleaner output +import tensorflow.python.util.deprecation as deprecation +deprecation._PRINT_DEPRECATION_WARNINGS = False + name = "logname-{}".format ( int ( time.time () ) ) # both metrics and early stopping conditions are defined here and then saved in the log42 file diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py index 4954465..0f8bbad 100644 --- a/python3-lstm/Data_preparation.py +++ b/python3-lstm/Data_preparation.py @@ -1,4 +1,6 @@ -from datetime import datetime + +import warnings +warnings.filterwarnings('ignore',category=FutureWarning) import pickle import sys import numpy as np @@ -118,16 +120,13 @@ def getFeaturesLabels(db_path): db_cursor.execute(command) # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)] data_table = db_cursor.fetchall() - print('reading taskset_jobs join done') - print('The current time is:',datetime.now()) + finalFeatureList = [] finalLabelList = [] currentTset = data_table[0][0] # first taskset id tSetJobs = [] totalSize = len(data_table) for row in data_table: - if row[0] % 1000 == 0: - print('processed',int(100 * (row[0]/totalSize)),'%' ) if row[0] == currentTset: #then still same setTset tSetJobs.append(row) @@ -149,21 +148,18 @@ def getFeaturesLabels(db_path): TASKS_DICT = getTaskFeatures(DB_PATH) -print('Tasks have been added to TASKS_DICT') -print('length of taskdict: ', len(TASKS_DICT)) -print('example task 222:',TASKS_DICT[222]) - -features, labels = getFeaturesLabels(DB_PATH) +if debug: + print('Tasks have been added to TASKS_DICT') + print('length of taskdict: ', len(TASKS_DICT)) + print('example task 222:',TASKS_DICT[222]) -print('The current time is:',datetime.now()) +features, labels = getFeaturesLabels(DB_PATH) -print("Done reading") - labels = np.array(labels) # to save the labels list as numpy array # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value -features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post') +features = pad_sequences(features, maxlen=42, value=-1, padding='post', truncating='post') if debug: print(features.shape) # the dimensionality of features @@ -172,9 +168,9 @@ def getFeaturesLabels(db_path): input() # save both files for the training -with open ( '56_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' +with open ( '42_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' pickle.dump(features, outfile) -with open ( '56_labels', 'wb' ) as outfile: +with open ( '42_labels', 'wb' ) as outfile: pickle.dump(labels, outfile) diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py index 46d1db7..8dc2274 100644 --- a/python3-lstm/prediction.py +++ b/python3-lstm/prediction.py @@ -3,13 +3,11 @@ from keras.models import load_model import csv -with open ( '56_features', 'wb' ) as outfile: # 'wb' is the file mode, it means 'write binary' - #pickle.load(outfile features, outfile) - features = pickle.load(outfile, *, fix_imports=True, encoding="ASCII", errors="strict") +with open ( '42_features', 'rb' ) as outfile: # 'wb' is the file mode, it means 'write binary' + features = pickle.load(outfile, fix_imports=True) -with open ( '56_labels', 'wb' ) as outfile: - labels = pickle.load(outfile, *, fix_imports=True, encoding="ASCII", errors="strict") - #pickle.dump(labels, outfile) +with open ( '42_labels', 'rb' ) as outfile: + labels = pickle.load(outfile, fix_imports=True) model = load_model('My_LSTM_Model.h5') X = np.expand_dims(features, axis=2) From 8ded36d9373608f4074170a0ef1c4dbbded2d31c Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Mon, 9 Sep 2019 13:52:01 +0200 Subject: [PATCH 20/21] adopted values to fit the new name --- python3-lstm/CuDNNLSTM.py | 4 ++-- python3-lstm/Evaluation.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py index ef79ca5..a96dd2d 100644 --- a/python3-lstm/CuDNNLSTM.py +++ b/python3-lstm/CuDNNLSTM.py @@ -23,9 +23,9 @@ es = EarlyStopping ( monitor='val_loss', mode='min', verbose=1 ) # define early stopping criteria # Importing the the extracted features and labels -with open ( '56_features', 'rb' ) as fp: +with open ( '42_features', 'rb' ) as fp: X = pickle.load ( fp ) -with open ( '56_labels', 'rb' ) as fp: +with open ( '42_labels', 'rb' ) as fp: y = pickle.load ( fp ) # LSTM’s input shape argument expects a three-dimensional array as an input in this order: Samples, timestamps and features. This is why we need to add another dimention to the numpy array. diff --git a/python3-lstm/Evaluation.py b/python3-lstm/Evaluation.py index cc2c75c..1ffccc8 100644 --- a/python3-lstm/Evaluation.py +++ b/python3-lstm/Evaluation.py @@ -12,9 +12,9 @@ from sklearn import metrics from sklearn.model_selection import train_test_split -with open ( '56_features', 'rb' ) as fp: +with open ( '42_features', 'rb' ) as fp: X = pickle.load ( fp ) -with open ( '56_labels', 'rb' ) as fp: +with open ( '42_labels', 'rb' ) as fp: y = pickle.load ( fp ) X = np.expand_dims ( X, axis=2 ) From b0d12c713923eaa090bc0594d715261dc9b40870 Mon Sep 17 00:00:00 2001 From: privatereese <5148715+privatereese@users.noreply.github.com> Date: Mon, 9 Sep 2019 14:02:51 +0200 Subject: [PATCH 21/21] adpoted new shape of features and fixed warning supression --- python3-lstm/CuDNNLSTM.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py index a96dd2d..1a28871 100644 --- a/python3-lstm/CuDNNLSTM.py +++ b/python3-lstm/CuDNNLSTM.py @@ -1,5 +1,10 @@ import pickle import time +import warnings +warnings.filterwarnings('ignore',category=FutureWarning) +#ignore deprecation warnings to get a better and cleaner output +from tensorflow.python.util import deprecation +deprecation._PRINT_DEPRECATION_WARNINGS = False import tensorflow as tf import numpy as np from keras.callbacks import TensorBoard @@ -11,9 +16,6 @@ from sklearn.model_selection import train_test_split -#ignore deprecation warnings to get a better and cleaner output -import tensorflow.python.util.deprecation as deprecation -deprecation._PRINT_DEPRECATION_WARNINGS = False name = "logname-{}".format ( int ( time.time () ) ) @@ -48,7 +50,7 @@ # print ( X_train.shape ) # LSTM input is fifty-six time-steps and one feature at each time-step is represented by the notation: (56,1). -input = Input ( shape=(56, 1) ) +input = Input ( shape=(42, 1) ) # the first LSTM layer has 64 cells, the number must be equal/bigger than the input size. If you are using a CPU then change CuDNNLSTM to LSTM lstm = CuDNNLSTM ( 64, return_sequences=True ) ( input ) # Return_sequences is set true because the first LSTM has to return a sequence, which then can be fed into the 2nd LSTM