From 780964943e5164ecb28806f68f63bad694e08ca3 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Mon, 3 Jun 2019 17:06:03 +0200
Subject: [PATCH 01/21] added vagrantfiles to LSTM repository

---
 Vagrantfile  | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 bootstrap.sh | 12 +++++++++
 provision.sh | 14 ++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 Vagrantfile
 create mode 100644 bootstrap.sh
 create mode 100755 provision.sh

diff --git a/Vagrantfile b/Vagrantfile
new file mode 100644
index 0000000..434703f
--- /dev/null
+++ b/Vagrantfile
@@ -0,0 +1,73 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+# All Vagrant configuration is done below. The "2" in Vagrant.configure
+# configures the configuration version (we support older styles for
+# backwards compatibility). Please don't change it unless you know what
+# you're doing.
+Vagrant.configure("2") do |config|
+  # The most common configuration options are documented and commented below.
+  # For a complete reference, please see the online documentation at
+  # https://docs.vagrantup.com.
+
+  # Every Vagrant development environment requires a box. You can search for
+  # boxes at https://atlas.hashicorp.com/search.
+  config.vm.box = "ubuntu/xenial64"
+
+  # Disable automatic box update checking. If you disable this, then
+  # boxes will only be checked for updates when the user runs
+  # `vagrant box outdated`. This is not recommended.
+  # config.vm.box_check_update = false
+
+  # Create a forwarded port mapping which allows access to a specific port
+  # within the machine from a port on the host machine. In the example below,
+  # accessing "localhost:8080" will access port 80 on the guest machine.
+  # config.vm.network "forwarded_port", guest: 80, host: 8080
+
+  # Create a private network, which allows host-only access to the machine
+  # using a specific IP.
+  # config.vm.network "private_network", ip: "192.168.33.10"
+  # config.vm.network "public_network", ip: "127.0.0.1", bridge: "enp0s25"
+
+
+  # Create a public network, which generally matched to bridged network.
+  # Bridged networks make the machine appear as another physical device on
+  # your network.
+  config.vm.network "public_network", :mac => "0A0100000000", :auto_config => false
+
+  # Share an additional folder to the guest VM. The first argument is
+  # the path on the host to the actual folder. The second argument is
+  # the path on the guest to mount the folder. And the optional third
+  # argument is a set of non-required options.
+  # config.vm.synced_folder "../data", "/vagrant_data"
+
+  # Provider-specific configuration so you can fine-tune various
+  # backing providers for Vagrant. These expose provider-specific options.
+  # Example for VirtualBox:
+  #
+   config.vm.provider "virtualbox" do |vb|
+  #   # Display the VirtualBox GUI when booting the machine
+     vb.gui = false
+  #
+  #   # Customize the amount of memory on the VM:
+      vb.memory = 4096
+      vb.cpus = 2 
+   end
+  #
+  # View the documentation for the provider you are using for more
+  # information on available options.
+
+  # Define a Vagrant Push strategy for pushing to Atlas. Other push strategies
+  # such as FTP and Heroku are also available. See the documentation at
+  # https://docs.vagrantup.com/v2/push/atlas.html for more information.
+  # config.push.define "atlas" do |push|
+  #   push.app = "YOUR_ATLAS_USERNAME/YOUR_APPLICATION_NAME"
+  # end
+
+  # Enable provisioning with a shell script. Additional provisioners such as
+  # Puppet, Chef, Ansible, Salt, and Docker are also available. Please see the
+  # documentation for more information about their specific syntax and use.
+    config.vm.provision "shell", path:"provision.sh", privileged:false;
+    config.vm.provision "shell",  path:"bootstrap.sh" , run:"always";
+
+end
diff --git a/bootstrap.sh b/bootstrap.sh
new file mode 100644
index 0000000..20131cf
--- /dev/null
+++ b/bootstrap.sh
@@ -0,0 +1,12 @@
+
+##############################
+#
+# This is a bootstrap script which is
+# run at every startup of the vagrant machine
+# If you want to run something just once at provisioning
+# and first bootup of the vagrant machine please see
+# provision.sh
+#
+# Contributor: Bernhard Blieninger
+##############################
+
diff --git a/provision.sh b/provision.sh
new file mode 100755
index 0000000..0a68924
--- /dev/null
+++ b/provision.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#######################
+#
+# This is a provision script
+# it will be called once when the vagrant vm is first provisioned
+# If you have commands that you want to run always please have a
+# look at the bootstrap.sh script
+#
+# Contributor: Bernhard Blieninger, Robert Hamsch
+######################
+
+sudo apt update -qq
+sudo apt install python3.5 python3-pip tmux -qq
+

From 7cffaaf57d60d4b97165b696815698c6b46c138f Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Wed, 19 Jun 2019 13:05:26 +0200
Subject: [PATCH 02/21] structured python files into one folder

---
 CuDNNLSTM.py => python3-lstm/CuDNNLSTM.py               | 0
 Data_preparation.py => python3-lstm/Data_preparation.py | 0
 Evaluation.py => python3-lstm/Evaluation.py             | 0
 Plotting.py => python3-lstm/Plotting.py                 | 0
 parallel_search.py => python3-lstm/parallel_search.py   | 0
 prediction.py => python3-lstm/prediction.py             | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename CuDNNLSTM.py => python3-lstm/CuDNNLSTM.py (100%)
 rename Data_preparation.py => python3-lstm/Data_preparation.py (100%)
 rename Evaluation.py => python3-lstm/Evaluation.py (100%)
 rename Plotting.py => python3-lstm/Plotting.py (100%)
 rename parallel_search.py => python3-lstm/parallel_search.py (100%)
 rename prediction.py => python3-lstm/prediction.py (100%)

diff --git a/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py
similarity index 100%
rename from CuDNNLSTM.py
rename to python3-lstm/CuDNNLSTM.py
diff --git a/Data_preparation.py b/python3-lstm/Data_preparation.py
similarity index 100%
rename from Data_preparation.py
rename to python3-lstm/Data_preparation.py
diff --git a/Evaluation.py b/python3-lstm/Evaluation.py
similarity index 100%
rename from Evaluation.py
rename to python3-lstm/Evaluation.py
diff --git a/Plotting.py b/python3-lstm/Plotting.py
similarity index 100%
rename from Plotting.py
rename to python3-lstm/Plotting.py
diff --git a/parallel_search.py b/python3-lstm/parallel_search.py
similarity index 100%
rename from parallel_search.py
rename to python3-lstm/parallel_search.py
diff --git a/prediction.py b/python3-lstm/prediction.py
similarity index 100%
rename from prediction.py
rename to python3-lstm/prediction.py

From df9c96e0d5872e3148d7ed639320889230566dec Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Thu, 5 Sep 2019 23:21:39 +0200
Subject: [PATCH 03/21] rewrote data_preparation

---
 data_preparation.py | 317 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 317 insertions(+)
 create mode 100644 data_preparation.py

diff --git a/data_preparation.py b/data_preparation.py
new file mode 100644
index 0000000..ce52853
--- /dev/null
+++ b/data_preparation.py
@@ -0,0 +1,317 @@
+import pickle
+import sys
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from keras.preprocessing.sequence import pad_sequences
+import sqlite3
+
+# After exporting the relational database to separate tables with .csv extension, the transformation can begin
+# The first step is to read the cvs files as Dataframes
+#df_taskset = pd.read_csv('TaskSet.csv') # import task-sets
+# print(df_taskset.head()) if you want to see how the data look like
+
+#df_task = pd.read_csv('Task.csv')   # import tasks
+# print(df_task.head())
+
+#df_job = pd.read_csv('Job.csv')  # import jobs
+# print(df_job.head())
+
+
+# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label
+PKGs = {
+        'pi' : 0,
+        'hey' : 1,
+        'tumatmul' : 2,
+        'cond_mod' : 3
+        }
+
+# Integer encoding for Exit_Values from Jobs
+Exit_Values = {
+        'EXIT' : 1,
+        'EXIT_CRITICAL' : 0,
+        'EXIT_PERIOD' : 2,
+        'OUT_OF_CAPS' : 3,
+        'OUT_OF_QUOTA' : 4
+        }
+
+# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17
+Arg_Values = {
+        1 : 1,
+        4096 : 2,
+        8192 : 3,
+        16384 : 4,
+        32768 : 5,
+        65536 : 6,
+        131072 : 7,
+        262144 : 8,
+        524288 : 9,
+        1048576 : 10,
+        2097152 : 11,
+        847288609443 : 12,
+        2541865828329 : 13,
+        7625597484987 : 14,
+        22876792454961 : 15,
+        68630377364883 : 16,
+        205891132094649 : 17
+        }
+
+
+print("Doing writing")
+
+DB_PATH = "/home/bernhard/panda_v4.db"
+TASKS_DICT = {}
+
+def taskToFeatureList(task):
+    #returns a fature list for the corresponding task values
+    feature = []
+    feature.append(task['Priority'])
+    feature.append(task['Period'])
+    feature.append(task['Number_of_Jobs'])
+    feature.append(task['PKG'])
+    feature.append(task['Arg'])
+    feature.append(task['CRITICALTIME'])
+    return feature
+
+
+def getTaskFeatures(db_path): #c is the cursor for the db
+    # returns a dictionary 
+    # { task_id : [ feature, list ]
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = lambda C, R: { c[0]: R[i] for i, c in enumerate(C.description) }
+    db_cursor  = conn.cursor()
+    db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task')
+    outputTable  = db_cursor.fetchall()
+
+    tasks_dict = {}
+    for row in outputTable:
+        row['Period'] = int(row['Period']/1000)
+        row['Number_of_Jobs'] = int(row['Number_of_Jobs'])
+        row['PKG'] = PKGs[row['PKG']]
+        row['CRITICALTIME'] = int(row['CRITICALTIME']/1000)
+        row['Arg'] = Arg_Values[row['Arg']]
+        tasks_dict[row['Task_ID']] = taskToFeatureList(row)
+    return tasks_dict
+
+
+def processTaskset(tasksetData):
+    # tasksetData is a list of tuples returned from the DB in getTasksetData()
+    try:
+        label = tasksetData[0][-1]
+        features = []
+        jobExitsByTask = {}
+        for tsData in tasksetData:
+            try:
+                jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]])
+            except KeyError:
+                jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]]
+        for taskIdNo in (1,2,3):
+            if tasksetData[0][taskIdNo] != -1:
+                features += TASKS_DICT[tasksetData[0][taskIdNo]]
+                features += jobExitsByTask[tasksetData[0][taskIdNo]]
+    except KeyError as k:
+        for t in tasksetData:
+            print(t)
+        raise k
+    return np.array(features), label
+
+
+def getFeaturesLabels(db_path):
+    conn = sqlite3.connect(db_path)
+    db_cursor = conn.cursor()
+    command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful'\
+            ' FROM TaskSet JOIN Job'\
+            ' ON TaskSet.Set_ID = Job.Set_ID and'\
+            ' (TaskSet.TASK1_ID == Job.Task_ID or'\
+            ' TaskSet.TASK2_ID == Job.Task_ID or'\
+            ' TaskSet.TASK3_ID == Job.Task_ID);'
+    db_cursor.execute(command)
+    # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)]
+    data_table = db_cursor.fetchall()
+    print('reading taskset_jobs join done')
+    finalFeatureList = []
+    finalLabelList = []
+    currentTset = data_table[0][0] # first taskset id
+    tSetJobs = []
+    totalSize = len(data_table)
+    for row in data_table:
+        if row[0] % 1000 == 0:
+            print('processed',int(100 * (row[0]/totalSize)),'%' )
+        if row[0] == currentTset:
+            #then still same setTset
+            tSetJobs.append(row)
+        else:
+            # job of next taskset
+            # process data and record new
+            features, label = processTaskset(tSetJobs)
+            finalFeatureList.append(features)
+            finalLabelList.append(label)
+            tSetJobs = []
+            currentTset = row[0]
+            tSetJobs.append(row)
+    # proess last taskset
+    features, label = processTaskset(tSetJobs)
+    finalFeatureList.append(features)
+    finalLabelList.append(label)
+    return finalFeatureList, finalLabelList
+
+
+
+TASKS_DICT = getTaskFeatures(DB_PATH)
+print('Tasks have been added to TASKS_DICT')
+print('length of taskdict: ', len(TASKS_DICT))
+print('example task 222:',TASKS_DICT[222])
+
+features, labels = getFeaturesLabels(DB_PATH)
+
+print("Done reading")
+
+
+'''
+
+
+# 2. data transformation
+
+# here starts data transformation
+#ntn = df_task[['PKG']].values  # get values from PKG in tasks. This step is equivalent to: Select distinct PKG from Task
+#ntn1 = []
+#for n in ntn:
+#    ntn1.append(n[0])
+#print(np.unique(ntn1)) # print the unique values
+
+
+
+
+
+
+
+
+
+# 3. Features and Labels extraction
+i = 0
+
+
+
+
+sys.exit()
+features = [] # create an empty list for features
+labels = [] # create an empty list for labels
+new_task_list = []
+# loop in the task-set
+with tqdm ( total=len( list(df_taskset.iterrows()))) as pbar:  # the total length would be total=len(list(df_taskset.iterrows()))
+
+    for task_set in taskset_table:
+
+            if task_set['TASK1_ID']!= -1:
+
+                new_task_list.append()
+
+
+                blub
+
+
+    for index, row in df_taskset.iterrows ():
+
+        try:
+
+            i += 1
+            grid = int(df_taskset.loc[index, 'Set_ID']) # task_set ID
+            first_task = int(df_taskset.loc[index, 'TASK1_ID']) # first task_id
+            second_task = int(df_taskset.loc[index, 'TASK2_ID']) # second task_id
+            third_task = int(df_taskset.loc[index, 'TASK3_ID']) # third task_id
+            fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) # fourth task_id
+            tasks = []  # empty list of tasks where features are saved later
+
+            if first_task != -1: # if the first task exists in this task-set then :
+
+                task_info = df_task.loc[df_task['Task_ID'] == first_task]
+                tasks.append(int(task_info['Priority']))   # save the priority
+                tasks.append(int(task_info['Period']/1000)) # save the period in seconds
+                tasks.append(int(task_info['Number_of_Jobs'])) # save number of jobs
+                n = str(task_info['PKG'].item())
+                tasks.append(PKGs[n])             #save the numerical value of PKG
+                av = int(task_info['Arg'].item())
+                tasks.append(Arg_Values[av])    #save the scaled value of Arg
+                tasks.append(int(task_info['CRITICALTIME']/1000))   # save criticaltime in seconds
+                # for each job in that is in the task and has this task_set id
+                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
+
+                for ind, r in job_info.iterrows():
+                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])  # save the transformed exit value
+
+            if second_task != -1:  # if the second task exists in this task-set then :
+                first_task = second_task
+                task_info = df_task.loc[df_task['Task_ID'] == first_task]
+                tasks.append(int(task_info['Priority']))
+                tasks.append(int(task_info['Period']/1000))
+                tasks.append(int(task_info['Number_of_Jobs']))
+                n = str(task_info['PKG'].item())
+                tasks.append(PKGs[n])
+                av = int(task_info['Arg'].item())
+                tasks.append(Arg_Values[av])
+                tasks.append(int(task_info['CRITICALTIME']/1000))
+                print(tasks)
+                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
+                for ind, r in job_info.iterrows():
+                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
+
+            if third_task != -1:   # if the third task exists in this task-set then :
+                first_task = third_task
+                task_info = df_task.loc[df_task['Task_ID'] == first_task]
+                tasks.append(int(task_info['Priority']))
+                tasks.append(int(task_info['Period']/1000))
+                tasks.append(int(task_info['Number_of_Jobs']))
+                n = str(task_info['PKG'].item())
+                tasks.append(PKGs[n])
+                av = int(task_info['Arg'].item())
+                tasks.append(Arg_Values[av])
+                tasks.append(int(task_info['CRITICALTIME']/1000))
+
+                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
+                for ind, r in job_info.iterrows():
+                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
+
+
+            if fourth_task != -1: # if the fourth task exists in this task-set then :
+                first_task = fourth_task
+                task_info = df_task.loc[df_task['Task_ID'] == first_task]
+                tasks.append(int(task_info['Priority']))
+                tasks.append(int(task_info['Period']/1000))
+                tasks.append(int(task_info['Number_of_Jobs']))
+                n = str(task_info['PKG'].item())
+                tasks.append(PKGs[n])
+                av = int(task_info['Arg'].item())
+                tasks.append(Arg_Values[av])
+                tasks.append(int(task_info['CRITICALTIME']/1000))
+
+                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
+                for ind, r in job_info.iterrows():
+                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
+
+
+            tasks = np.array(tasks)  #  to save the task list as numpy array
+            features.append(tasks)  # values in tasks are features
+            labels.append(int(df_taskset.loc[index, 'Successful'])) # in the label list, append the value in the successful col from task-set
+        except Exception as e:  # exception handler
+            print(e)
+            pass
+        pbar.update(1)
+
+'''
+
+labels = np.array(labels) #  to save the labels list as numpy array
+
+# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value
+features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post')
+
+#print(features.shape) # the dimensionality of features
+#print(labels.shape) # the dimensionality of labels
+
+#  save both files for the training
+with open ( '56_features', 'wb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'
+    pickle.dump(features, outfile)
+
+with open ( '56_labels', 'wb' ) as outfile:
+    pickle.dump(labels, outfile)
+
+

From dd0c0921f5b654e98ae478a67dab92acf01674df Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Thu, 5 Sep 2019 23:24:30 +0200
Subject: [PATCH 04/21] added timestamps

---
 data_preparation.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/data_preparation.py b/data_preparation.py
index ce52853..1abab29 100644
--- a/data_preparation.py
+++ b/data_preparation.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 import pickle
 import sys
 import numpy as np
@@ -129,6 +130,7 @@ def getFeaturesLabels(db_path):
     # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)]
     data_table = db_cursor.fetchall()
     print('reading taskset_jobs join done')
+    print('The current time is:',datetime.now())
     finalFeatureList = []
     finalLabelList = []
     currentTset = data_table[0][0] # first taskset id
@@ -164,9 +166,12 @@ def getFeaturesLabels(db_path):
 
 features, labels = getFeaturesLabels(DB_PATH)
 
-print("Done reading")
 
+print('The current time is:',datetime.now())
 
+print("Done reading")
+  
+ 
 '''
 
 
@@ -314,4 +319,4 @@ def getFeaturesLabels(db_path):
 with open ( '56_labels', 'wb' ) as outfile:
     pickle.dump(labels, outfile)
 
-
+print('The current time is:',datetime.now())

From 2c2f519d7afe070ece521d8c955e228dcb20b492 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Thu, 5 Sep 2019 23:35:50 +0200
Subject: [PATCH 05/21] catched error where datasets are not fully completed in
 jobs table

---
 data_preparation.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/data_preparation.py b/data_preparation.py
index 1abab29..772aa28 100644
--- a/data_preparation.py
+++ b/data_preparation.py
@@ -33,7 +33,8 @@
         'EXIT_CRITICAL' : 0,
         'EXIT_PERIOD' : 2,
         'OUT_OF_CAPS' : 3,
-        'OUT_OF_QUOTA' : 4
+        'OUT_OF_QUOTA' : 4,
+        'EXIT_ERROR' : 5
         }
 
 # ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17
@@ -97,23 +98,21 @@ def getTaskFeatures(db_path): #c is the cursor for the db
 
 def processTaskset(tasksetData):
     # tasksetData is a list of tuples returned from the DB in getTasksetData()
-    try:
-        label = tasksetData[0][-1]
-        features = []
-        jobExitsByTask = {}
-        for tsData in tasksetData:
+    label = tasksetData[0][-1]
+    features = []
+    jobExitsByTask = {}
+    for tsData in tasksetData:
+        try:
+            jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]])
+        except KeyError:
+            jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]]
+    for taskIdNo in (1,2,3):
+        if tasksetData[0][taskIdNo] != -1:
+            features += TASKS_DICT[tasksetData[0][taskIdNo]]
             try:
-                jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]])
-            except KeyError:
-                jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]]
-        for taskIdNo in (1,2,3):
-            if tasksetData[0][taskIdNo] != -1:
-                features += TASKS_DICT[tasksetData[0][taskIdNo]]
                 features += jobExitsByTask[tasksetData[0][taskIdNo]]
-    except KeyError as k:
-        for t in tasksetData:
-            print(t)
-        raise k
+            except KeyError:
+                features += [Exit_Values['EXIT_ERROR']]
     return np.array(features), label
 
 

From f8ba1be68f71099b44639fcca56e8123e7d8321f Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Sun, 8 Sep 2019 00:20:05 +0200
Subject: [PATCH 06/21] changed prediction.py

---
 python3-lstm/prediction.py | 278 +++++++++++++++++++++----------------
 1 file changed, 155 insertions(+), 123 deletions(-)

diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py
index 3280154..b9fa43c 100644
--- a/python3-lstm/prediction.py
+++ b/python3-lstm/prediction.py
@@ -6,130 +6,162 @@
 from keras.models import load_model
 import csv
 
-df_taskset = pd.read_csv ( 'TaskSet.csv' )
-# df_taskset = df_taskset.sample(frac=0.0001, random_state=99)
-df_task = pd.read_csv ( 'Task.csv' )
-df_job = pd.read_csv ( 'Job.csv' )
-
-ntn = df_task[['PKG']].values
-ntn1 = []
-for n in ntn:
-    ntn1.append ( n[0] )
-
-PKGs = {}
-PKGs['pi'] = 0
-PKGs['hey'] = 1
-PKGs['tumatmul'] = 2
-PKGs['cond_mod'] = 3
-
-Exit_Values = {}
-Exit_Values['EXIT'] = 1
-Exit_Values['EXIT_CRITICAL'] = 0
-
-Arg_Values = {}
-Arg_Values[1] = 1
-Arg_Values[4096] = 2
-Arg_Values[8192] = 3
-Arg_Values[16384] = 4
-Arg_Values[32768] = 5
-Arg_Values[65536] = 6
-Arg_Values[131072] = 7
-Arg_Values[262144] = 8
-Arg_Values[524288] = 9
-Arg_Values[1048576] = 10
-Arg_Values[2097152] = 11
-Arg_Values[847288609443] = 12
-Arg_Values[2541865828329] = 13
-Arg_Values[7625597484987] = 14
-Arg_Values[22876792454961] = 15
-Arg_Values[68630377364883] = 16
-Arg_Values[205891132094649] = 17
-
-i = 0
-features = []
-labels = []
-with tqdm(total=len(list(df_taskset.iterrows()))) as pbar:
-    for index, row in df_taskset.iterrows():
-
+# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label
+PKGs = {
+    'pi': 0,
+    'hey': 1,
+    'tumatmul': 2,
+    'cond_mod': 3
+}
+
+# Integer encoding for Exit_Values from Jobs
+Exit_Values = {
+    'EXIT': 1,
+    'EXIT_CRITICAL': 0,
+    'EXIT_PERIOD': 2,
+    'OUT_OF_CAPS': 3,
+    'OUT_OF_QUOTA': 4,
+    'EXIT_ERROR': 5
+}
+
+# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17
+Arg_Values = {
+    1: 1,
+    4096: 2,
+    8192: 3,
+    16384: 4,
+    32768: 5,
+    65536: 6,
+    131072: 7,
+    262144: 8,
+    524288: 9,
+    1048576: 10,
+    2097152: 11,
+    847288609443: 12,
+    2541865828329: 13,
+    7625597484987: 14,
+    22876792454961: 15,
+    68630377364883: 16,
+    205891132094649: 17
+}
+
+print("Doing writing")
+
+DB_PATH = "/home/bernhard/panda_v4.db"
+TASKS_DICT = {}
+
+
+def taskToFeatureList(task):
+    # returns a fature list for the corresponding task values
+    feature = []
+    feature.append(task['Priority'])
+    feature.append(task['Period'])
+    feature.append(task['Number_of_Jobs'])
+    feature.append(task['PKG'])
+    feature.append(task['Arg'])
+    feature.append(task['CRITICALTIME'])
+    return feature
+
+
+def getTaskFeatures(db_path):  # c is the cursor for the db
+    # returns a dictionary
+    # { task_id : [ feature, list ]
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = lambda C, R: {c[0]: R[i] for i, c in enumerate(C.description)}
+    db_cursor = conn.cursor()
+    db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task')
+    outputTable = db_cursor.fetchall()
+
+    tasks_dict = {}
+    for row in outputTable:
+        row['Period'] = int(row['Period'] / 1000)
+        row['Number_of_Jobs'] = int(row['Number_of_Jobs'])
+        row['PKG'] = PKGs[row['PKG']]
+        row['CRITICALTIME'] = int(row['CRITICALTIME'] / 1000)
+        row['Arg'] = Arg_Values[row['Arg']]
+        tasks_dict[row['Task_ID']] = taskToFeatureList(row)
+    return tasks_dict
+
+
+def processTaskset(tasksetData):
+    # tasksetData is a list of tuples returned from the DB in getTasksetData()
+    label = tasksetData[0][-1]
+    features = []
+    jobExitsByTask = {}
+    for tsData in tasksetData:
         try:
-
-            i += 1
-            grid = int(df_taskset.loc[index, 'Set_ID'])
-            res = int(df_taskset.loc[index, 'Successful'])
-            print(grid)
-            first_task = int(df_taskset.loc[index, 'TASK1_ID'])
-            second_task = int(df_taskset.loc[index, 'TASK2_ID'])
-            third_task = int(df_taskset.loc[index, 'TASK3_ID'])
-            fourth_task = int(df_taskset.loc[index, 'TASK4_ID'])
-            tasks = []
-
-            if first_task != -1:
-
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period'] / 1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                tasks.append(int(task_info['Arg']))
-                tasks.append(int(task_info['CRITICALTIME'] / 1000))
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-            if second_task != -1:
-                first_task = second_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period'] / 1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                tasks.append(int(task_info['Arg']))
-                tasks.append(int(task_info['CRITICALTIME'] / 1000))
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-            if third_task != -1:
-                first_task = third_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period'] / 1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                tasks.append(int(task_info['Arg']))
-                tasks.append( int ( task_info['CRITICALTIME'] / 1000))
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-            if fourth_task != -1:
-                first_task = fourth_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period']))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                tasks.append(int(task_info['Arg']))
-                tasks.append(int(task_info['CRITICALTIME']))
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-                labels = np.array(int(df_taskset.loc[index, 'Successful']))
-
-            tasks = np.array(tasks)
-            features.append(tasks)
-            labels.append(res)
-        except Exception as e:
-            print(e)
-            pass
-        pbar.update(1)
-
-labels = np.array(labels)
-features = pad_sequences(features, maxlen=42, value=-1, padding='post', truncating='post')
+            jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]])
+        except KeyError:
+            jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]]
+    for taskIdNo in (1, 2, 3):
+        if tasksetData[0][taskIdNo] != -1:
+            features += TASKS_DICT[tasksetData[0][taskIdNo]]
+            try:
+                features += jobExitsByTask[tasksetData[0][taskIdNo]]
+            except KeyError:
+                features += [Exit_Values['EXIT_ERROR']]
+    return np.array(features), label
+
+
+def getFeaturesLabels(db_path):
+    conn = sqlite3.connect(db_path)
+    db_cursor = conn.cursor()
+    command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful' \
+              ' FROM TaskSet JOIN Job' \
+              ' ON TaskSet.Set_ID = Job.Set_ID and' \
+              ' (TaskSet.TASK1_ID == Job.Task_ID or' \
+              ' TaskSet.TASK2_ID == Job.Task_ID or' \
+              ' TaskSet.TASK3_ID == Job.Task_ID);'
+    db_cursor.execute(command)
+    # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)]
+    data_table = db_cursor.fetchall()
+    print('reading taskset_jobs join done')
+    print('The current time is:', datetime.now())
+    finalFeatureList = []
+    finalLabelList = []
+    currentTset = data_table[0][0]  # first taskset id
+    tSetJobs = []
+    totalSize = len(data_table)
+    for row in data_table:
+        if row[0] % 1000 == 0:
+            print('processed', int(100 * (row[0] / totalSize)), '%')
+        if row[0] == currentTset:
+            # then still same setTset
+            tSetJobs.append(row)
+        else:
+            # job of next taskset
+            # process data and record new
+            features, label = processTaskset(tSetJobs)
+            finalFeatureList.append(features)
+            finalLabelList.append(label)
+            tSetJobs = []
+            currentTset = row[0]
+            tSetJobs.append(row)
+    # proess last taskset
+    features, label = processTaskset(tSetJobs)
+    finalFeatureList.append(features)
+    finalLabelList.append(label)
+    return finalFeatureList, finalLabelList
+
+
+TASKS_DICT = getTaskFeatures(DB_PATH)
+print('Tasks have been added to TASKS_DICT')
+print('length of taskdict: ', len(TASKS_DICT))
+print('example task 222:', TASKS_DICT[222])
+
+features, labels = getFeaturesLabels(DB_PATH)
+
+print('The current time is:', datetime.now())
+
+print("Done reading")
+
+
+
+
+labels = np.array(labels) #  to save the labels list as numpy array
+
+# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value
+features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post')
 
 model = load_model('My_LSTM_Model.h5')
 X = np.expand_dims(features, axis=2)

From f83b34dcdc9966484d7f6fa445d9a8840773eb53 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Sun, 8 Sep 2019 00:44:31 +0200
Subject: [PATCH 07/21] enhanced data_preparation

---
 data_preparation.py | 145 --------------------------------------------
 1 file changed, 145 deletions(-)

diff --git a/data_preparation.py b/data_preparation.py
index 772aa28..42e4949 100644
--- a/data_preparation.py
+++ b/data_preparation.py
@@ -7,18 +7,6 @@
 from keras.preprocessing.sequence import pad_sequences
 import sqlite3
 
-# After exporting the relational database to separate tables with .csv extension, the transformation can begin
-# The first step is to read the cvs files as Dataframes
-#df_taskset = pd.read_csv('TaskSet.csv') # import task-sets
-# print(df_taskset.head()) if you want to see how the data look like
-
-#df_task = pd.read_csv('Task.csv')   # import tasks
-# print(df_task.head())
-
-#df_job = pd.read_csv('Job.csv')  # import jobs
-# print(df_job.head())
-
-
 # PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label
 PKGs = {
         'pi' : 0,
@@ -170,139 +158,6 @@ def getFeaturesLabels(db_path):
 
 print("Done reading")
   
- 
-'''
-
-
-# 2. data transformation
-
-# here starts data transformation
-#ntn = df_task[['PKG']].values  # get values from PKG in tasks. This step is equivalent to: Select distinct PKG from Task
-#ntn1 = []
-#for n in ntn:
-#    ntn1.append(n[0])
-#print(np.unique(ntn1)) # print the unique values
-
-
-
-
-
-
-
-
-
-# 3. Features and Labels extraction
-i = 0
-
-
-
-
-sys.exit()
-features = [] # create an empty list for features
-labels = [] # create an empty list for labels
-new_task_list = []
-# loop in the task-set
-with tqdm ( total=len( list(df_taskset.iterrows()))) as pbar:  # the total length would be total=len(list(df_taskset.iterrows()))
-
-    for task_set in taskset_table:
-
-            if task_set['TASK1_ID']!= -1:
-
-                new_task_list.append()
-
-
-                blub
-
-
-    for index, row in df_taskset.iterrows ():
-
-        try:
-
-            i += 1
-            grid = int(df_taskset.loc[index, 'Set_ID']) # task_set ID
-            first_task = int(df_taskset.loc[index, 'TASK1_ID']) # first task_id
-            second_task = int(df_taskset.loc[index, 'TASK2_ID']) # second task_id
-            third_task = int(df_taskset.loc[index, 'TASK3_ID']) # third task_id
-            fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) # fourth task_id
-            tasks = []  # empty list of tasks where features are saved later
-
-            if first_task != -1: # if the first task exists in this task-set then :
-
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))   # save the priority
-                tasks.append(int(task_info['Period']/1000)) # save the period in seconds
-                tasks.append(int(task_info['Number_of_Jobs'])) # save number of jobs
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])             #save the numerical value of PKG
-                av = int(task_info['Arg'].item())
-                tasks.append(Arg_Values[av])    #save the scaled value of Arg
-                tasks.append(int(task_info['CRITICALTIME']/1000))   # save criticaltime in seconds
-                # for each job in that is in the task and has this task_set id
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])  # save the transformed exit value
-
-            if second_task != -1:  # if the second task exists in this task-set then :
-                first_task = second_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period']/1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                av = int(task_info['Arg'].item())
-                tasks.append(Arg_Values[av])
-                tasks.append(int(task_info['CRITICALTIME']/1000))
-                print(tasks)
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-            if third_task != -1:   # if the third task exists in this task-set then :
-                first_task = third_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period']/1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                av = int(task_info['Arg'].item())
-                tasks.append(Arg_Values[av])
-                tasks.append(int(task_info['CRITICALTIME']/1000))
-
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-
-            if fourth_task != -1: # if the fourth task exists in this task-set then :
-                first_task = fourth_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period']/1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                av = int(task_info['Arg'].item())
-                tasks.append(Arg_Values[av])
-                tasks.append(int(task_info['CRITICALTIME']/1000))
-
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-
-            tasks = np.array(tasks)  #  to save the task list as numpy array
-            features.append(tasks)  # values in tasks are features
-            labels.append(int(df_taskset.loc[index, 'Successful'])) # in the label list, append the value in the successful col from task-set
-        except Exception as e:  # exception handler
-            print(e)
-            pass
-        pbar.update(1)
-
-'''
-
 labels = np.array(labels) #  to save the labels list as numpy array
 
 # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value

From e1a22e3061bd8c80b777411c4b87bc2fa7c43738 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Sun, 8 Sep 2019 00:50:12 +0200
Subject: [PATCH 08/21] moved data_preparation and updated prediction, could be
 improved further

---
 data_preparation.py              | 176 ------------------
 python3-lstm/Data_preparation.py | 304 +++++++++++++++----------------
 python3-lstm/prediction.py       |   8 +-
 3 files changed, 152 insertions(+), 336 deletions(-)
 delete mode 100644 data_preparation.py

diff --git a/data_preparation.py b/data_preparation.py
deleted file mode 100644
index 42e4949..0000000
--- a/data_preparation.py
+++ /dev/null
@@ -1,176 +0,0 @@
-from datetime import datetime
-import pickle
-import sys
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-from keras.preprocessing.sequence import pad_sequences
-import sqlite3
-
-# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label
-PKGs = {
-        'pi' : 0,
-        'hey' : 1,
-        'tumatmul' : 2,
-        'cond_mod' : 3
-        }
-
-# Integer encoding for Exit_Values from Jobs
-Exit_Values = {
-        'EXIT' : 1,
-        'EXIT_CRITICAL' : 0,
-        'EXIT_PERIOD' : 2,
-        'OUT_OF_CAPS' : 3,
-        'OUT_OF_QUOTA' : 4,
-        'EXIT_ERROR' : 5
-        }
-
-# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17
-Arg_Values = {
-        1 : 1,
-        4096 : 2,
-        8192 : 3,
-        16384 : 4,
-        32768 : 5,
-        65536 : 6,
-        131072 : 7,
-        262144 : 8,
-        524288 : 9,
-        1048576 : 10,
-        2097152 : 11,
-        847288609443 : 12,
-        2541865828329 : 13,
-        7625597484987 : 14,
-        22876792454961 : 15,
-        68630377364883 : 16,
-        205891132094649 : 17
-        }
-
-
-print("Doing writing")
-
-DB_PATH = "/home/bernhard/panda_v4.db"
-TASKS_DICT = {}
-
-def taskToFeatureList(task):
-    #returns a fature list for the corresponding task values
-    feature = []
-    feature.append(task['Priority'])
-    feature.append(task['Period'])
-    feature.append(task['Number_of_Jobs'])
-    feature.append(task['PKG'])
-    feature.append(task['Arg'])
-    feature.append(task['CRITICALTIME'])
-    return feature
-
-
-def getTaskFeatures(db_path): #c is the cursor for the db
-    # returns a dictionary 
-    # { task_id : [ feature, list ]
-    conn = sqlite3.connect(db_path)
-    conn.row_factory = lambda C, R: { c[0]: R[i] for i, c in enumerate(C.description) }
-    db_cursor  = conn.cursor()
-    db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task')
-    outputTable  = db_cursor.fetchall()
-
-    tasks_dict = {}
-    for row in outputTable:
-        row['Period'] = int(row['Period']/1000)
-        row['Number_of_Jobs'] = int(row['Number_of_Jobs'])
-        row['PKG'] = PKGs[row['PKG']]
-        row['CRITICALTIME'] = int(row['CRITICALTIME']/1000)
-        row['Arg'] = Arg_Values[row['Arg']]
-        tasks_dict[row['Task_ID']] = taskToFeatureList(row)
-    return tasks_dict
-
-
-def processTaskset(tasksetData):
-    # tasksetData is a list of tuples returned from the DB in getTasksetData()
-    label = tasksetData[0][-1]
-    features = []
-    jobExitsByTask = {}
-    for tsData in tasksetData:
-        try:
-            jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]])
-        except KeyError:
-            jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]]
-    for taskIdNo in (1,2,3):
-        if tasksetData[0][taskIdNo] != -1:
-            features += TASKS_DICT[tasksetData[0][taskIdNo]]
-            try:
-                features += jobExitsByTask[tasksetData[0][taskIdNo]]
-            except KeyError:
-                features += [Exit_Values['EXIT_ERROR']]
-    return np.array(features), label
-
-
-def getFeaturesLabels(db_path):
-    conn = sqlite3.connect(db_path)
-    db_cursor = conn.cursor()
-    command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful'\
-            ' FROM TaskSet JOIN Job'\
-            ' ON TaskSet.Set_ID = Job.Set_ID and'\
-            ' (TaskSet.TASK1_ID == Job.Task_ID or'\
-            ' TaskSet.TASK2_ID == Job.Task_ID or'\
-            ' TaskSet.TASK3_ID == Job.Task_ID);'
-    db_cursor.execute(command)
-    # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)]
-    data_table = db_cursor.fetchall()
-    print('reading taskset_jobs join done')
-    print('The current time is:',datetime.now())
-    finalFeatureList = []
-    finalLabelList = []
-    currentTset = data_table[0][0] # first taskset id
-    tSetJobs = []
-    totalSize = len(data_table)
-    for row in data_table:
-        if row[0] % 1000 == 0:
-            print('processed',int(100 * (row[0]/totalSize)),'%' )
-        if row[0] == currentTset:
-            #then still same setTset
-            tSetJobs.append(row)
-        else:
-            # job of next taskset
-            # process data and record new
-            features, label = processTaskset(tSetJobs)
-            finalFeatureList.append(features)
-            finalLabelList.append(label)
-            tSetJobs = []
-            currentTset = row[0]
-            tSetJobs.append(row)
-    # proess last taskset
-    features, label = processTaskset(tSetJobs)
-    finalFeatureList.append(features)
-    finalLabelList.append(label)
-    return finalFeatureList, finalLabelList
-
-
-
-TASKS_DICT = getTaskFeatures(DB_PATH)
-print('Tasks have been added to TASKS_DICT')
-print('length of taskdict: ', len(TASKS_DICT))
-print('example task 222:',TASKS_DICT[222])
-
-features, labels = getFeaturesLabels(DB_PATH)
-
-
-print('The current time is:',datetime.now())
-
-print("Done reading")
-  
-labels = np.array(labels) #  to save the labels list as numpy array
-
-# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value
-features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post')
-
-#print(features.shape) # the dimensionality of features
-#print(labels.shape) # the dimensionality of labels
-
-#  save both files for the training
-with open ( '56_features', 'wb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'
-    pickle.dump(features, outfile)
-
-with open ( '56_labels', 'wb' ) as outfile:
-    pickle.dump(labels, outfile)
-
-print('The current time is:',datetime.now())
diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py
index 0d847a1..d215f89 100644
--- a/python3-lstm/Data_preparation.py
+++ b/python3-lstm/Data_preparation.py
@@ -1,164 +1,160 @@
+from datetime import datetime
 import pickle
 import numpy as np
-import pandas as pd
-from tqdm import tqdm
 from keras.preprocessing.sequence import pad_sequences
-
-# After exporting the relational database to separate tables with .csv extension, the transformation can begin
-# The first step is to read the cvs files as Dataframes
-df_taskset = pd.read_csv('TaskSet.csv') # import task-sets
-# print(df_taskset.head()) if you want to see how the data look like
-
-df_task = pd.read_csv('Task.csv')   # import tasks
-# print(df_task.head())
-
-df_job = pd.read_csv('Job.csv')  # import jobs
-# print(df_job.head())
-
-
-# 2. data transformation
-
-# here starts data transformation
-ntn = df_task[['PKG']].values  # get values from PKG in tasks. This step is equivalent to: Select distinct PKG from Task
-ntn1 = []
-for n in ntn:
-    ntn1.append(n[0])
-print(np.unique(ntn1)) # print the unique values
-
-
+import sqlite3
 
 # PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label
-PKGs = {}
-PKGs['pi'] = 0
-PKGs['hey'] = 1
-PKGs['tumatmul'] = 2
-PKGs['cond_mod'] = 3
-
-# INteger encoding for Exit_Values from Jobs
-Exit_Values = {}
-Exit_Values['EXIT'] = 1
-Exit_Values['EXIT_CRITICAL'] = 0
-
+PKGs = {
+        'pi' : 0,
+        'hey' : 1,
+        'tumatmul' : 2,
+        'cond_mod' : 3
+        }
+
+# Integer encoding for Exit_Values from Jobs
+Exit_Values = {
+        'EXIT' : 1,
+        'EXIT_CRITICAL' : 0,
+        'EXIT_PERIOD' : 2,
+        'OUT_OF_CAPS' : 3,
+        'OUT_OF_QUOTA' : 4,
+        'EXIT_ERROR' : 5
+        }
 
 # ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17
-Arg_Values = {}
-Arg_Values[1] = 1
-Arg_Values[4096] = 2
-Arg_Values[8192] = 3
-Arg_Values[16384] = 4
-Arg_Values[32768] = 5
-Arg_Values[65536] = 6
-Arg_Values[131072] = 7
-Arg_Values[262144] = 8
-Arg_Values[524288] = 9
-Arg_Values[1048576] = 10
-Arg_Values[2097152] = 11
-Arg_Values[847288609443] = 12
-Arg_Values[2541865828329] = 13
-Arg_Values[7625597484987] = 14
-Arg_Values[22876792454961] = 15
-Arg_Values[68630377364883] = 16
-Arg_Values[205891132094649] = 17
-
-
-
-# 3. Features and Labels extraction
-i = 0
-
-features = [] # create an empty list for features
-labels = [] # create an empty list for labels
-# loop in the task-set
-with tqdm ( total=len (
-        list(df_taskset.iterrows()))) as pbar:  # the total length would be total=len(list(df_taskset.iterrows()))
-    for index, row in df_taskset.iterrows ():
-
+Arg_Values = {
+        1 : 1,
+        4096 : 2,
+        8192 : 3,
+        16384 : 4,
+        32768 : 5,
+        65536 : 6,
+        131072 : 7,
+        262144 : 8,
+        524288 : 9,
+        1048576 : 10,
+        2097152 : 11,
+        847288609443 : 12,
+        2541865828329 : 13,
+        7625597484987 : 14,
+        22876792454961 : 15,
+        68630377364883 : 16,
+        205891132094649 : 17
+        }
+
+
+print("Doing writing")
+
+DB_PATH = "/home/bernhard/panda_v4.db"
+TASKS_DICT = {}
+
+def taskToFeatureList(task):
+    #returns a fature list for the corresponding task values
+    feature = []
+    feature.append(task['Priority'])
+    feature.append(task['Period'])
+    feature.append(task['Number_of_Jobs'])
+    feature.append(task['PKG'])
+    feature.append(task['Arg'])
+    feature.append(task['CRITICALTIME'])
+    return feature
+
+
+def getTaskFeatures(db_path): #c is the cursor for the db
+    # returns a dictionary 
+    # { task_id : [ feature, list ]
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = lambda C, R: { c[0]: R[i] for i, c in enumerate(C.description) }
+    db_cursor  = conn.cursor()
+    db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task')
+    outputTable  = db_cursor.fetchall()
+
+    tasks_dict = {}
+    for row in outputTable:
+        row['Period'] = int(row['Period']/1000)
+        row['Number_of_Jobs'] = int(row['Number_of_Jobs'])
+        row['PKG'] = PKGs[row['PKG']]
+        row['CRITICALTIME'] = int(row['CRITICALTIME']/1000)
+        row['Arg'] = Arg_Values[row['Arg']]
+        tasks_dict[row['Task_ID']] = taskToFeatureList(row)
+    return tasks_dict
+
+
+def processTaskset(tasksetData):
+    # tasksetData is a list of tuples returned from the DB in getTasksetData()
+    label = tasksetData[0][-1]
+    features = []
+    jobExitsByTask = {}
+    for tsData in tasksetData:
         try:
-
-            i += 1
-            grid = int(df_taskset.loc[index, 'Set_ID']) # task_set ID
-            first_task = int(df_taskset.loc[index, 'TASK1_ID']) # first task_id
-            second_task = int(df_taskset.loc[index, 'TASK2_ID']) # second task_id
-            third_task = int(df_taskset.loc[index, 'TASK3_ID']) # third task_id
-            fourth_task = int(df_taskset.loc[index, 'TASK4_ID']) # fourth task_id
-            tasks = []  # empty list of tasks where features are saved later
-
-            if first_task != -1: # if the first task exists in this task-set then :
-
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))   # save the priority
-                tasks.append(int(task_info['Period']/1000)) # save the period in seconds
-                tasks.append(int(task_info['Number_of_Jobs'])) # save number of jobs
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])             #save the numerical value of PKG
-                av = int(task_info['Arg'].item())
-                tasks.append(Arg_Values[av])    #save the scaled value of Arg
-                tasks.append(int(task_info['CRITICALTIME']/1000))   # save criticaltime in seconds
-                # for each job in that is in the task and has this task_set id
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])  # save the transformed exit value
-
-            if second_task != -1:  # if the second task exists in this task-set then :
-                first_task = second_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period']/1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                av = int(task_info['Arg'].item())
-                tasks.append(Arg_Values[av])
-                tasks.append(int(task_info['CRITICALTIME']/1000))
-                print(tasks)
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-            if third_task != -1:   # if the third task exists in this task-set then :
-                first_task = third_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period']/1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                av = int(task_info['Arg'].item())
-                tasks.append(Arg_Values[av])
-                tasks.append(int(task_info['CRITICALTIME']/1000))
-
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-
-            if fourth_task != -1: # if the fourth task exists in this task-set then :
-                first_task = fourth_task
-                task_info = df_task.loc[df_task['Task_ID'] == first_task]
-                tasks.append(int(task_info['Priority']))
-                tasks.append(int(task_info['Period']/1000))
-                tasks.append(int(task_info['Number_of_Jobs']))
-                n = str(task_info['PKG'].item())
-                tasks.append(PKGs[n])
-                av = int(task_info['Arg'].item())
-                tasks.append(Arg_Values[av])
-                tasks.append(int(task_info['CRITICALTIME']/1000))
-
-                job_info = df_job.loc[(df_job['Task_ID'] == first_task) & (df_job['Set_ID'] == grid)]
-                for ind, r in job_info.iterrows():
-                    tasks.append(Exit_Values[job_info.loc[ind, 'Exit_Value']])
-
-
-            tasks = np.array(tasks)  #  to save the task list as numpy array
-            features.append(tasks)  # values in tasks are features
-            labels.append(int(df_taskset.loc[index, 'Successful'])) # in the label list, append the value in the successful col from task-set
-        except Exception as e:  # exception handler
-            print(e)
-            pass
-        pbar.update(1)
-
-
-
+            jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]])
+        except KeyError:
+            jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]]
+    for taskIdNo in (1,2,3):
+        if tasksetData[0][taskIdNo] != -1:
+            features += TASKS_DICT[tasksetData[0][taskIdNo]]
+            try:
+                features += jobExitsByTask[tasksetData[0][taskIdNo]]
+            except KeyError:
+                features += [Exit_Values['EXIT_ERROR']]
+    return np.array(features), label
+
+
+def getFeaturesLabels(db_path):
+    conn = sqlite3.connect(db_path)
+    db_cursor = conn.cursor()
+    command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful'\
+            ' FROM TaskSet JOIN Job'\
+            ' ON TaskSet.Set_ID = Job.Set_ID and'\
+            ' (TaskSet.TASK1_ID == Job.Task_ID or'\
+            ' TaskSet.TASK2_ID == Job.Task_ID or'\
+            ' TaskSet.TASK3_ID == Job.Task_ID);'
+    db_cursor.execute(command)
+    # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)]
+    data_table = db_cursor.fetchall()
+    print('reading taskset_jobs join done')
+    print('The current time is:',datetime.now())
+    finalFeatureList = []
+    finalLabelList = []
+    currentTset = data_table[0][0] # first taskset id
+    tSetJobs = []
+    totalSize = len(data_table)
+    for row in data_table:
+        if row[0] % 1000 == 0:
+            print('processed',int(100 * (row[0]/totalSize)),'%' )
+        if row[0] == currentTset:
+            #then still same setTset
+            tSetJobs.append(row)
+        else:
+            # job of next taskset
+            # process data and record new
+            features, label = processTaskset(tSetJobs)
+            finalFeatureList.append(features)
+            finalLabelList.append(label)
+            tSetJobs = []
+            currentTset = row[0]
+            tSetJobs.append(row)
+    # proess last taskset
+    features, label = processTaskset(tSetJobs)
+    finalFeatureList.append(features)
+    finalLabelList.append(label)
+    return finalFeatureList, finalLabelList
+
+
+
+TASKS_DICT = getTaskFeatures(DB_PATH)
+print('Tasks have been added to TASKS_DICT')
+print('length of taskdict: ', len(TASKS_DICT))
+print('example task 222:',TASKS_DICT[222])
+
+features, labels = getFeaturesLabels(DB_PATH)
+
+
+print('The current time is:',datetime.now())
+
+print("Done reading")
+  
 labels = np.array(labels) #  to save the labels list as numpy array
 
 # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value
@@ -173,3 +169,5 @@
 
 with open ( '56_labels', 'wb' ) as outfile:
     pickle.dump(labels, outfile)
+
+print('The current time is:',datetime.now())
diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py
index b9fa43c..c19c3fd 100644
--- a/python3-lstm/prediction.py
+++ b/python3-lstm/prediction.py
@@ -1,7 +1,4 @@
-import pickle
 import numpy as np
-import pandas as pd
-from tqdm import tqdm
 from keras.preprocessing.sequence import pad_sequences
 from keras.models import load_model
 import csv
@@ -155,10 +152,7 @@ def getFeaturesLabels(db_path):
 
 print("Done reading")
 
-
-
-
-labels = np.array(labels) #  to save the labels list as numpy array
+labels = np.array(labels)  # to save the labels list as numpy array
 
 # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value
 features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post')

From 4f0b26a35deb445e4f5ed5fda0bcdba0d7061c54 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Sun, 8 Sep 2019 12:46:17 +0200
Subject: [PATCH 09/21] updated Data_preparation to accept a different DB

---
 python3-lstm/Data_preparation.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py
index d215f89..4312e77 100644
--- a/python3-lstm/Data_preparation.py
+++ b/python3-lstm/Data_preparation.py
@@ -4,6 +4,9 @@
 from keras.preprocessing.sequence import pad_sequences
 import sqlite3
 
+debug = False
+
+
 # PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label
 PKGs = {
         'pi' : 0,
@@ -44,9 +47,10 @@
         }
 
 
-print("Doing writing")
+if debug:
+    print("Doing writing")
 
-DB_PATH = "/home/bernhard/panda_v4.db"
+DB_PATH = sys.argv[1]
 TASKS_DICT = {}
 
 def taskToFeatureList(task):
@@ -135,7 +139,7 @@ def getFeaturesLabels(db_path):
             tSetJobs = []
             currentTset = row[0]
             tSetJobs.append(row)
-    # proess last taskset
+    # process last taskset
     features, label = processTaskset(tSetJobs)
     finalFeatureList.append(features)
     finalLabelList.append(label)
@@ -160,8 +164,9 @@ def getFeaturesLabels(db_path):
 # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value
 features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post')
 
-#print(features.shape) # the dimensionality of features
-#print(labels.shape) # the dimensionality of labels
+if debug:
+    print(features.shape) # the dimensionality of features
+    print(labels.shape) # the dimensionality of labels
 
 #  save both files for the training
 with open ( '56_features', 'wb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'

From 626490ec5e9f0e11d757ec1280756c49f0c75d4a Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Sun, 8 Sep 2019 12:47:55 +0200
Subject: [PATCH 10/21] added gitignore file

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9f11b75
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.idea/

From 78c0fd45f8ed537170bb757346dc9d14b001b167 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Sun, 8 Sep 2019 22:14:45 +0200
Subject: [PATCH 11/21] Updated environment setup and prediction

---
 bootstrap.sh                  |   3 +
 provision.sh                  |   3 +
 python3-lstm/prediction.py    | 160 ++--------------------------------
 python3-lstm/requirements.txt |   8 ++
 4 files changed, 21 insertions(+), 153 deletions(-)
 create mode 100644 python3-lstm/requirements.txt

diff --git a/bootstrap.sh b/bootstrap.sh
index 20131cf..f5e64fd 100644
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -10,3 +10,6 @@
 # Contributor: Bernhard Blieninger
 ##############################
 
+python3 -m venv lstm-virtenv
+source lstm-virtenv/bin/activate
+pip3 install  -r python3-lstm/requirements.txt
diff --git a/provision.sh b/provision.sh
index 0a68924..5b4f244 100755
--- a/provision.sh
+++ b/provision.sh
@@ -10,5 +10,8 @@
 ######################
 
 sudo apt update -qq
+
 sudo apt install python3.5 python3-pip tmux -qq
 
+sudo apt install python3-venv
+#pip3 install --user virtualenv
diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py
index c19c3fd..dda7bbb 100644
--- a/python3-lstm/prediction.py
+++ b/python3-lstm/prediction.py
@@ -1,161 +1,15 @@
 import numpy as np
-from keras.preprocessing.sequence import pad_sequences
+import pickle
 from keras.models import load_model
 import csv
 
-# PKG has a fixed set of labels. Integer encoding is used where integer # value is assigned to each label
-PKGs = {
-    'pi': 0,
-    'hey': 1,
-    'tumatmul': 2,
-    'cond_mod': 3
-}
+with open ( '56_features', 'wb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'
+    #pickle.load(outfile features, outfile)
+    features = pickle.load(outfile, *, fix_imports=True, encoding="ASCII", errors="strict")
 
-# Integer encoding for Exit_Values from Jobs
-Exit_Values = {
-    'EXIT': 1,
-    'EXIT_CRITICAL': 0,
-    'EXIT_PERIOD': 2,
-    'OUT_OF_CAPS': 3,
-    'OUT_OF_QUOTA': 4,
-    'EXIT_ERROR': 5
-}
-
-# ARG values ranged from 1 to 205.891.132.094.649, these values were normalized and scaled # to range from 1 to 17
-Arg_Values = {
-    1: 1,
-    4096: 2,
-    8192: 3,
-    16384: 4,
-    32768: 5,
-    65536: 6,
-    131072: 7,
-    262144: 8,
-    524288: 9,
-    1048576: 10,
-    2097152: 11,
-    847288609443: 12,
-    2541865828329: 13,
-    7625597484987: 14,
-    22876792454961: 15,
-    68630377364883: 16,
-    205891132094649: 17
-}
-
-print("Doing writing")
-
-DB_PATH = "/home/bernhard/panda_v4.db"
-TASKS_DICT = {}
-
-
-def taskToFeatureList(task):
-    # returns a fature list for the corresponding task values
-    feature = []
-    feature.append(task['Priority'])
-    feature.append(task['Period'])
-    feature.append(task['Number_of_Jobs'])
-    feature.append(task['PKG'])
-    feature.append(task['Arg'])
-    feature.append(task['CRITICALTIME'])
-    return feature
-
-
-def getTaskFeatures(db_path):  # c is the cursor for the db
-    # returns a dictionary
-    # { task_id : [ feature, list ]
-    conn = sqlite3.connect(db_path)
-    conn.row_factory = lambda C, R: {c[0]: R[i] for i, c in enumerate(C.description)}
-    db_cursor = conn.cursor()
-    db_cursor.execute('select Task_ID,Priority,Period,PKG,Arg,CRITICALTIME,Number_of_Jobs from Task')
-    outputTable = db_cursor.fetchall()
-
-    tasks_dict = {}
-    for row in outputTable:
-        row['Period'] = int(row['Period'] / 1000)
-        row['Number_of_Jobs'] = int(row['Number_of_Jobs'])
-        row['PKG'] = PKGs[row['PKG']]
-        row['CRITICALTIME'] = int(row['CRITICALTIME'] / 1000)
-        row['Arg'] = Arg_Values[row['Arg']]
-        tasks_dict[row['Task_ID']] = taskToFeatureList(row)
-    return tasks_dict
-
-
-def processTaskset(tasksetData):
-    # tasksetData is a list of tuples returned from the DB in getTasksetData()
-    label = tasksetData[0][-1]
-    features = []
-    jobExitsByTask = {}
-    for tsData in tasksetData:
-        try:
-            jobExitsByTask[tsData[4]].append(Exit_Values[tsData[5]])
-        except KeyError:
-            jobExitsByTask[tsData[4]] = [Exit_Values[tsData[5]]]
-    for taskIdNo in (1, 2, 3):
-        if tasksetData[0][taskIdNo] != -1:
-            features += TASKS_DICT[tasksetData[0][taskIdNo]]
-            try:
-                features += jobExitsByTask[tasksetData[0][taskIdNo]]
-            except KeyError:
-                features += [Exit_Values['EXIT_ERROR']]
-    return np.array(features), label
-
-
-def getFeaturesLabels(db_path):
-    conn = sqlite3.connect(db_path)
-    db_cursor = conn.cursor()
-    command = 'SELECT TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful' \
-              ' FROM TaskSet JOIN Job' \
-              ' ON TaskSet.Set_ID = Job.Set_ID and' \
-              ' (TaskSet.TASK1_ID == Job.Task_ID or' \
-              ' TaskSet.TASK2_ID == Job.Task_ID or' \
-              ' TaskSet.TASK3_ID == Job.Task_ID);'
-    db_cursor.execute(command)
-    # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)]
-    data_table = db_cursor.fetchall()
-    print('reading taskset_jobs join done')
-    print('The current time is:', datetime.now())
-    finalFeatureList = []
-    finalLabelList = []
-    currentTset = data_table[0][0]  # first taskset id
-    tSetJobs = []
-    totalSize = len(data_table)
-    for row in data_table:
-        if row[0] % 1000 == 0:
-            print('processed', int(100 * (row[0] / totalSize)), '%')
-        if row[0] == currentTset:
-            # then still same setTset
-            tSetJobs.append(row)
-        else:
-            # job of next taskset
-            # process data and record new
-            features, label = processTaskset(tSetJobs)
-            finalFeatureList.append(features)
-            finalLabelList.append(label)
-            tSetJobs = []
-            currentTset = row[0]
-            tSetJobs.append(row)
-    # proess last taskset
-    features, label = processTaskset(tSetJobs)
-    finalFeatureList.append(features)
-    finalLabelList.append(label)
-    return finalFeatureList, finalLabelList
-
-
-TASKS_DICT = getTaskFeatures(DB_PATH)
-print('Tasks have been added to TASKS_DICT')
-print('length of taskdict: ', len(TASKS_DICT))
-print('example task 222:', TASKS_DICT[222])
-
-features, labels = getFeaturesLabels(DB_PATH)
-
-print('The current time is:', datetime.now())
-
-print("Done reading")
-
-labels = np.array(labels)  # to save the labels list as numpy array
-
-# To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value
-features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post')
+with open ( '56_labels', 'wb' ) as outfile:
+    labels = pickle.load(outfile, *, fix_imports=True, encoding="ASCII", errors="strict")
+    #pickle.dump(labels, outfile)
 
 model = load_model('My_LSTM_Model.h5')
 X = np.expand_dims(features, axis=2)
diff --git a/python3-lstm/requirements.txt b/python3-lstm/requirements.txt
new file mode 100644
index 0000000..99bb79b
--- /dev/null
+++ b/python3-lstm/requirements.txt
@@ -0,0 +1,8 @@
+keras==2.2.5
+matplotlib==3.1.1
+numpy==1.17.2
+pandas==0.25.1
+seaborn==0.9.0
+scikit_learn==0.21.3
+tensorboard==1.14.0
+tensorflow==1.14.0

From 32da32ceb992a5167c707a2d1b9a6f4831cecb97 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Sun, 8 Sep 2019 22:18:45 +0200
Subject: [PATCH 12/21] corrected typo

---
 python3-lstm/Evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python3-lstm/Evaluation.py b/python3-lstm/Evaluation.py
index ffd9811..2b5e489 100644
--- a/python3-lstm/Evaluation.py
+++ b/python3-lstm/Evaluation.py
@@ -58,8 +58,8 @@
 
 plt.figure ( figsize=(5.5, 4) )
 sns.heatmap ( cm_df, annot=True, fmt='g' )
-plt.title ( 'Confusoin Matrix \n Accuracy:{0:.3f}'.format ( accuracy_score ( yt, yp ) ) )
+plt.title ( 'Confusion Matrix \n Accuracy:{0:.3f}'.format ( accuracy_score ( yt, yp ) ) )
 plt.ylabel ( 'True label' )
 plt.xlabel ( 'Predicted label' )
 plt.show ()
-plt.savefig ( 'Confusoin_Matrix.png' )
+plt.savefig ( 'Confusion_Matrix.png' )

From 35664b09923ad3bcda4e434ee080606712ccda0b Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Sun, 8 Sep 2019 22:40:19 +0200
Subject: [PATCH 13/21] fixed missing spaces in string

---
 python3-lstm/prediction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py
index dda7bbb..46d1db7 100644
--- a/python3-lstm/prediction.py
+++ b/python3-lstm/prediction.py
@@ -19,7 +19,7 @@
 for i in range(len(labels)):
     l = labels[i]
     p = np.argmax(preds[i])
-    print ( "the actual value is{0}and the predicted value is {1}".format(l, p))
+    print ( "the actual value is {0} and the predicted value is {1}".format(l, p))
     arr.append([i + 1, l, p])
 
 csvfile = "Predicion_results.csv"

From f7b1d458ce0810bceedf4c038a5db33ac91563cc Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Mon, 9 Sep 2019 00:35:51 +0200
Subject: [PATCH 14/21] Added a random value to dataset split

---
 python3-lstm/CuDNNLSTM.py  | 2 +-
 python3-lstm/Evaluation.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py
index 5fd9629..569ba2a 100644
--- a/python3-lstm/CuDNNLSTM.py
+++ b/python3-lstm/CuDNNLSTM.py
@@ -39,7 +39,7 @@
 # print ( count )
 
 # devide data into training and test sets
-X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 )
+X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 ,random_state=42)
 # print ( X_train.shape )
 
 # LSTM input is fifty-six time-steps and one feature at each time-step is represented by the notation: (56,1).
diff --git a/python3-lstm/Evaluation.py b/python3-lstm/Evaluation.py
index 2b5e489..cc2c75c 100644
--- a/python3-lstm/Evaluation.py
+++ b/python3-lstm/Evaluation.py
@@ -30,7 +30,7 @@
 y = np.array ( newy )
 print ( count )
 
-X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3 )
+X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.3, random_state=42)
 print ( X_train.shape )
 
 model = load_model ( 'My_LSTM_Model.h5' )  # loading saved model

From 284e8cc1674ad545477d6782793c03e9b461ae81 Mon Sep 17 00:00:00 2001
From: Bernhard Blieninger <5148715+privatereese@users.noreply.github.com>
Date: Mon, 9 Sep 2019 10:20:15 +0200
Subject: [PATCH 15/21] fixed minor typos

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 70d2e70..c488bc9 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ The first step is to preprocess the data. The database was imported and transfor
     5. Task Critical time: Integer
     6. Number of Jobs: Integer
 From Jobs only one feature was selected: Job Exit_Value: String.
-After exporting all tables, start with Data_preparation.py. Line 165 is responsible for the length of the feature vector. 
+After exporting all tables, start with Data_preparation.py.
 Feature and labels are save in the end. 
 
 
@@ -35,7 +35,7 @@ CuDNNLSTM.py. When using CPU, install Tensorflow and replace CuDNNLSTM with LSTM
 Evaluation.py. Evaluation prints the confusion matrix and classification report. Tensorboard can be launched by typing tensorboard -–logdir=logs/ into the terminal and logs from trained models can be visualized 
 
 **4. Prediction:**
-predictin.py. A CSV file will be save with actual and predictied values. The trained model should be loaded first.
+prediction.py. A CSV file will be save with actual and predicted values. The trained model should be loaded first.
 
 **5. Plotting:**
 Plotting.py. Another way to visualize the model built.

From 9b224b6fba2df3483837800f0323170f30baec88 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Mon, 9 Sep 2019 11:25:35 +0200
Subject: [PATCH 16/21] edited CUDNNLSTM and Data_preparation

---
 python3-lstm/CuDNNLSTM.py        | 3 +--
 python3-lstm/Data_preparation.py | 3 +++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py
index 569ba2a..2ceaab5 100644
--- a/python3-lstm/CuDNNLSTM.py
+++ b/python3-lstm/CuDNNLSTM.py
@@ -46,8 +46,7 @@
 input = Input ( shape=(56, 1) )
 
 # the first LSTM layer has 64 cells, the number must be equal/bigger than the input size. If you are using a CPU then change CuDNNLSTM to LSTM
-lstm = CuDNNLSTM ( 64, return_sequences=True ) (
-    input )  # Return_sequences is set true because the first LSTM has to return a sequence, which then can be fed into the 2nd LSTM
+lstm = CuDNNLSTM ( 64, return_sequences=True ) ( input )  # Return_sequences is set true because the first LSTM has to return a sequence, which then can be fed into the 2nd LSTM
 lstm = CuDNNLSTM ( 128, return_sequences=True ) ( lstm )
 lstm = CuDNNLSTM ( 256 ) ( lstm )
 
diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py
index 4312e77..7e1bb70 100644
--- a/python3-lstm/Data_preparation.py
+++ b/python3-lstm/Data_preparation.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 import pickle
+import sys
 import numpy as np
 from keras.preprocessing.sequence import pad_sequences
 import sqlite3
@@ -166,7 +167,9 @@ def getFeaturesLabels(db_path):
 
 if debug:
     print(features.shape) # the dimensionality of features
+    input()
     print(labels.shape) # the dimensionality of labels
+    input()
 
 #  save both files for the training
 with open ( '56_features', 'wb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'

From 57201f58626a0436e120fdce6b3541bc0257615e Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Mon, 9 Sep 2019 11:30:33 +0200
Subject: [PATCH 17/21] put time in it

---
 python3-lstm/Data_preparation.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py
index 7e1bb70..a5fd194 100644
--- a/python3-lstm/Data_preparation.py
+++ b/python3-lstm/Data_preparation.py
@@ -4,6 +4,9 @@
 import numpy as np
 from keras.preprocessing.sequence import pad_sequences
 import sqlite3
+import timeit
+
+start = timeit.default_timer()
 
 debug = False
 
@@ -178,4 +181,7 @@ def getFeaturesLabels(db_path):
 with open ( '56_labels', 'wb' ) as outfile:
     pickle.dump(labels, outfile)
 
-print('The current time is:',datetime.now())
+
+stop = timeit.default_timer()
+
+print('Time elapsed: ', stop - start)

From e3b8733d8c4c26cea03199f2e254a4c278f286fb Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Mon, 9 Sep 2019 13:07:03 +0200
Subject: [PATCH 18/21] deleted timer again will be done with linux (time)

---
 python3-lstm/Data_preparation.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py
index a5fd194..4954465 100644
--- a/python3-lstm/Data_preparation.py
+++ b/python3-lstm/Data_preparation.py
@@ -4,9 +4,6 @@
 import numpy as np
 from keras.preprocessing.sequence import pad_sequences
 import sqlite3
-import timeit
-
-start = timeit.default_timer()
 
 debug = False
 
@@ -181,7 +178,3 @@ def getFeaturesLabels(db_path):
 with open ( '56_labels', 'wb' ) as outfile:
     pickle.dump(labels, outfile)
 
-
-stop = timeit.default_timer()
-
-print('Time elapsed: ', stop - start)

From 106c51783f32a789d7de5eccd0315b8600c4c601 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Mon, 9 Sep 2019 13:50:17 +0200
Subject: [PATCH 19/21] adapted some values to fit smaller tensor range

---
 python3-lstm/CuDNNLSTM.py        |  5 +++++
 python3-lstm/Data_preparation.py | 28 ++++++++++++----------------
 python3-lstm/prediction.py       | 10 ++++------
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py
index 2ceaab5..ef79ca5 100644
--- a/python3-lstm/CuDNNLSTM.py
+++ b/python3-lstm/CuDNNLSTM.py
@@ -10,6 +10,11 @@
 from keras.optimizers import Adam
 from sklearn.model_selection import train_test_split
 
+
+#ignore deprecation warnings to get a better and cleaner output
+import tensorflow.python.util.deprecation as deprecation
+deprecation._PRINT_DEPRECATION_WARNINGS = False
+
 name = "logname-{}".format ( int ( time.time () ) )
 
 # both metrics and early stopping conditions are defined here and then saved in the log42 file
diff --git a/python3-lstm/Data_preparation.py b/python3-lstm/Data_preparation.py
index 4954465..0f8bbad 100644
--- a/python3-lstm/Data_preparation.py
+++ b/python3-lstm/Data_preparation.py
@@ -1,4 +1,6 @@
-from datetime import datetime
+
+import warnings
+warnings.filterwarnings('ignore',category=FutureWarning)
 import pickle
 import sys
 import numpy as np
@@ -118,16 +120,13 @@ def getFeaturesLabels(db_path):
     db_cursor.execute(command)
     # data_table format: [( TaskSet.Set_ID, TaskSet.TASK1_ID, TaskSet.TASK2_ID, TaskSet.TASK3_ID, Job.Task_ID, Job.Exit_Value, TaskSet.Successful)]
     data_table = db_cursor.fetchall()
-    print('reading taskset_jobs join done')
-    print('The current time is:',datetime.now())
+
     finalFeatureList = []
     finalLabelList = []
     currentTset = data_table[0][0] # first taskset id
     tSetJobs = []
     totalSize = len(data_table)
     for row in data_table:
-        if row[0] % 1000 == 0:
-            print('processed',int(100 * (row[0]/totalSize)),'%' )
         if row[0] == currentTset:
             #then still same setTset
             tSetJobs.append(row)
@@ -149,21 +148,18 @@ def getFeaturesLabels(db_path):
 
 
 TASKS_DICT = getTaskFeatures(DB_PATH)
-print('Tasks have been added to TASKS_DICT')
-print('length of taskdict: ', len(TASKS_DICT))
-print('example task 222:',TASKS_DICT[222])
-
-features, labels = getFeaturesLabels(DB_PATH)
 
+if debug:
+    print('Tasks have been added to TASKS_DICT')
+    print('length of taskdict: ', len(TASKS_DICT))
+    print('example task 222:',TASKS_DICT[222])
 
-print('The current time is:',datetime.now())
+features, labels = getFeaturesLabels(DB_PATH)
 
-print("Done reading")
-  
 labels = np.array(labels) #  to save the labels list as numpy array
 
 # To make a fixed length vector, if the vector is smaller than 56 then replace the empty values with -1. if longer than 56 trim the value
-features = pad_sequences(features, maxlen=56, value=-1, padding='post', truncating='post')
+features = pad_sequences(features, maxlen=42, value=-1, padding='post', truncating='post')
 
 if debug:
     print(features.shape) # the dimensionality of features
@@ -172,9 +168,9 @@ def getFeaturesLabels(db_path):
     input()
 
 #  save both files for the training
-with open ( '56_features', 'wb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'
+with open ( '42_features', 'wb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'
     pickle.dump(features, outfile)
 
-with open ( '56_labels', 'wb' ) as outfile:
+with open ( '42_labels', 'wb' ) as outfile:
     pickle.dump(labels, outfile)
 
diff --git a/python3-lstm/prediction.py b/python3-lstm/prediction.py
index 46d1db7..8dc2274 100644
--- a/python3-lstm/prediction.py
+++ b/python3-lstm/prediction.py
@@ -3,13 +3,11 @@
 from keras.models import load_model
 import csv
 
-with open ( '56_features', 'wb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'
-    #pickle.load(outfile features, outfile)
-    features = pickle.load(outfile, *, fix_imports=True, encoding="ASCII", errors="strict")
+with open ( '42_features', 'rb' ) as outfile:  # 'wb' is the file mode, it means 'write binary'
+    features = pickle.load(outfile, fix_imports=True)
 
-with open ( '56_labels', 'wb' ) as outfile:
-    labels = pickle.load(outfile, *, fix_imports=True, encoding="ASCII", errors="strict")
-    #pickle.dump(labels, outfile)
+with open ( '42_labels', 'rb' ) as outfile:
+    labels = pickle.load(outfile, fix_imports=True)
 
 model = load_model('My_LSTM_Model.h5')
 X = np.expand_dims(features, axis=2)

From 8ded36d9373608f4074170a0ef1c4dbbded2d31c Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Mon, 9 Sep 2019 13:52:01 +0200
Subject: [PATCH 20/21] adopted values to fit the new name

---
 python3-lstm/CuDNNLSTM.py  | 4 ++--
 python3-lstm/Evaluation.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py
index ef79ca5..a96dd2d 100644
--- a/python3-lstm/CuDNNLSTM.py
+++ b/python3-lstm/CuDNNLSTM.py
@@ -23,9 +23,9 @@
 es = EarlyStopping ( monitor='val_loss', mode='min', verbose=1 )  # define early stopping criteria
 
 # Importing the the extracted features and labels
-with open ( '56_features', 'rb' ) as fp:
+with open ( '42_features', 'rb' ) as fp:
     X = pickle.load ( fp )
-with open ( '56_labels', 'rb' ) as fp:
+with open ( '42_labels', 'rb' ) as fp:
     y = pickle.load ( fp )
 
 # LSTM’s input shape argument expects a three-dimensional array as an input in this order: Samples, timestamps and features. This is why we need to add another dimention to the numpy array.
diff --git a/python3-lstm/Evaluation.py b/python3-lstm/Evaluation.py
index cc2c75c..1ffccc8 100644
--- a/python3-lstm/Evaluation.py
+++ b/python3-lstm/Evaluation.py
@@ -12,9 +12,9 @@
 from sklearn import metrics
 from sklearn.model_selection import train_test_split
 
-with open ( '56_features', 'rb' ) as fp:
+with open ( '42_features', 'rb' ) as fp:
     X = pickle.load ( fp )
-with open ( '56_labels', 'rb' ) as fp:
+with open ( '42_labels', 'rb' ) as fp:
     y = pickle.load ( fp )
 
 X = np.expand_dims ( X, axis=2 )

From b0d12c713923eaa090bc0594d715261dc9b40870 Mon Sep 17 00:00:00 2001
From: privatereese <5148715+privatereese@users.noreply.github.com>
Date: Mon, 9 Sep 2019 14:02:51 +0200
Subject: [PATCH 21/21] adpoted new shape of features and fixed warning
 supression

---
 python3-lstm/CuDNNLSTM.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python3-lstm/CuDNNLSTM.py b/python3-lstm/CuDNNLSTM.py
index a96dd2d..1a28871 100644
--- a/python3-lstm/CuDNNLSTM.py
+++ b/python3-lstm/CuDNNLSTM.py
@@ -1,5 +1,10 @@
 import pickle
 import time
+import warnings
+warnings.filterwarnings('ignore',category=FutureWarning)
+#ignore deprecation warnings to get a better and cleaner output
+from tensorflow.python.util import deprecation
+deprecation._PRINT_DEPRECATION_WARNINGS = False
 import tensorflow as tf
 import numpy as np
 from keras.callbacks import TensorBoard
@@ -11,9 +16,6 @@
 from sklearn.model_selection import train_test_split
 
 
-#ignore deprecation warnings to get a better and cleaner output
-import tensorflow.python.util.deprecation as deprecation
-deprecation._PRINT_DEPRECATION_WARNINGS = False
 
 name = "logname-{}".format ( int ( time.time () ) )
 
@@ -48,7 +50,7 @@
 # print ( X_train.shape )
 
 # LSTM input is fifty-six time-steps and one feature at each time-step is represented by the notation: (56,1).
-input = Input ( shape=(56, 1) )
+input = Input ( shape=(42, 1) )
 
 # the first LSTM layer has 64 cells, the number must be equal/bigger than the input size. If you are using a CPU then change CuDNNLSTM to LSTM
 lstm = CuDNNLSTM ( 64, return_sequences=True ) ( input )  # Return_sequences is set true because the first LSTM has to return a sequence, which then can be fed into the 2nd LSTM