-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathClassification.py
More file actions
147 lines (133 loc) · 6.29 KB
/
Classification.py
File metadata and controls
147 lines (133 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import pdb
import numpy as np
import scipy
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
class Classification_methods:
def __init__(self, xtrain,ytrain= None, xtest=None, ytest=None):
self.xtrain = xtrain
self.ytrain = ytrain
self.xtest = xtest
self.ytest = ytest
def Classification_start(self,string):
#Description: the function decide which learner to apply and calculates all the metric for it.
#INPUT: - string: name of the learner to apply
#OUTPUT: -self.Accuracy, self.F1_score, self.BER: metrics to evaluate the performance of the pipeline
print 'Starting Learning step'
print '----------------------'
if string == 'SVC':
self.SVC()
elif string == 'Random_Forest':
self.Random_forest()
elif string == 'kNN':
self.kNN()
elif string == 'kMeans':
self.kMeans_supervised()
elif string == 'HierarchicalClustering':
self.AggClustering()
self.Accuracy_evaluation()
self.F1_score_evaluation()
self.BER_evaluation()
return self.Accuracy, self.F1_score, self.BER
def SVC(self):
#Description: the function perform the Support Vector Classifier with linear Kernel.
# The train and test data are parameters of the class itself
#INPUT:
#OUTPUT:
clf = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', coef0=0.0, \
verbose=False, max_iter=10000)
clf.fit(self.xtrain, self.ytrain)
self.Prediction = clf.predict(self.xtest)
def kNN(self):
#Description: the function perform k - Nearest Neighbors with neighbors = 3 and,
# since the dataset is small, comparing all the points in a brute force approach.
# The train and test data are parameters of the class itself
#INPUT:
#OUTPUT:
clf = KNeighborsClassifier(n_neighbors=1, weights='uniform', algorithm='brute', \
p=2, metric='minkowski')
#p=2 and minkowski is the euclidean distance, brute force will compute distance for each point.
clf.fit(self.xtrain, self.ytrain)
self.Prediction = clf.predict(self.xtest)
def Random_forest(self):
#Description: the function perform the Random Forset with 50 trees, 8 as max depth
# and 20 maximum leaves, to avoid overfitting on the training data.
# The train and test data are parameters of the class itself
#INPUT:
#OUTPUT:
clf = RandomForestClassifier(n_estimators = 40, max_depth=8, max_leaf_nodes=25,random_state=0)
clf.fit(self.xtrain, self.ytrain)
self.Prediction = clf.predict(self.xtest)
def kMeans_supervised(self):
#Description: the function perform kMeans with k-means++ initialization to reduce convergence time.
# By now the number of cluster is 5, as the number of classes.
#INPUT:
#OUTPUT:
NCLUSTER = 20
kmeans = KMeans(init='k-means++',n_clusters=NCLUSTER, random_state=0)
kmeans.fit(self.xtrain)
guesses = np.zeros(kmeans.labels_.size)
for i in range(NCLUSTER):
labels_cluster = self.ytest[kmeans.labels_==i]
classes= []
for j in range(5):
classes.append(sum(labels_cluster==j+1))
label_convert = np.argmax(classes)+1
guesses[kmeans.labels_==i] = label_convert
self.Prediction = guesses
def AggClustering(self):
#Description: the function perform Hierarchical Clustering
# By now the number of cluster is 5, as the number of classes.
#INPUT:
#OUTPUT:
NCLUSTER = 20
clt = AgglomerativeClustering(n_clusters=NCLUSTER, affinity = 'euclidean',linkage = 'ward')
#ward = ward minimizes the variance of the clusters being merged.
#clt = DBSCAN(eps=0.2, min_samples=5, metric='euclidean', leaf_size=30)
clt.fit(self.xtrain/self.xtrain.max())
guesses = np.zeros(clt.labels_.size)
for i in range(NCLUSTER):
labels_cluster = self.ytest[clt.labels_==i]
classes= []
for j in range(5):
classes.append(sum(labels_cluster==j+1))
label_convert = np.argmax(classes)+1
guesses[clt.labels_==i] = label_convert
self.Prediction = guesses
def Accuracy_evaluation(self):
#Description: evaluate the accuracy of the learner.
#INPUT:
#OUTPUT:
self.Accuracy = accuracy_score(self.ytest, self.Prediction)
def F1_score_evaluation(self):
#Description: evaluate the F1_score as mean of the F1_score of all the classes.
#INPUT:
#OUTPUT:
self.F1_score = f1_score(self.ytest, self.Prediction, average='weighted') #other options are micro and macro, see documentatin
def BER_evaluation(self):
#Description: evaluate the BER as mean of the BER of all the classes.
#INPUT:
#OUTPUT:
score =[]
for label in [1,2,3,4,5]:
class_predicted_well = 0
class_true = self.ytest==label
class_predicted = self.Prediction==label
for i in range(self.ytest.size):
if class_true[i] == True and class_predicted[i] == True:
class_predicted_well = class_predicted_well + 1
score.append(class_predicted_well/float(np.sum(class_true)))
self.BER = 1 - np.mean(score)
#we use an adapted version, that computes the errors on each class and make an average
def Confusion_matrix(self):
confusion_matrix = np.zeros([5,5])
for i in range(self.Prediction.shape[0]):
confusion_matrix[self.ytest[i].astype(int)-1,self.Prediction[i].astype(int)-1] += 1
for i in range(5):
print (confusion_matrix[i,i]/sum(confusion_matrix[i]))