Pam50Classification_algorithms/Main.py at master · ABurrello/Pam50Classification_algorithms · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import numpy as np
import pdb
import scipy
from sklearn.model_selection import KFold
import copy
import scipy.io
#myclasses
from DR import DimReduction
from Classification import Classification_methods
from Opening_design import GUI_design
from Imbalance_manager import Imbalance_classes

#Nfeatures is the number of final features after the reduction,
#both for future selection methods and feature reduction
NfeaturesPCA = 180
NfeaturesLDA = 4
NfeaturesGA = 1800
#number of folds for the K-fold cross validation
NFOLDS = 10
#display methods: the algorithm either takes arguments as parameters in the launching command,
#or directly from input/output in the first part of the program.
displayer = GUI_design()
Arguments = displayer.Pipeline_construction()
Feature_reduction = Arguments[0]
Imbalance_model = Arguments[1]
Classification_choice = Arguments[2]
if Feature_reduction == 'PCA':
    Nfeatures = NfeaturesPCA
if Feature_reduction == 'LDA':
    Nfeatures = NfeaturesLDA
if Feature_reduction == 'GA':
    Nfeatures = NfeaturesGA

np.random.seed(0)

#loading of the dataset
dataset = scipy.io.loadmat('dataset.mat')
X = dataset['Transcriptome'][1:,:]
Y2 = dataset['Y'][1:]
feat_name = dataset['Transcriptome_labels']
del dataset
#we eclude all the Healty patients or patient with Not present labels
Y = np.zeros([len(Y2)])
for i in range(len(Y)):
    if Y2[i] == 'Basal-like   ': Y[i] = 1
    if Y2[i] == 'Normal-like  ': Y[i] = 2
    if Y2[i] == 'Luminal A    ': Y[i] = 3
    if Y2[i] == 'Luminal B    ': Y[i] = 4
    if Y2[i] == 'HER2-enriched': Y[i] = 5
    if Y2[i] == 'Healty       ': Y[i] = 6
    if Y2[i] == 'NA           ': Y[i] = 6
    if Y2[i] == 'Not present  ': Y[i] = 6

X = X[Y<6, :]
Y = Y[Y<6]
#remove features with all 0
X = X[:, sum(X,0)!=0]
#Unsupervised pipeline: we force reduction to PCA and not class imbalance to apply the clustering:
#In this case we don't perform Kfold, because we don't have the labels and it is useless.
# We use the labels at the end to decide which cluster assign to each label and calculate the accuracy and the other scores.
if Classification_choice == 'kMeans' or Classification_choice == 'HierarchicalClustering':
    FeatR = DimReduction(mdl_type = Feature_reduction, dim_out = Nfeatures)
    X_train = FeatR.Dataset_reduction(X, supervised = False)
    classifier = Classification_methods(X_train,ytest=Y)
    Accuracy, F1_score,BER = classifier.Classification_start(Classification_choice)
else:
    #k-fold cross validation
    kf = KFold(n_splits=NFOLDS,shuffle=True, random_state=2)
    Accuracy = 0
    F1_score = 0
    BER = 0
    i=1
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        ##Feature reduction
        FeatR = DimReduction(mdl_type = Feature_reduction, dim_out = Nfeatures)
        X_train, X_test = FeatR.Dataset_reduction(X_train = X_train,supervised = True,X_test= X_test, labels =y_train)
        ##Class Imbalance
        model0 = Imbalance_classes(X_train, y_train)
        X_train, y_train = model0.apply(Imbalance_model)
        np.random.seed(0)
        #Learning Step
        classifier = Classification_methods(X_train, y_train,X_test,y_test)
        Acc, F1,BER_int = classifier.Classification_start(Classification_choice)
        print 'Summary of fold {:.0f}'.format(i)
        print '-----------------'
        print 'Accuracy = {:.2f} \nF1 score = {:.2f} \nBER = {:.2f}'\
        .format(Acc,F1,BER_int)
        i = i + 1
        Accuracy = Accuracy + Acc
        F1_score = F1_score + F1
        BER = BER + BER_int
    #metrics to evaluate the algorithm. See the corresponding class.
    Accuracy = Accuracy/float(NFOLDS)
    F1_score = F1_score/float(NFOLDS)
    BER = BER/float(NFOLDS)
print 'Final Summary'
print '-------------'
print 'Accuracy = {:.2f} \nF1 score = {:.2f} \nBER = {:.2f}'\
.format(Accuracy,F1_score,BER)