-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathClusterModule.py
More file actions
72 lines (47 loc) · 1.69 KB
/
ClusterModule.py
File metadata and controls
72 lines (47 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#-*-coding:utf-8-*-
from __future__ import division
from scipy import sparse
from scipy.sparse.linalg.eigen import arpack
from numpy import *
from pyspark.mllib.clustering import KMeans, KMeansModel
import Utils
import FileParser as fp
import PredefinedValues as pv
def getClusterModel(sc, mat, rawdata, clusterNum, dimensionReductionNum, targetEigenVecFile):
laplacianMat = getLaplacianMatrix(mat)
vals, vecs = computeEigenValsVectors(laplacianMat, dimensionReductionNum)
unifiedEigenVec = unification(vecs)
fp.outputMatrix(unifiedEigenVec, targetEigenVecFile)
unifiedRDDVecs = sc.parallelize(unifiedEigenVec)
model = kMeans(unifiedRDDVecs,clusterNum)
if pv.outputDebugMsg:
Utils.logMessage("\nCluster finished")
return model, unifiedRDDVecs
def getLaplacianMatrix(mat):
D = mat.sum(1)
D = sqrt(1/D)
n = len(D)
D = D.T
D = sparse.spdiags(D, 0, n, n)
if pv.outputDebugMsg:
Utils.logMessage("\nConvert to Laplacian Matrix finished")
return D * mat * D
def computeEigenValsVectors(mat, dimensionReductionNum):
eigenVals, eigenVecs = arpack.eigs(mat, k = dimensionReductionNum, tol=0, which = "LM")
if pv.outputDebugMsg:
Utils.logMessage("\nCompute eigen values vectors finished")
return eigenVals, eigenVecs
def unification(vecs):
sq_sum = sqrt(multiply(vecs, vecs).sum(1))
rows,cols = shape(vecs)
for i in xrange(rows):
for j in xrange(cols):
vecs[i,j] = vecs[i,j]/sq_sum[i]
if pv.outputDebugMsg:
Utils.logMessage("\nUnification finished")
return vecs
def kMeans(vecs, clusterNum):
clusters = KMeans.train(vecs, clusterNum, maxIterations=10, runs=10, initializationMode="random")
if pv.outputDebugMsg:
Utils.logMessage("\nKmean cluster finished")
return clusters