diffusion_maps/diffusion_maps.py at master · kaynakb/diffusion_maps · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# diffusion_maps.py is a compact python module for a modified version of the (an)isotropic
# diffusion maps technique with a variable bandwidth based on the research article by Coifman and
# Lafon [1]. The version of the modification used in the module is based on the discussion
# presented in the research article by Zelnic-Manor and Perona [2].

# [1] R. R. Coifman, S. Lafon, Diffusion maps, Appl. Comput. Harmon. Anal., 21 (2006) 5-30.
# [2] L. Zelnic-Manor, P. Parona, Self-tuning spectral clustering, Adv. NIPS. 17,
# (MIT, Cambridge, MA, 2004) 1601-1608.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from timeit import default_timer as timer
import numpy as np
from scipy.spatial.distance import cdist
from scipy.linalg import svd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigsh, svds
from sklearn.metrics import pairwise_distances
from sklearn.metrics import pairwise_kernels

__author__ = 'Burak T. Kaynak'
__license__ = 'GPLv3'
__version__ = '0.5'
__email__ = 'kaynakb@gmail.com'


class Diffusion_Maps:
    'An implementation of (an)isotropic diffusion maps technique.'

    __slots__ = [
        '_data', '_alpha', '_time', '_epsilon', '_epsilon_const',
        '_n_neighbors', '_n_eigvecs', '_n_jobs', '_dist', '_kernel_alpha',
        '_d_alpha', '_eigvals', '_eigvecs', '_diffusion_map'
    ]

    def __init__(self,
                 data,
                 alpha=1.0,
                 time=0.0,
                 epsilon=None,
                 epsilon_const=1.0,
                 n_neighbors=10,
                 n_eigvecs=6,
                 n_jobs=None):
        '''
Docstring:
Diffusion_Maps(data, alpha=1.0, time=0.0, epsilon=None, epsilon_const=1.0, n_neighbors=10, n_eigvecs=6, n_jobs=None)

Instantiate an object of type Diffusion_Maps.

Parameters
----------
data : array like
     Data
alpha : float, optional (default = 1.0)
      Anisotropic diffusion parameter, which takes values one of the following
      values: 0.0, 1/2, 1.0
time : float, optional (default = 0.0)
     Time steps.
epsilon : float, optional (default = None)
        Scale parameter.
epsilon_const : float, optional (default = 1.0)
        Scale parameter, multiplying the variable bandwidth, epsilon_const = O(1).
n_neighbors : int, optional (default = 10)
            Number of neighbors to calcualte epsilon.
n_eigvecs : int, optional (default = 6)
          Dimension of the embedding. It should be set to 'all' to calculate all the eigenvectors.
n_jobs : int, optional (default = None)
       The number of parallel jobs to run when the distance matrix is to be calculated.
If -1, then the number of jobs is set to the number of CPU cores.'''

        self._data = data
        self._alpha = alpha
        self._time = time
        self._epsilon = epsilon
        self._epsilon_const = epsilon_const
        if epsilon is None:
            self._n_neighbors = n_neighbors
        else:
            self._n_neighbors = None
        self._n_eigvecs = n_eigvecs
        self._n_jobs = n_jobs
        self._dist = None
        self._kernel_alpha = None
        self._d_alpha = None
        self._eigvals = None
        self._eigvecs = None
        self._diffusion_map = None

        self.generate_diffusion_map()

    @property
    def data(self):
        '''
Docstring:
data

Return the data.'''

        return self._data

    @property
    def alpha(self):
        '''
Docstring:
alpha

Return the value of alpha.'''

        return self._alpha

    @property
    def time(self):
        '''
Docstring:
times

Return the value of time.'''

        return self._time

    @property
    def epsilon(self):
        '''
Docstring:
epsilon

Return the value of epsilon.'''

        return self._epsilon

    @property
    def epsilon_const(self):
        '''
Docstring:
epsilon_const

Return the value of epsilon_const.'''

        return self._epsilon_const

    @property
    def n_neighbors(self):
        '''
Docstring:
n_neighbors

Return the number of neighbors.'''

        return self._n_neighbors

    @property
    def n_eigvecs(self):
        '''
Docstring:
n_eigvecs

Return the number of eigenvectors.'''

        if self._n_eigvecs != 'all':
            return self._n_eigvecs
        else:
            return self._eigvals.size

    @property
    def kernel(self):
        '''
Docstring:
kernel

Return the renormalized kernel.'''

        return self._kernel_alpha

    @property
    def measure(self):
        '''
Docstring:
measure

Return the Riemannian measure.'''

        return self._d_alpha / np.sum(self._d_alpha)

    @property
    def probability(self):
        '''
Docstring:
probability

Return the transition probability.'''

        return np.diag(1 / self._d_alpha) @ self._kernel_alpha

    @property
    def eigvals(self):
        '''
Docstring:
eigvals

Return the eigenvalues.'''

        return self._eigvals

    @property
    def eigvecs(self):
        '''
Docstring:
eigvecs

Return the renormalized eigenvectors of the diffusion operator.'''

        return self._eigvecs

    @property
    def diffusion_map(self):
        '''
Docstring:
diffusion_map

Return the diffusion map.'''

        return self._diffusion_map

    def __calc_epsilon(self):

        # if the data is not large, then cdist from scipy.spatial.distance
        # will be faster.

        self._dist = pairwise_distances(self._data, n_jobs=self._n_jobs)
        dsn = np.median(np.sort(self._dist, 0)[1:self._n_neighbors + 1], 0)
        self._epsilon = self._epsilon_const * dsn.reshape(-1, 1) * dsn

    def __generate_kernel(self):

        t0 = timer()

        if self._n_neighbors is not None:
            self.__calc_epsilon()
            k_init = np.exp(-self._dist**2 / self._epsilon)
        else:
            k_init = pairwise_kernels(
                self._data,
                metric='rbf',
                gamma=1 / self._epsilon,
                n_jobs=self._n_jobs)
        # k_init = k_init - np.eye(k_init.shape[0])  # prohibits self-transitions

        d_init = np.sum(k_init, 1)
        d_init_alpha = d_init**(-self._alpha)
        d_init_alpha_mat = d_init_alpha.reshape(-1, 1) * d_init_alpha
        self._kernel_alpha = k_init / d_init_alpha_mat
        if np.allclose(self._kernel_alpha, self._kernel_alpha.T):
            pass
        else:
            self._kernel_alpha = (
                self._kernel_alpha + self._kernel_alpha.T) / 2
        self._d_alpha = np.sum(self._kernel_alpha, 1)

        t1 = timer()
        t10 = round(t1 - t0, 3)
        print('time elapsed for the computation of the kernel: {}'.format(t10))

    def __calc_eigs(self):

        if self._n_eigvecs == 'all':

            ts = self._d_alpha**(-1 / 2)
            dts = np.diag(ts)
            kt = self._kernel_alpha * ts.reshape(-1, 1) * ts
            if np.allclose(kt, kt.T):
                pass
            else:
                kt = (kt + kt.T) / 2

            u, s, vh = svd(kt)
            self._eigvals = s
            self._eigvecs = dts @ u
            self._eigvecs = self._eigvecs / self._eigvecs[0, 0]
            self._diffusion_map = self._eigvecs * self._eigvals**self._time

        else:

            l, v = eigsh(
                self._kernel_alpha,
                k=self._n_eigvecs,
                M=csr_matrix(np.diag(self._d_alpha)))
            self._eigvals = l[::-1]
            self._eigvecs = v[:, ::-1]
            self._eigvecs = self._eigvecs / self._eigvecs[0, 0]

            # The following method can be used to obtain the diffusion maps as
            # well. It gives the same result possibly up to a complex phase
            # such that some of the vectors could be multiplied by -1.
            # If a larger set of eigenvalues & eigenvectors is sought, then
            # the generalized eigenvalue problem method may not be reliable, so
            # the next method should be used. However, computing a large set of
            # eigenvalues & eigenvectors is slower than computing the whole set
            # thereof.

            # ts = self._d_alpha ** (-1 / 2)
            # dts = np.diag(ts)
            # kt = self._kernel_alpha * ts.reshape(-1, 1) * ts
            # if np.allclose(kt, kt.T):
            #     pass
            # else:
            #     kt = (kt + kt.T) / 2

            # u, s, vh = svds(kt, k=self._n_eigvecs)
            # self._eigvals = s[::-1]
            # self._eigvecs = dts @ u[:, ::-1]
            # self._eigvecs = self._eigvecs / self._eigvecs[0, 0]
            # self._diffusion_map = self._eigvecs * self._eigvals**self._time

    def generate_diffusion_map(self,
                               alpha=None,
                               time=None,
                               n_neighbors=None,
                               epsilon=None,
                               epsilon_const=None,
                               n_eigvecs=None):
        '''
Docstring:
generate_diffusion_map(alpha=None, time=None, epsilon=None, epsilon_const=None, n_neighbors=None, n_eigvecs=None)

Generate another diffusion map for the same instance with a different set of parameters.

Parameters
----------
alpha : float, optional (default = None)
      Anisotropic diffusion parameter, which takes values one of the following
      values: 0.0, 1/2, 1.0
time : float, optional (default = None)
     Time steps.
epsilon : float, optional (default = None)
        Scale parameter.
epsilon_const : float, optional (default = 1.0)
              Scale parameter, multiplying the variable bandwidth, epsilon_const = O(1).
n_neighbors : int, optional (default = None)
            Number of neighbors to calcualte epsilon.
n_eigvecs : int, optional (default = None)
          Dimension of the embedding. It should be set to 'all' to calculate all the eigenvectors.'''

        t0 = timer()

        if self._kernel_alpha is None:
            self.__generate_kernel()
            self.__calc_eigs()

        if (alpha is None) and (epsilon is None) and (n_neighbors is None):
            if n_eigvecs is not None:
                self._n_eigvecs = n_eigvecs
                self.__calc_eigs()
        else:
            if alpha is not None:
                self._alpha = alpha

            if epsilon is not None:
                self._epsilon = epsilon
                self._n_neighbors = None

            if n_neighbors is not None:
                if epsilon_const is not None:
                    self._epsilon_const = epsilon_const
                self._n_neighbors = n_neighbors

            self.__generate_kernel()

            if n_eigvecs is not None:
                self._n_eigvecs = n_eigvecs

            self.__calc_eigs()

        if time is not None:
            self._time = time

        self._diffusion_map = self._eigvecs * self._eigvals**self._time

        t1 = timer()
        t10 = round(t1 - t0, 3)
        print('time elapsed for the computation of the diffusion maps: {}'.
              format(t10))