String-Sanitization-Project/runner.py at main · Yebulabula/String-Sanitization-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# runner.py
# 03-04-2021
# This file is for processing some .txt file and convert them to the string type.
# Also, it includes all static functions that other scripts may need.
#
# https://github.com/Yebulabula/String-Sanitization-Project
#
# Author Ye Mao
# King's College London
# Version 1.0


import time
import sys
from optparse import OptionParser
from model import solver
import warnings
import DataProcessing


def default(str):
    """
        The function to return default help message.
    """
    return str + ' [Default: %default]'


def readCommand(argv):
    usageStr = """
    USAGE:      python runner.py <options>
    EXAMPLES:   (1) python runner.py
                    - starts deletion strategy test case
                (2) python runner.py -m 2000
                    - select best deleted symbol by 2000 iterations.
    """

    parser = OptionParser(usageStr)
    parser.add_option('-w', '--originalFile', dest='w_filename', type='string',
                      help=default('The string for sanitization(W)'), default='test/test_w.txt')
    parser.add_option('-z', '--sanitizedFile', dest='z_filename', type='string',
                      help=default('The string for sanitization(Z)'), default='test/test_z.txt')
    parser.add_option('-t', '--tau', dest='tau', type='int',
                      help=default('The tau value to identify spurious pattern'), default=1)
    parser.add_option('-o', '--omega', dest='omega', type='float',
                      help=default('The weight of non-spurious pattern'), default=1)
    parser.add_option('-s', '--sensitivePatterns', dest='sensitive_pat', type='string',
                      help=default('A file that consists of all sensitive patterns in W'),
                      default='test/sen_pattern_test.txt')

    parser.add_option('-k', dest='k', type='int',
                      help=default('The length of each pattern'), default=4)
    parser.add_option('-c', dest='c', type='int',
                      help=default('The exploration parameter for UCB1 formula'), default=20)
    parser.add_option('-d', '--delta', dest='delta', type='int',
                      help=default('The number of deletions'), default=5)
    parser.add_option('-e', '--E', dest='tolerance', type='int',
                      help=default('The pruning parameter for ELLS-ALGO'), default=10)
    parser.add_option('-m', '--max', dest='max_simulations', type='int',
                      help=default('The number of iterations per selection in ELLS-ALGO'), default=3)

    options, otherjunk = parser.parse_args(argv)
    if len(otherjunk) != 0:
        raise Exception('Command line input not understood: ' + str(otherjunk))
    args = dict()

    args['w'] = DataProcessing.readFile(options.w_filename)
    args['k'] = options.k
    args['delta'] = options.delta
    args['z'] = DataProcessing.readFile(options.z_filename)
    args['sensitive_patterns'] = DataProcessing.readMultiLineFile(options.sensitive_pat)
    if args['sensitive_patterns'] is None: raise Exception(
        "The file " + options.w_filename + " cannot be found")
    args['tau'] = options.tau
    args['omega'] = options.omega
    args['c'] = options.c
    args['max_simulations'] = options.max_simulations
    args['tolerance'] = options.tolerance

    return args


if __name__ == '__main__':
    args = readCommand(sys.argv[1:])
    solver = solver(**args)

    # d_baseline = solver._get_distortion(solver.baseline())
    ghosts_origin = solver._get_number_of_spurious(solver.Z)
    # print('Baseline Distortion Reduction\nSpurious:', sp - d_baseline[0], 'non-spurious:', nsp - d_baseline[1])
    warnings.filterwarnings(action='ignore', category=DeprecationWarning)
    tick = time.time()
    print('---------------------------')
    result = solver.run()
    # d_csd_plus = solver._get_distortion(result)
    ghosts_after = solver._get_number_of_spurious(result)
    tock = time.time()

    print('Distortion in sanitized string (Z)', solver._get_distortion(solver.Z))
    print('Distortion in the CSD-PLUS result (H)',solver._get_distortion(result))
    print('The number of ghost/lost patterns in Z:', ghosts_origin)
    print('The number of ghost/lost patterns in H:', ghosts_after)
    print('Time consumption in CSD-Plus is', tock - tick)
    # solver._exhaustive_search(lst=list(range(len(solver.Z))), n=solver.delta)
    # print('Exhaustive Search:', min(solver.EX))