smautner · MatthiasHerrmann · Mar 16, 2021 · Mar 22, 2021 · Mar 24, 2021 · Mar 25, 2021
diff --git a/exploration/pareto.py b/exploration/pareto.py
@@ -5,13 +5,12 @@
 import heapq
 import numpy as np
 from eden.util import timeit
-### from graphlearn.local_substitution_graph_grammar import LocalSubstitutionGraphGrammar as lsggold ########## TMP
 from graphlearn.cipcorevector import LsggCoreVec as lsgg
 from toolz.curried import compose, map, concat
 from exploration.pareto_funcs import _manage_int_or_float
 logger = logging.getLogger(__name__)
 import structout as so
-from exploration import pareto_funcs as paretof, cost_estimator as costs
+from exploration import pareto_options, pareto_funcs as paretof, cost_estimator as costs
 from extensions import lsggscramble as lsggs
 from sklearn.metrics.pairwise import euclidean_distances
 
@@ -223,39 +222,43 @@ def optimize(self, graphs):
 
     def optimize_step(self, graphs):
         # filter, expand, chk duplicates
-        costs = self.get_costs(graphs)
-        status = self.checkstatus(costs, graphs)
+        step_start_time = time.time()
+        graphlen_start = len(graphs)
+        graphs, status = self.filter_by_cost(graphs)
         if status: return [],True,None
-        graphs = self.filter_by_cost(costs, graphs)
+        graphlen_filter = len(graphs)
+        logger.log(10, f"cost_filter: Got {graphlen_start} graphs, reduced to {graphlen_filter} ({time.time()-step_start_time})")
         num_graphs = len(graphs)
         if self.grammar.cipselector == new_cipselector0:  ### SPECIAL CASE
-            logger.log(10, "USING CIPSELECTOR 0")
             graphs = self._expand_neighbors2(graphs)
         else:
             graphs = self._expand_neighbors(graphs)
-        avg_productions = len(graphs)/num_graphs
-        logger.log(10, f"Average productions per graph: {avg_productions}")
+        graphlen_expand = len(graphs)
+        avg_productions = graphlen_expand/graphlen_filter
+        logger.log(10, f"graph generation: Got {avg_productions} per graph. ({time.time()-step_start_time})")
         graphs = self.duplicate_rm(graphs)
+        logger.log(10, f"duplicate_rm: {graphlen_expand} -> {len(graphs)} graphs. ({time.time()-step_start_time})")
         return graphs, status, avg_productions
 
 
 
-    def filter_by_cost(self,costs,graphs):
+    def filter_by_cost(self,graphs):
         """expand "keepgraphs" graphs, divided between top graphs in everything
         and pareto front, discard rest"""
-        timenow=time.time()
-        in_count = len(graphs)
         keepgraphs = self.keepgraphs
+
+        if True:  # Greedy Cost calculation doesnt work at the moment.. self.pareto_option != 'greedy':
+            costs = self.get_costs(graphs)
+            status = self.checkstatus(costs, graphs)
+            if status:
+                # Some graph has distance == 0
+                return graphs, True
 
-        if in_count <= self.keepgraphs:
+        if len(graphs) <= self.keepgraphs:
             # Only few graphs remaining so just return all of them.
-            logger.debug('cost_filter: keep all %d graphs' % in_count)
-            return graphs
+            logger.log(10, "cost_filter: keep all graphs")
+            return graphs, False
 
-        elif self.pareto_option == "random":
-            # Return randomly selected graphs without any application of pareto.
-            res =  random.sample(graphs, keepgraphs)
-
 ##        elif self.prefilter_kick!=0:
 ##            # DELETE THE 25% worst in each category
 ##            assert False
@@ -264,45 +267,37 @@ def filter_by_cost(self,costs,graphs):
 ##            keep =  [i for i in range(len(graphs)) if i not in trash]
 ##            graphs = [graphs[i] for i in keep]
 ##            costs = costs[keep]
-        
+
         elif self.pareto_option == 'greedy':
             # Return graphs with the lowest euclidean distance to the target vector
-            distances = []
-            for g in graphs:
-                distances.append(euclidean_distances(self.target_graph_vector, vertex_vec(g, self.decomposer).sum(axis=0))[0][0])
-            ranked_distances = np.argsort(distances)[:keepgraphs]
-            res =  [graphs[i] for i in ranked_distances]
+            return pareto_options.greedy(graphs, self.target_graph_vector, self.decomposer, keepgraphs)
+
+        elif self.pareto_option == "random":
+            # Return randomly selected graphs without any application of pareto.
+            return random.sample(graphs, keepgraphs), False
+
         elif self.pareto_option == "default":
             # Take best graphs from estimators and pareto front
-           costs_ranked = np.argsort(costs,axis=0)[:int(keepgraphs/6),[0,1,3]]
-           want , counts = np.unique(costs_ranked,return_counts=True)
-           res = [graphs[idd] for idd,count in zip( want,counts) if count > 0 ]
-           dontwant = [i for i in range(len(graphs)) if i not in want]
-           restgraphs = [graphs[i] for i in dontwant]
-           restcosts = costs[dontwant][:,[0,1,2]]
-           paretoselectedgraphs = paretof._pareto_set(restgraphs, restcosts)
-           random.shuffle(paretoselectedgraphs)
-           res += paretoselectedgraphs[:int(keepgraphs/2)]
+            return pareto_options.default(graphs, costs, keepgraphs), False
 
         elif self.pareto_option == "paretogreed":
             # 1. choose pareto graphs 
-            # 2. new score is the average rank over all costs 
+            # 2. new score is the average rank over all costs
             # 3. choose k best of those 
-           graphs, costs = paretof._pareto_set(graphs, costs,return_costs=True)
-           costs_ranked = np.argsort(costs,axis=0).sum(axis=1)
-           choosegr = np.argsort(costs_ranked) 
-           res = [graphs[x] for x in choosegr[:keepgraphs]]
-        else:
-            paretoselectedgraphs = paretof._pareto_set(graphs, costs)
-            random.shuffle(paretoselectedgraphs)
+           return pareto_options.paretogreed(graphs, costs, keepgraphs), False
+
+        paretoselectedgraphs = paretof._pareto_set(graphs, costs)
+        random.shuffle(paretoselectedgraphs)
 
-            if self.pareto_option == "pareto_only":
-                # Return only graphs from the pareto front
-                res = paretoselectedgraphs[:keepgraphs]
-
-            elif self.pareto_option == "all":
-                # Return ALL graphs from the pareto front
-                res = paretoselectedgraphs
+        if self.pareto_option == "pareto_only":
+            # Return only graphs from the pareto front
+            return paretoselectedgraphs[:keepgraphs], False
+
+        elif self.pareto_option == "all":
+            # Return ALL graphs from the pareto front
+            return paretoselectedgraphs, False
+        else:
+            raise ValueError("Invalid Pareto Option")
 
 ##        # DEBUG TO SHOW THE REAL DISTANCE
 ##        if self.cheat:
@@ -320,16 +315,10 @@ def filter_by_cost(self,costs,graphs):
 ##                from util import util
 ##                util.dumpfile(graphs[costs_ranked[stuff][0]],"gr")
 ##                print ("graph dumped")
-        logger.log(10, f"cost_filter: got {in_count} graphs, reduced to {len(res)} (%.2fs)"%(time.time()-timenow))
-
-        return res
 
 
     def duplicate_rm(self,graphs):
-        timenow=time.time()
-        count = len(graphs)
         graphs  = list(self._duplicate_rm(graphs))
-        logger.debug("duplicate_rm: %d -> %d graphs (%.2fs)" % (count, len(graphs), time.time()-timenow))
         return graphs
 
     def _duplicate_rm(self,graphs):
@@ -360,7 +349,7 @@ def get_costs(self, graphs):
         for i,e in enumerate(costs[:,2]):
             nucol[i,2] = resdic[e]
         costs = np.hstack((costs, np.sum(nucol,axis =1).reshape(-1,1)))
-        logger.debug("costs: best dist: %f (%.2fs)" %  (np.min(costs[:,0]) ,time.time()-timenow))
+        logger.log(10, f"costs: best dist: {np.min(costs[:,0])} ({time.time()-timenow})")
         return costs
 
     def _get_neighbors(self, graph):
@@ -373,17 +362,14 @@ def _get_neighbors(self, graph):
         return neighs
 
     def _expand_neighbors(self, graphs):
-        timenow = time.time()
         global _decomposer ##### Stupid hack but I dont know how else to allow lambda functions in multiprocessing
         _decomposer = self.decomposer #####
         if self.multiproc>1:
             with multiprocessing.Pool(self.multiproc) as p:
                 res = list(concat(p.map(self._get_neighbors,graphs)))
-                logger.debug("graph generation: %.2fs" %  (time.time()-timenow))
                 return res
         else:
             res = list(concat(map(self._get_neighbors,graphs)))
-            logger.debug("graph generation: %.2fs" %  (time.time()-timenow))
             return res
 
 
@@ -398,14 +384,11 @@ def _get_score_substitution(self, graph): #
 
     def _expand_neighbors2(self, graphs): #
         """Only used with Cipselector Option 0. Replaces _expand_neighbors"""
-        timenow = time.time()
         if self.multiproc>1:
             with multiprocessing.Pool(self.multiproc) as p:
                 res = list(concat(p.map(self._get_score_substitution,graphs)))
-                logger.debug("graph generation: %.2fs" %  (time.time()-timenow))
         else:
             res = list(concat(map(self._get_score_substitution,graphs)))
-            logger.debug("graph generation: %.2fs" %  (time.time()-timenow))
         res.sort(reverse=True, key=lambda a: a[0])
         counter = 0
         grlist = []
@@ -499,10 +482,6 @@ def __init__(
         self.output_k_best = output_k_best
         self.decomposer = decomposer
         self.grammar = lsgg_size_hack(radii=core_sizes, thickness=context_size, core_vec_decomposer=decomposer, cipselector=cipselector, nodelevel_radius_and_thickness=True) #cip_root_all=False, half_step_distance=True)
-###        self.grammar = lsggold(nodelevel_radius_and_thickness=True) #cip_root_all=False, half_step_distance=True)
-##        self.grammar.radii = core_sizes #self.grammar.set_core_size(core_sizes)
-##        self.grammar.thickness = context_size #self.grammar.decomposition_args['thickness_list'] = [context_size]
-        #self.grammar.set_min_count(min_count) interfacecount 1 makes no sense
         self.grammar.filter_min_cip = min_count #self.grammar.filter_args['min_cip_count'] = min_count
         self.optiopts = kwargs
         self.cs2cs = cs2cs

diff --git a/exploration/pareto_options.py b/exploration/pareto_options.py
@@ -0,0 +1,53 @@
+import exploration.pareto_funcs as paretof
+from graphlearn.cipcorevector import vertex_vec
+import numpy as np
+from sklearn.metrics.pairwise import euclidean_distances
+import random
+
+
+def greedy(graphs, target, decomposer, keepgraphs):
+    """
+    Return graphs with the lowest euclidean distance to the target vector.
+    Also returns if one of the distances equals 0.
+    """
+    distances = []
+####    distances = euclidean_distances(target, vertex_vec(graphs, decomposer))
+    for g in graphs:
+        distances.append(euclidean_distances(target, vertex_vec(g, decomposer).sum(axis=0))[0][0])
+    if min(distances) == 0:
+        return graphs, True
+    ranked_distances = np.argsort(distances)[:keepgraphs]
+    res =  [graphs[i] for i in ranked_distances]
+    if distances[ranked_distances[0]] < 0.000001:
+        ## => At least 1 distance is 0 => Successful reconstruction
+        return res, True
+    return res, False
+
+
+def default(graphs, costs, keepgraphs):
+    """
+    Take best graphs from estimators and pareto front.
+    """
+    costs_ranked = np.argsort(costs,axis=0)[:int(keepgraphs/6),[0,1,3]]
+    want , counts = np.unique(costs_ranked,return_counts=True)
+    res = [graphs[idd] for idd,count in zip( want,counts) if count > 0 ]
+    dontwant = [i for i in range(len(graphs)) if i not in want]
+    restgraphs = [graphs[i] for i in dontwant]
+    restcosts = costs[dontwant][:,[0,1,2]]
+    paretoselectedgraphs = paretof._pareto_set(restgraphs, restcosts)
+    random.shuffle(paretoselectedgraphs)
+    res += paretoselectedgraphs[:int(keepgraphs/2)]
+    return res
+
+
+def paretogreed(graphs, costs, keepgraphs):
+    """
+    1. choose pareto graphs 
+    2. new score is the average rank over all costs
+    3. choose k best of those
+    """
+    graphs, costs = paretof._pareto_set(graphs, costs,return_costs=True)
+    costs_ranked = np.argsort(costs,axis=0).sum(axis=1)
+    choosegr = np.argsort(costs_ranked)
+    res = [graphs[x] for x in choosegr[:keepgraphs]]
+    return res
diff --git a/reconstruct.py b/reconstruct.py
@@ -45,7 +45,7 @@ def maketasks(params):
 ##  OPTIONS FOR GRAPHS
 ##########################################
 
-EXPERIMENT_REPEATS = 50 #### CHANGE THIS BACK TO 100! 50 only for chemsets
+EXPERIMENT_REPEATS = 50
 # 1. param dict
 
 params_graphs = {
@@ -117,24 +117,27 @@ def maketasks(params):
 # Pareto Option "default": (3*5 best graphs for each category + 15 pareto front)
 # Pareto Option "random":  (No pareto front and no 3*5 best graphs. Just take 30 random graphs total)
 # Pareto Option "greedy":  (Instead of using the pareto front, take graphs with the lowest direct distance to the target)
+# Pareto Option "paretogreed":  (Greedy approach for pareto front)
 # Pareto Option "pareto_only": (Instead of using the 3*5 best graphs it takes double the graphs from the pareto front.
 # Pareto Option: "all": (Takes EVERY graph from the pareto front)
 parser = argparse.ArgumentParser()
 parser.add_argument('--core_sizes', nargs='*', type=int, default=[0,1,2], 
                     help='Core sizes/Radii')
 parser.add_argument('--context_size', nargs=1, type=float, default=[1],
                     help='Context sizes/Thickness')
-parser.add_argument('--cipselector_option', nargs=1, type=int, default=[1], ## Change this back
+parser.add_argument('--cipselector_option', nargs=1, type=int, default=[2], ## Change this back
                     choices=[0, 1, 2],
                     help='1: Take k best from all, 2: Take k best from each current cip')
-parser.add_argument('--cipselector_k', nargs=1, type=int, default=[100],
+parser.add_argument('--cipselector_k', nargs=1, type=int, default=[10],
                     help='k for Cipselector')
 parser.add_argument('--pareto_option', nargs=1, type=str, default=['greedy'],
-                    choices=['default', 'random', 'greedy', 'pareto_only', 'all'],
+                    choices=['default', 'random', 'greedy', 'paretogreed', 'pareto_only', 'all'],
                     help='Pareto option for optimization')
+parser.add_argument('--keepgraphs', nargs=1, type=int, default=[30],
+                    help='Number of graphs kept from the pareto part')
 parser.add_argument('--use_normalization', nargs=1, type=int, default=[1], choices=[1,0],
                     help='If 1, normalization will be applied for cipselection')
-parser.add_argument('--min_count', nargs=1, type=int, default=[2], 
+parser.add_argument('--min_count', nargs=1, type=int, default=[1], 
                     help='Also called min_cip')
 parser.add_argument('--graph_size_limiter', nargs=1, type=int, default=[1], choices=[1,0],
                     help='If 0, graph size limiter is only used with a graphs >100')
@@ -339,10 +342,12 @@ def report(folder = '.res', tasklist=None):
     lsuccess = [int(succ) for data in dat.values() for v in data.values() for succ,steps,times,avg in v]
     avg_productions = np.array([int(avg) for data in dat.values() for v in data.values() for succ,steps,times,avg in v])
     rnd = [int(steps) for data in dat.values() for v in data.values() for succ,steps,times,avg in v]
+    time = np.array([int(times) for data in dat.values() for v in data.values() for succ,steps,times,avg in v])
     print ("nores",nores)
     print ('nosucc',nosucc)
     print ("sumsuccess:", sum(lsuccess), lsuccess)
     print ("Average productions:", avg_productions.mean(), avg_productions)
+    print ("Average times:", np.average(time), time)
 #    print ("maxrnd:", max([int(b) for c in dat.values() for a,b,_ in c.values()]))
     print("maxrnd:", max(rnd))