From 8de8767eee47e6e215f32588304f41e24d9a3c9d Mon Sep 17 00:00:00 2001
From: Patrick O'Hara <patrick.h.o-hara@warwick.ac.uk>
Date: Wed, 16 Aug 2023 18:38:33 +0100
Subject: [PATCH] Dataset app changes

---
 pctsp/apps/dataset_app.py | 154 +++++++++++++++++++++++++-------------
 1 file changed, 102 insertions(+), 52 deletions(-)

diff --git a/pctsp/apps/dataset_app.py b/pctsp/apps/dataset_app.py
index 1557027..d2b3aba 100644
--- a/pctsp/apps/dataset_app.py
+++ b/pctsp/apps/dataset_app.py
@@ -7,8 +7,12 @@
 import typer
 from tspwplib import (
     BaseTSP,
+    EdgeWeightType,
     Generation,
     ProfitsProblem,
+    NotConnectedException,
+    asymmetric_from_undirected,
+    biggest_vertex_id_from_graph,
     build_path_to_londonaq_yaml,
     build_path_to_oplib_instance,
     metricness,
@@ -16,84 +20,130 @@
     rename_edge_attributes,
     rename_node_attributes,
     sparsify_uid,
+    split_head,
     total_cost,
     total_prize,
 )
-from ..preprocessing import remove_one_connected_components
+from ..compare import params
+from ..preprocessing import remove_one_connected_components, undirected_vertex_disjoint_paths_map, vertex_disjoint_cost_map
+from ..suurballe import suurballe_shortest_vertex_disjoint_paths
+from ..utils import get_pctsp_logger
 from ..vial import DatasetName
 
 from .options import LondonaqRootOption, OPLibRootOption
-from ..compare import params
 
 dataset_app = typer.Typer(name="dataset", help="Making and summarizing datasets")
 
 
-@dataset_app.command(name="metricness")
-def metricness_of_dataset(
+@dataset_app.command(name="stats")
+def stats_of_dataset(
     dataset: DatasetName,
     londonaq_root: Path = LondonaqRootOption,
     oplib_root: Path = OPLibRootOption,
 ) -> pd.DataFrame:
     """Create a pandas dataframe of the metricness and write to CSV"""
-    dataset_stats: Dict[str, List[Any]] = {
-        "num_nodes": [],
-        "num_edges": [],
-        "total_cost": [],
-        "total_prize": [],
-        "metricness": [],
-    }
+    logger = get_pctsp_logger("dataset-stats")
+    dataset_stats: List[Dict[str, float]] = []
     names = []
+    index=None
     if dataset == DatasetName.londonaq:
-        names = params.LONDONAQ_GRAPH_NAME_LIST
-    elif dataset == DatasetName.tspwplib:
-        names = params.TSPLIB_GRAPH_NAME_LIST
-    for graph_name in names:
-        # load the graph
-        if dataset == DatasetName.londonaq:
+        logger.info("Calculating stats for londonaq dataset.")
+        for graph_name in params.LONDONAQ_GRAPH_NAME_LIST:
+            logger.info("Loading %s", graph_name.value)
             problem_path = build_path_to_londonaq_yaml(londonaq_root, graph_name)
             tsp = BaseTSP.from_yaml(problem_path)
-        elif dataset == DatasetName.tspwplib:
-            problem_path = build_path_to_oplib_instance(
-                oplib_root,
-                Generation.gen3,
-                graph_name,
-            )
-            # load the problem from file
-            problem = ProfitsProblem().load(problem_path)
-            tsp = BaseTSP.from_tsplib95(problem)
-        # get the graph in networkx
-        graph = tsp.get_graph()
-        rename_edge_attributes(graph, {"weight": "cost"}, del_old_attr=True)
-        try:  # londonaq dataset
+            graph = tsp.get_graph()
+            rename_edge_attributes(graph, {"weight": "cost"}, del_old_attr=True)
             rename_node_attributes(graph, {"demand": "prize"}, del_old_attr=True)
-        except KeyError:  # tsplib dataset
-            nx.set_node_attributes(graph, problem.get_node_score(), name="prize")
+            logger.info("Calculating stats for %s", graph_name.value)
+            dataset_stats.append(get_graph_stats(graph, tsp.depots[0]))
+            names.append(graph_name.value)
+        index = pd.Index(names, name="graph_name")
 
-        # if removing edges
-        if dataset == DatasetName.tspwplib:
-            graph = sparsify_uid(graph, 5)
-            new_cost = mst_cost(graph, cost_attr="cost")
-            nx.set_edge_attributes(graph, new_cost, name="cost")
-
-        # preprocessing
-        graph = remove_one_connected_components(graph, tsp.depots[0])
+    elif dataset == DatasetName.tspwplib:
+        for graph_name in params.TSPLIB_GRAPH_NAME_LIST:
+            for gen in Generation:
+                problem_path = build_path_to_oplib_instance(oplib_root, gen, graph_name)
+                for cost in params.TSPLIB_COST_FUNCTIONS:
+                    for kappa in params.TSPLIB_KAPPA_LIST:
+                        logger.info("Loading %s on generation %s with cost %s and kappa %s", graph_name.value, gen.value, cost.value, kappa)
+                        problem_path = build_path_to_oplib_instance(
+                            oplib_root,
+                            gen,
+                            graph_name,
+                        )
+                        # load the problem from file
+                        problem = ProfitsProblem().load(problem_path)
+                        tsp = BaseTSP.from_tsplib95(problem)
+                        graph = tsp.get_graph()
+                        nx.set_node_attributes(graph, problem.get_node_score(), name="prize")
+                        rename_edge_attributes(graph, {"weight": "cost"}, del_old_attr=True)
+                        graph = sparsify_uid(graph, kappa)
+                        if cost == EdgeWeightType.MST:
+                            new_cost = mst_cost(graph, cost_attr="cost")
+                            nx.set_edge_attributes(graph, new_cost, name="cost")
+                        logger.info("Calculating stats for %s", graph_name.value)
+                        dataset_stats.append(get_graph_stats(graph, tsp.depots[0]))
+                        names.append((graph_name.value, gen.value, cost.value, kappa))
+        index = pd.MultiIndex.from_tuples(names, names=["graph_name", "generation", "cost_function", "kappa"])
 
-        # count the number of edges, vertices, total prize, total cost and the metricness
-        dataset_stats["num_nodes"].append(graph.number_of_nodes())
-        dataset_stats["num_edges"].append(graph.number_of_edges())
-        dataset_stats["total_cost"].append(
-            total_cost(nx.get_edge_attributes(graph, "cost"), list(graph.edges()))
-        )
-        dataset_stats["total_prize"].append(
-            total_prize(nx.get_node_attributes(graph, "prize"), list(graph.nodes()))
-        )
-        dataset_stats["metricness"].append(metricness(graph))
-    df = pd.DataFrame(dataset_stats, index=names)
+    logger.info("Creating dataframe from dataset stats.")
+    df = pd.DataFrame(dataset_stats, index=index)
     print(df)
     if dataset == DatasetName.londonaq:
         filepath = londonaq_root / "londonaq_dataset.csv"
     elif dataset == DatasetName.tspwplib:
         filepath = oplib_root / "tsplib_dataset.csv"
-    df.index = df.index.rename("graph_name")
+    logger.info("Writing dataframe to CSV at %s", filepath)
     df.to_csv(filepath, index=True)
     return df
+
+
+def get_graph_stats(graph: nx.Graph, root_vertex: int) -> Dict[str, float]:
+    # count the number of edges, vertices, total prize, total cost and the metricness
+    instance_stats = {}
+    instance_stats["num_nodes"] = graph.number_of_nodes()
+    instance_stats["num_edges"] = graph.number_of_edges()
+    instance_stats["total_cost"] = total_cost(nx.get_edge_attributes(graph, "cost"), list(graph.edges()))
+    og_prize =  total_prize(nx.get_node_attributes(graph, "prize"), list(graph.nodes()))
+    instance_stats["total_prize"] = og_prize
+    try:
+        instance_stats["metricness"] = metricness(graph)
+    except NotConnectedException:
+        largest_component_graph = graph.subgraph(max(nx.connected_components(graph), key=len))
+        instance_stats["metricness"] = metricness(largest_component_graph)
+
+    # evaluate the largest prize of any least-cost vertex-disjoint paths
+    biggest_vertex = biggest_vertex_id_from_graph(graph)
+    asymmetric_graph = asymmetric_from_undirected(graph)
+    tree = suurballe_shortest_vertex_disjoint_paths(
+        asymmetric_graph,
+        split_head(biggest_vertex, root_vertex),
+        weight="cost",
+    )
+    vertex_disjoint_paths_map = undirected_vertex_disjoint_paths_map(
+        tree, biggest_vertex
+    )
+    biggest_prize = 0
+    biggest_vertex = None
+    prize_map = nx.get_node_attributes(graph, "prize")
+    for u, (p1, p2) in vertex_disjoint_paths_map.items():
+        prize = total_prize(prize_map, p1) + total_prize(prize_map, p2) - prize_map[u] - prize_map[root_vertex]
+        if prize > biggest_prize:
+            biggest_prize = prize
+            biggest_vertex = u
+    instance_stats["biggest_disjoint_prize"] = biggest_prize
+    instance_stats["disjoint_prize_ratio"] = float(biggest_prize) / float(og_prize)
+
+    # preprocessing
+    graph = remove_one_connected_components(graph, root_vertex)
+
+    # re-evaluate stats after preprocessing
+    instance_stats["preprocessed_num_nodes"] = graph.number_of_nodes()
+    instance_stats["preprocessed_num_edges"] = graph.number_of_edges()
+    instance_stats["preprocessed_total_cost"] = total_cost(nx.get_edge_attributes(graph, "cost"), list(graph.edges()))
+    pp_prize = total_prize(nx.get_node_attributes(graph, "prize"), list(graph.nodes()))
+    instance_stats["preprocessed_total_prize"] = pp_prize
+    instance_stats["preprocessed_metricness"] = metricness(graph)
+    instance_stats["preprocessed_prize_ratio"] = float(pp_prize) / float(og_prize)
+    return instance_stats