Fix problem where only one cluster/label was found in the training da…

…ta and that sometimes thenumber of expected features was wrong
SINTEF-9012 · Sep 26, 2024 · e220f67 · e220f67
1 parent 3059aa7
commit e220f67
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 10 deletions.
diff --git a/src/api.py b/src/api.py
@@ -199,7 +199,13 @@ def post(self):
         yaml.dump(params, open("params.yaml", "w"), allow_unicode=True)
 
         # Run DVC to create virtual sensor.
-        subprocess.run(["dvc", "repro"], check=True)
+        try:
+            subprocess.run(["dvc", "repro"], check=True)
+        except subprocess.CalledProcessError as e:
+            print(e)
+            # If DVC fails, rerun the full pipeline using --force.
+            print("DVC failed. Forcing rerun of full pipeline.")
+            subprocess.run(["dvc", "repro", "--force"], check=True)
 
         # Reread params-file, in case it is changed during pipeline execution
         # (e.g., the number of clusters).

diff --git a/src/cluster_utils.py b/src/cluster_utils.py
@@ -309,6 +309,20 @@ def calculate_model_metrics(model, feature_vectors, labels):
 
     """
 
+    # Set invalid default values to indicate that they were not computed
+    metrics = {
+        "silhouette_score": -1000,
+        "calinski_harabasz_score": -1000,
+        "davies_bouldin_score": 1000
+    }
+
+
+    n_detected_clusters = np.unique(labels)
+
+    if len(n_detected_clusters) == 1:
+        print("Only one cluster detected. Skipping evaluation.")
+        return metrics
+
     silhouette = silhouette_score(feature_vectors, labels)
     chs = calinski_harabasz_score(feature_vectors, labels)
     dbs = davies_bouldin_score(feature_vectors, labels)

diff --git a/src/train.py b/src/train.py
@@ -107,18 +107,16 @@ def train(dir_path=""):
 
     unique_labels = np.unique(labels)
 
+    # print("=======================")
+    # print("BEFORE")
+    # print(feature_vectors.shape)
+    # print(model.cluster_centers_.shape)
+    # print("=======================")
+
     # These lines will remove the noise cluster of DBSCAN
     # if unique_labels[0] == -1:
     #     unique_labels = unique_labels[1:]
 
-    n_clusters = len(unique_labels)
-    params["train"]["n_clusters"] = n_clusters
-
-    # TODO: Not sure if it is a good idea to rewrite params.yaml during
-    # execution of the pipeline.
-    with open("params.yaml", "w") as params_file:
-        yaml.dump(params, params_file)
-
     # If the model does not calculate its own cluster centers, do the
     # computation manually based on core samples or similar. Applies for the
     # following clustering algorithms:
@@ -161,7 +159,12 @@ def train(dir_path=""):
 
         cluster_centers = np.array(cluster_centers)
 
-    assert n_clusters == cluster_centers.shape[0]
+    params["train"]["n_clusters"] = cluster_centers.shape[0]
+
+    # TODO: Not sure if it is a good idea to rewrite params.yaml during
+    # execution of the pipeline.
+    with open("params.yaml", "w") as params_file:
+        yaml.dump(params, params_file)
 
     # Clustering algorithms like AffinityPropagation might fail to converge,
     # so MiniBatchKMeans serves as a fallback method.
@@ -177,6 +180,12 @@ def train(dir_path=""):
     pd.DataFrame(labels).to_csv(LABELS_PATH)
     pd.DataFrame(cluster_centers).to_csv(CLUSTER_CENTERS_PATH)
 
+    # print("=======================")
+    # print("AFTER")
+    # print(feature_vectors.shape)
+    # print(cluster_centers.shape)
+    # print("=======================")
+
 
 def build_model(learning_method, n_clusters, max_iter):
     """Build clustering model.