Skip to content

Commit

Permalink
Fix problem where only one cluster/label was found in the training da…
Browse files Browse the repository at this point in the history
…ta and that sometimes thenumber of expected features was wrong
  • Loading branch information
ejhusom committed Sep 26, 2024
1 parent 3059aa7 commit e220f67
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 10 deletions.
8 changes: 7 additions & 1 deletion src/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,13 @@ def post(self):
yaml.dump(params, open("params.yaml", "w"), allow_unicode=True)

# Run DVC to create virtual sensor.
subprocess.run(["dvc", "repro"], check=True)
try:
subprocess.run(["dvc", "repro"], check=True)
except subprocess.CalledProcessError as e:
print(e)
# If DVC fails, rerun the full pipeline using --force.
print("DVC failed. Forcing rerun of full pipeline.")
subprocess.run(["dvc", "repro", "--force"], check=True)

# Reread params-file, in case it is changed during pipeline execution
# (e.g., the number of clusters).
Expand Down
14 changes: 14 additions & 0 deletions src/cluster_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,20 @@ def calculate_model_metrics(model, feature_vectors, labels):
"""

# Set invalid default values to indicate that they were not computed
metrics = {
"silhouette_score": -1000,
"calinski_harabasz_score": -1000,
"davies_bouldin_score": 1000
}


n_detected_clusters = np.unique(labels)

if len(n_detected_clusters) == 1:
print("Only one cluster detected. Skipping evaluation.")
return metrics

silhouette = silhouette_score(feature_vectors, labels)
chs = calinski_harabasz_score(feature_vectors, labels)
dbs = davies_bouldin_score(feature_vectors, labels)
Expand Down
27 changes: 18 additions & 9 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,18 +107,16 @@ def train(dir_path=""):

unique_labels = np.unique(labels)

# print("=======================")
# print("BEFORE")
# print(feature_vectors.shape)
# print(model.cluster_centers_.shape)
# print("=======================")

# These lines will remove the noise cluster of DBSCAN
# if unique_labels[0] == -1:
# unique_labels = unique_labels[1:]

n_clusters = len(unique_labels)
params["train"]["n_clusters"] = n_clusters

# TODO: Not sure if it is a good idea to rewrite params.yaml during
# execution of the pipeline.
with open("params.yaml", "w") as params_file:
yaml.dump(params, params_file)

# If the model does not calculate its own cluster centers, do the
# computation manually based on core samples or similar. Applies for the
# following clustering algorithms:
Expand Down Expand Up @@ -161,7 +159,12 @@ def train(dir_path=""):

cluster_centers = np.array(cluster_centers)

assert n_clusters == cluster_centers.shape[0]
params["train"]["n_clusters"] = cluster_centers.shape[0]

# TODO: Not sure if it is a good idea to rewrite params.yaml during
# execution of the pipeline.
with open("params.yaml", "w") as params_file:
yaml.dump(params, params_file)

# Clustering algorithms like AffinityPropagation might fail to converge,
# so MiniBatchKMeans serves as a fallback method.
Expand All @@ -177,6 +180,12 @@ def train(dir_path=""):
pd.DataFrame(labels).to_csv(LABELS_PATH)
pd.DataFrame(cluster_centers).to_csv(CLUSTER_CENTERS_PATH)

# print("=======================")
# print("AFTER")
# print(feature_vectors.shape)
# print(cluster_centers.shape)
# print("=======================")


def build_model(learning_method, n_clusters, max_iter):
"""Build clustering model.
Expand Down

0 comments on commit e220f67

Please sign in to comment.