From 87daaef53102ffffd3f4ab45b8c7278b7ccf73b8 Mon Sep 17 00:00:00 2001 From: Brian Gunnarson Date: Thu, 19 Oct 2023 12:53:28 -0700 Subject: [PATCH 1/2] fix typo in batch.py that causes a bug --- CHANGELOG.md | 4 ++++ merlin/study/batch.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f994a4d05..d709ccaa4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to Merlin will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] +### Fixed +- Typo in `batch.py` that caused lsf launches to fail (`ALL_SGPUS` changed to `ALL_GPUS`) + ## [1.11.0] ### Added - New reserved variable: diff --git a/merlin/study/batch.py b/merlin/study/batch.py index e02a65a32..66ccd1570 100644 --- a/merlin/study/batch.py +++ b/merlin/study/batch.py @@ -299,7 +299,7 @@ def construct_scheduler_legend(parsed_batch: Dict, nodes: int) -> Dict: "lsf": { "check cmd": ["jsrun", "--help"], "expected check output": b"jsrun", - "launch": f"jsrun -a 1 -c ALL_CPUS -g ALL_SGPUS --bind=none -n {nodes}", + "launch": f"jsrun -a 1 -c ALL_CPUS -g ALL_GPUS --bind=none -n {nodes}", }, # pbs is mainly a placeholder in case a user wants to try it (we don't have it at the lab so it's mostly untested) "pbs": { @@ -335,12 +335,16 @@ def construct_worker_launch_command(parsed_batch: Dict, nodes: int) -> str: scheduler_legend: Dict = construct_scheduler_legend(parsed_batch, nodes) workload_manager: str = get_batch_type(scheduler_legend) + print(f"parsed_batch: {parsed_batch}") + if parsed_batch["btype"] == "pbs" and workload_manager == parsed_batch["btype"]: raise TypeError("The PBS scheduler is only enabled for 'batch: flux' type") if parsed_batch["btype"] == "slurm" and workload_manager not in ("lsf", "flux", "pbs"): workload_manager = "slurm" + print(f"workload_manager: {workload_manager}") + try: launch_command = scheduler_legend[workload_manager]["launch"] except KeyError as e: # pylint: disable=C0103 From 85855b3bf56df8ba3fbb260f76f597aee8df25f2 Mon Sep 17 00:00:00 2001 From: Brian Gunnarson Date: Mon, 23 Oct 2023 10:45:52 -0700 Subject: [PATCH 2/2] change print statements to log statements --- merlin/study/batch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/merlin/study/batch.py b/merlin/study/batch.py index 66ccd1570..1b96cd282 100644 --- a/merlin/study/batch.py +++ b/merlin/study/batch.py @@ -335,7 +335,7 @@ def construct_worker_launch_command(parsed_batch: Dict, nodes: int) -> str: scheduler_legend: Dict = construct_scheduler_legend(parsed_batch, nodes) workload_manager: str = get_batch_type(scheduler_legend) - print(f"parsed_batch: {parsed_batch}") + LOG.debug(f"parsed_batch: {parsed_batch}") if parsed_batch["btype"] == "pbs" and workload_manager == parsed_batch["btype"]: raise TypeError("The PBS scheduler is only enabled for 'batch: flux' type") @@ -343,7 +343,7 @@ def construct_worker_launch_command(parsed_batch: Dict, nodes: int) -> str: if parsed_batch["btype"] == "slurm" and workload_manager not in ("lsf", "flux", "pbs"): workload_manager = "slurm" - print(f"workload_manager: {workload_manager}") + LOG.debug(f"workload_manager: {workload_manager}") try: launch_command = scheduler_legend[workload_manager]["launch"]