__main__ CLI working

ml-energy · Aug 2, 2023 · 2561338 · 2561338
1 parent 9c6dfac
commit 2561338
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 19 deletions.
diff --git a/setup.py b/setup.py
@@ -50,6 +50,8 @@
         "scikit-learn",
         "nvidia-ml-py",
         "pydantic",
+        "rich",
+        "typer[all]",
     ],
     python_requires=">=3.8",
     extras_require=extras_require,

diff --git a/tests/test_monitor.py b/tests/test_monitor.py
@@ -41,7 +41,7 @@
 @pytest.fixture
 def pynvml_mock(mocker: MockerFixture):
     """Mock the entire pynvml module."""
-    mock = mocker.patch("zeus.monitor.pynvml", autospec=True)
+    mock = mocker.patch("zeus.monitor.energy.pynvml", autospec=True)
 
     # Except for the arch constants.
     mock.NVML_DEVICE_ARCH_PASCAL = pynvml.NVML_DEVICE_ARCH_PASCAL
@@ -153,20 +153,20 @@ def test_monitor(pynvml_mock, mock_gpus, mocker: MockerFixture, tmp_path: Path):
     is_old_torch = {index: arch < pynvml.NVML_DEVICE_ARCH_VOLTA for index, arch in zip(torch_gpu_indices, gpu_archs)}
     num_old_archs = sum(is_old_nvml.values())
 
-    mkdtemp_mock = mocker.patch("zeus.monitor.tempfile.mkdtemp", return_value="mock_log_dir")
-    which_mock = mocker.patch("zeus.monitor.shutil.which", return_value="zeus_monitor")
-    popen_mock = mocker.patch("zeus.monitor.subprocess.Popen", autospec=True)
-    mocker.patch("zeus.monitor.atexit.register")
+    mkdtemp_mock = mocker.patch("zeus.monitor.energy.tempfile.mkdtemp", return_value="mock_log_dir")
+    which_mock = mocker.patch("zeus.monitor.energy.shutil.which", return_value="zeus_monitor")
+    popen_mock = mocker.patch("zeus.monitor.energy.subprocess.Popen", autospec=True)
+    mocker.patch("zeus.monitor.energy.atexit.register")
 
     monotonic_counter = itertools.count(start=4, step=1)
-    mocker.patch("zeus.monitor.time.monotonic", side_effect=monotonic_counter)
+    mocker.patch("zeus.monitor.energy.time.monotonic", side_effect=monotonic_counter)
 
     energy_counters = {
         f"handle{i}": itertools.count(start=1000, step=3)
         for i in nvml_gpu_indices if not is_old_nvml[i]
     }
     pynvml_mock.nvmlDeviceGetTotalEnergyConsumption.side_effect = lambda handle: next(energy_counters[handle])
-    energy_mock = mocker.patch("zeus.monitor.analyze.energy")
+    energy_mock = mocker.patch("zeus.monitor.energy.analyze.energy")
 
     log_file = tmp_path / "log.csv"
 

diff --git a/zeus/monitor/__init__.py b/zeus/monitor/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (C) 2023 Jae-Won Chung <jwnchung@umich.edu>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Time, energy, and power monitors for Zeus.
+
+The main class of this module is [`ZeusMonitor`](zeus.monitor.energy.ZeusMonitor).
+
+If users wish to monitor power consumption over time, the [`power`](zeus.monitor.power)
+module can come in handy.
+"""
+
+from zeus.monitor.energy import ZeusMonitor, Measurement
diff --git a/zeus/monitor/__main__.py b/zeus/monitor/__main__.py
@@ -0,0 +1,48 @@
+
+from __future__ import annotations
+
+import time
+from typing import Optional
+
+import rich
+import typer
+
+from zeus.monitor.energy import ZeusMonitor
+
+app = typer.Typer(add_completion=False)
+
+
+@app.command()
+def energy(gpu_indices: Optional[list[int]] = None) -> None:
+    """Measure the time and energy of GPUs during the duration of the CLI program.
+
+    This uses the `ZeusMonitor` class for measurement, ane thus `gpu_indices` respect
+    the `CUDA_VISIBLE_DEVICES` environment variable.
+    For instance, if `CUDA_VISIBLE_DEVICES=2,3`, GPU index `1` passed into `gpu_indices`
+    will be interpreted as CUDA device `3`.
+
+    Args:
+        gpu_indices: Indices of GPUs to monitor. Not ommitted, all GPUs will be monitored.
+    """
+    monitor = ZeusMonitor(gpu_indices)
+    monitor.begin_window("zeus.monitor.energy")
+
+    try:
+        time.sleep(365 * 24 * 60 * 60)
+    except KeyboardInterrupt:
+        rich.print(monitor.end_window("zeus.monitor.energy"))
+
+
+@app.command()
+def power(gpu_indices: Optional[list[int]] = None) -> None:
+    """Monitor the power consumption of GPUs during the duration of the CLI program.
+
+    Args:
+        gpu_indices: Indices of GPUs to monitor. Not ommitted, all GPUs will be monitored.
+    """
+    while True:
+        print("Hi")
+        time.sleep(1)
+
+if __name__ == "__main__":
+    app()
diff --git a/zeus/monitor.py → zeus/monitor/energy.py b/zeus/monitor.py → zeus/monitor/energy.py
@@ -58,11 +58,18 @@ class ZeusMonitor:
     """Measure the GPU energy and time consumption of a block of code.
 
     Works for multi-GPU and heterogeneous GPU types. Aware of `CUDA_VISIBLE_DEVICES`.
+    For instance, if `CUDA_VISIBLE_DEVICES=2,3`, GPU index `1` passed into `gpu_indices`
+    will be interpreted as CUDA device `3`.
 
     You can mark the beginning and end of a measurement window, during which the GPU
     energy and time consumed will be recorded. Multiple concurrent measurement windows
     are supported.
 
+    For Volta or newer GPUs, energy consumption is measured very cheaply with the
+    `nvmlDeviceGetTotalEnergyConsumption` API. On older architectures, this API is
+    not supported, so a separate Python process is used to poll `nvmlDeviceGetPowerUsage`
+    to get power samples over time, which are integrated to compute energy consumption.
+
     ## Integration Example
 
     ```python
@@ -105,28 +112,20 @@ def __init__(
     ) -> None:
         """Instantiate the monitor.
 
-        For Volta or newer GPUs, energy consumption is measured very cheaply with the
-        `nvmlDeviceGetTotalEnergyConsumption` API. The API is not supported on older
-        architectures, so the `zeus_monitor` binary is used to poll `nvmlDeviceGetPowerUsage`
-        and write to a temporary CSV file, which is then integrated over time to compute
-        energy consumption.
-
         Args:
             gpu_indices: Indices of all the CUDA devices to monitor. Time/Energy measurements
                 will begin and end at the same time for these GPUs (i.e., synchronized).
                 If None, all the GPUs available will be used. `CUDA_VISIBLE_DEVICES`
                 is respected if set, e.g., GPU index `1` passed into `gpu_indices` when
                 `CUDA_VISIBLE_DEVICES=2,3` will be interpreted as CUDA device `3`.
                 `CUDA_VISIBLE_DEVICES`s formatted with comma-separated indices are supported.
-                (Default: `None`)
             approx_instant_energy: When the execution time of a measurement window is
-                shorter than the NVML energy counter's update period, energy consumption will
-                be shows as zero. In this case, if `approx_instant_energy` is True, the
+                shorter than the NVML energy counter's update period, energy consumption may
+                be observed as zero. In this case, if `approx_instant_energy` is True, the
                 window's energy consumption will be approximated by multiplying the current
                 instantaneous power consumption with the window's execution time. This should
                 be a better estimate than zero, but it's still an approximation.
-                (Default: `False`)
-            monitor_exec: Zeus monitor executable. (Default: `"zeus_monitor"`)
+            monitor_exec: Zeus monitor executable.
             log_file: Path to the log CSV file. If `None`, logging will be disabled.
         """
         # Save arguments.
@@ -366,7 +365,7 @@ def end_window(
                     end_time - self.monitor_start_time,
                 )
 
-        # Approximate zero energy consumption.
+        # Approximate energy consumption if the measurement window is too short.
         if self.approx_instant_energy:
             for gpu_index in self.gpu_indices:
                 if energy_consumption[gpu_index] == 0.0:

diff --git a/zeus/monitor/power.py b/zeus/monitor/power.py
@@ -0,0 +1,36 @@
+# ; Copyright (C) 2023 Jae-Won Chung <jwnchung@umich.edu>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpers that poll power usage from GPUs."""
+
+from __future__ import annotations
+
+import pynvml
+
+
+def infer_counter_update_period() -> float:
+    """Infer the update period of the NVML power counter.
+
+    NVML counters can sometimes be as slow as 10 Hz.
+    """
+    return 100.0
+
+
+class PowerMonitor:
+    """Monitor power usage from GPUs."""
+
+    def __init__(self, gpu_indices: list[int] | None = None) -> None:
+        """Initialize the power monitor."""
+        # Initialize NVML.
+        pynvml.nvmlInit()