Skip to content

Commit

Permalink
__main__ CLI working
Browse files Browse the repository at this point in the history
  • Loading branch information
jaywonchung committed Aug 2, 2023
1 parent 9c6dfac commit 2561338
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 19 deletions.
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
"scikit-learn",
"nvidia-ml-py",
"pydantic",
"rich",
"typer[all]",
],
python_requires=">=3.8",
extras_require=extras_require,
Expand Down
14 changes: 7 additions & 7 deletions tests/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
@pytest.fixture
def pynvml_mock(mocker: MockerFixture):
"""Mock the entire pynvml module."""
mock = mocker.patch("zeus.monitor.pynvml", autospec=True)
mock = mocker.patch("zeus.monitor.energy.pynvml", autospec=True)

# Except for the arch constants.
mock.NVML_DEVICE_ARCH_PASCAL = pynvml.NVML_DEVICE_ARCH_PASCAL
Expand Down Expand Up @@ -153,20 +153,20 @@ def test_monitor(pynvml_mock, mock_gpus, mocker: MockerFixture, tmp_path: Path):
is_old_torch = {index: arch < pynvml.NVML_DEVICE_ARCH_VOLTA for index, arch in zip(torch_gpu_indices, gpu_archs)}
num_old_archs = sum(is_old_nvml.values())

mkdtemp_mock = mocker.patch("zeus.monitor.tempfile.mkdtemp", return_value="mock_log_dir")
which_mock = mocker.patch("zeus.monitor.shutil.which", return_value="zeus_monitor")
popen_mock = mocker.patch("zeus.monitor.subprocess.Popen", autospec=True)
mocker.patch("zeus.monitor.atexit.register")
mkdtemp_mock = mocker.patch("zeus.monitor.energy.tempfile.mkdtemp", return_value="mock_log_dir")
which_mock = mocker.patch("zeus.monitor.energy.shutil.which", return_value="zeus_monitor")
popen_mock = mocker.patch("zeus.monitor.energy.subprocess.Popen", autospec=True)
mocker.patch("zeus.monitor.energy.atexit.register")

monotonic_counter = itertools.count(start=4, step=1)
mocker.patch("zeus.monitor.time.monotonic", side_effect=monotonic_counter)
mocker.patch("zeus.monitor.energy.time.monotonic", side_effect=monotonic_counter)

energy_counters = {
f"handle{i}": itertools.count(start=1000, step=3)
for i in nvml_gpu_indices if not is_old_nvml[i]
}
pynvml_mock.nvmlDeviceGetTotalEnergyConsumption.side_effect = lambda handle: next(energy_counters[handle])
energy_mock = mocker.patch("zeus.monitor.analyze.energy")
energy_mock = mocker.patch("zeus.monitor.energy.analyze.energy")

log_file = tmp_path / "log.csv"

Expand Down
23 changes: 23 additions & 0 deletions zeus/monitor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright (C) 2023 Jae-Won Chung <jwnchung@umich.edu>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Time, energy, and power monitors for Zeus.
The main class of this module is [`ZeusMonitor`](zeus.monitor.energy.ZeusMonitor).
If users wish to monitor power consumption over time, the [`power`](zeus.monitor.power)
module can come in handy.
"""

from zeus.monitor.energy import ZeusMonitor, Measurement
48 changes: 48 additions & 0 deletions zeus/monitor/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@

from __future__ import annotations

import time
from typing import Optional

import rich
import typer

from zeus.monitor.energy import ZeusMonitor

app = typer.Typer(add_completion=False)


@app.command()
def energy(gpu_indices: Optional[list[int]] = None) -> None:
"""Measure the time and energy of GPUs during the duration of the CLI program.
This uses the `ZeusMonitor` class for measurement, ane thus `gpu_indices` respect
the `CUDA_VISIBLE_DEVICES` environment variable.
For instance, if `CUDA_VISIBLE_DEVICES=2,3`, GPU index `1` passed into `gpu_indices`
will be interpreted as CUDA device `3`.
Args:
gpu_indices: Indices of GPUs to monitor. Not ommitted, all GPUs will be monitored.
"""
monitor = ZeusMonitor(gpu_indices)
monitor.begin_window("zeus.monitor.energy")

try:
time.sleep(365 * 24 * 60 * 60)
except KeyboardInterrupt:
rich.print(monitor.end_window("zeus.monitor.energy"))


@app.command()
def power(gpu_indices: Optional[list[int]] = None) -> None:
"""Monitor the power consumption of GPUs during the duration of the CLI program.
Args:
gpu_indices: Indices of GPUs to monitor. Not ommitted, all GPUs will be monitored.
"""
while True:
print("Hi")
time.sleep(1)

if __name__ == "__main__":
app()
23 changes: 11 additions & 12 deletions zeus/monitor.py → zeus/monitor/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,18 @@ class ZeusMonitor:
"""Measure the GPU energy and time consumption of a block of code.
Works for multi-GPU and heterogeneous GPU types. Aware of `CUDA_VISIBLE_DEVICES`.
For instance, if `CUDA_VISIBLE_DEVICES=2,3`, GPU index `1` passed into `gpu_indices`
will be interpreted as CUDA device `3`.
You can mark the beginning and end of a measurement window, during which the GPU
energy and time consumed will be recorded. Multiple concurrent measurement windows
are supported.
For Volta or newer GPUs, energy consumption is measured very cheaply with the
`nvmlDeviceGetTotalEnergyConsumption` API. On older architectures, this API is
not supported, so a separate Python process is used to poll `nvmlDeviceGetPowerUsage`
to get power samples over time, which are integrated to compute energy consumption.
## Integration Example
```python
Expand Down Expand Up @@ -105,28 +112,20 @@ def __init__(
) -> None:
"""Instantiate the monitor.
For Volta or newer GPUs, energy consumption is measured very cheaply with the
`nvmlDeviceGetTotalEnergyConsumption` API. The API is not supported on older
architectures, so the `zeus_monitor` binary is used to poll `nvmlDeviceGetPowerUsage`
and write to a temporary CSV file, which is then integrated over time to compute
energy consumption.
Args:
gpu_indices: Indices of all the CUDA devices to monitor. Time/Energy measurements
will begin and end at the same time for these GPUs (i.e., synchronized).
If None, all the GPUs available will be used. `CUDA_VISIBLE_DEVICES`
is respected if set, e.g., GPU index `1` passed into `gpu_indices` when
`CUDA_VISIBLE_DEVICES=2,3` will be interpreted as CUDA device `3`.
`CUDA_VISIBLE_DEVICES`s formatted with comma-separated indices are supported.
(Default: `None`)
approx_instant_energy: When the execution time of a measurement window is
shorter than the NVML energy counter's update period, energy consumption will
be shows as zero. In this case, if `approx_instant_energy` is True, the
shorter than the NVML energy counter's update period, energy consumption may
be observed as zero. In this case, if `approx_instant_energy` is True, the
window's energy consumption will be approximated by multiplying the current
instantaneous power consumption with the window's execution time. This should
be a better estimate than zero, but it's still an approximation.
(Default: `False`)
monitor_exec: Zeus monitor executable. (Default: `"zeus_monitor"`)
monitor_exec: Zeus monitor executable.
log_file: Path to the log CSV file. If `None`, logging will be disabled.
"""
# Save arguments.
Expand Down Expand Up @@ -366,7 +365,7 @@ def end_window(
end_time - self.monitor_start_time,
)

# Approximate zero energy consumption.
# Approximate energy consumption if the measurement window is too short.
if self.approx_instant_energy:
for gpu_index in self.gpu_indices:
if energy_consumption[gpu_index] == 0.0:
Expand Down
36 changes: 36 additions & 0 deletions zeus/monitor/power.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# ; Copyright (C) 2023 Jae-Won Chung <jwnchung@umich.edu>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Helpers that poll power usage from GPUs."""

from __future__ import annotations

import pynvml


def infer_counter_update_period() -> float:
"""Infer the update period of the NVML power counter.
NVML counters can sometimes be as slow as 10 Hz.
"""
return 100.0


class PowerMonitor:
"""Monitor power usage from GPUs."""

def __init__(self, gpu_indices: list[int] | None = None) -> None:
"""Initialize the power monitor."""
# Initialize NVML.
pynvml.nvmlInit()

0 comments on commit 2561338

Please sign in to comment.