Skip to content

nvidia

zeus.device.gpu.nvidia

NVIDIA GPUs.

NVIDIAGPU

Bases: GPU

Implementation of GPU for NVIDIA GPUs.

Source code in zeus/device/gpu/nvidia.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
class NVIDIAGPU(gpu_common.GPU):
    """Implementation of `GPU` for NVIDIA GPUs."""

    def __init__(self, gpu_index: int) -> None:
        """Initialize the GPU object."""
        super().__init__(gpu_index)
        self._get_handle()
        self._supportsGetTotalEnergyConsumption = None

        # Check if it's a Grace Hopper chip
        try:
            c2c_mode_info = pynvml.nvmlDeviceGetC2cModeInfoV(self.handle)
            self._is_grace_hopper = c2c_mode_info.isC2cEnabled
        except pynvml.NVMLError as e:
            e_value = e.value  # pyright: ignore[reportAttributeAccessIssue]
            if e_value != pynvml.NVML_ERROR_NOT_SUPPORTED:
                logger.warning(
                    "Attempted to check whether the current chip is a Grace Hopper chip "
                    "by calling `nvmlDeviceGetC2cModeInfoV`, which we expected to either "
                    "return a valid response or raise `NVML_ERROR_NOT_SUPPORTED`. "
                    "Instead, it raised an unexpected error: '%s'. Treating this as "
                    "not a Grace Hopper chip.",
                    e,
                )
            self._is_grace_hopper = False

    _exception_map = {
        pynvml.NVML_ERROR_UNINITIALIZED: gpu_common.ZeusGPUInitError,
        pynvml.NVML_ERROR_INVALID_ARGUMENT: gpu_common.ZeusGPUInvalidArgError,
        pynvml.NVML_ERROR_NOT_SUPPORTED: gpu_common.ZeusGPUNotSupportedError,
        pynvml.NVML_ERROR_NO_PERMISSION: gpu_common.ZeusGPUNoPermissionError,
        pynvml.NVML_ERROR_ALREADY_INITIALIZED: gpu_common.ZeusGPUAlreadyInitializedError,
        pynvml.NVML_ERROR_NOT_FOUND: gpu_common.ZeusGPUNotFoundError,
        pynvml.NVML_ERROR_INSUFFICIENT_SIZE: gpu_common.ZeusGPUInsufficientSizeError,
        pynvml.NVML_ERROR_INSUFFICIENT_POWER: gpu_common.ZeusGPUInsufficientPowerError,
        pynvml.NVML_ERROR_DRIVER_NOT_LOADED: gpu_common.ZeusGPUDriverNotLoadedError,
        pynvml.NVML_ERROR_TIMEOUT: gpu_common.ZeusGPUTimeoutError,
        pynvml.NVML_ERROR_IRQ_ISSUE: gpu_common.ZeusGPUIRQError,
        pynvml.NVML_ERROR_LIBRARY_NOT_FOUND: gpu_common.ZeusGPULibraryNotFoundError,
        pynvml.NVML_ERROR_FUNCTION_NOT_FOUND: gpu_common.ZeusGPUFunctionNotFoundError,
        pynvml.NVML_ERROR_CORRUPTED_INFOROM: gpu_common.ZeusGPUCorruptedInfoROMError,
        pynvml.NVML_ERROR_GPU_IS_LOST: gpu_common.ZeusGPULostError,
        pynvml.NVML_ERROR_RESET_REQUIRED: gpu_common.ZeusGPUResetRequiredError,
        pynvml.NVML_ERROR_OPERATING_SYSTEM: gpu_common.ZeusGPUOperatingSystemError,
        pynvml.NVML_ERROR_LIB_RM_VERSION_MISMATCH: gpu_common.ZeusGPULibRMVersionMismatchError,
        pynvml.NVML_ERROR_MEMORY: gpu_common.ZeusGPUMemoryError,
        pynvml.NVML_ERROR_UNKNOWN: gpu_common.ZeusGPUUnknownError,
    }

    @_handle_nvml_errors
    def _get_handle(self):
        self.handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)

    @_handle_nvml_errors
    def get_name(self) -> str:
        """Return the name of the GPU model."""
        return pynvml.nvmlDeviceGetName(self.handle)

    @property
    def supports_nonblocking_setters(self) -> bool:
        """Return True if the GPU object supports non-blocking configuration setters."""
        return False

    @_handle_nvml_errors
    def get_power_management_limit_constraints(self) -> tuple[int, int]:
        """Return the minimum and maximum power management limits. Units: mW."""
        min_, max_ = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.handle)
        return (min_, max_)

    @_handle_nvml_errors
    def get_power_management_limit(self) -> int:
        """Return the current power management limit. Units: mW."""
        return pynvml.nvmlDeviceGetPowerManagementLimit(self.handle)

    @_handle_nvml_errors
    def set_power_management_limit(self, power_limit_mw: int, block: bool = True) -> None:
        """Set the GPU's power management limit. Unit: mW."""
        current_limit = self.get_power_management_limit()
        if current_limit != power_limit_mw:
            self._warn_sys_admin()
            pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, power_limit_mw)

    @_handle_nvml_errors
    def reset_power_management_limit(self, block: bool = True) -> None:
        """Reset the GPU's power management limit to the default value."""
        default_limit = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle)
        current_limit = self.get_power_management_limit()
        if current_limit != default_limit:
            self._warn_sys_admin()
            pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, default_limit)

    @_handle_nvml_errors
    def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
        """Set persistence mode."""
        current_mode = pynvml.nvmlDeviceGetPersistenceMode(self.handle)
        desired_mode = pynvml.NVML_FEATURE_ENABLED if enabled else pynvml.NVML_FEATURE_DISABLED
        if current_mode != desired_mode:
            self._warn_sys_admin()
            pynvml.nvmlDeviceSetPersistenceMode(self.handle, desired_mode)

    @_handle_nvml_errors
    def get_supported_memory_clocks(self) -> list[int]:
        """Return a list of supported memory clock frequencies. Units: MHz."""
        return pynvml.nvmlDeviceGetSupportedMemoryClocks(self.handle)

    @_handle_nvml_errors
    def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
        """Lock the memory clock to a specified range. Units: MHz."""
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetMemoryLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

    @_handle_nvml_errors
    def reset_memory_locked_clocks(self, block: bool = True) -> None:
        """Reset the locked memory clocks to the default."""
        self._warn_sys_admin()
        pynvml.nvmlDeviceResetMemoryLockedClocks(self.handle)

    @_handle_nvml_errors
    def get_supported_graphics_clocks(self, memory_clock_mhz: int | None = None) -> list[int]:
        """Return a list of supported graphics clock frequencies. Units: MHz.

        Args:
            memory_clock_mhz: Memory clock frequency to use. Some GPUs have
                different supported graphics clocks depending on the memory clock.
        """
        pass
        return pynvml.nvmlDeviceGetSupportedGraphicsClocks(self.handle, memory_clock_mhz)

    @_handle_nvml_errors
    def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
        """Lock the GPU clock to a specified range. Units: MHz."""
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetGpuLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

    @_handle_nvml_errors
    def reset_gpu_locked_clocks(self, block: bool = True) -> None:
        """Reset the locked GPU clocks to the default."""
        self._warn_sys_admin()
        pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

    @_handle_nvml_errors
    def get_average_power_usage(self) -> int:
        """Return the average power draw of the GPU. Units: mW."""
        if self._is_grace_hopper:
            fields = [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MODULE)]
        else:
            fields = [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_GPU)]

        metric = pynvml.nvmlDeviceGetFieldValues(self.handle, fields)[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        return metric.value.uiVal

    @_handle_nvml_errors
    def get_instant_power_usage(self) -> int:
        """Return the current power draw of the GPU. Units: mW."""
        if self._is_grace_hopper:
            fields = [(pynvml.NVML_FI_DEV_POWER_INSTANT, pynvml.NVML_POWER_SCOPE_MODULE)]
        else:
            fields = [(pynvml.NVML_FI_DEV_POWER_INSTANT, pynvml.NVML_POWER_SCOPE_GPU)]

        metric = pynvml.nvmlDeviceGetFieldValues(self.handle, fields)[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        return metric.value.uiVal

    @_handle_nvml_errors
    def get_average_memory_power_usage(self) -> int:
        """Return the average power draw of the GPU's memory. Units: mW.

        !!! Warning
            This isn't exactly documented in NVML at the time of writing, but `nvidia-smi`
            makes use of this API.

            Confirmed working on H100 80GB HBM3. Confirmed not working on A40.
        """
        metric = pynvml.nvmlDeviceGetFieldValues(
            self.handle,
            [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY)],
        )[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        power = metric.value.uiVal
        if power == 0:
            warnings.warn(
                "Average memory power returned 0. The current GPU may not be supported.",
                stacklevel=1,
            )
        return power

    @_handle_nvml_errors
    def supports_get_total_energy_consumption(self) -> bool:
        """Check if the GPU supports retrieving total energy consumption."""
        # Supported on Volta or newer microarchitectures
        if self._supportsGetTotalEnergyConsumption is None:
            self._supportsGetTotalEnergyConsumption = (
                pynvml.nvmlDeviceGetArchitecture(self.handle) >= pynvml.NVML_DEVICE_ARCH_VOLTA
            )

        return self._supportsGetTotalEnergyConsumption

    @_handle_nvml_errors
    def get_total_energy_consumption(self) -> int:
        """Return the total energy consumption of the specified GPU. Units: mJ."""
        return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)

    @_handle_nvml_errors
    def get_gpu_temperature(self) -> int:
        """Return the current GPU temperature. Units: Celsius."""
        temperature = pynvml.nvmlDeviceGetTemperatureV(self.handle, pynvml.NVML_TEMPERATURE_GPU)
        return temperature  # type: ignore

supports_nonblocking_setters property

supports_nonblocking_setters

Return True if the GPU object supports non-blocking configuration setters.

__init__

__init__(gpu_index)
Source code in zeus/device/gpu/nvidia.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def __init__(self, gpu_index: int) -> None:
    """Initialize the GPU object."""
    super().__init__(gpu_index)
    self._get_handle()
    self._supportsGetTotalEnergyConsumption = None

    # Check if it's a Grace Hopper chip
    try:
        c2c_mode_info = pynvml.nvmlDeviceGetC2cModeInfoV(self.handle)
        self._is_grace_hopper = c2c_mode_info.isC2cEnabled
    except pynvml.NVMLError as e:
        e_value = e.value  # pyright: ignore[reportAttributeAccessIssue]
        if e_value != pynvml.NVML_ERROR_NOT_SUPPORTED:
            logger.warning(
                "Attempted to check whether the current chip is a Grace Hopper chip "
                "by calling `nvmlDeviceGetC2cModeInfoV`, which we expected to either "
                "return a valid response or raise `NVML_ERROR_NOT_SUPPORTED`. "
                "Instead, it raised an unexpected error: '%s'. Treating this as "
                "not a Grace Hopper chip.",
                e,
            )
        self._is_grace_hopper = False

get_name

get_name()

Return the name of the GPU model.

Source code in zeus/device/gpu/nvidia.py
120
121
122
123
@_handle_nvml_errors
def get_name(self) -> str:
    """Return the name of the GPU model."""
    return pynvml.nvmlDeviceGetName(self.handle)

get_power_management_limit_constraints

get_power_management_limit_constraints()

Return the minimum and maximum power management limits. Units: mW.

Source code in zeus/device/gpu/nvidia.py
130
131
132
133
134
@_handle_nvml_errors
def get_power_management_limit_constraints(self) -> tuple[int, int]:
    """Return the minimum and maximum power management limits. Units: mW."""
    min_, max_ = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.handle)
    return (min_, max_)

get_power_management_limit

get_power_management_limit()

Return the current power management limit. Units: mW.

Source code in zeus/device/gpu/nvidia.py
136
137
138
139
@_handle_nvml_errors
def get_power_management_limit(self) -> int:
    """Return the current power management limit. Units: mW."""
    return pynvml.nvmlDeviceGetPowerManagementLimit(self.handle)

set_power_management_limit

set_power_management_limit(power_limit_mw, block=True)

Set the GPU's power management limit. Unit: mW.

Source code in zeus/device/gpu/nvidia.py
141
142
143
144
145
146
147
@_handle_nvml_errors
def set_power_management_limit(self, power_limit_mw: int, block: bool = True) -> None:
    """Set the GPU's power management limit. Unit: mW."""
    current_limit = self.get_power_management_limit()
    if current_limit != power_limit_mw:
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, power_limit_mw)

reset_power_management_limit

reset_power_management_limit(block=True)

Reset the GPU's power management limit to the default value.

Source code in zeus/device/gpu/nvidia.py
149
150
151
152
153
154
155
156
@_handle_nvml_errors
def reset_power_management_limit(self, block: bool = True) -> None:
    """Reset the GPU's power management limit to the default value."""
    default_limit = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle)
    current_limit = self.get_power_management_limit()
    if current_limit != default_limit:
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, default_limit)

set_persistence_mode

set_persistence_mode(enabled, block=True)

Set persistence mode.

Source code in zeus/device/gpu/nvidia.py
158
159
160
161
162
163
164
165
@_handle_nvml_errors
def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
    """Set persistence mode."""
    current_mode = pynvml.nvmlDeviceGetPersistenceMode(self.handle)
    desired_mode = pynvml.NVML_FEATURE_ENABLED if enabled else pynvml.NVML_FEATURE_DISABLED
    if current_mode != desired_mode:
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetPersistenceMode(self.handle, desired_mode)

get_supported_memory_clocks

get_supported_memory_clocks()

Return a list of supported memory clock frequencies. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
167
168
169
170
@_handle_nvml_errors
def get_supported_memory_clocks(self) -> list[int]:
    """Return a list of supported memory clock frequencies. Units: MHz."""
    return pynvml.nvmlDeviceGetSupportedMemoryClocks(self.handle)

set_memory_locked_clocks

set_memory_locked_clocks(min_clock_mhz, max_clock_mhz, block=True)

Lock the memory clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
172
173
174
175
176
@_handle_nvml_errors
def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
    """Lock the memory clock to a specified range. Units: MHz."""
    self._warn_sys_admin()
    pynvml.nvmlDeviceSetMemoryLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

reset_memory_locked_clocks

reset_memory_locked_clocks(block=True)

Reset the locked memory clocks to the default.

Source code in zeus/device/gpu/nvidia.py
178
179
180
181
182
@_handle_nvml_errors
def reset_memory_locked_clocks(self, block: bool = True) -> None:
    """Reset the locked memory clocks to the default."""
    self._warn_sys_admin()
    pynvml.nvmlDeviceResetMemoryLockedClocks(self.handle)

get_supported_graphics_clocks

get_supported_graphics_clocks(memory_clock_mhz=None)

Return a list of supported graphics clock frequencies. Units: MHz.

Parameters:

Name Type Description Default
memory_clock_mhz int | None

Memory clock frequency to use. Some GPUs have different supported graphics clocks depending on the memory clock.

None
Source code in zeus/device/gpu/nvidia.py
184
185
186
187
188
189
190
191
192
193
@_handle_nvml_errors
def get_supported_graphics_clocks(self, memory_clock_mhz: int | None = None) -> list[int]:
    """Return a list of supported graphics clock frequencies. Units: MHz.

    Args:
        memory_clock_mhz: Memory clock frequency to use. Some GPUs have
            different supported graphics clocks depending on the memory clock.
    """
    pass
    return pynvml.nvmlDeviceGetSupportedGraphicsClocks(self.handle, memory_clock_mhz)

set_gpu_locked_clocks

set_gpu_locked_clocks(min_clock_mhz, max_clock_mhz, block=True)

Lock the GPU clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
195
196
197
198
199
@_handle_nvml_errors
def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
    """Lock the GPU clock to a specified range. Units: MHz."""
    self._warn_sys_admin()
    pynvml.nvmlDeviceSetGpuLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

reset_gpu_locked_clocks

reset_gpu_locked_clocks(block=True)

Reset the locked GPU clocks to the default.

Source code in zeus/device/gpu/nvidia.py
201
202
203
204
205
@_handle_nvml_errors
def reset_gpu_locked_clocks(self, block: bool = True) -> None:
    """Reset the locked GPU clocks to the default."""
    self._warn_sys_admin()
    pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

get_average_power_usage

get_average_power_usage()

Return the average power draw of the GPU. Units: mW.

Source code in zeus/device/gpu/nvidia.py
207
208
209
210
211
212
213
214
215
216
217
218
@_handle_nvml_errors
def get_average_power_usage(self) -> int:
    """Return the average power draw of the GPU. Units: mW."""
    if self._is_grace_hopper:
        fields = [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MODULE)]
    else:
        fields = [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_GPU)]

    metric = pynvml.nvmlDeviceGetFieldValues(self.handle, fields)[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    return metric.value.uiVal

get_instant_power_usage

get_instant_power_usage()

Return the current power draw of the GPU. Units: mW.

Source code in zeus/device/gpu/nvidia.py
220
221
222
223
224
225
226
227
228
229
230
231
@_handle_nvml_errors
def get_instant_power_usage(self) -> int:
    """Return the current power draw of the GPU. Units: mW."""
    if self._is_grace_hopper:
        fields = [(pynvml.NVML_FI_DEV_POWER_INSTANT, pynvml.NVML_POWER_SCOPE_MODULE)]
    else:
        fields = [(pynvml.NVML_FI_DEV_POWER_INSTANT, pynvml.NVML_POWER_SCOPE_GPU)]

    metric = pynvml.nvmlDeviceGetFieldValues(self.handle, fields)[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    return metric.value.uiVal

get_average_memory_power_usage

get_average_memory_power_usage()

Return the average power draw of the GPU's memory. Units: mW.

Warning

This isn't exactly documented in NVML at the time of writing, but nvidia-smi makes use of this API.

Confirmed working on H100 80GB HBM3. Confirmed not working on A40.

Source code in zeus/device/gpu/nvidia.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
@_handle_nvml_errors
def get_average_memory_power_usage(self) -> int:
    """Return the average power draw of the GPU's memory. Units: mW.

    !!! Warning
        This isn't exactly documented in NVML at the time of writing, but `nvidia-smi`
        makes use of this API.

        Confirmed working on H100 80GB HBM3. Confirmed not working on A40.
    """
    metric = pynvml.nvmlDeviceGetFieldValues(
        self.handle,
        [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY)],
    )[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    power = metric.value.uiVal
    if power == 0:
        warnings.warn(
            "Average memory power returned 0. The current GPU may not be supported.",
            stacklevel=1,
        )
    return power

supports_get_total_energy_consumption

supports_get_total_energy_consumption()

Check if the GPU supports retrieving total energy consumption.

Source code in zeus/device/gpu/nvidia.py
257
258
259
260
261
262
263
264
265
266
@_handle_nvml_errors
def supports_get_total_energy_consumption(self) -> bool:
    """Check if the GPU supports retrieving total energy consumption."""
    # Supported on Volta or newer microarchitectures
    if self._supportsGetTotalEnergyConsumption is None:
        self._supportsGetTotalEnergyConsumption = (
            pynvml.nvmlDeviceGetArchitecture(self.handle) >= pynvml.NVML_DEVICE_ARCH_VOLTA
        )

    return self._supportsGetTotalEnergyConsumption

get_total_energy_consumption

get_total_energy_consumption()

Return the total energy consumption of the specified GPU. Units: mJ.

Source code in zeus/device/gpu/nvidia.py
268
269
270
271
@_handle_nvml_errors
def get_total_energy_consumption(self) -> int:
    """Return the total energy consumption of the specified GPU. Units: mJ."""
    return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)

get_gpu_temperature

get_gpu_temperature()

Return the current GPU temperature. Units: Celsius.

Source code in zeus/device/gpu/nvidia.py
273
274
275
276
277
@_handle_nvml_errors
def get_gpu_temperature(self) -> int:
    """Return the current GPU temperature. Units: Celsius."""
    temperature = pynvml.nvmlDeviceGetTemperatureV(self.handle, pynvml.NVML_TEMPERATURE_GPU)
    return temperature  # type: ignore

ZeusdNVIDIAGPU

Bases: NVIDIAGPU

An NVIDIAGPU that sets GPU knobs that require SYS_ADMIN via zeusd.

Some NVML APIs (e.g., setting persistence mode, power limit, frequency) requires the Linux security capability SYS_ADMIN, which is virtually sudo. This class overrides those methods so that they send a request to the Zeus daemon.

See here for details on system privileges required.

Source code in zeus/device/gpu/nvidia.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
class ZeusdNVIDIAGPU(NVIDIAGPU):
    """An NVIDIAGPU that sets GPU knobs that require `SYS_ADMIN` via zeusd.

    Some NVML APIs (e.g., setting persistence mode, power limit, frequency)
    requires the Linux security capability `SYS_ADMIN`, which is virtually `sudo`.
    This class overrides those methods so that they send a request to the
    Zeus daemon.

    See [here](https://ml.energy/zeus/getting_started/#system-privileges)
    for details on system privileges required.
    """

    def __init__(
        self,
        gpu_index: int,
        zeusd_sock_path: str = "/var/run/zeusd.sock",
    ) -> None:
        """Initialize NVML and sets up the GPUs.

        Args:
            gpu_index (int): Index of the GPU.
            zeusd_sock_path (str): Path to the Zeus daemon socket.
        """
        super().__init__(gpu_index)
        self.zeusd_sock_path = zeusd_sock_path

        self._client = httpx.Client(transport=httpx.HTTPTransport(uds=zeusd_sock_path))
        self._url_prefix = f"http://zeusd/gpu/{gpu_index}"

    @property
    def supports_nonblocking_setters(self) -> bool:
        """Return True if the GPU object supports non-blocking configuration setters."""
        return True

    def set_power_management_limit(self, power_limit_mw: int, block: bool = True) -> None:
        """Set the GPU's power management limit. Unit: mW."""
        current_limit = self.get_power_management_limit()
        if current_limit == power_limit_mw:
            return

        resp = self._client.post(
            self._url_prefix + "/set_power_limit",
            json=dict(power_limit_mw=power_limit_mw, block=block),
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to set power management limit: {resp.text}")
        logger.debug("Took %s ms to set power limit", resp.elapsed.microseconds / 1000)

    @_handle_nvml_errors
    def reset_power_management_limit(self, block: bool = True) -> None:
        """Reset the GPU's power management limit to the default value."""
        self.set_power_management_limit(
            pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
            block,
        )

    def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
        """Set persistence mode."""
        resp = self._client.post(
            self._url_prefix + "/set_persistence_mode",
            json=dict(enabled=enabled, block=block),
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to set persistence mode: {resp.text}")
        logger.debug("Took %s ms to set persistence mode", resp.elapsed.microseconds / 1000)

    def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
        """Lock the memory clock to a specified range. Units: MHz."""
        resp = self._client.post(
            self._url_prefix + "/set_mem_locked_clocks",
            json=dict(min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block),
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to set memory locked clocks: {resp.text}")
        logger.debug("Took %s ms to set memory locked clocks", resp.elapsed.microseconds / 1000)

    def reset_memory_locked_clocks(self, block: bool = True) -> None:
        """Reset the locked memory clocks to the default."""
        resp = self._client.post(self._url_prefix + "/reset_mem_locked_clocks", json=dict(block=block))
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to reset memory locked clocks: {resp.text}")

    def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
        """Lock the GPU clock to a specified range. Units: MHz."""
        resp = self._client.post(
            self._url_prefix + "/set_gpu_locked_clocks",
            json=dict(min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block),
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to set GPU locked clocks: {resp.text}")

    def reset_gpu_locked_clocks(self, block: bool = True) -> None:
        """Reset the locked GPU clocks to the default."""
        resp = self._client.post(self._url_prefix + "/reset_gpu_locked_clocks", json=dict(block=block))
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to reset GPU locked clocks: {resp.text}")

supports_nonblocking_setters property

supports_nonblocking_setters

Return True if the GPU object supports non-blocking configuration setters.

__init__

__init__(gpu_index, zeusd_sock_path='/var/run/zeusd.sock')

Parameters:

Name Type Description Default
gpu_index int

Index of the GPU.

required
zeusd_sock_path str

Path to the Zeus daemon socket.

'/var/run/zeusd.sock'
Source code in zeus/device/gpu/nvidia.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def __init__(
    self,
    gpu_index: int,
    zeusd_sock_path: str = "/var/run/zeusd.sock",
) -> None:
    """Initialize NVML and sets up the GPUs.

    Args:
        gpu_index (int): Index of the GPU.
        zeusd_sock_path (str): Path to the Zeus daemon socket.
    """
    super().__init__(gpu_index)
    self.zeusd_sock_path = zeusd_sock_path

    self._client = httpx.Client(transport=httpx.HTTPTransport(uds=zeusd_sock_path))
    self._url_prefix = f"http://zeusd/gpu/{gpu_index}"

set_power_management_limit

set_power_management_limit(power_limit_mw, block=True)

Set the GPU's power management limit. Unit: mW.

Source code in zeus/device/gpu/nvidia.py
314
315
316
317
318
319
320
321
322
323
324
325
326
def set_power_management_limit(self, power_limit_mw: int, block: bool = True) -> None:
    """Set the GPU's power management limit. Unit: mW."""
    current_limit = self.get_power_management_limit()
    if current_limit == power_limit_mw:
        return

    resp = self._client.post(
        self._url_prefix + "/set_power_limit",
        json=dict(power_limit_mw=power_limit_mw, block=block),
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to set power management limit: {resp.text}")
    logger.debug("Took %s ms to set power limit", resp.elapsed.microseconds / 1000)

reset_power_management_limit

reset_power_management_limit(block=True)

Reset the GPU's power management limit to the default value.

Source code in zeus/device/gpu/nvidia.py
328
329
330
331
332
333
334
@_handle_nvml_errors
def reset_power_management_limit(self, block: bool = True) -> None:
    """Reset the GPU's power management limit to the default value."""
    self.set_power_management_limit(
        pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
        block,
    )

set_persistence_mode

set_persistence_mode(enabled, block=True)

Set persistence mode.

Source code in zeus/device/gpu/nvidia.py
336
337
338
339
340
341
342
343
344
def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
    """Set persistence mode."""
    resp = self._client.post(
        self._url_prefix + "/set_persistence_mode",
        json=dict(enabled=enabled, block=block),
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to set persistence mode: {resp.text}")
    logger.debug("Took %s ms to set persistence mode", resp.elapsed.microseconds / 1000)

set_memory_locked_clocks

set_memory_locked_clocks(min_clock_mhz, max_clock_mhz, block=True)

Lock the memory clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
346
347
348
349
350
351
352
353
354
def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
    """Lock the memory clock to a specified range. Units: MHz."""
    resp = self._client.post(
        self._url_prefix + "/set_mem_locked_clocks",
        json=dict(min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block),
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to set memory locked clocks: {resp.text}")
    logger.debug("Took %s ms to set memory locked clocks", resp.elapsed.microseconds / 1000)

reset_memory_locked_clocks

reset_memory_locked_clocks(block=True)

Reset the locked memory clocks to the default.

Source code in zeus/device/gpu/nvidia.py
356
357
358
359
360
def reset_memory_locked_clocks(self, block: bool = True) -> None:
    """Reset the locked memory clocks to the default."""
    resp = self._client.post(self._url_prefix + "/reset_mem_locked_clocks", json=dict(block=block))
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to reset memory locked clocks: {resp.text}")

set_gpu_locked_clocks

set_gpu_locked_clocks(min_clock_mhz, max_clock_mhz, block=True)

Lock the GPU clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
362
363
364
365
366
367
368
369
def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
    """Lock the GPU clock to a specified range. Units: MHz."""
    resp = self._client.post(
        self._url_prefix + "/set_gpu_locked_clocks",
        json=dict(min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block),
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to set GPU locked clocks: {resp.text}")

reset_gpu_locked_clocks

reset_gpu_locked_clocks(block=True)

Reset the locked GPU clocks to the default.

Source code in zeus/device/gpu/nvidia.py
371
372
373
374
375
def reset_gpu_locked_clocks(self, block: bool = True) -> None:
    """Reset the locked GPU clocks to the default."""
    resp = self._client.post(self._url_prefix + "/reset_gpu_locked_clocks", json=dict(block=block))
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to reset GPU locked clocks: {resp.text}")

NVIDIAGPUs

Bases: GPUs

Implementation of GPUs for NVIDIA GPUs.

CUDA_VISIBLE_DEVICES environment variable is respected if set. For example, if there are 4 GPUs on the node and CUDA_VISIBLE_DEVICES=0,2, only GPUs 0 and 2 are instantiated. In this case, to access GPU of CUDA index 0, use the index 0, and for CUDA index 2, use the index 1.

If you have the Zeus daemon deployed, make sure you have set the ZEUSD_SOCK_PATH environment variable to the path of the Zeus daemon socket. This class will automatically use ZeusdNVIDIAGPU if ZEUSD_SOCK_PATH is set.

Note

For Grace Hopper, the power and energy values are for the entire superchip/module.

Source code in zeus/device/gpu/nvidia.py
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
class NVIDIAGPUs(gpu_common.GPUs):
    """Implementation of `GPUs` for NVIDIA GPUs.

    `CUDA_VISIBLE_DEVICES` environment variable is respected if set.
    For example, if there are 4 GPUs on the node and `CUDA_VISIBLE_DEVICES=0,2`,
    only GPUs 0 and 2 are instantiated. In this case, to access
    GPU of CUDA index 0, use the index 0, and for CUDA index 2, use the index 1.

    If you have the Zeus daemon deployed, make sure you have set the `ZEUSD_SOCK_PATH`
    environment variable to the path of the Zeus daemon socket. This class will
    automatically use [`ZeusdNVIDIAGPU`][zeus.device.gpu.nvidia.ZeusdNVIDIAGPU]
    if `ZEUSD_SOCK_PATH` is set.

    !!! Note
        For Grace Hopper, the power and energy values are for the entire superchip/module.
    """

    def __init__(self, ensure_homogeneous: bool = False) -> None:
        """Initialize NVML and sets up the GPUs.

        Args:
            ensure_homogeneous (bool): If True, ensures that all tracked GPUs have the same name.
        """
        try:
            pynvml.nvmlInit()
            self._init_gpus()
            if ensure_homogeneous:
                self._ensure_homogeneous()
        except pynvml.NVMLError as e:
            exception_class = NVIDIAGPU._exception_map.get(
                e.value,  # pyright: ignore[reportAttributeAccessIssue]
                gpu_common.ZeusBaseGPUError,
            )
            raise exception_class(
                e.msg  # pyright: ignore[reportAttributeAccessIssue]
            ) from e

    @property
    def gpus(self) -> Sequence[NVIDIAGPU]:
        """Return a list of NVIDIAGPU objects being tracked."""
        return self._gpus

    def _init_gpus(self) -> None:
        # Must respect `CUDA_VISIBLE_DEVICES` if set
        if (visible_device := os.environ.get("CUDA_VISIBLE_DEVICES")) is not None:
            if not visible_device:
                raise gpu_common.ZeusGPUInitError(
                    "CUDA_VISIBLE_DEVICES is set to an empty string. "
                    "It should either be unset or a comma-separated list of GPU indices."
                )
            if visible_device.startswith("MIG"):
                raise gpu_common.ZeusGPUInitError(
                    "CUDA_VISIBLE_DEVICES contains MIG devices. NVML (the library used by Zeus) "
                    "currently does not support measuring the power or energy consumption of MIG "
                    "slices. You can still measure the whole GPU by temporarily setting "
                    "CUDA_VISIBLE_DEVICES to integer GPU indices and restoring it afterwards."
                )
            visible_indices = [int(idx) for idx in visible_device.split(",")]
        else:
            visible_indices = list(range(pynvml.nvmlDeviceGetCount()))

        # If `ZEUSD_SOCK_PATH` is set, always use ZeusdNVIDIAGPU
        if (sock_path := os.environ.get("ZEUSD_SOCK_PATH")) is not None:
            if not Path(sock_path).exists():
                raise ZeusdError(f"ZEUSD_SOCK_PATH points to non-existent file: {sock_path}")
            if not Path(sock_path).is_socket():
                raise ZeusdError(f"ZEUSD_SOCK_PATH is not a socket: {sock_path}")
            if not os.access(sock_path, os.W_OK):
                raise ZeusdError(f"ZEUSD_SOCK_PATH is not writable: {sock_path}")
            self._gpus = [ZeusdNVIDIAGPU(gpu_num, sock_path) for gpu_num in visible_indices]
            # Disable the warning about SYS_ADMIN capabilities for each GPU
            for gpu in self._gpus:
                gpu._disable_sys_admin_warning = True

        # Otherwise just use NVIDIAGPU
        else:
            self._gpus = [NVIDIAGPU(gpu_num) for gpu_num in visible_indices]

    def __del__(self) -> None:
        """Shut down NVML."""
        with contextlib.suppress(pynvml.NVMLError):
            pynvml.nvmlShutdown()

gpus property

gpus

Return a list of NVIDIAGPU objects being tracked.

__init__

__init__(ensure_homogeneous=False)

Parameters:

Name Type Description Default
ensure_homogeneous bool

If True, ensures that all tracked GPUs have the same name.

False
Source code in zeus/device/gpu/nvidia.py
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
def __init__(self, ensure_homogeneous: bool = False) -> None:
    """Initialize NVML and sets up the GPUs.

    Args:
        ensure_homogeneous (bool): If True, ensures that all tracked GPUs have the same name.
    """
    try:
        pynvml.nvmlInit()
        self._init_gpus()
        if ensure_homogeneous:
            self._ensure_homogeneous()
    except pynvml.NVMLError as e:
        exception_class = NVIDIAGPU._exception_map.get(
            e.value,  # pyright: ignore[reportAttributeAccessIssue]
            gpu_common.ZeusBaseGPUError,
        )
        raise exception_class(
            e.msg  # pyright: ignore[reportAttributeAccessIssue]
        ) from e

__del__

__del__()

Shut down NVML.

Source code in zeus/device/gpu/nvidia.py
456
457
458
459
def __del__(self) -> None:
    """Shut down NVML."""
    with contextlib.suppress(pynvml.NVMLError):
        pynvml.nvmlShutdown()

nvml_is_available cached

nvml_is_available()

Check if NVML is available.

Source code in zeus/device/gpu/nvidia.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
@lru_cache(maxsize=1)
def nvml_is_available() -> bool:
    """Check if NVML is available."""
    try:
        import pynvml
    except ImportError:
        logger.info("Failed to import `pynvml`. Make sure you have `nvidia-ml-py` installed.")
        return False

    # Detect unofficial pynvml packages.
    # If detected, this should be a critical error.
    if not hasattr(pynvml, "_nvmlGetFunctionPointer"):
        logger.error("Unoffical pynvml package detected!")
        raise ImportError(
            "Unofficial pynvml package detected! "
            "This causes conflicts with the official NVIDIA bindings. "
            "Please remove with `pip uninstall pynvml` and instead use the official "
            "bindings from NVIDIA: `nvidia-ml-py`. "
        )

    try:
        pynvml.nvmlInit()
        logger.info("pynvml is available and initialized.")
        return True
    except pynvml.NVMLError as e:
        logger.info("pynvml is available but could not initialize NVML: %s.", e)
        return False