Skip to content

gpu

zeus.device.gpu

GPU management module for Zeus.

ZeusGPUInitError

Bases: ZeusBaseGPUError

Import error or GPU library initialization failures.

Source code in zeus/device/gpu.py
23
24
25
26
27
28
class ZeusGPUInitError(ZeusBaseGPUError):
    """Import error or GPU library initialization failures."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
26
27
28
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUInvalidArgError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Invalid Argument.

Source code in zeus/device/gpu.py
31
32
33
34
35
36
class ZeusGPUInvalidArgError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Invalid Argument."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
34
35
36
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUNotSupportedError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Not Supported Operation on GPU.

Source code in zeus/device/gpu.py
39
40
41
42
43
44
class ZeusGPUNotSupportedError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Not Supported Operation on GPU."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
42
43
44
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUNoPermissionError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for No Permission to perform GPU operation.

Source code in zeus/device/gpu.py
47
48
49
50
51
52
class ZeusGPUNoPermissionError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for No Permission to perform GPU operation."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
50
51
52
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUAlreadyInitializedError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Already Initialized GPU.

Source code in zeus/device/gpu.py
55
56
57
58
59
60
class ZeusGPUAlreadyInitializedError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Already Initialized GPU."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
58
59
60
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUNotFoundError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Not Found GPU.

Source code in zeus/device/gpu.py
63
64
65
66
67
68
class ZeusGPUNotFoundError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Not Found GPU."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
66
67
68
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUInsufficientSizeError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Insufficient Size.

Source code in zeus/device/gpu.py
71
72
73
74
75
76
class ZeusGPUInsufficientSizeError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Insufficient Size."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
74
75
76
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUInsufficientPowerError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Insufficient Power.

Source code in zeus/device/gpu.py
79
80
81
82
83
84
class ZeusGPUInsufficientPowerError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Insufficient Power."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
82
83
84
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUDriverNotLoadedError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Driver Error.

Source code in zeus/device/gpu.py
87
88
89
90
91
92
class ZeusGPUDriverNotLoadedError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Driver Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
90
91
92
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUTimeoutError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Timeout Error.

Source code in zeus/device/gpu.py
 95
 96
 97
 98
 99
100
class ZeusGPUTimeoutError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Timeout Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
 98
 99
100
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUIRQError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for IRQ Error.

Source code in zeus/device/gpu.py
103
104
105
106
107
108
class ZeusGPUIRQError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for IRQ Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
106
107
108
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPULibraryNotFoundError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Library Not Found Error.

Source code in zeus/device/gpu.py
111
112
113
114
115
116
class ZeusGPULibraryNotFoundError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Library Not Found Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
114
115
116
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUFunctionNotFoundError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Function Not Found Error.

Source code in zeus/device/gpu.py
119
120
121
122
123
124
class ZeusGPUFunctionNotFoundError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Function Not Found Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
122
123
124
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUCorruptedInfoROMError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Corrupted Info ROM Error.

Source code in zeus/device/gpu.py
127
128
129
130
131
132
class ZeusGPUCorruptedInfoROMError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Corrupted Info ROM Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
130
131
132
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPULostError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Lost GPU Error.

Source code in zeus/device/gpu.py
135
136
137
138
139
140
class ZeusGPULostError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Lost GPU Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
138
139
140
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUResetRequiredError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Reset Required Error.

Source code in zeus/device/gpu.py
143
144
145
146
147
148
class ZeusGPUResetRequiredError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Reset Required Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
146
147
148
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUOperatingSystemError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Operating System Error.

Source code in zeus/device/gpu.py
151
152
153
154
155
156
class ZeusGPUOperatingSystemError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Operating System Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
154
155
156
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPULibRMVersionMismatchError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for LibRM Version Mismatch Error.

Source code in zeus/device/gpu.py
159
160
161
162
163
164
class ZeusGPULibRMVersionMismatchError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for LibRM Version Mismatch Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
162
163
164
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUMemoryError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Insufficient Memory Error.

Source code in zeus/device/gpu.py
167
168
169
170
171
172
class ZeusGPUMemoryError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Insufficient Memory Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
170
171
172
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

ZeusGPUUnknownError

Bases: ZeusBaseGPUError

Zeus GPU exception class Wrapper for Unknown Error.

Source code in zeus/device/gpu.py
175
176
177
178
179
180
class ZeusGPUUnknownError(ZeusBaseGPUError):
    """Zeus GPU exception class Wrapper for Unknown Error."""

    def __init__(self, message: str) -> None:
        """Initialize Zeus Exception."""
        super().__init__(message)

__init__

1
__init__(message)
Source code in zeus/device/gpu.py
178
179
180
def __init__(self, message: str) -> None:
    """Initialize Zeus Exception."""
    super().__init__(message)

GPU

Bases: ABC

Abstract base class for GPU management.

This class defines the interface for interacting with GPUs, subclasses should implement the methods to interact with specific GPU libraries (e.g., NVML for NVIDIA GPUs).

Source code in zeus/device/gpu.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
class GPU(abc.ABC):
    """Abstract base class for GPU management.

    This class defines the interface for interacting with GPUs, subclasses should implement the methods to interact with specific GPU libraries
    (e.g., NVML for NVIDIA GPUs).
    """

    def __init__(self, gpu_index: int) -> None:
        """Initialize the GPU with a specified index."""
        self.gpu_index = gpu_index

    @abc.abstractmethod
    def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
        """Return the minimum and maximum power management limits for the GPU. Units: mW."""
        pass

    @abc.abstractmethod
    def setPersistenceMode(self, enable: bool) -> None:
        """Enable persistence mode for the GPU."""
        pass

    @abc.abstractmethod
    def setPowerManagementLimit(self, value: int) -> None:
        """Set the power management limit for the GPU to a specified value or default. Unit: mW."""
        pass

    @abc.abstractmethod
    def resetPowerManagementLimit(self) -> None:
        """Resets the power management limit for the specified GPU to the default value."""
        pass

    @abc.abstractmethod
    def setMemoryLockedClocks(self, minMemClockMHz: int, maxMemClockMHz: int) -> None:
        """Lock the memory clock to a specified range. Units: MHz."""
        pass

    @abc.abstractmethod
    def getSupportedMemoryClocks(self) -> list[int]:
        """Return a list of supported memory clock frequencies for the GPU. Units: MHz."""
        pass

    @abc.abstractmethod
    def getSupportedGraphicsClocks(self, freq: int) -> list[int]:
        """Return a list of supported graphics clock frequencies for a given memory frequency. Units: MHz."""
        pass

    @abc.abstractmethod
    def getName(self) -> str:
        """Return the name of the GPU."""
        pass

    @abc.abstractmethod
    def setGpuLockedClocks(self, minGpuClockMHz: int, maxGpuClockMHz: int) -> None:
        """Lock the GPU clock to a specified range. Units: MHz."""
        pass

    @abc.abstractmethod
    def resetMemoryLockedClocks(self) -> None:
        """Reset the memory locked clocks to default values."""
        pass

    @abc.abstractmethod
    def resetGpuLockedClocks(self) -> None:
        """Reset the GPU locked clocks to default values."""
        pass

    @abc.abstractmethod
    def getPowerUsage(self) -> int:
        """Return the current power usage of the GPU. Units: mW."""
        pass

    @abc.abstractmethod
    def supportsGetTotalEnergyConsumption(self) -> bool:
        """Check if the GPU supports retrieving total energy consumption."""
        pass

    @abc.abstractmethod
    def getTotalEnergyConsumption(self) -> int:
        """Return the total energy consumption of the GPU since driver load. Units: mJ."""
        pass

__init__

1
__init__(gpu_index)
Source code in zeus/device/gpu.py
193
194
195
def __init__(self, gpu_index: int) -> None:
    """Initialize the GPU with a specified index."""
    self.gpu_index = gpu_index

getPowerManagementLimitConstraints abstractmethod

1
getPowerManagementLimitConstraints()

Return the minimum and maximum power management limits for the GPU. Units: mW.

Source code in zeus/device/gpu.py
197
198
199
200
@abc.abstractmethod
def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
    """Return the minimum and maximum power management limits for the GPU. Units: mW."""
    pass

setPersistenceMode abstractmethod

1
setPersistenceMode(enable)

Enable persistence mode for the GPU.

Source code in zeus/device/gpu.py
202
203
204
205
@abc.abstractmethod
def setPersistenceMode(self, enable: bool) -> None:
    """Enable persistence mode for the GPU."""
    pass

setPowerManagementLimit abstractmethod

1
setPowerManagementLimit(value)

Set the power management limit for the GPU to a specified value or default. Unit: mW.

Source code in zeus/device/gpu.py
207
208
209
210
@abc.abstractmethod
def setPowerManagementLimit(self, value: int) -> None:
    """Set the power management limit for the GPU to a specified value or default. Unit: mW."""
    pass

resetPowerManagementLimit abstractmethod

1
resetPowerManagementLimit()

Resets the power management limit for the specified GPU to the default value.

Source code in zeus/device/gpu.py
212
213
214
215
@abc.abstractmethod
def resetPowerManagementLimit(self) -> None:
    """Resets the power management limit for the specified GPU to the default value."""
    pass

setMemoryLockedClocks abstractmethod

1
setMemoryLockedClocks(minMemClockMHz, maxMemClockMHz)

Lock the memory clock to a specified range. Units: MHz.

Source code in zeus/device/gpu.py
217
218
219
220
@abc.abstractmethod
def setMemoryLockedClocks(self, minMemClockMHz: int, maxMemClockMHz: int) -> None:
    """Lock the memory clock to a specified range. Units: MHz."""
    pass

getSupportedMemoryClocks abstractmethod

1
getSupportedMemoryClocks()

Return a list of supported memory clock frequencies for the GPU. Units: MHz.

Source code in zeus/device/gpu.py
222
223
224
225
@abc.abstractmethod
def getSupportedMemoryClocks(self) -> list[int]:
    """Return a list of supported memory clock frequencies for the GPU. Units: MHz."""
    pass

getSupportedGraphicsClocks abstractmethod

1
getSupportedGraphicsClocks(freq)

Return a list of supported graphics clock frequencies for a given memory frequency. Units: MHz.

Source code in zeus/device/gpu.py
227
228
229
230
@abc.abstractmethod
def getSupportedGraphicsClocks(self, freq: int) -> list[int]:
    """Return a list of supported graphics clock frequencies for a given memory frequency. Units: MHz."""
    pass

getName abstractmethod

1
getName()

Return the name of the GPU.

Source code in zeus/device/gpu.py
232
233
234
235
@abc.abstractmethod
def getName(self) -> str:
    """Return the name of the GPU."""
    pass

setGpuLockedClocks abstractmethod

1
setGpuLockedClocks(minGpuClockMHz, maxGpuClockMHz)

Lock the GPU clock to a specified range. Units: MHz.

Source code in zeus/device/gpu.py
237
238
239
240
@abc.abstractmethod
def setGpuLockedClocks(self, minGpuClockMHz: int, maxGpuClockMHz: int) -> None:
    """Lock the GPU clock to a specified range. Units: MHz."""
    pass

resetMemoryLockedClocks abstractmethod

1
resetMemoryLockedClocks()

Reset the memory locked clocks to default values.

Source code in zeus/device/gpu.py
242
243
244
245
@abc.abstractmethod
def resetMemoryLockedClocks(self) -> None:
    """Reset the memory locked clocks to default values."""
    pass

resetGpuLockedClocks abstractmethod

1
resetGpuLockedClocks()

Reset the GPU locked clocks to default values.

Source code in zeus/device/gpu.py
247
248
249
250
@abc.abstractmethod
def resetGpuLockedClocks(self) -> None:
    """Reset the GPU locked clocks to default values."""
    pass

getPowerUsage abstractmethod

1
getPowerUsage()

Return the current power usage of the GPU. Units: mW.

Source code in zeus/device/gpu.py
252
253
254
255
@abc.abstractmethod
def getPowerUsage(self) -> int:
    """Return the current power usage of the GPU. Units: mW."""
    pass

supportsGetTotalEnergyConsumption abstractmethod

1
supportsGetTotalEnergyConsumption()

Check if the GPU supports retrieving total energy consumption.

Source code in zeus/device/gpu.py
257
258
259
260
@abc.abstractmethod
def supportsGetTotalEnergyConsumption(self) -> bool:
    """Check if the GPU supports retrieving total energy consumption."""
    pass

getTotalEnergyConsumption abstractmethod

1
getTotalEnergyConsumption()

Return the total energy consumption of the GPU since driver load. Units: mJ.

Source code in zeus/device/gpu.py
262
263
264
265
@abc.abstractmethod
def getTotalEnergyConsumption(self) -> int:
    """Return the total energy consumption of the GPU since driver load. Units: mJ."""
    pass

NVIDIAGPU

Bases: GPU

Control a Single NVIDIA GPU.

Uses NVML Library to control and query GPU. There is a 1:1 mapping between the methods in this class and the NVML library functions. Zeus GPU Exceptions are raised when NVML errors occur. To ensure computational efficiency, this class utilizes caching (ex. saves the handle) to avoid repeated calls to NVML.

Source code in zeus/device/gpu.py
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
class NVIDIAGPU(GPU):
    """Control a Single NVIDIA GPU.

    Uses NVML Library to control and query GPU. There is a 1:1 mapping between the methods in this class and the NVML library functions.
    Zeus GPU Exceptions are raised when NVML errors occur.
    To ensure computational efficiency, this class utilizes caching (ex. saves the handle) to avoid repeated calls to NVML.
    """

    def __init__(self, gpu_index: int) -> None:
        """Initializes the NVIDIAGPU object with a specified GPU index. Acquires a handle to the GPU using `pynvml.nvmlDeviceGetHandleByIndex`."""
        super().__init__(gpu_index)
        self._get_handle()
        self._supportsGetTotalEnergyConsumption = None

    _exception_map = {
        pynvml.NVML_ERROR_UNINITIALIZED: ZeusGPUInitError,
        pynvml.NVML_ERROR_INVALID_ARGUMENT: ZeusGPUInvalidArgError,
        pynvml.NVML_ERROR_NOT_SUPPORTED: ZeusGPUNotSupportedError,
        pynvml.NVML_ERROR_NO_PERMISSION: ZeusGPUNoPermissionError,
        pynvml.NVML_ERROR_ALREADY_INITIALIZED: ZeusGPUAlreadyInitializedError,
        pynvml.NVML_ERROR_NOT_FOUND: ZeusGPUNotFoundError,
        pynvml.NVML_ERROR_INSUFFICIENT_SIZE: ZeusGPUInsufficientSizeError,
        pynvml.NVML_ERROR_INSUFFICIENT_POWER: ZeusGPUInsufficientPowerError,
        pynvml.NVML_ERROR_DRIVER_NOT_LOADED: ZeusGPUDriverNotLoadedError,
        pynvml.NVML_ERROR_TIMEOUT: ZeusGPUTimeoutError,
        pynvml.NVML_ERROR_IRQ_ISSUE: ZeusGPUIRQError,
        pynvml.NVML_ERROR_LIBRARY_NOT_FOUND: ZeusGPULibraryNotFoundError,
        pynvml.NVML_ERROR_FUNCTION_NOT_FOUND: ZeusGPUFunctionNotFoundError,
        pynvml.NVML_ERROR_CORRUPTED_INFOROM: ZeusGPUCorruptedInfoROMError,
        pynvml.NVML_ERROR_GPU_IS_LOST: ZeusGPULostError,
        pynvml.NVML_ERROR_RESET_REQUIRED: ZeusGPUResetRequiredError,
        pynvml.NVML_ERROR_OPERATING_SYSTEM: ZeusGPUOperatingSystemError,
        pynvml.NVML_ERROR_LIB_RM_VERSION_MISMATCH: ZeusGPULibRMVersionMismatchError,
        pynvml.NVML_ERROR_MEMORY: ZeusGPUMemoryError,
        pynvml.NVML_ERROR_UNKNOWN: ZeusGPUUnknownError,
    }

    @_handle_nvml_errors
    def _get_handle(self):
        self.handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)

    @_handle_nvml_errors
    def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
        """Returns the minimum and maximum power management limits for the specified GPU. Units: mW."""
        min_, max_ = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.handle)
        return (min_, max_)

    @_handle_nvml_errors
    def setPersistenceMode(self, enable: bool) -> None:
        """If enable = True, enables persistence mode for the specified GPU. If enable = False, disables persistence mode."""
        if enable:
            pynvml.nvmlDeviceSetPersistenceMode(
                self.handle, pynvml.NVML_FEATURE_ENABLED
            )
        else:
            pynvml.nvmlDeviceSetPersistenceMode(
                self.handle, pynvml.NVML_FEATURE_DISABLED
            )

    @_handle_nvml_errors
    def setPowerManagementLimit(self, value: int) -> None:
        """Sets the power management limit for the specified GPU to the given value. Unit: mW."""
        pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, value)

    @_handle_nvml_errors
    def resetPowerManagementLimit(self) -> None:
        """Resets the power management limit for the specified GPU to the default value."""
        pynvml.nvmlDeviceSetPowerManagementLimit(
            self.handle,
            pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
        )

    @_handle_nvml_errors
    def setMemoryLockedClocks(self, minMemClockMHz: int, maxMemClockMHz: int) -> None:
        """Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies.  Units: MHz."""
        pynvml.nvmlDeviceSetMemoryLockedClocks(
            self.handle, minMemClockMHz, maxMemClockMHz
        )

    @_handle_nvml_errors
    def getSupportedMemoryClocks(self) -> list[int]:
        """Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz."""
        return pynvml.nvmlDeviceGetSupportedMemoryClocks(self.handle)

    @_handle_nvml_errors
    def getSupportedGraphicsClocks(self, freq: int) -> list[int]:
        """Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz."""
        return pynvml.nvmlDeviceGetSupportedGraphicsClocks(self.handle, freq)

    @_handle_nvml_errors
    def getName(self) -> str:
        """Returns the name of the specified GPU."""
        return pynvml.nvmlDeviceGetName(self.handle)

    @_handle_nvml_errors
    def setGpuLockedClocks(self, minGpuClockMHz: int, maxGpuClockMHz: int) -> None:
        """Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies. Units: MHz."""
        pynvml.nvmlDeviceSetGpuLockedClocks(self.handle, minGpuClockMHz, maxGpuClockMHz)

    @_handle_nvml_errors
    def resetMemoryLockedClocks(self) -> None:
        """Resets the memory locked clocks of the specified GPU to their default values."""
        pynvml.nvmlDeviceResetMemoryLockedClocks(self.handle)

    @_handle_nvml_errors
    def resetGpuLockedClocks(self) -> None:
        """Resets the GPU locked clocks of the specified GPU to their default values."""
        pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

    @_handle_nvml_errors
    def getPowerUsage(self) -> int:
        """Returns the power usage of the specified GPU. Units: mW."""
        return pynvml.nvmlDeviceGetPowerUsage(self.handle)

    @_handle_nvml_errors
    def supportsGetTotalEnergyConsumption(self) -> bool:
        """Returns True if the specified GPU supports retrieving the total energy consumption."""
        # NVIDIA GPUs Volta or newer support this method
        if self._supportsGetTotalEnergyConsumption is None:
            self._supportsGetTotalEnergyConsumption = (
                pynvml.nvmlDeviceGetArchitecture(self.handle)
                >= pynvml.NVML_DEVICE_ARCH_VOLTA
            )

        return self._supportsGetTotalEnergyConsumption

    @_handle_nvml_errors
    def getTotalEnergyConsumption(self) -> int:
        """Returns the total energy consumption of the specified GPU. Units: mJ."""
        return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)

__init__

1
__init__(gpu_index)
Source code in zeus/device/gpu.py
291
292
293
294
295
def __init__(self, gpu_index: int) -> None:
    """Initializes the NVIDIAGPU object with a specified GPU index. Acquires a handle to the GPU using `pynvml.nvmlDeviceGetHandleByIndex`."""
    super().__init__(gpu_index)
    self._get_handle()
    self._supportsGetTotalEnergyConsumption = None

getPowerManagementLimitConstraints

1
getPowerManagementLimitConstraints()

Returns the minimum and maximum power management limits for the specified GPU. Units: mW.

Source code in zeus/device/gpu.py
324
325
326
327
328
@_handle_nvml_errors
def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
    """Returns the minimum and maximum power management limits for the specified GPU. Units: mW."""
    min_, max_ = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.handle)
    return (min_, max_)

setPersistenceMode

1
setPersistenceMode(enable)

If enable = True, enables persistence mode for the specified GPU. If enable = False, disables persistence mode.

Source code in zeus/device/gpu.py
330
331
332
333
334
335
336
337
338
339
340
@_handle_nvml_errors
def setPersistenceMode(self, enable: bool) -> None:
    """If enable = True, enables persistence mode for the specified GPU. If enable = False, disables persistence mode."""
    if enable:
        pynvml.nvmlDeviceSetPersistenceMode(
            self.handle, pynvml.NVML_FEATURE_ENABLED
        )
    else:
        pynvml.nvmlDeviceSetPersistenceMode(
            self.handle, pynvml.NVML_FEATURE_DISABLED
        )

setPowerManagementLimit

1
setPowerManagementLimit(value)

Sets the power management limit for the specified GPU to the given value. Unit: mW.

Source code in zeus/device/gpu.py
342
343
344
345
@_handle_nvml_errors
def setPowerManagementLimit(self, value: int) -> None:
    """Sets the power management limit for the specified GPU to the given value. Unit: mW."""
    pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, value)

resetPowerManagementLimit

1
resetPowerManagementLimit()

Resets the power management limit for the specified GPU to the default value.

Source code in zeus/device/gpu.py
347
348
349
350
351
352
353
@_handle_nvml_errors
def resetPowerManagementLimit(self) -> None:
    """Resets the power management limit for the specified GPU to the default value."""
    pynvml.nvmlDeviceSetPowerManagementLimit(
        self.handle,
        pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
    )

setMemoryLockedClocks

1
setMemoryLockedClocks(minMemClockMHz, maxMemClockMHz)

Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies. Units: MHz.

Source code in zeus/device/gpu.py
355
356
357
358
359
360
@_handle_nvml_errors
def setMemoryLockedClocks(self, minMemClockMHz: int, maxMemClockMHz: int) -> None:
    """Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies.  Units: MHz."""
    pynvml.nvmlDeviceSetMemoryLockedClocks(
        self.handle, minMemClockMHz, maxMemClockMHz
    )

getSupportedMemoryClocks

1
getSupportedMemoryClocks()

Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz.

Source code in zeus/device/gpu.py
362
363
364
365
@_handle_nvml_errors
def getSupportedMemoryClocks(self) -> list[int]:
    """Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz."""
    return pynvml.nvmlDeviceGetSupportedMemoryClocks(self.handle)

getSupportedGraphicsClocks

1
getSupportedGraphicsClocks(freq)

Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz.

Source code in zeus/device/gpu.py
367
368
369
370
@_handle_nvml_errors
def getSupportedGraphicsClocks(self, freq: int) -> list[int]:
    """Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz."""
    return pynvml.nvmlDeviceGetSupportedGraphicsClocks(self.handle, freq)

getName

1
getName()

Returns the name of the specified GPU.

Source code in zeus/device/gpu.py
372
373
374
375
@_handle_nvml_errors
def getName(self) -> str:
    """Returns the name of the specified GPU."""
    return pynvml.nvmlDeviceGetName(self.handle)

setGpuLockedClocks

1
setGpuLockedClocks(minGpuClockMHz, maxGpuClockMHz)

Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies. Units: MHz.

Source code in zeus/device/gpu.py
377
378
379
380
@_handle_nvml_errors
def setGpuLockedClocks(self, minGpuClockMHz: int, maxGpuClockMHz: int) -> None:
    """Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies. Units: MHz."""
    pynvml.nvmlDeviceSetGpuLockedClocks(self.handle, minGpuClockMHz, maxGpuClockMHz)

resetMemoryLockedClocks

1
resetMemoryLockedClocks()

Resets the memory locked clocks of the specified GPU to their default values.

Source code in zeus/device/gpu.py
382
383
384
385
@_handle_nvml_errors
def resetMemoryLockedClocks(self) -> None:
    """Resets the memory locked clocks of the specified GPU to their default values."""
    pynvml.nvmlDeviceResetMemoryLockedClocks(self.handle)

resetGpuLockedClocks

1
resetGpuLockedClocks()

Resets the GPU locked clocks of the specified GPU to their default values.

Source code in zeus/device/gpu.py
387
388
389
390
@_handle_nvml_errors
def resetGpuLockedClocks(self) -> None:
    """Resets the GPU locked clocks of the specified GPU to their default values."""
    pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

getPowerUsage

1
getPowerUsage()

Returns the power usage of the specified GPU. Units: mW.

Source code in zeus/device/gpu.py
392
393
394
395
@_handle_nvml_errors
def getPowerUsage(self) -> int:
    """Returns the power usage of the specified GPU. Units: mW."""
    return pynvml.nvmlDeviceGetPowerUsage(self.handle)

supportsGetTotalEnergyConsumption

1
supportsGetTotalEnergyConsumption()

Returns True if the specified GPU supports retrieving the total energy consumption.

Source code in zeus/device/gpu.py
397
398
399
400
401
402
403
404
405
406
407
@_handle_nvml_errors
def supportsGetTotalEnergyConsumption(self) -> bool:
    """Returns True if the specified GPU supports retrieving the total energy consumption."""
    # NVIDIA GPUs Volta or newer support this method
    if self._supportsGetTotalEnergyConsumption is None:
        self._supportsGetTotalEnergyConsumption = (
            pynvml.nvmlDeviceGetArchitecture(self.handle)
            >= pynvml.NVML_DEVICE_ARCH_VOLTA
        )

    return self._supportsGetTotalEnergyConsumption

getTotalEnergyConsumption

1
getTotalEnergyConsumption()

Returns the total energy consumption of the specified GPU. Units: mJ.

Source code in zeus/device/gpu.py
409
410
411
412
@_handle_nvml_errors
def getTotalEnergyConsumption(self) -> int:
    """Returns the total energy consumption of the specified GPU. Units: mJ."""
    return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)

UnprivilegedNVIDIAGPU

Bases: NVIDIAGPU

Control a Single NVIDIA GPU with no SYS_ADMIN privileges.

Uses NVML Library to control and query GPU. There is a 1:1 mapping between the methods in this class and the NVML library functions. Zeus GPU Exceptions are raised when NVML errors occur. To ensure computational efficiency, this class utilizes caching (ex. saves the handle) to avoid repeated calls to NVML.

Source code in zeus/device/gpu.py
415
416
417
418
419
420
421
422
423
class UnprivilegedNVIDIAGPU(NVIDIAGPU):
    """Control a Single NVIDIA GPU with no SYS_ADMIN privileges.

    Uses NVML Library to control and query GPU. There is a 1:1 mapping between the methods in this class and the NVML library functions.
    Zeus GPU Exceptions are raised when NVML errors occur.
    To ensure computational efficiency, this class utilizes caching (ex. saves the handle) to avoid repeated calls to NVML.
    """

    pass

AMDGPU

Bases: GPU

Control a Single AMD GPU.

Uses amdsmi Library to control and query GPU. There is a 1:1 mapping between the methods in this class and the amdsmi library functions. Zeus GPU Exceptions are raised when amdsmi errors occur. To ensure computational efficiency, this class utilizes caching (ex. saves the handle) to avoid repeated calls to amdsmi.

Source code in zeus/device/gpu.py
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
class AMDGPU(GPU):
    """Control a Single AMD GPU.

    Uses amdsmi Library to control and query GPU. There is a 1:1 mapping between the methods in this class and the amdsmi library functions.
    Zeus GPU Exceptions are raised when amdsmi errors occur.
    To ensure computational efficiency, this class utilizes caching (ex. saves the handle) to avoid repeated calls to amdsmi.
    """

    def __init__(self, gpu_index: int) -> None:
        """Initializes the AMDGPU object with a specified GPU index. Acquires a handle to the GPU using `amdsmi.amdsmi_get_processor_handles()`."""
        super().__init__(gpu_index)
        self._get_handle()

    _exception_map = {}

    @_handle_amdsmi_errors
    def _get_handle(self):
        self.handle = amdsmi.amdsmi_get_processor_handles()[self.gpu_index]

    @_handle_amdsmi_errors
    def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
        """Returns the minimum and maximum power management limits for the specified GPU. Units: mW."""
        info = amdsmi.amdsmi_get_power_cap_info(self.handle)
        return (info.min_power_cap, info.max_power_cap)

    @_handle_amdsmi_errors
    def setPersistenceMode(self, enable: bool) -> None:
        """Enables persistence mode for the specified GPU."""
        raise ZeusGPUNotSupportedError(
            "Persistence mode is not supported for AMD GPUs yet"
        )
        profile = ...  # TODO: find out correct profile
        amdsmi.amdsmi_set_gpu_power_profile(self.handle, 0, profile)

    @_handle_amdsmi_errors
    def setPowerManagementLimit(self, value: int) -> None:
        """Sets the power management limit for the specified GPU to the given value. Unit: mW."""
        amdsmi.amdsmi_set_power_cap(self.handle, sensor_id=0, cap=value)

    @_handle_amdsmi_errors
    def resetPowerManagementLimit(self) -> None:
        """Resets the power management limit for the specified GPU to the default value."""
        info = amdsmi.amdsmi_get_power_cap_info(self.handle)
        amdsmi.amdsmi_set_power_cap(
            self.handle, sensor_id=0, cap=info.default_power_cap
        )

    @_handle_amdsmi_errors
    def setMemoryLockedClocks(self, minMemClockMHz: int, maxMemClockMHz: int) -> None:
        """Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies. Units: MHz."""
        amdsmi.amdsmi_set_gpu_clk_range(
            self.handle,
            minMemClockMHz,
            maxMemClockMHz,
            clk_type=amdsmi.AmdSmiClkType.MEM,
        )

    @_handle_amdsmi_errors
    def getSupportedMemoryClocks(self) -> list[int]:
        """Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz."""
        num_supported, current, frequency = amdsmi.amdsmi_get_clk_freq(
            self.handle, clk_type=amdsmi.AmdSmiClkType.MEM
        )  # TODO: Figure out correct clk_type
        # frequency; List of frequencies, only the first num_supported frequencies are valid"""
        return frequency[:num_supported]

    @_handle_amdsmi_errors
    def getSupportedGraphicsClocks(self, freq: int) -> list[int]:
        """Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz."""
        raise ZeusGPUNotSupportedError(
            "Getting supported graphics clocks is not supported for AMD GPUs yet"
        )

    @_handle_amdsmi_errors
    def getName(self) -> str:
        """Returns the name of the specified GPU."""
        (
            market_name,
            vendor_id,
            device_id,
            rev_id,
            asic_serial,
        ) = amdsmi.amdsmi_get_gpu_asic_info(
            self.handle
        )  # TODO: Does this return correct string
        return market_name

    @_handle_amdsmi_errors
    def setGpuLockedClocks(self, minGpuClockMHz: int, maxGpuClockMHz: int) -> None:
        """Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies.  Units: MHz."""
        amdsmi.amdsmi_set_gpu_clk_range(
            self.handle,
            minGpuClockMHz,
            maxGpuClockMHz,
            clk_type=amdsmi.AMDSMI_CLK_TYPE_GFX,
        )

    @_handle_amdsmi_errors
    def resetMemoryLockedClocks(self) -> None:
        """Resets the memory locked clocks of the specified GPU to their default values."""
        amdsmi.amdsmi_reset_gpu_clk(
            self.handle, clk_type=amdsmi.AMDSMI_CLK_TYPE_SYS
        )  # TODO: check docs

    @_handle_amdsmi_errors
    def resetGpuLockedClocks(self) -> None:
        """Resets the GPU locked clocks of the specified GPU to their default values."""
        amdsmi.amdsmi_reset_gpu_clk(
            self.handle, clk_type=amdsmi.AMDSMI_CLK_TYPE_GFX
        )  # TODO: check docs

    @_handle_amdsmi_errors
    def getPowerUsage(self) -> int:
        """Returns the power usage of the specified GPU. Units: mW."""
        raise ZeusGPUNotSupportedError(
            "Getting power usage is not supported for AMD GPUs yet"
        )

    @_handle_amdsmi_errors
    def supportsGetTotalEnergyConsumption(self) -> bool:
        """Returns True if the specified GPU supports retrieving the total energy consumption."""
        raise ZeusGPUNotSupportedError(
            "Getting total energy consumption is not supported for AMD GPUs yet"
        )

    @_handle_amdsmi_errors
    def getTotalEnergyConsumption(self) -> int:
        """Returns the total energy consumption of the specified GPU. Units: mJ."""
        raise ZeusGPUNotSupportedError(
            "Getting total energy consumption is not supported for AMD GPUs yet"
        )

__init__

1
__init__(gpu_index)
Source code in zeus/device/gpu.py
446
447
448
449
def __init__(self, gpu_index: int) -> None:
    """Initializes the AMDGPU object with a specified GPU index. Acquires a handle to the GPU using `amdsmi.amdsmi_get_processor_handles()`."""
    super().__init__(gpu_index)
    self._get_handle()

getPowerManagementLimitConstraints

1
getPowerManagementLimitConstraints()

Returns the minimum and maximum power management limits for the specified GPU. Units: mW.

Source code in zeus/device/gpu.py
457
458
459
460
461
@_handle_amdsmi_errors
def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
    """Returns the minimum and maximum power management limits for the specified GPU. Units: mW."""
    info = amdsmi.amdsmi_get_power_cap_info(self.handle)
    return (info.min_power_cap, info.max_power_cap)

setPersistenceMode

1
setPersistenceMode(enable)

Enables persistence mode for the specified GPU.

Source code in zeus/device/gpu.py
463
464
465
466
467
468
469
470
@_handle_amdsmi_errors
def setPersistenceMode(self, enable: bool) -> None:
    """Enables persistence mode for the specified GPU."""
    raise ZeusGPUNotSupportedError(
        "Persistence mode is not supported for AMD GPUs yet"
    )
    profile = ...  # TODO: find out correct profile
    amdsmi.amdsmi_set_gpu_power_profile(self.handle, 0, profile)

setPowerManagementLimit

1
setPowerManagementLimit(value)

Sets the power management limit for the specified GPU to the given value. Unit: mW.

Source code in zeus/device/gpu.py
472
473
474
475
@_handle_amdsmi_errors
def setPowerManagementLimit(self, value: int) -> None:
    """Sets the power management limit for the specified GPU to the given value. Unit: mW."""
    amdsmi.amdsmi_set_power_cap(self.handle, sensor_id=0, cap=value)

resetPowerManagementLimit

1
resetPowerManagementLimit()

Resets the power management limit for the specified GPU to the default value.

Source code in zeus/device/gpu.py
477
478
479
480
481
482
483
@_handle_amdsmi_errors
def resetPowerManagementLimit(self) -> None:
    """Resets the power management limit for the specified GPU to the default value."""
    info = amdsmi.amdsmi_get_power_cap_info(self.handle)
    amdsmi.amdsmi_set_power_cap(
        self.handle, sensor_id=0, cap=info.default_power_cap
    )

setMemoryLockedClocks

1
setMemoryLockedClocks(minMemClockMHz, maxMemClockMHz)

Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies. Units: MHz.

Source code in zeus/device/gpu.py
485
486
487
488
489
490
491
492
493
@_handle_amdsmi_errors
def setMemoryLockedClocks(self, minMemClockMHz: int, maxMemClockMHz: int) -> None:
    """Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies. Units: MHz."""
    amdsmi.amdsmi_set_gpu_clk_range(
        self.handle,
        minMemClockMHz,
        maxMemClockMHz,
        clk_type=amdsmi.AmdSmiClkType.MEM,
    )

getSupportedMemoryClocks

1
getSupportedMemoryClocks()

Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz.

Source code in zeus/device/gpu.py
495
496
497
498
499
500
501
502
@_handle_amdsmi_errors
def getSupportedMemoryClocks(self) -> list[int]:
    """Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz."""
    num_supported, current, frequency = amdsmi.amdsmi_get_clk_freq(
        self.handle, clk_type=amdsmi.AmdSmiClkType.MEM
    )  # TODO: Figure out correct clk_type
    # frequency; List of frequencies, only the first num_supported frequencies are valid"""
    return frequency[:num_supported]

getSupportedGraphicsClocks

1
getSupportedGraphicsClocks(freq)

Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz.

Source code in zeus/device/gpu.py
504
505
506
507
508
509
@_handle_amdsmi_errors
def getSupportedGraphicsClocks(self, freq: int) -> list[int]:
    """Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz."""
    raise ZeusGPUNotSupportedError(
        "Getting supported graphics clocks is not supported for AMD GPUs yet"
    )

getName

1
getName()

Returns the name of the specified GPU.

Source code in zeus/device/gpu.py
511
512
513
514
515
516
517
518
519
520
521
522
523
@_handle_amdsmi_errors
def getName(self) -> str:
    """Returns the name of the specified GPU."""
    (
        market_name,
        vendor_id,
        device_id,
        rev_id,
        asic_serial,
    ) = amdsmi.amdsmi_get_gpu_asic_info(
        self.handle
    )  # TODO: Does this return correct string
    return market_name

setGpuLockedClocks

1
setGpuLockedClocks(minGpuClockMHz, maxGpuClockMHz)

Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies. Units: MHz.

Source code in zeus/device/gpu.py
525
526
527
528
529
530
531
532
533
@_handle_amdsmi_errors
def setGpuLockedClocks(self, minGpuClockMHz: int, maxGpuClockMHz: int) -> None:
    """Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies.  Units: MHz."""
    amdsmi.amdsmi_set_gpu_clk_range(
        self.handle,
        minGpuClockMHz,
        maxGpuClockMHz,
        clk_type=amdsmi.AMDSMI_CLK_TYPE_GFX,
    )

resetMemoryLockedClocks

1
resetMemoryLockedClocks()

Resets the memory locked clocks of the specified GPU to their default values.

Source code in zeus/device/gpu.py
535
536
537
538
539
540
@_handle_amdsmi_errors
def resetMemoryLockedClocks(self) -> None:
    """Resets the memory locked clocks of the specified GPU to their default values."""
    amdsmi.amdsmi_reset_gpu_clk(
        self.handle, clk_type=amdsmi.AMDSMI_CLK_TYPE_SYS
    )  # TODO: check docs

resetGpuLockedClocks

1
resetGpuLockedClocks()

Resets the GPU locked clocks of the specified GPU to their default values.

Source code in zeus/device/gpu.py
542
543
544
545
546
547
@_handle_amdsmi_errors
def resetGpuLockedClocks(self) -> None:
    """Resets the GPU locked clocks of the specified GPU to their default values."""
    amdsmi.amdsmi_reset_gpu_clk(
        self.handle, clk_type=amdsmi.AMDSMI_CLK_TYPE_GFX
    )  # TODO: check docs

getPowerUsage

1
getPowerUsage()

Returns the power usage of the specified GPU. Units: mW.

Source code in zeus/device/gpu.py
549
550
551
552
553
554
@_handle_amdsmi_errors
def getPowerUsage(self) -> int:
    """Returns the power usage of the specified GPU. Units: mW."""
    raise ZeusGPUNotSupportedError(
        "Getting power usage is not supported for AMD GPUs yet"
    )

supportsGetTotalEnergyConsumption

1
supportsGetTotalEnergyConsumption()

Returns True if the specified GPU supports retrieving the total energy consumption.

Source code in zeus/device/gpu.py
556
557
558
559
560
561
@_handle_amdsmi_errors
def supportsGetTotalEnergyConsumption(self) -> bool:
    """Returns True if the specified GPU supports retrieving the total energy consumption."""
    raise ZeusGPUNotSupportedError(
        "Getting total energy consumption is not supported for AMD GPUs yet"
    )

getTotalEnergyConsumption

1
getTotalEnergyConsumption()

Returns the total energy consumption of the specified GPU. Units: mJ.

Source code in zeus/device/gpu.py
563
564
565
566
567
568
@_handle_amdsmi_errors
def getTotalEnergyConsumption(self) -> int:
    """Returns the total energy consumption of the specified GPU. Units: mJ."""
    raise ZeusGPUNotSupportedError(
        "Getting total energy consumption is not supported for AMD GPUs yet"
    )

UnprivilegedAMDGPU

Bases: AMDGPU

Control a Single AMD GPU with no SYS_ADMIN privileges.

Uses amdsmi Library to control and query GPU. There is a 1:1 mapping between the methods in this class and the amdsmi library functions. Zeus GPU Exceptions are raised when amdsmi errors occur. To ensure computational efficiency, this class utilizes caching (ex. saves the handle) to avoid repeated calls to amdsmi.

Source code in zeus/device/gpu.py
571
572
573
574
575
576
577
578
579
class UnprivilegedAMDGPU(AMDGPU):
    """Control a Single AMD GPU with no SYS_ADMIN privileges.

    Uses amdsmi Library to control and query GPU. There is a 1:1 mapping between the methods in this class and the amdsmi library functions.
    Zeus GPU Exceptions are raised when amdsmi errors occur.
    To ensure computational efficiency, this class utilizes caching (ex. saves the handle) to avoid repeated calls to amdsmi.
    """

    pass

GPUs

Bases: ABC

An abstract base class for GPU manager object.

This class defines the essential interface and common functionality for GPU management, instantiating multiple GPU objects for each GPU being tracked. Forwards the call for a specific method to the corresponding GPU object.

Source code in zeus/device/gpu.py
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
class GPUs(abc.ABC):
    """An abstract base class for GPU manager object.

    This class defines the essential interface and common functionality for GPU management, instantiating multiple `GPU` objects for each GPU being tracked.
    Forwards the call for a specific method to the corresponding GPU object.
    """

    @abc.abstractmethod
    def __init__(self, ensure_homogeneous: bool = False) -> None:
        """Initializes the GPU management library to communicate with the GPU driver and sets up tracking for specified GPUs."""
        pass

    @abc.abstractmethod
    def __del__(self) -> None:
        """Shuts down the GPU monitoring library to release resources and clean up."""
        pass

    @property
    @abc.abstractmethod
    def gpus(self) -> Sequence[GPU]:
        """Returns a list of GPU objects being tracked."""
        pass

    def _ensure_homogeneous(self) -> None:
        """Ensures that all tracked GPUs are homogeneous in terms of name."""
        gpu_names = [gpu.getName() for gpu in self.gpus]
        # Both zero (no GPUs found) and one are fine.
        if len(set(gpu_names)) > 1:
            raise ZeusBaseGPUError(f"Heterogeneous GPUs found: {gpu_names}")

    def getPowerManagementLimitConstraints(self, index: int) -> tuple[int, int]:
        """Returns the minimum and maximum power management limits for the specified GPU. Units: mW."""
        return self.gpus[index].getPowerManagementLimitConstraints()

    def setPersistenceMode(self, index: int, enable: bool) -> None:
        """Enables persistence mode for the specified GPU."""
        self.gpus[index].setPersistenceMode(enable)

    def setPowerManagementLimit(self, index: int, value: int) -> None:
        """Sets the power management limit for the specified GPU to the given value. Unit: mW."""
        self.gpus[index].setPowerManagementLimit(value)

    def resetPowerManagementLimit(self, index: int) -> None:
        """Resets the power management limit for the specified GPU to the default value."""
        self.gpus[index].resetPowerManagementLimit()

    def setMemoryLockedClocks(
        self, index: int, minMemClockMHz: int, maxMemClockMHz: int
    ) -> None:
        """Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies. Units: MHz."""
        self.gpus[index].setMemoryLockedClocks(minMemClockMHz, maxMemClockMHz)

    def getSupportedMemoryClocks(self, index: int) -> list[int]:
        """Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz."""
        return self.gpus[index].getSupportedMemoryClocks()

    def getSupportedGraphicsClocks(self, index: int, freq: int) -> list[int]:
        """Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz."""
        return self.gpus[index].getSupportedGraphicsClocks(freq)

    def getName(self, index: int) -> str:
        """Returns the name of the specified GPU."""
        return self.gpus[index].getName()

    def setGpuLockedClocks(
        self, index: int, minGpuClockMHz: int, maxGpuClockMHz: int
    ) -> None:
        """Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies. Units: MHz."""
        self.gpus[index].setGpuLockedClocks(minGpuClockMHz, maxGpuClockMHz)

    def resetMemoryLockedClocks(self, index: int) -> None:
        """Resets the memory locked clocks of the specified GPU to their default values."""
        self.gpus[index].resetMemoryLockedClocks()

    def resetGpuLockedClocks(self, index: int) -> None:
        """Resets the GPU locked clocks of the specified GPU to their default values."""
        self.gpus[index].resetGpuLockedClocks()

    def getPowerUsage(self, index: int) -> int:
        """Returns the power usage of the specified GPU. Units: mW."""
        return self.gpus[index].getPowerUsage()

    def supportsGetTotalEnergyConsumption(self, index: int) -> bool:
        """Returns True if the specified GPU supports retrieving the total energy consumption."""
        return self.gpus[index].supportsGetTotalEnergyConsumption()

    def getTotalEnergyConsumption(self, index: int) -> int:
        """Returns the total energy consumption of the specified GPU. Units: mJ."""
        return self.gpus[index].getTotalEnergyConsumption()

    def __len__(self) -> int:
        """Returns the number of GPUs being tracked."""
        return len(self.gpus)

gpus abstractmethod property

1
gpus

Returns a list of GPU objects being tracked.

__init__ abstractmethod

1
__init__(ensure_homogeneous=False)
Source code in zeus/device/gpu.py
589
590
591
592
@abc.abstractmethod
def __init__(self, ensure_homogeneous: bool = False) -> None:
    """Initializes the GPU management library to communicate with the GPU driver and sets up tracking for specified GPUs."""
    pass

__del__ abstractmethod

1
__del__()

Shuts down the GPU monitoring library to release resources and clean up.

Source code in zeus/device/gpu.py
594
595
596
597
@abc.abstractmethod
def __del__(self) -> None:
    """Shuts down the GPU monitoring library to release resources and clean up."""
    pass

_ensure_homogeneous

1
_ensure_homogeneous()

Ensures that all tracked GPUs are homogeneous in terms of name.

Source code in zeus/device/gpu.py
605
606
607
608
609
610
def _ensure_homogeneous(self) -> None:
    """Ensures that all tracked GPUs are homogeneous in terms of name."""
    gpu_names = [gpu.getName() for gpu in self.gpus]
    # Both zero (no GPUs found) and one are fine.
    if len(set(gpu_names)) > 1:
        raise ZeusBaseGPUError(f"Heterogeneous GPUs found: {gpu_names}")

getPowerManagementLimitConstraints

1
getPowerManagementLimitConstraints(index)

Returns the minimum and maximum power management limits for the specified GPU. Units: mW.

Source code in zeus/device/gpu.py
612
613
614
def getPowerManagementLimitConstraints(self, index: int) -> tuple[int, int]:
    """Returns the minimum and maximum power management limits for the specified GPU. Units: mW."""
    return self.gpus[index].getPowerManagementLimitConstraints()

setPersistenceMode

1
setPersistenceMode(index, enable)

Enables persistence mode for the specified GPU.

Source code in zeus/device/gpu.py
616
617
618
def setPersistenceMode(self, index: int, enable: bool) -> None:
    """Enables persistence mode for the specified GPU."""
    self.gpus[index].setPersistenceMode(enable)

setPowerManagementLimit

1
setPowerManagementLimit(index, value)

Sets the power management limit for the specified GPU to the given value. Unit: mW.

Source code in zeus/device/gpu.py
620
621
622
def setPowerManagementLimit(self, index: int, value: int) -> None:
    """Sets the power management limit for the specified GPU to the given value. Unit: mW."""
    self.gpus[index].setPowerManagementLimit(value)

resetPowerManagementLimit

1
resetPowerManagementLimit(index)

Resets the power management limit for the specified GPU to the default value.

Source code in zeus/device/gpu.py
624
625
626
def resetPowerManagementLimit(self, index: int) -> None:
    """Resets the power management limit for the specified GPU to the default value."""
    self.gpus[index].resetPowerManagementLimit()

setMemoryLockedClocks

1
2
3
setMemoryLockedClocks(
    index, minMemClockMHz, maxMemClockMHz
)

Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies. Units: MHz.

Source code in zeus/device/gpu.py
628
629
630
631
632
def setMemoryLockedClocks(
    self, index: int, minMemClockMHz: int, maxMemClockMHz: int
) -> None:
    """Locks the memory clock of the specified GPU to a range defined by the minimum and maximum memory clock frequencies. Units: MHz."""
    self.gpus[index].setMemoryLockedClocks(minMemClockMHz, maxMemClockMHz)

getSupportedMemoryClocks

1
getSupportedMemoryClocks(index)

Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz.

Source code in zeus/device/gpu.py
634
635
636
def getSupportedMemoryClocks(self, index: int) -> list[int]:
    """Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz."""
    return self.gpus[index].getSupportedMemoryClocks()

getSupportedGraphicsClocks

1
getSupportedGraphicsClocks(index, freq)

Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz.

Source code in zeus/device/gpu.py
638
639
640
def getSupportedGraphicsClocks(self, index: int, freq: int) -> list[int]:
    """Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz."""
    return self.gpus[index].getSupportedGraphicsClocks(freq)

getName

1
getName(index)

Returns the name of the specified GPU.

Source code in zeus/device/gpu.py
642
643
644
def getName(self, index: int) -> str:
    """Returns the name of the specified GPU."""
    return self.gpus[index].getName()

setGpuLockedClocks

1
setGpuLockedClocks(index, minGpuClockMHz, maxGpuClockMHz)

Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies. Units: MHz.

Source code in zeus/device/gpu.py
646
647
648
649
650
def setGpuLockedClocks(
    self, index: int, minGpuClockMHz: int, maxGpuClockMHz: int
) -> None:
    """Locks the GPU clock of the specified GPU to a range defined by the minimum and maximum GPU clock frequencies. Units: MHz."""
    self.gpus[index].setGpuLockedClocks(minGpuClockMHz, maxGpuClockMHz)

resetMemoryLockedClocks

1
resetMemoryLockedClocks(index)

Resets the memory locked clocks of the specified GPU to their default values.

Source code in zeus/device/gpu.py
652
653
654
def resetMemoryLockedClocks(self, index: int) -> None:
    """Resets the memory locked clocks of the specified GPU to their default values."""
    self.gpus[index].resetMemoryLockedClocks()

resetGpuLockedClocks

1
resetGpuLockedClocks(index)

Resets the GPU locked clocks of the specified GPU to their default values.

Source code in zeus/device/gpu.py
656
657
658
def resetGpuLockedClocks(self, index: int) -> None:
    """Resets the GPU locked clocks of the specified GPU to their default values."""
    self.gpus[index].resetGpuLockedClocks()

getPowerUsage

1
getPowerUsage(index)

Returns the power usage of the specified GPU. Units: mW.

Source code in zeus/device/gpu.py
660
661
662
def getPowerUsage(self, index: int) -> int:
    """Returns the power usage of the specified GPU. Units: mW."""
    return self.gpus[index].getPowerUsage()

supportsGetTotalEnergyConsumption

1
supportsGetTotalEnergyConsumption(index)

Returns True if the specified GPU supports retrieving the total energy consumption.

Source code in zeus/device/gpu.py
664
665
666
def supportsGetTotalEnergyConsumption(self, index: int) -> bool:
    """Returns True if the specified GPU supports retrieving the total energy consumption."""
    return self.gpus[index].supportsGetTotalEnergyConsumption()

getTotalEnergyConsumption

1
getTotalEnergyConsumption(index)

Returns the total energy consumption of the specified GPU. Units: mJ.

Source code in zeus/device/gpu.py
668
669
670
def getTotalEnergyConsumption(self, index: int) -> int:
    """Returns the total energy consumption of the specified GPU. Units: mJ."""
    return self.gpus[index].getTotalEnergyConsumption()

__len__

1
__len__()

Returns the number of GPUs being tracked.

Source code in zeus/device/gpu.py
672
673
674
def __len__(self) -> int:
    """Returns the number of GPUs being tracked."""
    return len(self.gpus)

NVIDIAGPUs

Bases: GPUs

NVIDIA GPU Manager object, containing individual NVIDIAGPU objects, abstracting pyNVML calls and handling related exceptions.

This class provides a high-level interface to interact with NVIDIA GPUs. CUDA_VISIBLE_DEVICES environment variable is respected if set. For example, if there are 4 GPUs and CUDA_VISIBLE_DEVICES=0,2, only GPUs 0 and 2 are instantiated. In this case, to access GPU of CUDA index 0, use the index 0, and for CUDA index 2, use the index 1.

This class provides a 1:1 mapping between the methods and NVML library functions. For example, if you want to do the following:

1
2
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(handle)

You can now do:

1
2
gpus = get_gpus() # returns a NVIDIAGPUs object
constraints =  gpus.getPowerManagementLimitConstraints(gpu_index)

Note: This class instantiates (grabs the handle, by calling pynvml.nvmlDeviceGetHandleByIndex) all GPUs that are visible to the system, as determined by the CUDA_VISIBLE_DEVICES environment variable if set.

Source code in zeus/device/gpu.py
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
class NVIDIAGPUs(GPUs):
    """NVIDIA GPU Manager object, containing individual NVIDIAGPU objects, abstracting pyNVML calls and handling related exceptions.

    This class provides a high-level interface to interact with NVIDIA GPUs. `CUDA_VISIBLE_DEVICES` environment variable is respected if set. For example, if there are
    4 GPUs and `CUDA_VISIBLE_DEVICES=0,2`, only GPUs 0 and 2 are instantiated. In this case, to access
    GPU of CUDA index 0, use the index 0, and for CUDA index 2, use the index 1.

    This class provides a 1:1 mapping between the methods and NVML library functions. For example, if you want to do the following:

    ```python
    handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index)
    constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(handle)
    ```

    You can now do:
    ```python
    gpus = get_gpus() # returns a NVIDIAGPUs object
    constraints =  gpus.getPowerManagementLimitConstraints(gpu_index)
    ```

    Note: This class instantiates (grabs the handle, by calling `pynvml.nvmlDeviceGetHandleByIndex`) all GPUs that are visible to the system, as determined by the `CUDA_VISIBLE_DEVICES` environment variable if set.
    """

    def __init__(self, ensure_homogeneous: bool = False) -> None:
        """Instantiates NVIDIAGPUs object, setting up tracking for specified NVIDIA GPUs.

        Args:
            ensure_homogeneous (bool): If True, ensures that all tracked GPUs have the same name (return value of `nvmlDeviceGetName`). False by default.
        """
        try:
            pynvml.nvmlInit()
            self._init_gpus()
            if ensure_homogeneous:
                self._ensure_homogeneous()
        except pynvml.NVMLError as e:
            exception_class = NVIDIAGPU._exception_map.get(e.value, ZeusBaseGPUError)
            raise exception_class(e.msg) from e

    @property
    def gpus(self) -> Sequence[GPU]:
        """Returns a list of NVIDIAGPU objects being tracked."""
        return self._gpus

    def _init_gpus(self) -> None:
        # Must respect `CUDA_VISIBLE_DEVICES` if set
        if (visible_device := os.environ.get("CUDA_VISIBLE_DEVICES")) is not None:
            self.visible_indices = [int(idx) for idx in visible_device.split(",")]
        else:
            self.visible_indices = list(range(pynvml.nvmlDeviceGetCount()))

        # initialize all GPUs
        self._gpus = [NVIDIAGPU(gpu_num) for gpu_num in self.visible_indices]

        # eventually replace with: self.gpus = [NVIDIAGPU(gpu_num) for gpu_num in self.visible_indices]

    def __del__(self) -> None:
        """Shuts down the NVIDIA GPU monitoring library to release resources and clean up."""
        with contextlib.suppress(pynvml.NVMLError):
            pynvml.nvmlShutdown()

gpus property

1
gpus

Returns a list of NVIDIAGPU objects being tracked.

__init__

1
__init__(ensure_homogeneous=False)

Parameters:

Name Type Description Default
ensure_homogeneous bool

If True, ensures that all tracked GPUs have the same name (return value of nvmlDeviceGetName). False by default.

False
Source code in zeus/device/gpu.py
700
701
702
703
704
705
706
707
708
709
710
711
712
713
def __init__(self, ensure_homogeneous: bool = False) -> None:
    """Instantiates NVIDIAGPUs object, setting up tracking for specified NVIDIA GPUs.

    Args:
        ensure_homogeneous (bool): If True, ensures that all tracked GPUs have the same name (return value of `nvmlDeviceGetName`). False by default.
    """
    try:
        pynvml.nvmlInit()
        self._init_gpus()
        if ensure_homogeneous:
            self._ensure_homogeneous()
    except pynvml.NVMLError as e:
        exception_class = NVIDIAGPU._exception_map.get(e.value, ZeusBaseGPUError)
        raise exception_class(e.msg) from e

__del__

1
__del__()

Shuts down the NVIDIA GPU monitoring library to release resources and clean up.

Source code in zeus/device/gpu.py
732
733
734
735
def __del__(self) -> None:
    """Shuts down the NVIDIA GPU monitoring library to release resources and clean up."""
    with contextlib.suppress(pynvml.NVMLError):
        pynvml.nvmlShutdown()

AMDGPUs

Bases: GPUs

AMD GPU Manager object, containing individual AMDGPU objects, abstracting amdsmi calls and handling related exceptions.

This class provides a high-level interface to interact with AMD GPUs. ROCR_VISIBLE_DEVICES environment variable is respected if set. For example, if there are 4 GPUs and ROCR_VISIBLE_DEVICES=0,2, only GPUs 0 and 2 are instantiated. In this case, to access GPU of ROCR index 0, use the index 0, and for ROCR index 2, use the index 1.

This class provides a 1:1 mapping between the methods and AMDSMI library functions. For example, if you want to do the following:

1
2
3
handle = amdsmi.amdsmi_get_processor_handles()[gpu_index]
info = amdsmi.amdsmi_get_power_cap_info(self.handle)
constraints = (info.min_power_cap, info.max_power_cap)

You can now do:

1
2
gpus = get_gpus() # returns a AMDGPUs object
constraints =  gpus.getPowerManagementLimitConstraints(gpu_index)

Note: This class instantiates (grabs the handle, by calling amdsmi.amdsmi_get_processor_handles()) all GPUs that are visible to the system, as determined by the ROCR_VISIBLE_DEVICES environment variable if set.

Source code in zeus/device/gpu.py
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
class AMDGPUs(GPUs):
    """AMD GPU Manager object, containing individual AMDGPU objects, abstracting amdsmi calls and handling related exceptions.

    This class provides a high-level interface to interact with AMD GPUs. `ROCR_VISIBLE_DEVICES` environment variable is respected if set. For example, if there are
    4 GPUs and `ROCR_VISIBLE_DEVICES=0,2`, only GPUs 0 and 2 are instantiated. In this case, to access
    GPU of ROCR index 0, use the index 0, and for ROCR index 2, use the index 1.

    This class provides a 1:1 mapping between the methods and AMDSMI library functions. For example, if you want to do the following:

    ```python
    handle = amdsmi.amdsmi_get_processor_handles()[gpu_index]
    info = amdsmi.amdsmi_get_power_cap_info(self.handle)
    constraints = (info.min_power_cap, info.max_power_cap)
    ```

    You can now do:
    ```python
    gpus = get_gpus() # returns a AMDGPUs object
    constraints =  gpus.getPowerManagementLimitConstraints(gpu_index)
    ```

    Note: This class instantiates (grabs the handle, by calling `amdsmi.amdsmi_get_processor_handles()`) all GPUs that are visible to the system, as determined by the `ROCR_VISIBLE_DEVICES` environment variable if set.

    """

    def __init__(self, ensure_homogeneous: bool = False) -> None:
        """Instantiates NVIDIAGPUs object, setting up tracking for specified NVIDIA GPUs.

        Args:
            ensure_homogeneous (bool, optional): If True, ensures that all tracked GPUs have the same name (return value of amdsmi.amdsmi_get_gpu_asic_info(handle).market_name). False by default.
        """
        try:
            amdsmi.amdsmi_init()
            self._init_gpus()
            if ensure_homogeneous:
                self._ensure_homogeneous()
        except amdsmi.AmdSmiException as e:
            exception_class = AMDGPU._exception_map.get(e.value, ZeusBaseGPUError)
            raise exception_class(e.msg) from e

    def _init_gpus(self) -> None:
        # Must respect `ROCR_VISIBLE_DEVICES` if set
        if (visible_device := os.environ.get("ROCR_VISIBLE_DEVICES")) is not None:
            self.visible_indices = [int(idx) for idx in visible_device.split(",")]
        else:
            self.visible_indices = list(
                range(len(amdsmi.amdsmi_get_processor_handles()))
            )

    def __del__(self) -> None:
        """Shuts down the AMD GPU monitoring library to release resources and clean up."""
        with contextlib.suppress(amdsmi.AmdSmiException):
            amdsmi.amdsmi_shut_down()  # Ignore error on shutdown. Neccessary for proper cleanup and test functionality
    @property
    def gpus(self) -> Sequence[GPU]:
        """Returns a list of AMDGPU objects being tracked."""
        raise NotImplementedError("AMDGPUs.gpus is not implemented yet.")

gpus property

1
gpus

Returns a list of AMDGPU objects being tracked.

__init__

1
__init__(ensure_homogeneous=False)

Parameters:

Name Type Description Default
ensure_homogeneous bool

If True, ensures that all tracked GPUs have the same name (return value of amdsmi.amdsmi_get_gpu_asic_info(handle).market_name). False by default.

False
Source code in zeus/device/gpu.py
763
764
765
766
767
768
769
770
771
772
773
774
775
776
def __init__(self, ensure_homogeneous: bool = False) -> None:
    """Instantiates NVIDIAGPUs object, setting up tracking for specified NVIDIA GPUs.

    Args:
        ensure_homogeneous (bool, optional): If True, ensures that all tracked GPUs have the same name (return value of amdsmi.amdsmi_get_gpu_asic_info(handle).market_name). False by default.
    """
    try:
        amdsmi.amdsmi_init()
        self._init_gpus()
        if ensure_homogeneous:
            self._ensure_homogeneous()
    except amdsmi.AmdSmiException as e:
        exception_class = AMDGPU._exception_map.get(e.value, ZeusBaseGPUError)
        raise exception_class(e.msg) from e

__del__

1
__del__()

Shuts down the AMD GPU monitoring library to release resources and clean up.

Source code in zeus/device/gpu.py
787
788
789
790
def __del__(self) -> None:
    """Shuts down the AMD GPU monitoring library to release resources and clean up."""
    with contextlib.suppress(amdsmi.AmdSmiException):
        amdsmi.amdsmi_shut_down()  # Ignore error on shutdown. Neccessary for proper cleanup and test functionality

get_gpus

1
get_gpus(ensure_homogeneous=False)

Initialize and return a singleton GPU monitoring object for NVIDIA or AMD GPUs.

The function returns a GPU management object that aims to abstract the underlying GPU monitoring libraries (pynvml for NVIDIA GPUs and amdsmi for AMD GPUs), and provides a 1:1 mapping between the methods in the object and related library functions.

This function attempts to initialize GPU monitoring using the pynvml library for NVIDIA GPUs first. If pynvml is not available or fails to initialize, it then tries to use the amdsmi library for AMD GPUs. If both attempts fail, it raises a ZeusErrorInit exception.

Parameters:

Name Type Description Default
ensure_homogeneous bool

If True, ensures that all tracked GPUs have the same name. False by default.

False
Source code in zeus/device/gpu.py
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
def get_gpus(ensure_homogeneous: bool = False) -> GPUs:
    """Initialize and return a singleton GPU monitoring object for NVIDIA or AMD GPUs.

    The function returns a GPU management object that aims to abstract the underlying GPU monitoring libraries
    (pynvml for NVIDIA GPUs and amdsmi for AMD GPUs), and provides a 1:1 mapping between the methods in the object and related library functions.

    This function attempts to initialize GPU monitoring using the pynvml library for NVIDIA GPUs
    first. If pynvml is not available or fails to initialize, it then tries to use the amdsmi
    library for AMD GPUs. If both attempts fail, it raises a ZeusErrorInit exception.

    Args:
        ensure_homogeneous (bool, optional): If True, ensures that all tracked GPUs have the same name. False by default.
    """
    global _gpus
    if _gpus is not None:
        return _gpus

    if nvml_is_available():
        _gpus = NVIDIAGPUs(ensure_homogeneous)
        return _gpus
    elif amdsmi_is_available():
        _gpus = AMDGPUs(ensure_homogeneous)
        return _gpus
    else:
        raise ZeusGPUInitError(
            "NVML and AMDSMI unavailable. Failed to initialize GPU management library."
        )

nvml_is_available

1
nvml_is_available()

Check if PyNVML is available.

Source code in zeus/device/gpu.py
829
830
831
832
833
834
835
836
837
838
839
840
841
def nvml_is_available() -> bool:
    """Check if PyNVML is available."""
    try:
        import pynvml
    except ImportError:
        logger.info("PyNVML is not available.")
        return False
    try:
        pynvml.nvmlInit()
        return True
    except pynvml.NVMLError:
        logger.info("PyNVML is available but could not initialize.")
        return False

amdsmi_is_available

1
amdsmi_is_available()

Check if amdsmi is available.

Source code in zeus/device/gpu.py
844
845
846
847
848
849
850
851
852
853
854
855
856
def amdsmi_is_available() -> bool:
    """Check if amdsmi is available."""
    try:
        import amdsmi
    except ImportError:
        logger.info("amdsmi is not available.")
        return False
    try:
        amdsmi.amdsmi_init()
        return True
    except amdsmi.AmdSmiLibraryException:
        logger.info("amdsmi is available but could not initialize.")
        return False