Skip to content

env

zeus.util.env

Tools related to environment variables.

get_env

get_env(name, valtype, default=None)

Fetch an environment variable and cast it to the given type.

Source code in zeus/util/env.py
27
28
29
30
31
32
33
34
35
36
37
38
39
def get_env(name: str, valtype: Type[T], default: T | None = None) -> T:
    """Fetch an environment variable and cast it to the given type."""
    try:
        if valtype == bool:
            val = os.environ[name].lower()
            if val not in ["true", "false"]:
                raise ValueError(f"Strange boolean environment variable value '{val}'")
            return cast(T, val == "true")
        return valtype(os.environ[name])
    except KeyError:
        if default is not None:
            return default
        raise ValueError(f"Missing environment variable '{name}'") from None

resolve_gpu_indices

resolve_gpu_indices(requested_gpu_indices)

Resolve GPU indices considering CUDA_VISIBLE_DEVICES.

Parameters:

Name Type Description Default
requested_gpu_indices list[int] | None

A list of user-specified GPU indices. If None, assume the user wants all GPUs visible under CUDA_VISIBLE_DEVICES.

required

Returns:

Type Description
tuple[list[int], list[int]]

A tuple of GPU index lists, where the former is CUDA indices under the illusion of CUDA_VISIBLE_DEVICES and the latter is the actual CUDA indices that NVML understands. The order of the two lists are the same.

Source code in zeus/util/env.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def resolve_gpu_indices(
    requested_gpu_indices: list[int] | None,
) -> tuple[list[int], list[int]]:
    """Resolve GPU indices considering `CUDA_VISIBLE_DEVICES`.

    Args:
        requested_gpu_indices: A list of user-specified GPU indices. If `None`,
            assume the user wants all GPUs visible under `CUDA_VISIBLE_DEVICES`.

    Returns:
        A tuple of GPU index lists, where the former is CUDA indices under the
            illusion of `CUDA_VISIBLE_DEVICES` and the latter is the actual CUDA
            indices that NVML understands. The order of the two lists are the same.
    """
    # Initialize NVML.
    pynvml.nvmlInit()

    # Sanity check.
    if requested_gpu_indices is not None and not requested_gpu_indices:
        raise ValueError("`requested_gpu_indices` must be None or non-empty.")

    # Find the NVML GPU indices visible to CUDA, respecting `CUDA_VISIBLE_DEVICES`.
    if (cuda_visible_device := os.environ.get("CUDA_VISIBLE_DEVICES")) is not None:
        nvml_visible_indices = [int(idx) for idx in cuda_visible_device.split(",")]
    else:
        nvml_visible_indices = list(range(pynvml.nvmlDeviceGetCount()))

    # NVML GPU indices and CUDA GPU indices should be different.
    # We always use CUDA GPU indices when communicating with the outside world,
    # but when dealing with NVML, we use the NVML GPU indices.
    if requested_gpu_indices is None:
        nvml_gpu_indices = nvml_visible_indices
        cuda_gpu_indices = list(range(len(nvml_visible_indices)))
    else:
        nvml_gpu_indices = [nvml_visible_indices[idx] for idx in requested_gpu_indices]
        cuda_gpu_indices = requested_gpu_indices

    # Deinitialize NVML.
    pynvml.nvmlShutdown()

    return cuda_gpu_indices, nvml_gpu_indices