Source code for zero.hardware

"""Tools related to devices, memory, etc."""

__all__ = ['free_memory', 'get_gpu_info']

import gc
import math
from typing import Any, Dict, List

import torch
from pynvml import NVMLError_LibraryNotFound
from pynvml.smi import nvidia_smi

_GPU_INFO_QUERY = 'memory.total, memory.used, memory.free, utilization.gpu'


[docs]def free_memory() -> None:
    """Free GPU-memory occupied by `torch` and run the garbage collector.

    Warning:
        There is a small chunk of GPU-memory (occupied by drivers) that is impossible to
        free. It is a `torch` "limitation", so the function inherits this property.
    """
    gc.collect()
    if torch.cuda.is_available():
        # torch has wrong .pyi
        torch.cuda.synchronize()  # type: ignore
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


[docs]def get_gpu_info(precise: bool = False) -> List[Dict[str, Any]]:
    """Get statistics about GPU devices.

    Includes information about memory (total, free and used) and utilization. Some
    figures are represented in two ways: with raw units and with percentage.

    Args:
        precise: if False, all data is rounded (to Mb for memory, to % for percentages)
    Returns:
        Information about GPU devices.
    Raises:
        RuntimeError: if necessary cuda-related libraries are not found. Usually, it
            means that the function is run on a machine without GPU.

    Examples:
        .. code-block::

            print(get_gpu_info())

        Output example (formatted for convenience):

        .. code-block:: none

            [
                {
                    'util%': 0,
                    'total': 11019,
                    'used': 0,
                    'free': 11019,
                    'used%': 0,
                    'free%': 100,
                },
                {
                    'util%': 0,
                    'total': 11016,
                    'used': 0,
                    'free': 11016,
                    'used%': 0,
                    'free%': 100,
                },
            ]

    Warning:
        The function directly collects information using the :code:`pynvml` library,
        hence, settings like :code:`CUDA_VISIBLE_DEVICES` don't affect the result.
    """
    try:
        smi = nvidia_smi.getInstance()
    except NVMLError_LibraryNotFound as err:
        raise RuntimeError(
            'Failed to get information about GPU memory. '
            'Make sure that you actually have GPU and all relevant software installed.'
        ) from err
    raw_info = smi.DeviceQuery(_GPU_INFO_QUERY)
    process_float = (lambda x: float(x)) if precise else math.floor  # noqa

    def unpack_raw_gpu_info(raw_gpu_info):
        gpu_info = {'util%': raw_gpu_info['utilization']['gpu_util']}
        gpu_info.update(
            (x, process_float(raw_gpu_info['fb_memory_usage'][x]))
            for x in ['total', 'used', 'free']
        )
        for x in 'used', 'free':
            gpu_info[x + '%'] = process_float(gpu_info[x] / gpu_info['total'] * 100)
        return gpu_info

    return list(map(unpack_raw_gpu_info, raw_info['gpu']))