Source code for sbu.parse_yaml

"""
sbu.parse_yaml
==============

A module for parsing and validating the .yaml input.

Index
-----
.. currentmodule:: sbu.parse_yaml
.. autosummary::
    yaml_to_pandas
    validate_usernames

API
---
.. autofunction:: yaml_to_pandas
.. autofunction:: validate_usernames

"""

from subprocess import check_output

from typing import (Tuple, Hashable, Any, Dict, Optional)

import yaml
import numpy as np
import pandas as pd

from sbu.globvar import ACTIVE, NAME, PROJECT, SBU_REQUESTED, TMP

__all__ = ['yaml_to_pandas', 'validate_usernames']


[docs]def yaml_to_pandas(filename: str) -> Tuple[pd.DataFrame, Optional[str]]:
    """Create a Pandas DataFrame out of a .yaml file.

    Examples
    --------
    Example yaml input:

    .. code-block:: yaml

        __project__: BlaBla
        A:
            description: Example project
            PI: Walt Disney
            SBU requested: 1000
            users:
                user1: Donald Duck
                user2: Scrooge McDuck
                user3: Mickey Mouse

    Example output:

    .. code-block:: python

        >>> df, project = yaml_to_pandas(filename)

        >>> print(df)
                    info                  ...
                 project            name  ... SBU requested           PI
        username                          ...
        user1          A     Donald Duck  ...        1000.0  Walt Disney
        user2          A  Scrooge McDuck  ...        1000.0  Walt Disney
        user3          A    Mickey Mouse  ...        1000.0  Walt Disney

        >>> print(project)
        BlaBla

    Parameters
    ----------
    filename : :class:`str`
        The path+filename to the .yaml file.

    Returns
    -------
    :class:`pandas.DataFrame` & :class:`str`, optional
        A Pandas DataFrame and project name constructed from **filename**.
        Columns and rows are instances of :class:`pandas.MultiIndex` and
        :class:`pandas.Index`, respectively.
        All retrieved .yaml data is stored under the ``"info"`` super-column.
        The project name will be :data:`None` if the ``__project__`` key is absent
        from the .yaml file

    """
    # Read the yaml file
    with open(filename, 'r') as f:
        dict_ = yaml.load(f, Loader=yaml.SafeLoader)
    project = dict_.pop("__project__", None)

    # Convert the yaml dictionary into a dataframe
    data: Dict[str, Dict[Tuple[Hashable, Hashable], Any]] = {}
    for k1, v1 in dict_.items():
        for k2, v2 in v1['users'].items():
            data[k2] = {('info', k): v for k, v in v1.items() if k != 'users'}
            data[k2][NAME] = v2
            data[k2][PROJECT] = k1
    df = pd.DataFrame(data).T

    # Fortmat, sort and return the dataframe
    df.index.name = 'username'
    df[SBU_REQUESTED] = df[SBU_REQUESTED].astype(float)
    df[TMP] = df.index
    df.sort_values(by=[PROJECT, TMP], inplace=True)
    df.sort_index(axis=1, inplace=True, ascending=False)
    del df[TMP]
    df[ACTIVE] = False

    validate_usernames(df)
    return df, project


[docs]def validate_usernames(df: pd.DataFrame) -> None:
    """Validate that all users belonging to an account are available in the .yaml input file.

    Raises a KeyError If one or more usernames printed by the ``accinfo`` comand are absent from
    **df**.

    Parameters
    ----------
    df : :class:`pandas.DataFrame`
        A DataFrame, produced by :func:`.yaml_to_pandas`, containing user accounts.
        :attr:`pandas.DataFrame.columns` and :attr:`pandas.DataFrame.index`
        should be instances of :class:`pandas.MultiIndex` and :class:`pandas.Index`, respectively.
        User accounts are expected to be stored in :attr:`pandas.DataFrame.index`.

    Raises
    ------
    ValueError
        Raised if one or more users reported by the ``accinfo`` command are absent from **df** or
        *vice versa*.

    """
    _usage = check_output(['accinfo'], encoding='utf8')
    iterator = filter(None, _usage.splitlines())
    for i in iterator:
        if i == "# Users linked to this account":
            usage = np.array(list(iterator), dtype=np.str_)
            break
    else:
        raise ValueError("Failed to parse the passed .yaml file")

    bool_ar1 = np.isin(usage, df.index)
    bool_ar2 = np.isin(df.index, usage)
    name_diff = ""
    name_diff += "".join(f"\n- {name}" for name in usage[~bool_ar1])
    name_diff += "".join(f"\n+ {name}" for name in df.index[~bool_ar2].values)
    if name_diff:
        raise ValueError(f"User mismatch between .yaml file and `accinfo` output:{name_diff}")