Source code for sbu.parse_yaml

"""
sbu.parse_yaml
==============

A module for parsing and validating the .yaml input.

Index
-----
.. currentmodule:: sbu.parse_yaml
.. autosummary::
    yaml_to_pandas
    validate_usernames

API
---
.. autofunction:: yaml_to_pandas
.. autofunction:: validate_usernames

"""

from subprocess import check_output

from typing import (Tuple, Hashable, Any, Dict, Optional)

import yaml
import numpy as np
import pandas as pd

from sbu.globvar import ACTIVE, NAME, PROJECT, SBU_REQUESTED, TMP

__all__ = ['yaml_to_pandas', 'validate_usernames']


[docs]def yaml_to_pandas(filename: str) -> Tuple[pd.DataFrame, Optional[str]]: """Create a Pandas DataFrame out of a .yaml file. Examples -------- Example yaml input: .. code-block:: yaml __project__: BlaBla A: description: Example project PI: Walt Disney SBU requested: 1000 users: user1: Donald Duck user2: Scrooge McDuck user3: Mickey Mouse Example output: .. code-block:: python >>> df, project = yaml_to_pandas(filename) >>> print(df) info ... project name ... SBU requested PI username ... user1 A Donald Duck ... 1000.0 Walt Disney user2 A Scrooge McDuck ... 1000.0 Walt Disney user3 A Mickey Mouse ... 1000.0 Walt Disney >>> print(project) BlaBla Parameters ---------- filename : :class:`str` The path+filename to the .yaml file. Returns ------- :class:`pandas.DataFrame` & :class:`str`, optional A Pandas DataFrame and project name constructed from **filename**. Columns and rows are instances of :class:`pandas.MultiIndex` and :class:`pandas.Index`, respectively. All retrieved .yaml data is stored under the ``"info"`` super-column. The project name will be :data:`None` if the ``__project__`` key is absent from the .yaml file """ # Read the yaml file with open(filename, 'r') as f: dict_ = yaml.load(f, Loader=yaml.SafeLoader) project = dict_.pop("__project__", None) # Convert the yaml dictionary into a dataframe data: Dict[str, Dict[Tuple[Hashable, Hashable], Any]] = {} for k1, v1 in dict_.items(): for k2, v2 in v1['users'].items(): data[k2] = {('info', k): v for k, v in v1.items() if k != 'users'} data[k2][NAME] = v2 data[k2][PROJECT] = k1 df = pd.DataFrame(data).T # Fortmat, sort and return the dataframe df.index.name = 'username' df[SBU_REQUESTED] = df[SBU_REQUESTED].astype(float) df[TMP] = df.index df.sort_values(by=[PROJECT, TMP], inplace=True) df.sort_index(axis=1, inplace=True, ascending=False) del df[TMP] df[ACTIVE] = False validate_usernames(df) return df, project
[docs]def validate_usernames(df: pd.DataFrame) -> None: """Validate that all users belonging to an account are available in the .yaml input file. Raises a KeyError If one or more usernames printed by the ``accinfo`` comand are absent from **df**. Parameters ---------- df : :class:`pandas.DataFrame` A DataFrame, produced by :func:`.yaml_to_pandas`, containing user accounts. :attr:`pandas.DataFrame.columns` and :attr:`pandas.DataFrame.index` should be instances of :class:`pandas.MultiIndex` and :class:`pandas.Index`, respectively. User accounts are expected to be stored in :attr:`pandas.DataFrame.index`. Raises ------ ValueError Raised if one or more users reported by the ``accinfo`` command are absent from **df** or *vice versa*. """ _usage = check_output(['accinfo'], encoding='utf8') iterator = filter(None, _usage.splitlines()) for i in iterator: if i == "# Users linked to this account": usage = np.array(list(iterator), dtype=np.str_) break else: raise ValueError("Failed to parse the passed .yaml file") bool_ar1 = np.isin(usage, df.index) bool_ar2 = np.isin(df.index, usage) name_diff = "" name_diff += "".join(f"\n- {name}" for name in usage[~bool_ar1]) name_diff += "".join(f"\n+ {name}" for name in df.index[~bool_ar2].values) if name_diff: raise ValueError(f"User mismatch between .yaml file and `accinfo` output:{name_diff}")