Source code for sbu.dataframe

"""
sbu.dataframe
=============

A module which handles data parsing and DataFrame construction.

Index
-----
.. currentmodule:: sbu.dataframe
.. autosummary::
    get_sbu
    parse_accuse
    get_date_range
    construct_filename
    _get_datetimeindex
    _parse_date
    _get_total_sbu_requested

API
---
.. autofunction:: get_sbu
.. autofunction:: parse_accuse
.. autofunction:: get_date_range
.. autofunction:: construct_filename
.. autofunction:: _get_datetimeindex
.. autofunction:: _parse_date
.. autofunction:: _get_total_sbu_requested

"""

import re
import datetime
from subprocess import check_output
from typing import Tuple, Optional, Union

import numpy as np
import pandas as pd

from sbu.globvar import ACTIVE, PROJECT, SBU_REQUESTED

__all__ = [
    'get_date_range', 'construct_filename', 'get_sbu', 'parse_accuse'
]


[docs]def get_sbu(
    df: pd.DataFrame,
    project: str,
    start: Union[None, str, int] = None,
    end: Union[None, str, int] = None,
) -> None:
    """Acquire the SBU usage for each account in the :attr:`pandas.DataFrame.index`.

    The start and end of the reported interval can, optionally, be altered with **start**
    and **end**.
    Performs an inplace update of **df**, adding new columns to hold the SBU usage per month under
    the ``"Month'`` super-column.
    In addition, a single row and column is added (``"sum"``) with SBU usage summed over
    the entire interval and over all users, respectively.

    Parameters
    ----------
    df : :class:`pandas.DataFrame`
        A Pandas DataFrame with usernames and information, constructed by :func:`yaml_to_pandas`.
        :attr:`pandas.DataFrame.columns` and :attr:`pandas.DataFrame.index` should
        be instances of :class:`pandas.MultiIndex` and :class:`pandas.Index`, respectively.
        User accounts are expected to be stored in :attr:`pandas.DataFrame.index`.
        SBU usage (including the sum) is stored in the ``"Month"`` super-column.

    start : :class:`int` or :class:`str`, optional
        Optional: The starting year of the interval.
        Defaults to the current year if ``None``.

    end : :class:`str` or :class:`int`, optional
        Optional: The final year of the interval.
        Defaults to current year + 1 if ``None``.

    project : :class:`str`, optional
        Optional: The project code of the project of interest.
        If not ``None``, only SBUs expended under this project are considered.

    """
    # Construct new columns in **df**
    sy, ey = get_date_range(start, end)
    date_range = _get_datetimeindex(sy, ey)
    for i in date_range:
        df[('Month', str(i)[:7])] = np.nan

    df_tmp = parse_accuse(project, sy, ey)
    df.update(df_tmp)

    # Calculate SBU sums
    SUM = ('Month', 'sum')
    df[SUM] = df['Month'].sum(axis=1)
    nan_template = {k: np.nan for k in df.columns}
    nan_template['info', 'active'] = False
    df.loc['sum'] = nan_template
    df.loc['sum', 'Month'] = df['Month'].sum(axis=0).values
    df.at['sum', PROJECT] = 'sum'
    df.at['sum', SBU_REQUESTED] = _get_total_sbu_requested(df)

    # Mark all active users
    df[ACTIVE] = False
    df.loc[df[SUM] > 1.0, ACTIVE] = True


DATE_PATTERN = re.compile("([0-9]+)-([0-9][0-9])-?([0-9][0-9])?")


[docs]def parse_accuse(
    project: str,
    start: Optional[str] = None,
    end: Optional[str] = None,
) -> pd.DataFrame:
    """Gather SBU usage of a specific user account.

    The bash command ``accuse`` is used for gathering SBU usage along an interval defined
    by **start** and **end**.
    Results are collected and returned in a Pandas DataFrame.

    Parameters
    ----------
    project : :class:`str`
        The project code of the project of interest.

    start : :class:`str`
        The starting date of the interval.
        Accepts dates formatted as YYYY, MM-YYYY or DD-MM-YYYY.

    end : :class:`str`
        The final date of the interval.
        Accepts dates formatted as YYYY, MM-YYYY or DD-MM-YYYY.

    Returns
    -------
    :class:`pandas.DataFrame`
        The SBU usage of **user** over a specified period.

    """
    # Acquire SBU usage
    arg = ['accuse', '-a', project]
    if start is not None:
        arg += ["-s", start]
    if end is not None:
        arg += ["-e", end]

    usage = check_output(arg).decode('utf-8')
    usage_list = []
    for i in usage.splitlines():
        try:
            month, *fields = i.split()
        except ValueError:
            continue
        if DATE_PATTERN.fullmatch(month):
            usage_list.append((month, *fields))

    df = pd.DataFrame(usage_list, columns=["Month", "Account", "User", "SBUs", "Restituted"])
    df.set_index("User", inplace=True)
    df["SBUs"] = pd.to_timedelta(df["SBUs"]).astype("m8[s]")
    df["SBUs"] -= pd.to_timedelta(df["Restituted"]).astype("m8[s]")
    df["SBUs"] /= 60**2  # seconds to hours

    index = pd.Index(sorted(set(df.index)), name="username")
    columns = pd.MultiIndex.from_product([
        ["Month"],
        sorted(set(df["Month"]), key=lambda i: np.datetime64(i, "M")),
    ])

    ret = pd.DataFrame(np.nan, index=index, columns=columns)
    for name, (sbu, month) in df[["SBUs", "Month"]].iterrows():
        ret.loc[name, ("Month", month)] = sbu
    return ret


def _get_last_day_of_month(any_day: datetime.date) -> str:
    # The day 28 exists in every month. 4 days later, it's always next month
    next_month = any_day.replace(day=28) + datetime.timedelta(days=4)
    # subtracting the number of the current day brings us back one month
    ret = next_month - datetime.timedelta(days=next_month.day)
    return ret.strftime('%d')


[docs]def get_date_range(start: Optional[Union[str, int]] = None,
                   end: Optional[Union[str, int]] = None) -> Tuple[str, str]:
    """Return a starting and ending date as two strings.

    Parameters
    ----------
    start : :class:`int` or :class:`str`, optional
        The starting year of the interval.
        Accepts dates formatted as YYYY, MM-YYYY or DD-MM-YYYY.
        Defaults to the current year if ``None``.

    end : :class:`str` or :class:`int`, optional
        The final year of the interval.
        Accepts dates formatted as YYYY, MM-YYYY or DD-MM-YYYY.
        Defaults to the current year + 1 if ``None``.

    Returns
    -------
    :class:`tuple` [:class:`str`, :class:`str`]
        A tuple with the start and end data, formatted as strings.
        Dates are formatted as DD-MM-YYYY.

    """
    today = datetime.date.today()
    month = today.strftime('%m')
    year = today.strftime('%Y')
    last_day = _get_last_day_of_month(today)

    start = _parse_date(start, default_month='01', default_year=year)
    end = _parse_date(end, default_day=last_day, default_month=month, default_year=year)

    return start, end


[docs]def construct_filename(prefix: str, suffix: Optional[str] = '.csv') -> str:
    """Construct a filename containing the current date.

    Examples
    --------
    .. code:: python

        >>> filename = construct_filename('my_file', '.txt')
        >>> print(filename)
        'my_file_31_May_2019.txt'

    Parameters
    ----------
    prefix : :class:`str`
        A prefix for the to-be returned filename.
        The current date will be appended to this prefix.

    sufix : :class:`str`, optional
        An optional sufix of the to be returned filename.
        No sufix will be attached if ``None``.

    Returns
    -------
    :class:`str`
        A filename consisting of **prefix**, the current date and **suffix**.

    """
    today = datetime.date.today()
    suffix = suffix or ''
    return prefix + today.strftime('_%d_%b_%Y') + suffix


[docs]def _get_datetimeindex(start: str, end: str) -> pd.DatetimeIndex:
    """Create a Pandas DatetimeIndex from a start and end date.

    Parameters
    ----------
    start : :class:`str`
        The start of the interval.
        Accepts dates formatted as DD-MM-YYYY.

    end : :class:`str`
        The end of the interval.
        Accepts dates formatted as DD-MM-YYYY.

    Returns
    -------
    :class:`pandas.DatetimeIndex`
        A DatetimeIndex starting from **sy** and ending on **ey**.

    """
    _, mm, yyyy = start.split('-')
    start_ = f'{yyyy}-{mm}-01'

    _, mm, yyyy = end.split('-')
    end_ = f'{yyyy}-{mm}-01'

    return pd.date_range(start_, end_, freq=pd.offsets.MonthBegin(), name='Month')


[docs]def _parse_date(input_date: Union[str, int, None],
                default_day: str = '01',
                default_month: str = '01',
                default_year: Optional[str] = None) -> str:
    """Parse any dates supplied to :func:`.get_date_range`.

    Parameters
    ----------
    input_date : :class:`str`, :class:`int` or ``None``
        The to-be parsed date.
        Allowed types and values are:

        * ``None``: Defaults to the first day of the current year and month.
        * :class:`int`: A year (*e.g.* ``2019``).
        * :class:`str`: A date in YYYY, MM-YYYY or DD-MM-YYYY format (*e.g.* ``"22-10-2018"``).

    default_month : :class:`str`
        The default month if a month is not provided in **input_date**.
        Expects a month in MM format.

    default_year : :class:`str`, optional
        Optional: The default year if a year is not provided in **input_date**.
        Expects a year in YYYY format.
        Defaults to the current year if ``None``.

    Returns
    -------
    :class:`str`
        A string, constructed from **input_date**, representing a date in DD-MM-YYYY format.

    Raises
    ------
    ValueError
        Raised if **input_date** is provided as string and contains more than 2 dashes.

    TypeError
        Raised if **input_date** is neither ``None``, a string nor an integer.

    """
    if default_year is None:
        default_year = datetime.date.today().strftime('%Y')

    if input_date is None:
        return f'{default_day}-{default_month}-{default_year}'
    elif isinstance(input_date, int):
        return f'01-01-{input_date}'
    elif isinstance(input_date, str):
        dash_count = input_date.count('-')
        if dash_count == 0:
            return f'{default_day}-{default_month}-{input_date}'
        elif dash_count == 1:
            return f'{default_day}-{input_date}'
        elif dash_count == 2:
            return input_date
        else:
            raise ValueError(f"'input_date': '{input_date}'")

    type_name = input_date.__class__.__name__
    raise TypeError(f"The 'input_data' parameter is of invalid type: '{type_name}'")


[docs]def _get_total_sbu_requested(df: pd.DataFrame) -> float:
    """Return the total number of requested SBUs."""
    slice_ = df[[SBU_REQUESTED, PROJECT]].iloc[:-1]
    slice_ = slice_.set_index(PROJECT, inplace=False)
    slice_ = slice_.loc[~slice_.index.duplicated(keep='first')]
    return slice_[SBU_REQUESTED].sum()