Source code for leaspy.io.data.abstract_dataframe_data_reader

import warnings
from abc import abstractmethod

import numpy as np
import pandas as pd

from leaspy.exceptions import LeaspyDataInputError
from leaspy.utils.typing import IDType

from .individual_data import IndividualData

__all__ = ["AbstractDataframeDataReader"]



[docs]
class AbstractDataframeDataReader:
    """
    Methods to convert :class:`pandas.DataFrame` to `Leaspy`-compliant data containers.

    Raises
    ------
    :exc:`.LeaspyDataInputError`
    """

    time_rounding_digits = 6

    def __init__(self):
        self.individuals: dict[IDType, IndividualData] = {}
        self.iter_to_idx: dict[int, IDType] = {}
        self.n_individuals: int = 0

    ######################################################
    #               COMMON METHODS
    ######################################################

    @staticmethod
    def _check_numeric_type(dtype) -> bool:
        """
        Check if the type of the pandas data is numeric or not

        Parameters
        ----------
        s: pandas.Series.dtype
            pandas type of the data

        Returns
        -------
        : :obj:`bool`
            True if the type is a numeric type
        """
        return pd.api.types.is_numeric_dtype(
            dtype
        ) and not pd.api.types.is_complex_dtype(dtype)

    @classmethod
    def _check_ID(cls, s: pd.Series) -> None:
        """
        Check requirements on subjects identifiers.

        Parameters
        ----------
        s: pd.Series
            Identifiers of the patients

        Raises
        ------
        :exc:`.LeaspyModelInputError` :
            - If the :s:`pd.Series` is not a string, integer or categories
            - If the :s:`pd.Series` contains Nan
            - If the :s:`pd.Series` is integer and contain negative values
            - If the :s:`pd.Series` is string and contain empty strings
        """
        # TODO? enforce strings? (for compatibility for downstream requirements especially in IndividualParameters)
        valid_dtypes = ["string", "integer", "categorical"]
        inferred_dtype = pd.api.types.infer_dtype(s)
        if inferred_dtype not in valid_dtypes:
            raise LeaspyDataInputError(
                "The `ID` column should identify individuals as string, integer or categories, "
                f"not {inferred_dtype} ({s.dtype})."
            )

        if s.isna().any():
            # NOTE: as soon as a np.nan or np.inf, inferred_dtype cannot be 'integer'
            # but custom pandas dtype can still contain pd.NA
            raise LeaspyDataInputError(
                f"The `ID` column should NOT contain any nan ({s.isna().sum()} found)."
            )

        if inferred_dtype == "integer":
            if (s < 0).any():
                raise LeaspyDataInputError(
                    "All `ID` should be >= 0 when subjects are identified as integers, "
                    "use string identifiers if you need more flexibility."
                )
        elif inferred_dtype == "string":
            if (s.str.len() == 0).any():
                raise LeaspyDataInputError(
                    "No `ID` should be empty when subjects are identified as strings."
                )

    def _clean_index(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Check requirements on subjects identifiers:
            - ID represents patient index,
            - TIME represents the "age" of the patient at visit for visit indexing.

        Parameters
        ----------
        df: pd.DataFrame
            The whole dataframe with patient information.

        Returns
        -------
        df: pd.DataFrame
            The whole dataframe with patient information with a clean, set index
        """

        df = df.copy(deep=True)

        # Check columns headers
        columns = df.columns.tolist()

        # Try to read the raw dataframe
        try:
            self._check_headers(columns)

        # If we do not find 'ID' and 'TIME' columns, check the Index
        except LeaspyDataInputError:
            df.reset_index(inplace=True)
            columns = df.columns.tolist()
            self._check_headers(columns)

        # Check patient ID common to every format
        self._check_ID(df["ID"])

        df = self._set_index(df)
        if not df.index.is_unique:
            # get lines number as well as ID & TIME for duplicates (original line numbers)
            df_dup = df[[]].reset_index().duplicated(keep=False)
            df_dup = df_dup[df_dup]
            raise LeaspyDataInputError(f"Some raw are duplicated:\n{df_dup}")

        return df

    def _clean_numeric_data(
        self, df: pd.DataFrame, drop_full_nan: bool, warn_empty_column: bool
    ) -> pd.DataFrame:
        """
        Dataframe with patient information should only contain numeric data. This method check that this is the
        case, clean nans and empty columns.

        Parameters
        ----------
        df: pd.DataFrame
            Dataframe with patient information.

        drop_full_nan: bool
            If set to True, raw full of nan are dropped.

        warn_empty_column: bool
            If set to True, a warning is raise for columns full of nan.

        Returns
        -------
        df: pd.DataFrame
            Dataframe with clean numeric information.
        """
        # Check and clean numerical values
        types_nok = {
            ft: dtype
            for ft, dtype in df.dtypes.items()
            if not self._check_numeric_type(dtype)
        }
        if types_nok:
            raise LeaspyDataInputError(
                f"All columns should be of numerical type, which is not the case for {types_nok}."
            )

        try:
            # it is needed so to always use numpy.nan as nans even if pd.NA were used originally
            df = df.astype(float)
        except Exception as e:
            raise LeaspyDataInputError(
                "Cannot safely convert dataframe to float type."
            ) from e

        # warn if some columns are full of nans
        full_of_nans = df.isna().all(axis=0)
        full_of_nans = full_of_nans[full_of_nans].index.tolist()
        if warn_empty_column and full_of_nans:
            warnings.warn(f"These columns only contain nans: {full_of_nans}.")

        # check that no 'inf' are present in dataframe
        df_inf = np.isinf(df)  # numpy.nan are considered finite :)
        df_inf_rows_and_cols = (
            df.where(df_inf)
            .dropna(how="all", axis=0)
            .dropna(how="all", axis=1)
            .fillna("")
        )
        if len(df_inf_rows_and_cols) != 0:
            raise LeaspyDataInputError(
                f"Values may be nan but not infinite, double check your data:\n{df_inf_rows_and_cols}"
            )

        # Drop visits full of nans so to get a correct number of total visits
        if drop_full_nan:
            df = df.dropna(how="all")

        return df

    @abstractmethod
    def _check_headers(self, columns: list[str]) -> None:
        """
        Check mendatory dataframe headers

        Parameters
        ----------
        columns: List[str]
            Names of the columns headers of the dataframe that contains patients information
        """

    @abstractmethod
    def _set_index(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Set the index suited for the type of information contained in the dataframe

        Parameters
        ----------
        df: pd.DataFrame
            Dataframe with patient information

        Returns
        -------
        df: pd.DataFrame
            Dataframe with the right index
        """

    @abstractmethod
    def _clean_dataframe(
        self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool
    ) -> pd.DataFrame:
        """
        Clean the dataframe that contains patient information. This method depends on the data type that is analysed: repeated measures, events or both

        Parameters
        ----------
        df: pd.DataFrame
            Dataframe with patient information.

        drop_full_nan: bool
            If set to True, raw full of nan are dropped.

        warn_empty_column: bool
            If set to True, a warning is raise for columns full of nan.


        Returns
        -------
        df: pd.DataFrame
            Dataframe with clean information.
        """

    @abstractmethod
    def _load_individuals_data(
        self, subj: IndividualData, df_subj: pd.DataFrame
    ) -> None:
        """
        Convert information stored in a dataframe to information stored into IndividualData

        Parameters
        ----------
        subj: IndividualData
            One patient with her/his information, potentially empty

        df_subj: pd.DataFrame
            One patient with her/his information
        """

    ######################################################
    #               MAIN METHOD
    ######################################################


[docs]
    def read(
        self,
        df: pd.DataFrame,
        *,
        drop_full_nan: bool = True,
        sort_index: bool = False,
        warn_empty_column: bool = True,
    ) -> None:
        """
        The method that effectively reads the input dataframe (automatically called in __init__).

        Parameters
        ----------
        df : :class:`pandas.DataFrame`
            The dataframe to read.
        drop_full_nan : bool
            Should we drop rows full of nans? (except index)
        sort_index : bool
            Should we lexsort index?
            (Keep False as default so not to break many of the downstream tests that check order...)
        warn_empty_column : bool
            Should we warn when there are empty columns?
        """
        if not isinstance(df, pd.DataFrame):
            # TODO? accept series? (for univariate dataset, with index already set)
            raise LeaspyDataInputError(
                "Input should be a pandas.DataFrame not anything else."
            )

        df = df.copy(deep=True)  # No modification on the input dataframe !
        df = self._clean_index(df)
        df = self._clean_numeric_data(df, drop_full_nan, warn_empty_column)

        # Clean data
        df = self._clean_dataframe(
            df, drop_full_nan=drop_full_nan, warn_empty_column=warn_empty_column
        )

        # sort after duplicate checks and full of nans dropped
        if sort_index:
            df.sort_index(inplace=True)

        # Create individuals to store
        for idx_subj, df_subj in df.groupby(level="ID", sort=False):
            self.individuals[idx_subj] = IndividualData(idx_subj)
            self._load_individuals_data(self.individuals[idx_subj], df_subj)

            self.iter_to_idx[self.n_individuals] = idx_subj
            self.n_individuals += 1