Source code for leaspy.io.data.covariate_dataframe_data_reader

import warnings
from typing import Optional

import numpy as np
import pandas as pd

from leaspy.exceptions import LeaspyDataInputError
from leaspy.utils.typing import FeatureType

from .abstract_dataframe_data_reader import AbstractDataframeDataReader
from .individual_data import IndividualData
from .visit_dataframe_data_reader import VisitDataframeDataReader

__all__ = ["CovariateDataframeDataReader"]



[docs]
class CovariateDataframeDataReader(AbstractDataframeDataReader):
    """
    Methods to convert :class:`pandas.DataFrame` to `Leaspy`-compliant data containers for longitudinal data with covariates.

    Parameters
    ----------
    covariate_names: List[str]
        Names of the columns in dataframe that contains the covariates

    Raises
    ------
    :exc:`.LeaspyDataInputError`
    """

    def __init__(
        self,
        *,
        covariate_names: list[str],
    ):
        super().__init__()
        if not covariate_names:
            raise LeaspyDataInputError("You must prrovide at least one covariate name.")
        self.covariate_names = covariate_names
        self.visit_reader = VisitDataframeDataReader()

    @property
    def long_outcome_names(self) -> list[FeatureType]:
        """Name of the longitudinal outcomes in dataset"""
        return self.visit_reader.long_outcome_names

    @property
    def n_visits(self) -> int:
        """Number of visit in the dataset"""
        return self.visit_reader.n_visits

    ######################################################
    #               COVARIATE METHODS
    ######################################################

    def _check_headers(self, columns: list[str]) -> None:
        """
        Check mendatory dataframe headers

        Parameters
        ----------
        columns: List[str]
            Names of the columns headers of the dataframe that contains patients information
        """
        self.visit_reader._check_headers(columns)

    def _set_index(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Set the index suited for the type of information contained in the dataframe

        Parameters
        ----------
        df: pd.DataFrame
            Dataframe with patient information

        Returns
        -------
        df: pd.DataFrame
            Dataframe with the right index
        """

        return self.visit_reader._set_index(df)

    def _clean_dataframe_covariates(
        self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool
    ) -> pd.DataFrame:
        """
        Clean the dataframe that contains patient information

        Parameters
        ----------
        df: pd.DataFrame
            Dataframe with patient information

        drop_full_nan: bool
            If set to True, raw full of nan are dropped

        warn_empty_column: bool
            If set to True, a warning is raise for columns full of nan


        Returns
        -------
        df: pd.DataFrame
            Dataframe with clean information
        """

        df_covariate = df.copy(deep=True)

        if not (df_covariate.columns == self.covariate_names).all():
            raise LeaspyDataInputError(
                f"The covariate column names {df_covariate.columns} are "
                f"different from the provided covariate names {self.covariate_names}."
            )

        for covariate in self.covariate_names:
            if df_covariate[covariate].isna().any():
                raise LeaspyDataInputError(
                    f"Covariate '{covariate}' contains missing values (NaN)."
                    "Please ensure that values are provided for each visit."
                )

        for covariate in self.covariate_names:
            if not np.array_equal(
                df_covariate[covariate], df_covariate[covariate].astype(int)
            ):
                raise LeaspyDataInputError(
                    f"Covariate '{covariate}' must contain only integer values."
                )
            df_covariate[covariate] = df_covariate[covariate].astype(int)

        # Assert one unique covariate per patient and group to drop duplicates
        if (
            not (df_covariate.groupby("ID").nunique()[self.covariate_names].eq(1))
            .all()
            .all()
        ):
            raise LeaspyDataInputError(
                "There must be only an unique covariate value per patient."
            )
        df_covariate = df_covariate.groupby("ID").first()

        if len(df_covariate) == 0:
            raise LeaspyDataInputError("Dataframe should have at least 1 covariate")

        # Assert at least 2 different values per covariate
        for covariate in self.covariate_names:
            if (n_value := df_covariate[covariate].nunique(dropna=False)) < 2:
                raise LeaspyDataInputError(
                    f"The covariate '{covariate}' has only {n_value} unique value."
                    "Each covariate must have at least two distinct values across patients"
                )

        return df_covariate

    def _clean_dataframe(
        self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool
    ) -> pd.DataFrame:
        """
        Clean the dataframe that contains patient information

        Parameters
        ----------
        df: pd.DataFrame
            Dataframe with patient information

        drop_full_nan: bool
            If set to True, raw full of nan are dropped

        warn_empty_column: bool
            If set to True, a warning is raise for columns full of nan


        Returns
        -------
        df: pd.DataFrame
            Dataframe with clean information
        """

        df_visit = self.visit_reader._clean_dataframe(
            df.drop(columns=self.covariate_names),
            drop_full_nan=drop_full_nan,
            warn_empty_column=warn_empty_column,
        )

        df_covariate = self._clean_dataframe_covariates(
            df.reset_index()
            .drop(self.long_outcome_names + ["TIME"], axis=1)
            .set_index("ID"),
            drop_full_nan=drop_full_nan,
            warn_empty_column=warn_empty_column,
        )

        if (
            not df_covariate.groupby("ID")
            .first()
            .index.equals(df_visit.groupby("ID").first().index)
        ):
            raise LeaspyDataInputError(
                "All patients must have at least one visit and one covariate"
            )

        df = df_visit.join(df_covariate)

        return df

    def _load_individuals_data_covariates(
        self, subj: IndividualData, df_subj: pd.DataFrame
    ) -> None:
        """
        Convert information stored in a dataframe to information stored into IndividualData

        Parameters
        ----------
        subj: IndividualData
            One patient with her/his information, potentially empty

        df_subj: pd.DataFrame
            One patient with her/his information
        """
        subj.add_covariates(
            covariates=df_subj[self.covariate_names].iloc[0].values.tolist()
        )

    def _load_individuals_data(
        self, subj: IndividualData, df_subj: pd.DataFrame
    ) -> None:
        """
        Convert information stored in a dataframe to information stored into IndividualData

        Parameters
        ----------
        subj: IndividualData
            One patient with her/his information, potentially empty

        df_subj: pd.DataFrame
            One patient with her/his information
        """
        self.visit_reader._load_individuals_data(subj, df_subj)
        self._load_individuals_data_covariates(subj, df_subj)