Source code for leaspy.io.data.covariate_dataframe_data_reader

import warnings
from typing import Optional

import numpy as np
import pandas as pd

from leaspy.exceptions import LeaspyDataInputError
from leaspy.utils.typing import FeatureType

from .abstract_dataframe_data_reader import AbstractDataframeDataReader
from .individual_data import IndividualData
from .visit_dataframe_data_reader import VisitDataframeDataReader

__all__ = ["CovariateDataframeDataReader"]


[docs] class CovariateDataframeDataReader(AbstractDataframeDataReader): """ Methods to convert :class:`pandas.DataFrame` to `Leaspy`-compliant data containers for longitudinal data with covariates. Parameters ---------- covariate_names: List[str] Names of the columns in dataframe that contains the covariates Raises ------ :exc:`.LeaspyDataInputError` """ def __init__( self, *, covariate_names: list[str], ): super().__init__() if not covariate_names: raise LeaspyDataInputError("You must prrovide at least one covariate name.") self.covariate_names = covariate_names self.visit_reader = VisitDataframeDataReader() @property def long_outcome_names(self) -> list[FeatureType]: """Name of the longitudinal outcomes in dataset""" return self.visit_reader.long_outcome_names @property def n_visits(self) -> int: """Number of visit in the dataset""" return self.visit_reader.n_visits ###################################################### # COVARIATE METHODS ###################################################### def _check_headers(self, columns: list[str]) -> None: """ Check mendatory dataframe headers Parameters ---------- columns: List[str] Names of the columns headers of the dataframe that contains patients information """ self.visit_reader._check_headers(columns) def _set_index(self, df: pd.DataFrame) -> pd.DataFrame: """ Set the index suited for the type of information contained in the dataframe Parameters ---------- df: pd.DataFrame Dataframe with patient information Returns ------- df: pd.DataFrame Dataframe with the right index """ return self.visit_reader._set_index(df) def _clean_dataframe_covariates( self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool ) -> pd.DataFrame: """ Clean the dataframe that contains patient information Parameters ---------- df: pd.DataFrame Dataframe with patient information drop_full_nan: bool If set to True, raw full of nan are dropped warn_empty_column: bool If set to True, a warning is raise for columns full of nan Returns ------- df: pd.DataFrame Dataframe with clean information """ df_covariate = df.copy(deep=True) if not (df_covariate.columns == self.covariate_names).all(): raise LeaspyDataInputError( f"The covariate column names {df_covariate.columns} are " f"different from the provided covariate names {self.covariate_names}." ) for covariate in self.covariate_names: if df_covariate[covariate].isna().any(): raise LeaspyDataInputError( f"Covariate '{covariate}' contains missing values (NaN)." "Please ensure that values are provided for each visit." ) for covariate in self.covariate_names: if not np.array_equal( df_covariate[covariate], df_covariate[covariate].astype(int) ): raise LeaspyDataInputError( f"Covariate '{covariate}' must contain only integer values." ) df_covariate[covariate] = df_covariate[covariate].astype(int) # Assert one unique covariate per patient and group to drop duplicates if ( not (df_covariate.groupby("ID").nunique()[self.covariate_names].eq(1)) .all() .all() ): raise LeaspyDataInputError( "There must be only an unique covariate value per patient." ) df_covariate = df_covariate.groupby("ID").first() if len(df_covariate) == 0: raise LeaspyDataInputError("Dataframe should have at least 1 covariate") # Assert at least 2 different values per covariate for covariate in self.covariate_names: if (n_value := df_covariate[covariate].nunique(dropna=False)) < 2: raise LeaspyDataInputError( f"The covariate '{covariate}' has only {n_value} unique value." "Each covariate must have at least two distinct values across patients" ) return df_covariate def _clean_dataframe( self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool ) -> pd.DataFrame: """ Clean the dataframe that contains patient information Parameters ---------- df: pd.DataFrame Dataframe with patient information drop_full_nan: bool If set to True, raw full of nan are dropped warn_empty_column: bool If set to True, a warning is raise for columns full of nan Returns ------- df: pd.DataFrame Dataframe with clean information """ df_visit = self.visit_reader._clean_dataframe( df.drop(columns=self.covariate_names), drop_full_nan=drop_full_nan, warn_empty_column=warn_empty_column, ) df_covariate = self._clean_dataframe_covariates( df.reset_index() .drop(self.long_outcome_names + ["TIME"], axis=1) .set_index("ID"), drop_full_nan=drop_full_nan, warn_empty_column=warn_empty_column, ) if ( not df_covariate.groupby("ID") .first() .index.equals(df_visit.groupby("ID").first().index) ): raise LeaspyDataInputError( "All patients must have at least one visit and one covariate" ) df = df_visit.join(df_covariate) return df def _load_individuals_data_covariates( self, subj: IndividualData, df_subj: pd.DataFrame ) -> None: """ Convert information stored in a dataframe to information stored into IndividualData Parameters ---------- subj: IndividualData One patient with her/his information, potentially empty df_subj: pd.DataFrame One patient with her/his information """ subj.add_covariates( covariates=df_subj[self.covariate_names].iloc[0].values.tolist() ) def _load_individuals_data( self, subj: IndividualData, df_subj: pd.DataFrame ) -> None: """ Convert information stored in a dataframe to information stored into IndividualData Parameters ---------- subj: IndividualData One patient with her/his information, potentially empty df_subj: pd.DataFrame One patient with her/his information """ self.visit_reader._load_individuals_data(subj, df_subj) self._load_individuals_data_covariates(subj, df_subj)