Source code for leaspy.io.data.visit_dataframe_data_reader

from typing import Optional

import numpy as np
import pandas as pd

from leaspy.exceptions import LeaspyDataInputError

from .abstract_dataframe_data_reader import AbstractDataframeDataReader
from .individual_data import IndividualData

__all__ = ["VisitDataframeDataReader"]


[docs] class VisitDataframeDataReader(AbstractDataframeDataReader): """ Methods to convert :class:`pandas.DataFrame` to `Leaspy`-compliant data containers for longitudinal data only. Raises ------ :exc:`.LeaspyDataInputError` """ def __init__(self): super().__init__() @property def dimension(self) -> Optional[int]: """ Number of longitudinal outcomes in dataset. Returns ------- : :obj:`int` Number of longitudinal outcomes in dataset """ if self.long_outcome_names is None: return None return len(self.long_outcome_names) @classmethod def _check_TIME(cls, s: pd.Series) -> None: """ Check requirements on patient's visits indexing: only numeric value and no missing values are tolerated Parameters ---------- s: pd.Series Pandas series that contains the time at visits of each patient """ if not cls._check_numeric_type(s): raise LeaspyDataInputError( f"The `TIME` column should contain numeric values (not {s.dtype})." ) s.replace([np.inf, -np.inf], np.nan, inplace=True) if s.isna().any(): individuals_with_at_least_1_bad_tpt = s.isna().groupby("ID").any() individuals_with_at_least_1_bad_tpt = individuals_with_at_least_1_bad_tpt[ individuals_with_at_least_1_bad_tpt ].index.tolist() raise LeaspyDataInputError( "The `TIME` column should NOT contain any nan nor inf, " f"please double check these individuals:\n{individuals_with_at_least_1_bad_tpt}." ) def _check_headers(self, columns: list[str]) -> None: """ Check mendatory dataframe headers Parameters ---------- columns: List[str] Names of the columns headers of the dataframe that contains patients information """ missing_mandatory_columns = [_ for _ in ["ID", "TIME"] if _ not in columns] if len(missing_mandatory_columns) > 0: raise LeaspyDataInputError( f"Your dataframe must have {missing_mandatory_columns} columns" ) def _set_index(self, df: pd.DataFrame) -> pd.DataFrame: """ Set the index suited for the type of information contained in the dataframe Parameters ---------- df: pd.DataFrame Dataframe with patient information Returns ------- df: pd.DataFrame Dataframe with the right index """ # Check and clean visit times self._check_TIME(df.set_index("ID")["TIME"]) df["TIME"] = round( df["TIME"], self.time_rounding_digits ) # avoid missing duplicates due to rounding errors # Set index and make sure it is unique df.set_index(["ID", "TIME"], inplace=True) return df def _clean_dataframe( self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool ) -> pd.DataFrame: """ Clean the dataframe that contains repeated measures information for each visit Parameters ---------- df: pd.DataFrame Dataframe with patient information. drop_full_nan: bool If set to True, raw full of nan are dropped. warn_empty_column: bool If set to True, a warning is raise for columns full of nan. Returns ------- df: pd.DataFrame Dataframe with clean information. Raises ------ :exc:`.LeaspyDataInputError` : - If the :df:`pd.DataFrame` is empty in terms of raw - If the :df:`pd.DataFrame` is empty in terms of columns """ self.n_visits = len(df) if self.n_visits == 0: raise LeaspyDataInputError( "Dataframe should have at least 1 row (not full of nans)..." ) self.long_outcome_names = df.columns.tolist() if self.dimension < 1: raise LeaspyDataInputError("Dataframe should have at least 1 feature...") return df def _load_individuals_data( self, subj: IndividualData, df_subj: pd.DataFrame ) -> None: """ Convert information stored in a dataframe to information stored into IndividualData Parameters ---------- subj: IndividualData One patient with her/his information, potentially empty df_subj: pd.DataFrame One patient with her/his information """ subj.add_observations( timepoints=df_subj.index.get_level_values("TIME").to_list(), observations=df_subj[self.long_outcome_names].values.tolist(), )