Source code for leaspy.io.data.joint_dataframe_data_reader
import warnings
from typing import Optional
import pandas as pd
from leaspy.exceptions import LeaspyDataInputError
from leaspy.utils.typing import FeatureType
from .abstract_dataframe_data_reader import AbstractDataframeDataReader
from .event_dataframe_data_reader import EventDataframeDataReader
from .individual_data import IndividualData
from .visit_dataframe_data_reader import VisitDataframeDataReader
__all__ = ["JointDataframeDataReader"]
[docs]
class JointDataframeDataReader(AbstractDataframeDataReader):
"""
Methods to convert :class:`pandas.DataFrame` to `Leaspy`-compliant data containers for event data and longitudinal data.
Parameters
----------
event_time_name: str
Name of the columns in dataframe that contains the time of event
event_bool_name: str
Name of the columns in dataframe that contains if the event is censored of not
Raises
------
:exc:`.LeaspyDataInputError`
"""
tol_diff = 0.001
def __init__(
self,
*,
event_time_name: str = "EVENT_TIME",
event_bool_name: str = "EVENT_BOOL",
nb_events: Optional[int] = None,
):
super().__init__()
self.visit_reader = VisitDataframeDataReader()
self.event_reader = EventDataframeDataReader(
event_time_name=event_time_name,
event_bool_name=event_bool_name,
nb_events=nb_events,
)
######################################################
# JOINT METHODS
######################################################
@property
def event_time_name(self) -> str:
"""Name of the event time column in dataset"""
return self.event_reader.event_time_name
@property
def event_bool_name(self) -> str:
"""Name of the event bool column in dataset"""
return self.event_reader.event_bool_name
@property
def dimension(self) -> Optional[int]:
"""Number of longitudinal outcomes in dataset."""
return self.visit_reader.dimension
@property
def long_outcome_names(self) -> list[FeatureType]:
"""Name of the longitudinal outcomes in dataset"""
return self.visit_reader.long_outcome_names
@property
def n_visits(self) -> int:
"""Number of visit in the dataset"""
return self.visit_reader.n_visits
def _check_headers(self, columns: list[str]) -> None:
"""
Check mendatory dataframe headers
Parameters
----------
columns: List[str]
Names of the columns headers of the dataframe that contains patients information
"""
self.visit_reader._check_headers(columns)
self.event_reader._check_headers(columns)
def _set_index(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Set the index suited for the type of information contained in the dataframe
Parameters
----------
df: pd.DataFrame
Dataframe with patient information
Returns
-------
df: pd.DataFrame
Dataframe with the right index
"""
return self.visit_reader._set_index(df)
def _clean_dataframe(
self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool
) -> pd.DataFrame:
"""
Clean the dataframe that contains patient information which are repeated measures and events
Parameters
----------
df: pd.DataFrame
Dataframe with patient information.
drop_full_nan: bool
If set to True, raw full of nan are dropped.
warn_empty_column: bool
If set to True, a warning is raise for columns full of nan.
Returns
-------
df: pd.DataFrame
Dataframe with clean information.
"""
# Check visits
df_visit = self.visit_reader._clean_dataframe(
df.drop([self.event_time_name, self.event_reader.event_bool_name], axis=1),
drop_full_nan=drop_full_nan,
warn_empty_column=warn_empty_column,
)
# Check events
df_event = self.event_reader._clean_dataframe(
df.reset_index()
.drop(self.long_outcome_names + ["TIME"], axis=1)
.set_index("ID"),
drop_full_nan=drop_full_nan,
warn_empty_column=warn_empty_column,
)
# [SPECIFIC] prepare_clean_output
if (
not df_event.groupby("ID")
.first()
.index.equals(df_visit.groupby("ID").first().index)
):
raise LeaspyDataInputError(
"All patients must have at least one visit and one event"
)
df = df_visit.join(df_event)
# Additional crossed check
df_test = df.reset_index().groupby("ID").max()
if not (
df_test[self.event_time_name] - df_test["TIME"] >= -self.tol_diff
).all():
df_before = df_test[
~(df_test[self.event_time_name] - df_test["TIME"] >= -self.tol_diff)
]
if df_before[self.event_bool_name].sum() == 0:
warnings.warn(
"You have event censored before the last available visits, you should be in a prediction set-up"
)
else:
raise LeaspyDataInputError(
f"Event should happen after or at the last visit "
f"for {df_before.index.tolist()} patients"
)
return df
def _load_individuals_data(
self, subj: IndividualData, df_subj: pd.DataFrame
) -> None:
"""
Convert information stored in a dataframe to information stored into IndividualData
Parameters
----------
subj: IndividualData
One patient with her/his information, potentially empty
df_subj: pd.DataFrame
One patient with her/his information
"""
self.visit_reader._load_individuals_data(subj, df_subj)
self.event_reader._load_individuals_data(subj, df_subj)