Source code for leaspy.io.data.event_dataframe_data_reader
import warnings
from typing import Optional
import numpy as np
import pandas as pd
from leaspy.exceptions import LeaspyDataInputError
from .abstract_dataframe_data_reader import AbstractDataframeDataReader
from .individual_data import IndividualData
__all__ = ["EventDataframeDataReader"]
[docs]
class EventDataframeDataReader(AbstractDataframeDataReader):
"""
Methods to convert :class:`pandas.DataFrame` to `Leaspy`-compliant data containers for event data only.
Parameters
----------
event_time_name: str
Name of the columns in dataframe that contains the time of event
event_bool_name: str
Name of the columns in dataframe that contains if the event is censored of not
Raises
------
:exc:`.LeaspyDataInputError`
"""
def __init__(
self,
*,
event_time_name: str = "EVENT_TIME",
event_bool_name: str = "EVENT_BOOL",
nb_events: Optional[int] = None,
):
super().__init__()
self.event_time_name = event_time_name
self.event_bool_name = event_bool_name
self.nb_events = nb_events
@staticmethod
def _check_headers(columns: list[str]) -> None:
"""
Check mendatory dataframe headers
Parameters
----------
columns: List[str]
Names of the columns headers of the dataframe that contains patients information
"""
missing_mandatory_columns = ["ID"] if "ID" not in columns else []
if len(missing_mandatory_columns) > 0:
raise LeaspyDataInputError(
f"Your dataframe must have {missing_mandatory_columns} columns"
)
def _set_index(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Set the index suited for the type of information contained in the dataframe. Here, events are only indices provided by patient ID
Parameters
----------
df: pd.DataFrame
Dataframe with patient information
Returns
-------
df: pd.DataFrame
Dataframe with the right index
"""
return df.set_index(["ID"])
def _clean_dataframe(
self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool
) -> pd.DataFrame:
"""
Clean the dataframe that contains patient events information
Parameters
----------
df: pd.DataFrame
Dataframe with patient information
drop_full_nan: bool
If set to True, raw full of nan are dropped
warn_empty_column: bool
If set to True, a warning is raise for columns full of nan
Returns
-------
df: pd.DataFrame
Dataframe with clean information
"""
# [SPECIFIC] check_available_data
df_event = df.copy(deep=True)
# Assert events columns are the only one available
assert (df_event.columns == [self.event_time_name, self.event_bool_name]).all()
# Round
df_event[self.event_time_name] = round(
df_event[self.event_time_name], self.time_rounding_digits
) # avoid missing duplicates due to rounding errors
# Check event time data good format
if not (df_event[self.event_time_name] > 0).all():
raise LeaspyDataInputError("Events must be above 0")
# Check event bool good format
if not np.array_equal(
df_event[self.event_bool_name], df_event[self.event_bool_name].astype(int)
):
raise LeaspyDataInputError(
"Events must be stored in type int, with 0 equal to censored event"
)
df_event[self.event_bool_name] = df_event[self.event_bool_name].astype(int)
# Assert one unique event per patient and group to drop duplicates
if (
not (
df_event.groupby("ID")
.nunique()[[self.event_time_name, self.event_bool_name]]
.eq(1)
)
.all()
.all()
):
raise LeaspyDataInputError(
"There must be only an unique event_time and an unique event_bool per patient"
)
df_event = df_event.groupby("ID").first()
# Event must be empty to raise an error
if len(df_event) == 0:
raise LeaspyDataInputError(
"Dataframe should have at least 1 feature or an event"
)
nb_events = df_event[self.event_bool_name].max()
if not self.nb_events:
if nb_events == 0:
raise LeaspyDataInputError(
f"There are no event, please check your data or put the number of events"
)
self.nb_events = nb_events
elif self.nb_events != nb_events:
if nb_events == 0:
warnings.warn(
"There were no event in the dataset but you try to predict one"
)
else:
raise LeaspyDataInputError(
"The number of events you provided is different from the number of events available in the data"
)
return df_event
def _load_individuals_data(
self, subj: IndividualData, df_subj: pd.DataFrame
) -> None:
"""
Convert information stored in a dataframe to information stored into IndividualData
Parameters
----------
subj: IndividualData
One patient with her/his information, potentially empty
df_subj: pd.DataFrame
One patient with her/his information
"""
if self.nb_events < 1:
raise LeaspyDataInputError(
"The number of event should be equal or greater than 1"
)
time_at_event = [df_subj[self.event_time_name].unique()[0]] * self.nb_events
bool_at_event = [False] * self.nb_events
event_subj = df_subj[self.event_bool_name].unique()[0]
if event_subj != 0:
bool_at_event[event_subj - 1] = True
subj.add_event(time_at_event, bool_at_event)