Source code for leaspy.io.data.abstract_dataframe_data_reader

import warnings
from abc import abstractmethod

import numpy as np
import pandas as pd

from leaspy.exceptions import LeaspyDataInputError
from leaspy.utils.typing import IDType

from .individual_data import IndividualData

__all__ = ["AbstractDataframeDataReader"]


[docs] class AbstractDataframeDataReader: """ Methods to convert :class:`pandas.DataFrame` to `Leaspy`-compliant data containers. Raises ------ :exc:`.LeaspyDataInputError` """ time_rounding_digits = 6 def __init__(self): self.individuals: dict[IDType, IndividualData] = {} self.iter_to_idx: dict[int, IDType] = {} self.n_individuals: int = 0 ###################################################### # COMMON METHODS ###################################################### @staticmethod def _check_numeric_type(dtype) -> bool: """ Check if the type of the pandas data is numeric or not Parameters ---------- s: pandas.Series.dtype pandas type of the data Returns ------- : :obj:`bool` True if the type is a numeric type """ return pd.api.types.is_numeric_dtype( dtype ) and not pd.api.types.is_complex_dtype(dtype) @classmethod def _check_ID(cls, s: pd.Series) -> None: """ Check requirements on subjects identifiers. Parameters ---------- s: pd.Series Identifiers of the patients Raises ------ :exc:`.LeaspyModelInputError` : - If the :s:`pd.Series` is not a string, integer or categories - If the :s:`pd.Series` contains Nan - If the :s:`pd.Series` is integer and contain negative values - If the :s:`pd.Series` is string and contain empty strings """ # TODO? enforce strings? (for compatibility for downstream requirements especially in IndividualParameters) valid_dtypes = ["string", "integer", "categorical"] inferred_dtype = pd.api.types.infer_dtype(s) if inferred_dtype not in valid_dtypes: raise LeaspyDataInputError( "The `ID` column should identify individuals as string, integer or categories, " f"not {inferred_dtype} ({s.dtype})." ) if s.isna().any(): # NOTE: as soon as a np.nan or np.inf, inferred_dtype cannot be 'integer' # but custom pandas dtype can still contain pd.NA raise LeaspyDataInputError( f"The `ID` column should NOT contain any nan ({s.isna().sum()} found)." ) if inferred_dtype == "integer": if (s < 0).any(): raise LeaspyDataInputError( "All `ID` should be >= 0 when subjects are identified as integers, " "use string identifiers if you need more flexibility." ) elif inferred_dtype == "string": if (s.str.len() == 0).any(): raise LeaspyDataInputError( "No `ID` should be empty when subjects are identified as strings." ) def _clean_index(self, df: pd.DataFrame) -> pd.DataFrame: """ Check requirements on subjects identifiers: - ID represents patient index, - TIME represents the "age" of the patient at visit for visit indexing. Parameters ---------- df: pd.DataFrame The whole dataframe with patient information. Returns ------- df: pd.DataFrame The whole dataframe with patient information with a clean, set index """ df = df.copy(deep=True) # Check columns headers columns = df.columns.tolist() # Try to read the raw dataframe try: self._check_headers(columns) # If we do not find 'ID' and 'TIME' columns, check the Index except LeaspyDataInputError: df.reset_index(inplace=True) columns = df.columns.tolist() self._check_headers(columns) # Check patient ID common to every format self._check_ID(df["ID"]) df = self._set_index(df) if not df.index.is_unique: # get lines number as well as ID & TIME for duplicates (original line numbers) df_dup = df[[]].reset_index().duplicated(keep=False) df_dup = df_dup[df_dup] raise LeaspyDataInputError(f"Some raw are duplicated:\n{df_dup}") return df def _clean_numeric_data( self, df: pd.DataFrame, drop_full_nan: bool, warn_empty_column: bool ) -> pd.DataFrame: """ Dataframe with patient information should only contain numeric data. This method check that this is the case, clean nans and empty columns. Parameters ---------- df: pd.DataFrame Dataframe with patient information. drop_full_nan: bool If set to True, raw full of nan are dropped. warn_empty_column: bool If set to True, a warning is raise for columns full of nan. Returns ------- df: pd.DataFrame Dataframe with clean numeric information. """ # Check and clean numerical values types_nok = { ft: dtype for ft, dtype in df.dtypes.items() if not self._check_numeric_type(dtype) } if types_nok: raise LeaspyDataInputError( f"All columns should be of numerical type, which is not the case for {types_nok}." ) try: # it is needed so to always use numpy.nan as nans even if pd.NA were used originally df = df.astype(float) except Exception as e: raise LeaspyDataInputError( "Cannot safely convert dataframe to float type." ) from e # warn if some columns are full of nans full_of_nans = df.isna().all(axis=0) full_of_nans = full_of_nans[full_of_nans].index.tolist() if warn_empty_column and full_of_nans: warnings.warn(f"These columns only contain nans: {full_of_nans}.") # check that no 'inf' are present in dataframe df_inf = np.isinf(df) # numpy.nan are considered finite :) df_inf_rows_and_cols = ( df.where(df_inf) .dropna(how="all", axis=0) .dropna(how="all", axis=1) .fillna("") ) if len(df_inf_rows_and_cols) != 0: raise LeaspyDataInputError( f"Values may be nan but not infinite, double check your data:\n{df_inf_rows_and_cols}" ) # Drop visits full of nans so to get a correct number of total visits if drop_full_nan: df = df.dropna(how="all") return df @abstractmethod def _check_headers(self, columns: list[str]) -> None: """ Check mendatory dataframe headers Parameters ---------- columns: List[str] Names of the columns headers of the dataframe that contains patients information """ @abstractmethod def _set_index(self, df: pd.DataFrame) -> pd.DataFrame: """ Set the index suited for the type of information contained in the dataframe Parameters ---------- df: pd.DataFrame Dataframe with patient information Returns ------- df: pd.DataFrame Dataframe with the right index """ @abstractmethod def _clean_dataframe( self, df: pd.DataFrame, *, drop_full_nan: bool, warn_empty_column: bool ) -> pd.DataFrame: """ Clean the dataframe that contains patient information. This method depends on the data type that is analysed: repeated measures, events or both Parameters ---------- df: pd.DataFrame Dataframe with patient information. drop_full_nan: bool If set to True, raw full of nan are dropped. warn_empty_column: bool If set to True, a warning is raise for columns full of nan. Returns ------- df: pd.DataFrame Dataframe with clean information. """ @abstractmethod def _load_individuals_data( self, subj: IndividualData, df_subj: pd.DataFrame ) -> None: """ Convert information stored in a dataframe to information stored into IndividualData Parameters ---------- subj: IndividualData One patient with her/his information, potentially empty df_subj: pd.DataFrame One patient with her/his information """ ###################################################### # MAIN METHOD ######################################################
[docs] def read( self, df: pd.DataFrame, *, drop_full_nan: bool = True, sort_index: bool = False, warn_empty_column: bool = True, ) -> None: """ The method that effectively reads the input dataframe (automatically called in __init__). Parameters ---------- df : :class:`pandas.DataFrame` The dataframe to read. drop_full_nan : bool Should we drop rows full of nans? (except index) sort_index : bool Should we lexsort index? (Keep False as default so not to break many of the downstream tests that check order...) warn_empty_column : bool Should we warn when there are empty columns? """ if not isinstance(df, pd.DataFrame): # TODO? accept series? (for univariate dataset, with index already set) raise LeaspyDataInputError( "Input should be a pandas.DataFrame not anything else." ) df = df.copy(deep=True) # No modification on the input dataframe ! df = self._clean_index(df) df = self._clean_numeric_data(df, drop_full_nan, warn_empty_column) # Clean data df = self._clean_dataframe( df, drop_full_nan=drop_full_nan, warn_empty_column=warn_empty_column ) # sort after duplicate checks and full of nans dropped if sort_index: df.sort_index(inplace=True) # Create individuals to store for idx_subj, df_subj in df.groupby(level="ID", sort=False): self.individuals[idx_subj] = IndividualData(idx_subj) self._load_individuals_data(self.individuals[idx_subj], df_subj) self.iter_to_idx[self.n_individuals] = idx_subj self.n_individuals += 1