Source code for leaspy.io.data.data

from __future__ import annotations

import warnings
from collections.abc import Iterable, Iterator
from typing import Optional, Union

import pandas as pd

from leaspy.exceptions import LeaspyDataInputError, LeaspyTypeError
from leaspy.utils.typing import FeatureType, IDType

from .factory import dataframe_data_reader_factory
from .individual_data import IndividualData

__all__ = ["Data"]



[docs]
class Data(Iterable):
    """
    Main data container for a collection of individuals

    It can be iterated over and sliced, both of these operations being
    applied to the underlying `individuals` attribute.

    Attributes
    ----------
    individuals : :class:`~leaspy.utils.typing.Dict` [:class:`~leaspy.utils.typing.IDType` , :class:`~leaspy.individual_data.IndividualData`]
        Included individuals and their associated data
    iter_to_idx : :class:`~leaspy.utils.typing.Dict` [:obj:`int`, :class:`~leaspy.utils.typing.IDType`]
        Maps an integer index to the associated individual ID
    headers : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`]
        Feature names
    dimension : :obj:`int`
        Number of features
    n_individuals : :obj:`int`
        Number of individuals
    n_visits : :obj:`int`
        Total number of visits
    cofactors : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`]
        Feature names corresponding to cofactors
    event_time_name : :obj:`str`
        Name of the header that store the time at event in the original dataframe
    event_bool_name : :obj:`str`
        Name of the header that store the bool at event (censored or observed) in the original dataframe
    """

    def __init__(self):
        """
        Initialize the Data object
        """
        # Patients information
        self.individuals: dict[IDType, IndividualData] = {}
        self.iter_to_idx: dict[int, IDType] = {}

        # Longitudinal outcomes information
        self.headers: Optional[list[FeatureType]] = None

        # Event information
        self.event_time_name: Optional[str] = None
        self.event_bool_name: Optional[str] = None

        # Covariate information
        self.covariate_names: Optional[list[str]] = None

    @property
    def dimension(self) -> Optional[int]:
        """
        Number of features

        Returns
        -------
        :obj:`int` or None:
            Number of features in the dataset. If no features are present, returns None.
        """
        if self.headers is None:
            return None
        return len(self.headers)

    @property
    def n_individuals(self) -> int:
        """
        Number of individuals

        Returns
        -------
        :obj:`int`:
            Number of individuals in the dataset.
        """
        return len(self.individuals)

    @property
    def n_visits(self) -> int:
        """
        Total number of visits

        Returns
        -------
        :obj:`int`:
            Total number of visits in the dataset.
        """
        if self.dimension:
            return sum(len(indiv.timepoints) for indiv in self.individuals.values())

    @property
    def cofactors(self) -> list[FeatureType]:
        """
        Feature names corresponding to cofactors

        Returns
        -------
        :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`]:
            List of feature names corresponding to cofactors.
        """
        if len(self.individuals) == 0:
            return []
        # Consistency checks are in place to ensure that cofactors are the same
        # for all individuals, so they can be retrieved from any one
        indiv = next(x for x in self.individuals.values())
        return list(indiv.cofactors.keys())

    def __getitem__(
        self, key: Union[int, IDType, slice, list[int], list[IDType]]
    ) -> Union[IndividualData, Data]:
        """
        Access the individuals in the Data object using their ID or integer index.

        Parameters
        ----------
        key : :obj:`int` or :class:`~leaspy.utils.typing.IDType` or :obj:`slice` or :class:`~leaspy.utils.typing.List` [:obj:`int`] or :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.IDType`]
            The key(s) to access the individuals.
            Can be an integer index, an ID, a slice object or a list of integers or IDs.

        Returns
        -------
        :class:`~leaspy.individual_data.IndividualData` or :class:`~leaspy.utils.typing.Data`:
            The individual data corresponding to the key(s).
            If a single key is provided, returns the corresponding `IndividualData` object.
            If a slice or list of keys is provided, returns a new `Data` object
            containing the selected individuals.

        Raises
        ------
        :exc:`.LeaspyTypeError`
            If the key is not of a valid type or if the list of keys contains mixed types.
        """
        if isinstance(key, int):
            return self.individuals[self.iter_to_idx[key]]

        elif isinstance(key, IDType):
            return self.individuals[key]

        elif isinstance(key, (slice, list)):
            if isinstance(key, slice):
                slice_iter = range(self.n_individuals)[key]
                individual_indices = [self.iter_to_idx[i] for i in slice_iter]
            else:
                if all(isinstance(value, int) for value in key):
                    individual_indices = [self.iter_to_idx[i] for i in key]
                elif all(isinstance(value, IDType) for value in key):
                    individual_indices = key
                else:
                    raise LeaspyTypeError(
                        "Cannot access a Data object using " "a list of this type"
                    )

            individuals = [self.individuals[i] for i in individual_indices]
            return Data.from_individuals(
                individuals,
                self.headers,
                self.event_time_name,
                self.event_bool_name,
                self.covariate_names,
            )

        raise LeaspyTypeError("Cannot access a Data object this way")

    def __iter__(self) -> Iterator:
        """
        Iterate over the individuals in the Data object.

        Returns
        -------
        :class:`~Iterator`:
            An iterator over the individuals in the Data object.
        """

        # Ordering the index list first ensures that the order used by the
        # iterator is consistent with integer indexing  of individual data,
        # e.g. when using `enumerate`
        ordered_idx_list = [
            self.iter_to_idx[k] for k in sorted(self.iter_to_idx.keys())
        ]
        return iter([self.individuals[it] for it in ordered_idx_list])

    def __contains__(self, key: IDType) -> bool:
        """
        Check if the Data object contains an individual with the given ID.

        Parameters
        ----------
        key : :class:`~leaspy.utils.typing.IDType`
            The ID of the individual to check for.

        Returns
        -------
        :obj:`bool`:
            True if the individual is present in the Data object, False otherwise.

        Raises
        ------
        :exc:`.LeaspyTypeError`
            If the key is not of a valid type.
        """
        if isinstance(key, IDType):
            return key in self.individuals.keys()
        else:
            raise LeaspyTypeError(
                "Cannot test Data membership for " "an element of this type"
            )


[docs]
    def load_cofactors(
        self, df: pd.DataFrame, *, cofactors: Optional[list[FeatureType]] = None
    ) -> None:
        """
        Load cofactors from a `pandas.DataFrame` to the `Data` object

        Parameters
        ----------
        df : :obj:`pandas.DataFrame`
            The dataframe where the cofactors are stored.
            Its index should be ID, the identifier of subjects
            and it should uniquely index the dataframe (i.e. one row per individual).
        cofactors : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`], optional
            Names of the column(s) of dataframe which shall be loaded as cofactors.
            If None, all the columns from the input dataframe will be loaded as cofactors.
            Default: None

        """
        _check_cofactor_index(df)
        self._check_cofactor_index_is_consistent_with_data_index(df)
        self._check_no_individual_missing(df)
        internal_indices = pd.Index(self.iter_to_idx.values())
        if cofactors is None:
            cofactors = df.columns.tolist()
        cofactors_dict = df.loc[internal_indices, cofactors].to_dict(orient="index")
        for subject_name, subject_cofactors in cofactors_dict.items():
            self.individuals[subject_name].add_cofactors(subject_cofactors)


    def _check_cofactor_index_is_consistent_with_data_index(self, df: pd.DataFrame):
        """
        Check that the index of the dataframe is consistent with the
        index of the Data object.

        Parameters
        ----------
        df : :obj:`pandas.DataFrame`
            The dataframe where the cofactors are stored.

        Raises
        ------
        :exc:`.LeaspyDataInputError`
            If the index of the dataframe is not consistent with the
            index of the Data object.
        """
        if (cofactors_dtype_indices := pd.api.types.infer_dtype(df.index)) != (
            internal_dtype_indices := pd.api.types.infer_dtype(
                self.iter_to_idx.values()
            )
        ):
            raise LeaspyDataInputError(
                f"The ID type in your cofactors ({cofactors_dtype_indices}) "
                f"is inconsistent with the ID type in Data ({internal_dtype_indices}):\n{df.index}"
            )

    def _check_no_individual_missing(self, df: pd.DataFrame):
        """
        Check that the individuals in the Data object are present in the dataframe.

        Parameters
        ----------
        df : :obj:`pandas.DataFrame`
            The dataframe where the cofactors are stored.

        Raises
        ------
        :exc:`.LeaspyDataInputError`
            If some individuals are missing in the dataframe.
        """
        internal_indices = pd.Index(self.iter_to_idx.values())
        if len(missing_individuals := internal_indices.difference(df.index)):
            raise LeaspyDataInputError(
                f"These individuals are missing: {missing_individuals}"
            )
        if len(unknown_individuals := df.index.difference(internal_indices)):
            warnings.warn(
                f"These individuals with cofactors are not part of your Data: {unknown_individuals}"
            )


[docs]
    @staticmethod
    def from_csv_file(
        path: str,
        data_type: str = "visit",
        *,
        pd_read_csv_kws: dict = {},
        facto_kws: dict = {},
        **df_reader_kws,
    ) -> Data:
        """
        Create a `Data` object from a CSV file.

        Parameters
        ----------
        path : :obj:`str`
            Path to the CSV file to load (with extension)
        data_type : :obj:`str`
            Type of data to read. Can be 'visit' or 'event'.
        pd_read_csv_kws : :obj:`dict`
            Keyword arguments that are sent to :func:`pandas.read_csv`
        facto_kws : :obj:`dict`
            Keyword arguments
        **df_reader_kws :
            Keyword arguments that are sent to :class:`~AbstractDataframeDataReader` to :func:`dataframe_data_reader_factory`

        Returns
        -------
        :class:`~leaspy.utils.typing.Data`:
            A Data object containing the data from the CSV file.
        """
        # enforce ID to be interpreted as string as default (can be overwritten)
        pd_read_csv_kws = {"dtype": {"ID": str}, **pd_read_csv_kws}
        df = pd.read_csv(path, **pd_read_csv_kws)

        reader = dataframe_data_reader_factory(data_type, **facto_kws)
        reader.read(df=df, **df_reader_kws)
        return Data._from_reader(
            reader,
        )



[docs]
    def to_dataframe(
        self,
        *,
        cofactors: Optional[Union[list[FeatureType], str]] = None,
        reset_index: bool = True,
    ) -> pd.DataFrame:
        """
        Convert the Data object to a :obj:`pandas.DataFrame`

        Parameters
        ----------
        cofactors : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`] or :obj:`int`, optional
            Cofactors to include in the DataFrame.
            If None (default), no cofactors are included.
            If "all", all the available cofactors are included.
            Default: None

        reset_index : :obj:`bool`, optional
            Whether to reset index levels in output.
            Default: True

        Returns
        -------
        :obj:`pandas.DataFrame`:
            A DataFrame containing the individuals' ID, timepoints and
            associated observations (optional - and cofactors).

        Raises
        ------
        :exc:`.LeaspyDataInputError`
            If the Data object does not contain any cofactors.
        :exc:`.LeaspyTypeError`
            If the cofactors argument is not of a valid type.
        """
        cofactors_list = self._validate_cofactors_input(cofactors)
        df = pd.concat(
            [
                individual_data.to_frame(
                    self.headers,
                    self.event_time_name,
                    self.event_bool_name,
                    self.covariate_names,
                )
                for individual_data in self.individuals.values()
            ]
        )
        for cofactor in cofactors_list:
            for i in self.individuals.values():
                individual_slice = pd.IndexSlice[i.idx, :]
                df.loc[individual_slice, cofactor] = i.cofactors[cofactor]
        if reset_index:
            df = df.reset_index()

        return df


    def _validate_cofactors_input(
        self, cofactors: Optional[Union[list[FeatureType], str]] = None
    ) -> list[FeatureType]:
        """
        Validate the cofactors input for the to_dataframe method.

        Parameters
        ----------
        cofactors : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`] or :obj:`int`, optional
            Cofactors to include in the DataFrame.
            If None (default), no cofactors are included.
            If "all", all the available cofactors are included.
            Default: None

        Returns
        -------
        :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`]:
            A list of the validated cofactors.

        Raises
        ------
        :exc:`.LeaspyDataInputError`
            If the Data object does not contain given cofactors.
        :exc:`.LeaspyTypeError`
            If the cofactors argument is not of a valid type.
        """
        if cofactors is None:
            return []
        if isinstance(cofactors, str):
            if cofactors == "all":
                return self.cofactors
            raise LeaspyDataInputError("Invalid `cofactors` argument value")
        if not (
            isinstance(cofactors, list) and all(isinstance(c, str) for c in cofactors)
        ):
            raise LeaspyTypeError("Invalid `cofactors` argument type")
        if len(unknown_cofactors := list(set(cofactors) - set(self.cofactors))):
            raise LeaspyDataInputError(
                f"These cofactors are not part of your Data: {unknown_cofactors}"
            )
        return cofactors


[docs]
    @staticmethod
    def from_dataframe(
        df: pd.DataFrame, data_type: str = "visit", factory_kws: dict = {}, **kws
    ) -> Data:
        """
        Create a `Data` object from a :class:`~pandas.DataFrame`.

        Parameters
        ----------
        df : :obj:`pandas.DataFrame`
            Dataframe containing ID, TIME and features.
        data_type : :obj:`str`
            Type of data to read. Can be 'visit', 'event', 'joint'
        factory_kws : :class:`~leaspy.utils.typing.Dict`
            Keyword arguments that are sent to :func:`.dataframe_data_reader_factory`
        **kws
            Keyword arguments that are sent to :class:`~leaspy.utils.typing.DataframeDataReader`

        Returns
        -------
        :class:`~leaspy.utils.typing.Data`
        """
        reader = dataframe_data_reader_factory(data_type, **factory_kws)
        reader.read(df, **kws)
        return Data._from_reader(reader)


    @staticmethod
    def _from_reader(reader) -> Data:
        """
        Create a Data object from a reader

        Parameters
        ----------
        reader : :class:`~AbstractDataframeDataReader`
            Reader object containing the data

        Returns
        -------
        :class:`~leaspy.utils.typing.Data`
            A Data object containing the data from the reader.

        """
        data = Data()
        data.individuals = reader.individuals
        data.iter_to_idx = reader.iter_to_idx
        if hasattr(reader, "long_outcome_names"):
            data.headers = reader.long_outcome_names
        if hasattr(reader, "event_time_name"):
            data.event_time_name = reader.event_time_name
            data.event_bool_name = reader.event_bool_name
        if hasattr(reader, "covariate_names"):
            data.covariate_names = reader.covariate_names
        return data


[docs]
    @staticmethod
    def from_individual_values(
        indices: list[IDType],
        timepoints: Optional[list[list[float]]] = None,
        values: Optional[list[list[list[float]]]] = None,
        headers: Optional[list[FeatureType]] = None,
        event_time_name: Optional[str] = None,
        event_bool_name: Optional[str] = None,
        event_time: Optional[list[list[float]]] = None,
        event_bool: Optional[list[list[int]]] = None,
        covariate_names: Optional[list[str]] = None,
        covariates: Optional[list[list[int]]] = None,
    ) -> Data:
        """
        Construct `Data` from a collection of individual data points

        Parameters
        ----------
        indices : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.IDType`]
            List of the individuals' unique ID
        timepoints : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.List` [:obj:`float`]]
            For each individual ``i``, list of timepoints associated
            with the observations.
            The number of such timepoints is noted ``n_timepoints_i``
        values : :class:`~leaspy.utils.typing.List` [:obj:`array-like` [:obj:`float`, :obj:`2D`]]
            For each individual ``i``, two-dimensional array-like object
            containing observed data points.
            Its expected shape is ``(n_timepoints_i, n_features)``
        headers : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`]
            Feature names.
            The number of features is noted ``n_features``

        Returns
        -------
        :class:`~leaspy.utils.typing.Data`:
            A Data object containing the individuals and their data.
        """

        # Longitudinal input check
        if not headers:
            if timepoints or values:
                raise ("Not coherent inputs for longitudinal data")
        else:
            if not timepoints or not values:
                raise ("Not coherent inputs for longitudinal data")

        # Event input checks
        if not event_time_name:
            if event_bool_name or event_time or event_bool:
                raise ("Not coherent inputs for longitudinal data")
        else:
            if not event_bool_name or not event_time or not event_bool:
                raise ("Not coherent inputs for longitudinal data")

        # Covariates input checks
        if (covariate_names is None) != (covariates is None):
            raise ValueError(
                "Not coherent inputs for covariate data: \n "
                f"covariate_names = {covariate_names} and \n "
                f"covariates = {covariates}."
            )

        individuals = []
        for i, idx in enumerate(indices):
            indiv = IndividualData(idx)
            if headers:
                indiv.add_observations(timepoints[i], values[i])
            if event_time_name:
                indiv.add_event(event_time[i], event_bool[i])
            if covariate_names:
                indiv.add_covariates(covariates[i])
            individuals.append(indiv)

        return Data.from_individuals(
            individuals, headers, event_time_name, event_bool_name
        )



[docs]
    @staticmethod
    def from_individuals(
        individuals: list[IndividualData],
        headers: Optional[list[FeatureType]] = None,
        event_time_name: Optional[str] = None,
        event_bool_name: Optional[str] = None,
        covariate_names: Optional[list[str]] = None,
    ) -> Data:
        """
        Construct `Data` from a list of individuals

        Parameters
        ----------
        individuals : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.individual_data.IndividualData`]
            List of individuals
        headers : :class:`~leaspy.utils.typing.List` [:class:`~leaspy.utils.typing.FeatureType`]
            List of feature names

        Returns
        -------
        :class:`~leaspy.utils.typing.Data`:
            A Data object containing the individuals and their data.
        """

        data = Data()

        if headers:
            data.headers = headers
            n_features = len(headers)

        if event_time_name and event_bool_name:
            data.event_time_name = event_time_name
            data.event_bool_name = event_bool_name

        if covariate_names:
            data.covariate_names = covariate_names

        for indiv in individuals:
            idx = indiv.idx
            _, n_features_i = indiv.observations.shape
            if n_features_i != n_features:
                raise LeaspyDataInputError(
                    f"Inconsistent number of features for individual {idx}:\n"
                    f"Expected {n_features}, received {n_features_i}"
                )

            data.individuals[idx] = indiv
            data.iter_to_idx[data.n_individuals - 1] = idx

        return data



[docs]
    def extract_longitudinal_only(self) -> Data:
        """
        Extract longitudinal data from the Data object

        Returns
        -------
        :class:`~leaspy.utils.typing.Data`:
            A Data object containing only longitudinal data.

        Raises
        ------
        :exc:`.LeaspyDataInputError`
            If the Data object does not contain any longitudinal data.
        """

        if not self.headers:
            raise LeaspyDataInputError(
                "You can't extract longitudinal data from data that have none"
            )

        individuals = []
        for id, individual_data in self.individuals.items():
            indiv = IndividualData(id)
            indiv.add_observations(
                individual_data.timepoints, individual_data.observations
            )
            individuals.append(indiv)
        return Data.from_individuals(individuals, self.headers)




def _check_cofactor_index(df: pd.DataFrame):
    """
    Check that the index of the dataframe is a valid index for cofactors

    Parameters
    ----------
    df : :obj:`pandas.DataFrame`
        The dataframe where the cofactors are stored.

    Raises
    ------
    :exc:`.LeaspyDataInputError`
        If the index of the dataframe is not a valid index for cofactors.
    """
    if not (
        isinstance(df, pd.DataFrame)
        and isinstance(df.index, pd.Index)
        and df.index.names == ["ID"]
        and df.index.notnull().all()
        and df.index.is_unique
    ):
        raise LeaspyDataInputError(
            "You should pass a dataframe whose index ('ID') should "
            "not contain any NaN nor any duplicate."
        )