Source code for leaspy.algo.settings

"""This module defines the `OutputsSettings` and `AlgorithmSettings` classes."""

import json
import os
import shutil
import warnings
from pathlib import Path
from typing import Optional, Union

import torch

from leaspy.exceptions import LeaspyAlgoInputError
from leaspy.utils.typing import KwargsType

algo_default_data_dir = Path(__file__).parent.parent / "algo" / "data"

__all__ = [
    "OutputsSettings",
    "AlgorithmSettings",
    "algo_default_data_dir",
]



[docs]
class OutputsSettings:
    """
    Used to create the `logs` folder to monitor the convergence of the fit algorithm.

    Parameters
    ----------
    settings : :obj:`dict` [:obj:`str`, Any]
            * path : :obj:`str` or None
                Path to store logs. If None, default path "./_outputs/" will be used.
            * print_periodicity : :obj:`int` >= 1 or None
                Print information every N iterations
            * save_periodicity : :obj:`int` >= 1, optional
                Save convergence data every N iterations
                Default=50.
            * plot_periodicity : :obj:`int` >= 1 or None
                Plot convergence data every N iterations.
                If None, no plots will be saved.
                Note that plotting requires saving to be realized and can not be more than saves.
            * plot_sourcewise : :obj:`bool`
                Flag to plot source based multidimensional parameters such as mixing_matrix for each source.
                Otherwise, they will be plotted according to the other dimension such as feature.
                Default=False
            * overwrite_logs_folder : :obj:`bool`
                Flag to remove all previous logs if existing (default False)

    Raises
    ------
    :exc:`.LeaspyAlgoInputError`
    """

    DEFAULT_LOGS_DIR = "_outputs"

    def __init__(self, settings):
        self.print_periodicity = None
        self.plot_periodicity = None
        self.save_periodicity = None
        self.plot_patient_periodicity = None
        self.plot_sourcewise = False
        self.nb_of_patients_to_plot = 5

        self.root_path = None
        self.parameter_convergence_path = None
        self.plot_path = None
        self.patients_plot_path = None

        self._set_print_periodicity(settings)
        self._set_save_periodicity(settings)
        self._set_plot_periodicity(settings)
        self._set_nb_of_patients_to_plot(settings)
        self._set_plot_sourcewise(settings)
        self._set_plot_patient_periodicity(settings)

        # only create folders if the user want to save data or plots and provided a valid path!
        self._create_root_folder(settings)

    def _set_param_as_int_or_ignore(self, settings: dict, param: str):
        """Inplace set of parameter (as int) from settings."""
        if param not in settings:
            return

        val = settings[param]
        if val is not None:
            # try to cast as an integer.
            try:
                val = int(val)
                assert val >= 1
            except Exception:
                warnings.warn(
                    f"The '{param}' parameter you provided is not castable to an int > 0. "
                    "Ignoring its value.",
                    UserWarning,
                )
                return

        # Update the attribute of self in-place
        setattr(self, param, val)

    def _set_plot_sourcewise(self, settings: dict):
        setattr(self, "plot_sourcewise", settings["plot_sourcewise"])

    def _set_nb_of_patients_to_plot(self, settings: dict):
        self._set_param_as_int_or_ignore(settings, "nb_of_patients_to_plot")

    def _set_print_periodicity(self, settings: dict):
        self._set_param_as_int_or_ignore(settings, "print_periodicity")

    def _set_save_periodicity(self, settings: dict):
        self._set_param_as_int_or_ignore(settings, "save_periodicity")

    def _set_plot_periodicity(self, settings: dict):
        self._set_param_as_int_or_ignore(settings, "plot_periodicity")

    def _set_plot_patient_periodicity(self, settings: dict):
        self._set_param_as_int_or_ignore(settings, "plot_patient_periodicity")

        if self.plot_periodicity is not None:
            if self.save_periodicity is None:
                raise LeaspyAlgoInputError(
                    "You can not define a `plot_periodicity` without defining `save_periodicity`. "
                    "Note that the `plot_periodicity` should be a multiple of `save_periodicity`."
                )

            if self.plot_periodicity % self.save_periodicity != 0:
                raise LeaspyAlgoInputError(
                    "The `plot_periodicity` should be a multiple of `save_periodicity`."
                )

    def _create_root_folder(self, settings: dict):
        # Get the path to put the outputs
        path = settings.get("path", None)
        if path is None:
            if self.save_periodicity:
                warnings.warn(
                    f"Outputs will be saved in '{self.DEFAULT_LOGS_DIR}' relative to the current working directory",
                    stacklevel=2,
                )
                path = Path.cwd() / self.DEFAULT_LOGS_DIR
                if path.exists():
                    self._clean_folder(path)
            else:
                return
        else:
            path = Path.cwd() / path

        settings["path"] = str(path)

        # Check if the folder does not exist: if not, create (and its parent)
        if not path.exists():
            warnings.warn(
                f"The logs path you provided ({settings['path']}) does not exist. "
                "Needed paths will be created (and their parents if needed).",
                stacklevel=2,
            )
        elif settings.get("overwrite_logs_folder", False):
            warnings.warn(f"Overwriting '{path}' folder...")
            self._clean_folder(path)

        all_ok = self._check_needed_folders_are_empty_or_create_them(path)

        if not all_ok:
            raise LeaspyAlgoInputError(
                f"The logs folder '{path}' already exists and is not empty! "
                "Give another path or use keyword argument `overwrite_logs_folder=True`."
            )

    @staticmethod
    def _check_folder_is_empty_or_create_it(path_folder: Path) -> bool:
        if path_folder.exists():
            if (
                os.path.islink(path_folder)
                or not path_folder.is_dir()
                or len([f for f in path_folder.iterdir()]) > 0
            ):
                return False
        else:
            path_folder.mkdir(parents=True, exist_ok=True)

        return True

    @staticmethod
    def _clean_folder(path: Path):
        shutil.rmtree(path)
        path.mkdir(exist_ok=True, parents=True)

    def _check_needed_folders_are_empty_or_create_them(self, path: Path) -> bool:
        self.root_path = path
        self.parameter_convergence_path = path / "parameter_convergence"
        self.plot_path = path / "plots"
        self.patients_plot_path = self.plot_path / "patients"
        all_ok = self._check_folder_is_empty_or_create_it(
            self.parameter_convergence_path
        )
        all_ok &= self._check_folder_is_empty_or_create_it(self.plot_path)
        all_ok &= self._check_folder_is_empty_or_create_it(self.patients_plot_path)

        return all_ok




[docs]
class AlgorithmSettings:
    """
    Used to set the algorithms' settings.

    All parameters except the algorithm name have default values, which can be overwritten by the user.

    Parameters
    ----------
    name : str
        The algorithm's name. Must be one of:
            * For `fit` algorithms:
                * ``'mcmc_saem'``
                * ``'lme_fit'`` (for LME model only)
            * For `personalize` algorithms:
                * ``'scipy_minimize'``
                * ``'mean_real'``
                * ``'mode_real'``
                * ``'constant_prediction'`` (for constant model only)
                * ``'lme_personalize'`` (for LME model only)
            * For `simulate` algorithms:
                * ``'simulation'``

    **kwargs : any
        Depending on the algorithm, various parameters are possible:
            * seed : :obj:`int`, optional, default None
                Used for stochastic algorithms.
            * algorithm_initialization_method : :obj:`str`, optional
                Personalize the algorithm initialization method, according to those possible for the given algorithm
                (refer to its documentation in :mod:`leaspy.algo`).
            * n_iter : :obj:`int`, optional
                Number of iteration. Note that there is no stopping criteria for MCMC SAEM algorithms.
            * n_burn_in_iter : :obj:`int`, optional
                Number of iteration during burning phase, used for the MCMC SAEM algorithms.
            * use_jacobian : :obj:`bool`, optional, default True
                Used in ``scipy_minimize`` algorithm to perform a `L-BFGS` instead of a `Powell` algorithm.
            * n_jobs : :obj:`bool`, optional, default 1
                Used in ``scipy_minimize`` algorithm to accelerate calculation with parallel derivation using joblib.
            * progress_bar : :obj:`bool`, optional, default True
                Used to display a progress bar during computation.
            * device : :obj:`int` or torch.device, optional
                Specifies on which device the algorithm will run. Only 'cpu' and 'cuda' are supported for this argument.
                Only ``'mcmc_saem'``, ``'mean_real'`` and ``'mode_real'`` algorithms support this setting.

        For the complete list of the available parameters for a given algorithm, please directly refer to its documentation.

    Attributes
    ----------
    name : :obj:`str`
        The algorithm's name.
    algorithm_initialization_method : :obj:`str`, optional
        Personalize the algorithm initialization method, according to those possible for the given algorithm
        (refer to its documentation in :mod:`leaspy.algo`).
    seed : :obj:`int`, optional, default None
        Used for stochastic algorithms.
    parameters :  :obj:`dict`
        Contains the other parameters: `n_iter`, `n_burn_in_iter`, `use_jacobian`, `n_jobs` & `progress_bar`.
    logs : :class:`.OutputsSettings`, optional
        Used to create a ``logs`` file containing convergence information during fitting the model.
    device : :obj:`str` (or torch.device), optional, default 'cpu'
        Specifies the computation device. Only `'cpu'` and `'cuda'` are supported.
        Note that specifying an indexed CUDA device (such as 'cuda:1') is not supported.
        In order to specify the precise cuda device index, one should use the `CUDA_VISIBLE_DEVICES` environment variable.

    Raises
    ------
    :exc:`.LeaspyAlgoInputError`

    Notes
    -----
    Developers can use `_dynamic_default_parameters` to define settings that depend on other parameters when
    not explicitly specified by the user.

    """

    # TODO should be in the each algo class directly?
    _dynamic_default_parameters = {
        "lme_fit": [
            (
                lambda kw: "force_independent_random_effects" in kw
                and kw["force_independent_random_effects"],
                {
                    ("method",): lambda kw: [
                        "lbfgs",
                        "bfgs",
                    ]  # Powell & Nelder-Mead methods cannot ensure respect of "free"
                },
            )
        ]
    }

    # known keys for all algorithms (<!> not all of them are mandatory!)
    _known_keys = [
        "name",
        "seed",
        "algorithm_initialization_method",
        "parameters",
        "device",
    ]  # 'logs' are not handled in exported files

    def __init__(self, name: str, **kwargs):
        from leaspy.algo import AlgorithmName

        self.name: AlgorithmName = AlgorithmName(name)
        self.parameters: Optional[KwargsType] = None
        self.seed: Optional[int] = None
        self.algorithm_initialization_method: Optional[str] = None
        self.logs: Optional[OutputsSettings] = None
        default_algo_settings_path = (
            algo_default_data_dir / f"default_{self.name.value}.json"
        )
        if default_algo_settings_path.is_file():
            self._load_default_values(default_algo_settings_path)
        else:
            raise LeaspyAlgoInputError(
                f"The algorithm name '{self.name.value}' you provided does not exist"
            )
        self._manage_kwargs(kwargs)
        self.check_consistency()


[docs]
    def check_consistency(self) -> None:
        """
        Check internal consistency of algorithm settings and warn or raise a `LeaspyAlgoInputError` if not.
        """
        from .algo_with_device import AlgorithmWithDeviceMixin
        from .base import get_algorithm_class

        algo_class = get_algorithm_class(self.name)
        if self.seed is not None and algo_class.deterministic:
            warnings.warn(
                f"You can skip defining `seed` since the algorithm {self.name} is deterministic."
            )
        if hasattr(self, "device") and not issubclass(
            algo_class, AlgorithmWithDeviceMixin
        ):
            warnings.warn(
                f'The algorithm "{self.name}" does not support user-specified devices (this '
                "is supported only for specific algorithms) and will use the default device (CPU)."
            )


    @classmethod
    def _recursive_merge_dict_warn_extra_keys(
        cls, ref: dict, new: dict, *, prefix_keys: str = ""
    ):
        """Merge in-place dictionary `ref` with the values from `new`, for dict keys, merge is recursive."""
        extra_keys = [prefix_keys + k for k in new if k not in ref]
        if extra_keys:
            warnings.warn(
                f"The parameters {extra_keys} were not present by default and are likely to be unsupported."
            )
        for k, v in new.items():
            if k not in ref or not isinstance(ref[k], dict):
                ref[k] = v
            else:
                if not isinstance(v, dict):
                    raise LeaspyAlgoInputError(
                        f"Algorithm parameter `{prefix_keys + k}` should be a dictionary, not '{v}' of type {type(v)}."
                    )
                cls._recursive_merge_dict_warn_extra_keys(
                    ref[k], v, prefix_keys=f"{prefix_keys}{k}."
                )


[docs]
    @classmethod
    def load(cls, path_to_algorithm_settings: Union[str, Path]):
        """Instantiate a AlgorithmSettings object a from json file.

        Parameters
        ----------
        path_to_algorithm_settings :  :obj:`str`
            Path of the json file.

        Returns
        -------
        :class:`.AlgorithmSettings`
            An instanced of AlgorithmSettings with specified parameters.

        Raises
        ------
        :exc:`.LeaspyAlgoInputError`
            if anything is invalid in algo settings

        Examples
        --------
        >>> from leaspy.algo import AlgorithmSettings
        >>> leaspy_univariate = AlgorithmSettings.load('outputs/leaspy-univariate_model-settings.json')
        """
        with open(path_to_algorithm_settings) as fp:
            settings = json.load(fp)
        if "name" not in settings.keys():
            raise LeaspyAlgoInputError(
                "Your json file must contain a 'name' attribute!"
            )
        algorithm_settings = cls(settings["name"])
        if "parameters" in settings.keys():
            print("You overwrote the algorithm default parameters")
            cls._recursive_merge_dict_warn_extra_keys(
                algorithm_settings.parameters, cls._get_parameters(settings)
            )
        if "seed" in settings.keys():
            print("You overwrote the algorithm default seed")
            algorithm_settings.seed = cls._get_seed(settings)
        if "algorithm_initialization_method" in settings.keys():
            print("You overwrote the algorithm default initialization method")
            algorithm_settings.algorithm_initialization_method = (
                cls._get_algorithm_initialization_method(settings)
            )
        if "device" in settings.keys():
            print("You overwrote the algorithm default device")
            algorithm_settings.device = cls._get_device(settings)
        if "loss" in settings.keys():
            raise LeaspyAlgoInputError(
                "`loss` keyword for AlgorithmSettings is not supported any more. "
                "Please define `noise_model` directly in your Leaspy model."
            )
        # TODO: this class should really be refactored so not to copy in 3 methods same stuff (manage_kwargs, load & _check_default_settings)
        unknown_keys = set(settings.keys()).difference(cls._known_keys)
        if unknown_keys:
            raise LeaspyAlgoInputError(
                f"Unexpected keys {unknown_keys} in algorithm settings."
            )
        algorithm_settings.check_consistency()
        return algorithm_settings



[docs]
    def save(self, path: Union[str, Path], **kwargs):
        """
        Save an AlgorithmSettings object in a json file.

        TODO? save leaspy version as well for retro/future-compatibility issues?

        Parameters
        ----------
        path : :obj:`str`
            Path to store the AlgorithmSettings.
        **kwargs
            Keyword arguments for json.dump method.
            Default: dict(indent=2)

        Examples
        --------
        >>> from leaspy.algo import AlgorithmSettings
        >>> settings = AlgorithmSettings("scipy_minimize", seed=42)
        >>> settings.save("outputs/scipy_minimize-settings.json")
        """
        from leaspy.algo import AlgorithmType, get_algorithm_type

        json_settings = {
            "name": self.name,
            "seed": self.seed,
            "algorithm_initialization_method": self.algorithm_initialization_method,
        }
        if hasattr(self, "device"):
            json_settings["device"] = self.device
        # TODO: save config of logging as well (OutputSettings needs to be JSON serializable...)
        # if self.logs is not None:
        #    json_settings['logs'] = self.logs
        # append parameters key after "hyperparameters"
        json_settings["parameters"] = self.parameters
        # Default json.dump kwargs:
        kwargs = {"indent": 2, **kwargs}
        with open(path, "w") as json_file:
            json.dump(json_settings, json_file, **kwargs)



[docs]
    def set_logs(self, **kwargs):
        """
        Use this method to monitor the convergence of a model fit.

        This method creates CSV files and plots to track the evolution of population parameters
        (i.e., fixed effects) during the fitting.

        Parameters
        ----------
        **kwargs
            path : :obj:`str`, optional
               The path of the folder where graphs and csv files will be saved.
               If None, DEFAULT_LOGS_DIR will be used.
            * print_periodicity : :obj:`int`, optional, default 100
                Prints every N iterations.
            * save_periodicity : :obj:`int`, optional, default 50
                Saves the values in csv files every N iterations.
            * plot_periodicity : :obj:`int`, optional, default 1000
                Generates plots from saved values every N iterations.
                Notes:
                    * Must be a multiple of `save_periodicity`.
                    * Setting this value too low may significantly slow down the fitting process.
            * plot_patient_periodicity : :obj:`int`
                 Set the frequency of the saves of the patients' reconstructions
            * plot_sourcewise : :obj:`bool`, optional, default False
                Set this to True to plot the source-based parameters sourcewise.
            * overwrite_logs_folder : :obj:`bool`, optional, default False
                Set it to ``True`` to overwrite the content of the folder in ``path``.
            * nb_of_patients_to_plot : :obj:`int`, optional default 5
                number of patients to plot

        Raises
        ------
        :exc:`.LeaspyAlgoInputError`
            If the folder given in ``path`` already exists and if ``overwrite_logs_folder`` is set to ``False``.

        Notes
        -----
        By default, if the folder given in ``path`` already exists, the method will raise an error.
        To overwrite the content of the folder, set ``overwrite_logs_folder`` it to ``True``.
        """
        # TODO: all this logic should be delegated in dedicated OutputSettings class...!

        default_settings = {
            "path": None,
            "print_periodicity": None,
            "save_periodicity": None,
            "plot_periodicity": None,
            "plot_patient_periodicity": None,
            "plot_sourcewise": False,
            "overwrite_logs_folder": False,
            "nb_of_patients_to_plot": 5,
        }
        settings = default_settings.copy()
        for k, v in kwargs.items():
            if k in (
                "print_periodicity",
                "plot_periodicity",
                "save_periodicity",
                "plot_patient_periodicity",
                "nb_of_patients_to_plot",
                "plot_sourcewise",
            ):
                if v is not None and not isinstance(v, int):
                    raise LeaspyAlgoInputError(
                        f"You must provide a integer to the input <{k}>! "
                        f"You provide {v} of type {type(v)}."
                    )
                settings[k] = v
            elif k in ["overwrite_logs_folder"]:
                if not isinstance(v, bool):
                    raise LeaspyAlgoInputError(
                        f"You must provide a boolean to the input <{k}>! "
                        f"You provide {v} of type {type(v)}."
                    )
                settings[k] = v
            elif k == "path":
                if v is not None and not isinstance(v, (str, Path)):
                    raise LeaspyAlgoInputError(
                        f"You must provide a string or Path to the input <{k}>! "
                        f"You provide {v} of type {type(v)}."
                    )
                settings[k] = v
        if settings != default_settings:
            self.logs = OutputsSettings(settings)


    def _manage_kwargs(self, kwargs):
        _special_kwargs = {
            "seed": self._get_seed,
            "algorithm_initialization_method": self._get_algorithm_initialization_method,
            "device": self._get_device,
        }

        for k, v in kwargs.items():
            if k in _special_kwargs:
                k_getter = _special_kwargs[k]
                setattr(self, k, k_getter(kwargs))

        kwargs_interpreted_as_parameters = {
            k: v for k, v in kwargs.items() if k not in _special_kwargs
        }
        self._recursive_merge_dict_warn_extra_keys(
            self.parameters, kwargs_interpreted_as_parameters
        )

        # dynamic default parameters
        if self.name in self._dynamic_default_parameters:
            for func_condition, associated_defaults in self._dynamic_default_parameters[
                self.name
            ]:
                if not func_condition(kwargs):
                    continue

                # loop on dynamic defaults
                for nested_levels, val_getter in associated_defaults.items():
                    # check that the dynamic default that we want to set is not already overwritten
                    if self._get_nested_dict(kwargs, nested_levels) is None:
                        self._set_nested_dict(
                            self.parameters, nested_levels, val_getter(kwargs)
                        )

    @staticmethod
    def _get_nested_dict(nested_dict: dict, nested_levels, default=None):
        """
        Get a nested key of a dict or default if any previous level is missing.

        Examples
        --------
        >>> _get_nested_dict(d, ('a','b'), -1) == ...
            * -1 if 'a' not in d
            * -1 if 'b' not in d['a']
            * d['a']['b'] else

        >>> _get_nested_dict(d, (), ...) == d
        """
        it_levels = iter(nested_levels)

        while isinstance(nested_dict, dict):
            try:
                next_lvl = next(it_levels)
            except StopIteration:
                break

            # get next level dict
            nested_dict = nested_dict.get(next_lvl, default)

        return nested_dict

    @classmethod
    def _set_nested_dict(cls, nested_dict: dict, nested_levels, val):
        """
        Set a nested key of a dict.
        Precondition: all intermediate levels must exist.
        """
        *nested_top_levels, last_level = nested_levels
        dict_to_set = cls._get_nested_dict(nested_dict, nested_top_levels, default=None)
        assert isinstance(dict_to_set, dict)
        dict_to_set[last_level] = val  # inplace

    def _load_default_values(self, path_to_algorithm_settings: Path):
        from leaspy.algo import AlgorithmType, get_algorithm_class, get_algorithm_type

        with open(path_to_algorithm_settings) as fp:
            settings = json.load(fp)
        self._check_default_settings(settings)
        # TODO: Urgent => The following function should in fact be algorithm-name specific!! As for the constant prediction
        # Etienne: I'd advocate for putting all non-generic / parametric stuff in special methods / attributes
        # of corresponding algos... so that everything is generic here
        # Igor : Agreed. This class became a real mess.
        self.name = self._get_name(settings)
        self.parameters = self._get_parameters(settings)
        self.algorithm_initialization_method = (
            self._get_algorithm_initialization_method(settings)
        )
        # optional hyperparameters depending on type of algorithm
        algo_class = get_algorithm_class(self.name)
        if not algo_class.deterministic:
            self.seed = self._get_seed(settings)
        if "device" in settings:
            self.device = self._get_device(settings)

    @classmethod
    def _check_default_settings(cls, settings: dict):
        from leaspy.algo import AlgorithmType, get_algorithm_class, get_algorithm_type

        unknown_keys = set(settings.keys()).difference(cls._known_keys)
        if unknown_keys:
            raise LeaspyAlgoInputError(
                f"Unexpected keys {unknown_keys} in algorithm settings."
            )
        error_tpl = "The '{}' key is missing in the algorithm settings (JSON file) you are loading."
        for mandatory_key in ("name", "parameters"):
            if mandatory_key not in settings.keys():
                raise LeaspyAlgoInputError(error_tpl.format(mandatory_key))
        algo_class = get_algorithm_class(settings["name"])
        if not algo_class.deterministic and "seed" not in settings:
            raise LeaspyAlgoInputError(error_tpl.format("seed"))
        if "algorithm_initialization_method" not in settings:
            raise LeaspyAlgoInputError(
                error_tpl.format("algorithm_initialization_method")
            )

    @staticmethod
    def _get_name(settings: dict) -> str:
        return settings["name"].lower()

    @staticmethod
    def _get_parameters(settings: dict) -> dict:
        return settings["parameters"]

    @staticmethod
    def _get_seed(settings: dict) -> Optional[int]:
        if settings["seed"] is None:
            return None
        try:
            return int(settings["seed"])
        except Exception:
            warnings.warn(
                f"The 'seed' parameter you provided ({settings['seed']}) cannot be converted to int, using None instead."
            )
            return None

    @staticmethod
    def _get_algorithm_initialization_method(settings: dict) -> Optional[str]:
        if settings["algorithm_initialization_method"] is None:
            return None
        # TODO : There should be a list of possible initialization method.
        #  It can also be discussed depending on the algorithms name
        return settings["algorithm_initialization_method"]

    @staticmethod
    def _get_device(settings: dict):
        # in case where a torch.device object was used, we convert it to the
        # corresponding string (torch.device('cuda') is converted into 'cuda')
        # in order for the AlgorithmSettings to be saved into json files if needed
        if isinstance(settings["device"], torch.device):
            return settings["device"].type

        # getting the type of torch.device(...) allows to convert 'cuda:2' to 'cuda'
        # which prevents potential issues when using torch.set_default_tensor_type
        return torch.device(settings["device"]).type