Source code for leaspy.algo.settings

"""This module defines the `OutputsSettings` and `AlgorithmSettings` classes."""

import json
import os
import shutil
import warnings
from pathlib import Path
from typing import Optional, Union

import torch

from leaspy.exceptions import LeaspyAlgoInputError
from leaspy.utils.typing import KwargsType

algo_default_data_dir = Path(__file__).parent.parent / "algo" / "data"

__all__ = [
    "OutputsSettings",
    "AlgorithmSettings",
    "algo_default_data_dir",
]


[docs] class OutputsSettings: """ Used to create the `logs` folder to monitor the convergence of the fit algorithm. Parameters ---------- settings : :obj:`dict` [:obj:`str`, Any] * path : :obj:`str` or None Path to store logs. If None, default path "./_outputs/" will be used. * print_periodicity : :obj:`int` >= 1 or None Print information every N iterations * save_periodicity : :obj:`int` >= 1, optional Save convergence data every N iterations Default=50. * plot_periodicity : :obj:`int` >= 1 or None Plot convergence data every N iterations. If None, no plots will be saved. Note that plotting requires saving to be realized and can not be more than saves. * plot_sourcewise : :obj:`bool` Flag to plot source based multidimensional parameters such as mixing_matrix for each source. Otherwise, they will be plotted according to the other dimension such as feature. Default=False * overwrite_logs_folder : :obj:`bool` Flag to remove all previous logs if existing (default False) Raises ------ :exc:`.LeaspyAlgoInputError` """ DEFAULT_LOGS_DIR = "_outputs" def __init__(self, settings): self.print_periodicity = None self.plot_periodicity = None self.save_periodicity = None self.plot_patient_periodicity = None self.plot_sourcewise = False self.nb_of_patients_to_plot = 5 self.root_path = None self.parameter_convergence_path = None self.plot_path = None self.patients_plot_path = None self._set_print_periodicity(settings) self._set_save_periodicity(settings) self._set_plot_periodicity(settings) self._set_nb_of_patients_to_plot(settings) self._set_plot_sourcewise(settings) self._set_plot_patient_periodicity(settings) # only create folders if the user want to save data or plots and provided a valid path! self._create_root_folder(settings) def _set_param_as_int_or_ignore(self, settings: dict, param: str): """Inplace set of parameter (as int) from settings.""" if param not in settings: return val = settings[param] if val is not None: # try to cast as an integer. try: val = int(val) assert val >= 1 except Exception: warnings.warn( f"The '{param}' parameter you provided is not castable to an int > 0. " "Ignoring its value.", UserWarning, ) return # Update the attribute of self in-place setattr(self, param, val) def _set_plot_sourcewise(self, settings: dict): setattr(self, "plot_sourcewise", settings["plot_sourcewise"]) def _set_nb_of_patients_to_plot(self, settings: dict): self._set_param_as_int_or_ignore(settings, "nb_of_patients_to_plot") def _set_print_periodicity(self, settings: dict): self._set_param_as_int_or_ignore(settings, "print_periodicity") def _set_save_periodicity(self, settings: dict): self._set_param_as_int_or_ignore(settings, "save_periodicity") def _set_plot_periodicity(self, settings: dict): self._set_param_as_int_or_ignore(settings, "plot_periodicity") def _set_plot_patient_periodicity(self, settings: dict): self._set_param_as_int_or_ignore(settings, "plot_patient_periodicity") if self.plot_periodicity is not None: if self.save_periodicity is None: raise LeaspyAlgoInputError( "You can not define a `plot_periodicity` without defining `save_periodicity`. " "Note that the `plot_periodicity` should be a multiple of `save_periodicity`." ) if self.plot_periodicity % self.save_periodicity != 0: raise LeaspyAlgoInputError( "The `plot_periodicity` should be a multiple of `save_periodicity`." ) def _create_root_folder(self, settings: dict): # Get the path to put the outputs path = settings.get("path", None) if path is None: if self.save_periodicity: warnings.warn( f"Outputs will be saved in '{self.DEFAULT_LOGS_DIR}' relative to the current working directory", stacklevel=2, ) path = Path.cwd() / self.DEFAULT_LOGS_DIR if path.exists(): self._clean_folder(path) else: return else: path = Path.cwd() / path settings["path"] = str(path) # Check if the folder does not exist: if not, create (and its parent) if not path.exists(): warnings.warn( f"The logs path you provided ({settings['path']}) does not exist. " "Needed paths will be created (and their parents if needed).", stacklevel=2, ) elif settings.get("overwrite_logs_folder", False): warnings.warn(f"Overwriting '{path}' folder...") self._clean_folder(path) all_ok = self._check_needed_folders_are_empty_or_create_them(path) if not all_ok: raise LeaspyAlgoInputError( f"The logs folder '{path}' already exists and is not empty! " "Give another path or use keyword argument `overwrite_logs_folder=True`." ) @staticmethod def _check_folder_is_empty_or_create_it(path_folder: Path) -> bool: if path_folder.exists(): if ( os.path.islink(path_folder) or not path_folder.is_dir() or len([f for f in path_folder.iterdir()]) > 0 ): return False else: path_folder.mkdir(parents=True, exist_ok=True) return True @staticmethod def _clean_folder(path: Path): shutil.rmtree(path) path.mkdir(exist_ok=True, parents=True) def _check_needed_folders_are_empty_or_create_them(self, path: Path) -> bool: self.root_path = path self.parameter_convergence_path = path / "parameter_convergence" self.plot_path = path / "plots" self.patients_plot_path = self.plot_path / "patients" all_ok = self._check_folder_is_empty_or_create_it( self.parameter_convergence_path ) all_ok &= self._check_folder_is_empty_or_create_it(self.plot_path) all_ok &= self._check_folder_is_empty_or_create_it(self.patients_plot_path) return all_ok
[docs] class AlgorithmSettings: """ Used to set the algorithms' settings. All parameters except the algorithm name have default values, which can be overwritten by the user. Parameters ---------- name : str The algorithm's name. Must be one of: * For `fit` algorithms: * ``'mcmc_saem'`` * ``'lme_fit'`` (for LME model only) * For `personalize` algorithms: * ``'scipy_minimize'`` * ``'mean_real'`` * ``'mode_real'`` * ``'constant_prediction'`` (for constant model only) * ``'lme_personalize'`` (for LME model only) * For `simulate` algorithms: * ``'simulation'`` **kwargs : any Depending on the algorithm, various parameters are possible: * seed : :obj:`int`, optional, default None Used for stochastic algorithms. * algorithm_initialization_method : :obj:`str`, optional Personalize the algorithm initialization method, according to those possible for the given algorithm (refer to its documentation in :mod:`leaspy.algo`). * n_iter : :obj:`int`, optional Number of iteration. Note that there is no stopping criteria for MCMC SAEM algorithms. * n_burn_in_iter : :obj:`int`, optional Number of iteration during burning phase, used for the MCMC SAEM algorithms. * use_jacobian : :obj:`bool`, optional, default True Used in ``scipy_minimize`` algorithm to perform a `L-BFGS` instead of a `Powell` algorithm. * n_jobs : :obj:`bool`, optional, default 1 Used in ``scipy_minimize`` algorithm to accelerate calculation with parallel derivation using joblib. * progress_bar : :obj:`bool`, optional, default True Used to display a progress bar during computation. * device : :obj:`int` or torch.device, optional Specifies on which device the algorithm will run. Only 'cpu' and 'cuda' are supported for this argument. Only ``'mcmc_saem'``, ``'mean_real'`` and ``'mode_real'`` algorithms support this setting. For the complete list of the available parameters for a given algorithm, please directly refer to its documentation. Attributes ---------- name : :obj:`str` The algorithm's name. algorithm_initialization_method : :obj:`str`, optional Personalize the algorithm initialization method, according to those possible for the given algorithm (refer to its documentation in :mod:`leaspy.algo`). seed : :obj:`int`, optional, default None Used for stochastic algorithms. parameters : :obj:`dict` Contains the other parameters: `n_iter`, `n_burn_in_iter`, `use_jacobian`, `n_jobs` & `progress_bar`. logs : :class:`.OutputsSettings`, optional Used to create a ``logs`` file containing convergence information during fitting the model. device : :obj:`str` (or torch.device), optional, default 'cpu' Specifies the computation device. Only `'cpu'` and `'cuda'` are supported. Note that specifying an indexed CUDA device (such as 'cuda:1') is not supported. In order to specify the precise cuda device index, one should use the `CUDA_VISIBLE_DEVICES` environment variable. Raises ------ :exc:`.LeaspyAlgoInputError` Notes ----- Developers can use `_dynamic_default_parameters` to define settings that depend on other parameters when not explicitly specified by the user. """ # TODO should be in the each algo class directly? _dynamic_default_parameters = { "lme_fit": [ ( lambda kw: "force_independent_random_effects" in kw and kw["force_independent_random_effects"], { ("method",): lambda kw: [ "lbfgs", "bfgs", ] # Powell & Nelder-Mead methods cannot ensure respect of "free" }, ) ] } # known keys for all algorithms (<!> not all of them are mandatory!) _known_keys = [ "name", "seed", "algorithm_initialization_method", "parameters", "device", ] # 'logs' are not handled in exported files def __init__(self, name: str, **kwargs): from leaspy.algo import AlgorithmName self.name: AlgorithmName = AlgorithmName(name) self.parameters: Optional[KwargsType] = None self.seed: Optional[int] = None self.algorithm_initialization_method: Optional[str] = None self.logs: Optional[OutputsSettings] = None default_algo_settings_path = ( algo_default_data_dir / f"default_{self.name.value}.json" ) if default_algo_settings_path.is_file(): self._load_default_values(default_algo_settings_path) else: raise LeaspyAlgoInputError( f"The algorithm name '{self.name.value}' you provided does not exist" ) self._manage_kwargs(kwargs) self.check_consistency()
[docs] def check_consistency(self) -> None: """ Check internal consistency of algorithm settings and warn or raise a `LeaspyAlgoInputError` if not. """ from .algo_with_device import AlgorithmWithDeviceMixin from .base import get_algorithm_class algo_class = get_algorithm_class(self.name) if self.seed is not None and algo_class.deterministic: warnings.warn( f"You can skip defining `seed` since the algorithm {self.name} is deterministic." ) if hasattr(self, "device") and not issubclass( algo_class, AlgorithmWithDeviceMixin ): warnings.warn( f'The algorithm "{self.name}" does not support user-specified devices (this ' "is supported only for specific algorithms) and will use the default device (CPU)." )
@classmethod def _recursive_merge_dict_warn_extra_keys( cls, ref: dict, new: dict, *, prefix_keys: str = "" ): """Merge in-place dictionary `ref` with the values from `new`, for dict keys, merge is recursive.""" extra_keys = [prefix_keys + k for k in new if k not in ref] if extra_keys: warnings.warn( f"The parameters {extra_keys} were not present by default and are likely to be unsupported." ) for k, v in new.items(): if k not in ref or not isinstance(ref[k], dict): ref[k] = v else: if not isinstance(v, dict): raise LeaspyAlgoInputError( f"Algorithm parameter `{prefix_keys + k}` should be a dictionary, not '{v}' of type {type(v)}." ) cls._recursive_merge_dict_warn_extra_keys( ref[k], v, prefix_keys=f"{prefix_keys}{k}." )
[docs] @classmethod def load(cls, path_to_algorithm_settings: Union[str, Path]): """Instantiate a AlgorithmSettings object a from json file. Parameters ---------- path_to_algorithm_settings : :obj:`str` Path of the json file. Returns ------- :class:`.AlgorithmSettings` An instanced of AlgorithmSettings with specified parameters. Raises ------ :exc:`.LeaspyAlgoInputError` if anything is invalid in algo settings Examples -------- >>> from leaspy.algo import AlgorithmSettings >>> leaspy_univariate = AlgorithmSettings.load('outputs/leaspy-univariate_model-settings.json') """ with open(path_to_algorithm_settings) as fp: settings = json.load(fp) if "name" not in settings.keys(): raise LeaspyAlgoInputError( "Your json file must contain a 'name' attribute!" ) algorithm_settings = cls(settings["name"]) if "parameters" in settings.keys(): print("You overwrote the algorithm default parameters") cls._recursive_merge_dict_warn_extra_keys( algorithm_settings.parameters, cls._get_parameters(settings) ) if "seed" in settings.keys(): print("You overwrote the algorithm default seed") algorithm_settings.seed = cls._get_seed(settings) if "algorithm_initialization_method" in settings.keys(): print("You overwrote the algorithm default initialization method") algorithm_settings.algorithm_initialization_method = ( cls._get_algorithm_initialization_method(settings) ) if "device" in settings.keys(): print("You overwrote the algorithm default device") algorithm_settings.device = cls._get_device(settings) if "loss" in settings.keys(): raise LeaspyAlgoInputError( "`loss` keyword for AlgorithmSettings is not supported any more. " "Please define `noise_model` directly in your Leaspy model." ) # TODO: this class should really be refactored so not to copy in 3 methods same stuff (manage_kwargs, load & _check_default_settings) unknown_keys = set(settings.keys()).difference(cls._known_keys) if unknown_keys: raise LeaspyAlgoInputError( f"Unexpected keys {unknown_keys} in algorithm settings." ) algorithm_settings.check_consistency() return algorithm_settings
[docs] def save(self, path: Union[str, Path], **kwargs): """ Save an AlgorithmSettings object in a json file. TODO? save leaspy version as well for retro/future-compatibility issues? Parameters ---------- path : :obj:`str` Path to store the AlgorithmSettings. **kwargs Keyword arguments for json.dump method. Default: dict(indent=2) Examples -------- >>> from leaspy.algo import AlgorithmSettings >>> settings = AlgorithmSettings("scipy_minimize", seed=42) >>> settings.save("outputs/scipy_minimize-settings.json") """ from leaspy.algo import AlgorithmType, get_algorithm_type json_settings = { "name": self.name, "seed": self.seed, "algorithm_initialization_method": self.algorithm_initialization_method, } if hasattr(self, "device"): json_settings["device"] = self.device # TODO: save config of logging as well (OutputSettings needs to be JSON serializable...) # if self.logs is not None: # json_settings['logs'] = self.logs # append parameters key after "hyperparameters" json_settings["parameters"] = self.parameters # Default json.dump kwargs: kwargs = {"indent": 2, **kwargs} with open(path, "w") as json_file: json.dump(json_settings, json_file, **kwargs)
[docs] def set_logs(self, **kwargs): """ Use this method to monitor the convergence of a model fit. This method creates CSV files and plots to track the evolution of population parameters (i.e., fixed effects) during the fitting. Parameters ---------- **kwargs path : :obj:`str`, optional The path of the folder where graphs and csv files will be saved. If None, DEFAULT_LOGS_DIR will be used. * print_periodicity : :obj:`int`, optional, default 100 Prints every N iterations. * save_periodicity : :obj:`int`, optional, default 50 Saves the values in csv files every N iterations. * plot_periodicity : :obj:`int`, optional, default 1000 Generates plots from saved values every N iterations. Notes: * Must be a multiple of `save_periodicity`. * Setting this value too low may significantly slow down the fitting process. * plot_patient_periodicity : :obj:`int` Set the frequency of the saves of the patients' reconstructions * plot_sourcewise : :obj:`bool`, optional, default False Set this to True to plot the source-based parameters sourcewise. * overwrite_logs_folder : :obj:`bool`, optional, default False Set it to ``True`` to overwrite the content of the folder in ``path``. * nb_of_patients_to_plot : :obj:`int`, optional default 5 number of patients to plot Raises ------ :exc:`.LeaspyAlgoInputError` If the folder given in ``path`` already exists and if ``overwrite_logs_folder`` is set to ``False``. Notes ----- By default, if the folder given in ``path`` already exists, the method will raise an error. To overwrite the content of the folder, set ``overwrite_logs_folder`` it to ``True``. """ # TODO: all this logic should be delegated in dedicated OutputSettings class...! default_settings = { "path": None, "print_periodicity": None, "save_periodicity": None, "plot_periodicity": None, "plot_patient_periodicity": None, "plot_sourcewise": False, "overwrite_logs_folder": False, "nb_of_patients_to_plot": 5, } settings = default_settings.copy() for k, v in kwargs.items(): if k in ( "print_periodicity", "plot_periodicity", "save_periodicity", "plot_patient_periodicity", "nb_of_patients_to_plot", "plot_sourcewise", ): if v is not None and not isinstance(v, int): raise LeaspyAlgoInputError( f"You must provide a integer to the input <{k}>! " f"You provide {v} of type {type(v)}." ) settings[k] = v elif k in ["overwrite_logs_folder"]: if not isinstance(v, bool): raise LeaspyAlgoInputError( f"You must provide a boolean to the input <{k}>! " f"You provide {v} of type {type(v)}." ) settings[k] = v elif k == "path": if v is not None and not isinstance(v, (str, Path)): raise LeaspyAlgoInputError( f"You must provide a string or Path to the input <{k}>! " f"You provide {v} of type {type(v)}." ) settings[k] = v if settings != default_settings: self.logs = OutputsSettings(settings)
def _manage_kwargs(self, kwargs): _special_kwargs = { "seed": self._get_seed, "algorithm_initialization_method": self._get_algorithm_initialization_method, "device": self._get_device, } for k, v in kwargs.items(): if k in _special_kwargs: k_getter = _special_kwargs[k] setattr(self, k, k_getter(kwargs)) kwargs_interpreted_as_parameters = { k: v for k, v in kwargs.items() if k not in _special_kwargs } self._recursive_merge_dict_warn_extra_keys( self.parameters, kwargs_interpreted_as_parameters ) # dynamic default parameters if self.name in self._dynamic_default_parameters: for func_condition, associated_defaults in self._dynamic_default_parameters[ self.name ]: if not func_condition(kwargs): continue # loop on dynamic defaults for nested_levels, val_getter in associated_defaults.items(): # check that the dynamic default that we want to set is not already overwritten if self._get_nested_dict(kwargs, nested_levels) is None: self._set_nested_dict( self.parameters, nested_levels, val_getter(kwargs) ) @staticmethod def _get_nested_dict(nested_dict: dict, nested_levels, default=None): """ Get a nested key of a dict or default if any previous level is missing. Examples -------- >>> _get_nested_dict(d, ('a','b'), -1) == ... * -1 if 'a' not in d * -1 if 'b' not in d['a'] * d['a']['b'] else >>> _get_nested_dict(d, (), ...) == d """ it_levels = iter(nested_levels) while isinstance(nested_dict, dict): try: next_lvl = next(it_levels) except StopIteration: break # get next level dict nested_dict = nested_dict.get(next_lvl, default) return nested_dict @classmethod def _set_nested_dict(cls, nested_dict: dict, nested_levels, val): """ Set a nested key of a dict. Precondition: all intermediate levels must exist. """ *nested_top_levels, last_level = nested_levels dict_to_set = cls._get_nested_dict(nested_dict, nested_top_levels, default=None) assert isinstance(dict_to_set, dict) dict_to_set[last_level] = val # inplace def _load_default_values(self, path_to_algorithm_settings: Path): from leaspy.algo import AlgorithmType, get_algorithm_class, get_algorithm_type with open(path_to_algorithm_settings) as fp: settings = json.load(fp) self._check_default_settings(settings) # TODO: Urgent => The following function should in fact be algorithm-name specific!! As for the constant prediction # Etienne: I'd advocate for putting all non-generic / parametric stuff in special methods / attributes # of corresponding algos... so that everything is generic here # Igor : Agreed. This class became a real mess. self.name = self._get_name(settings) self.parameters = self._get_parameters(settings) self.algorithm_initialization_method = ( self._get_algorithm_initialization_method(settings) ) # optional hyperparameters depending on type of algorithm algo_class = get_algorithm_class(self.name) if not algo_class.deterministic: self.seed = self._get_seed(settings) if "device" in settings: self.device = self._get_device(settings) @classmethod def _check_default_settings(cls, settings: dict): from leaspy.algo import AlgorithmType, get_algorithm_class, get_algorithm_type unknown_keys = set(settings.keys()).difference(cls._known_keys) if unknown_keys: raise LeaspyAlgoInputError( f"Unexpected keys {unknown_keys} in algorithm settings." ) error_tpl = "The '{}' key is missing in the algorithm settings (JSON file) you are loading." for mandatory_key in ("name", "parameters"): if mandatory_key not in settings.keys(): raise LeaspyAlgoInputError(error_tpl.format(mandatory_key)) algo_class = get_algorithm_class(settings["name"]) if not algo_class.deterministic and "seed" not in settings: raise LeaspyAlgoInputError(error_tpl.format("seed")) if "algorithm_initialization_method" not in settings: raise LeaspyAlgoInputError( error_tpl.format("algorithm_initialization_method") ) @staticmethod def _get_name(settings: dict) -> str: return settings["name"].lower() @staticmethod def _get_parameters(settings: dict) -> dict: return settings["parameters"] @staticmethod def _get_seed(settings: dict) -> Optional[int]: if settings["seed"] is None: return None try: return int(settings["seed"]) except Exception: warnings.warn( f"The 'seed' parameter you provided ({settings['seed']}) cannot be converted to int, using None instead." ) return None @staticmethod def _get_algorithm_initialization_method(settings: dict) -> Optional[str]: if settings["algorithm_initialization_method"] is None: return None # TODO : There should be a list of possible initialization method. # It can also be discussed depending on the algorithms name return settings["algorithm_initialization_method"] @staticmethod def _get_device(settings: dict): # in case where a torch.device object was used, we convert it to the # corresponding string (torch.device('cuda') is converted into 'cuda') # in order for the AlgorithmSettings to be saved into json files if needed if isinstance(settings["device"], torch.device): return settings["device"].type # getting the type of torch.device(...) allows to convert 'cuda:2' to 'cuda' # which prevents potential issues when using torch.set_default_tensor_type return torch.device(settings["device"]).type