Source code for simod.settings.common_settings

from dataclasses import field
from enum import Enum
from pathlib import Path
from typing import List, Optional, Union

from pix_framework.io.event_log import PROSIMOS_LOG_IDS, EventLogIDs
from pydantic import BaseModel

from ..utilities import get_project_dir

QBP_NAMESPACE_URI = "http://www.qbp-simulator.com/Schema201212"
BPMN_NAMESPACE_URI = "http://www.omg.org/spec/BPMN/20100524/MODEL"
PROJECT_DIR = get_project_dir()


[docs] class Metric(str, Enum): """ Enum class storing the metrics used to evaluate the quality of a BPS model. Attributes ---------- DL : str Control-flow Log Distance metric based in the Damerau-Levenshtein distance. TWO_GRAM_DISTANCE : str Two-gram distance metric. THREE_GRAM_DISTANCE : str Three-gram distance metric. CIRCADIAN_EMD : str Earth Mover's Distance (EMD) for circadian event distribution. CIRCADIAN_WORKFORCE_EMD : str EMD for circadian workforce distribution. ARRIVAL_EMD : str EMD for arrival event distribution. RELATIVE_EMD : str EMD for relative event distribution. ABSOLUTE_EMD : str EMD for absolute event distribution. CYCLE_TIME_EMD : str EMD for cycle time distribution. """ DL = "dl" TWO_GRAM_DISTANCE = "two_gram_distance" THREE_GRAM_DISTANCE = "three_gram_distance" CIRCADIAN_EMD = "circadian_event_distribution" CIRCADIAN_WORKFORCE_EMD = "circadian_workforce_distribution" ARRIVAL_EMD = "arrival_event_distribution" RELATIVE_EMD = "relative_event_distribution" ABSOLUTE_EMD = "absolute_event_distribution" CYCLE_TIME_EMD = "cycle_time_distribution"
[docs] @classmethod def from_str(cls, value: Union[str, List[str]]) -> "Union[Metric, List[Metric]]": """ Converts a string (or list of strings) representing metric names into an instance (or list of instances) of the :class:`Metric` enum. Parameters ---------- value : Union[str, List[str]] A string representing a metric name or a list of metric names. Returns ------- Union[:class:`Metric`, List[:class:`Metric`]] An instance of :class:`Metric` if a single string is provided, or a list of :class:`Metric` instances if a list of strings is provided. Raises ------ ValueError If the provided string does not match any metric name. """ if isinstance(value, str): return Metric._from_str(value) elif isinstance(value, list): return [Metric._from_str(v) for v in value]
@classmethod def _from_str(cls, value: str) -> "Metric": if value.lower() == "dl": return cls.DL elif value.lower() in ["two_gram_distance", "2_gram_distance"]: return cls.TWO_GRAM_DISTANCE elif value.lower() in ["n_gram", "n_gram_distance", "three_gram_distance", "3_gram_distance"]: return cls.THREE_GRAM_DISTANCE elif value.lower() in ["circadian_event_distribution", "circadian_emd"]: return cls.CIRCADIAN_EMD elif value.lower() in ["circadian_workforce_distribution", "workforce_emd", "circadian_workforce"]: return cls.CIRCADIAN_WORKFORCE_EMD elif value.lower() in ["arrival_event_distribution", "arrival_emd"]: return cls.ARRIVAL_EMD elif value.lower() in ["relative_event_distribution", "relative_emd"]: return cls.RELATIVE_EMD elif value.lower() in [ "absolute_event_distribution", "absolute_hourly_emd", "absolute_hour_emd", "abs_hourly_emd", "abs_hour_emd", ]: return cls.ABSOLUTE_EMD elif value.lower() in ["cycle_time_distribution", "cycle_time_emd"]: return cls.CYCLE_TIME_EMD else: raise ValueError(f"Unknown value {value}") def __str__(self): if self == Metric.DL: return "DL" elif self == Metric.TWO_GRAM_DISTANCE: return "TWO_GRAM_DISTANCE" elif self == Metric.THREE_GRAM_DISTANCE: return "THREE_GRAM_DISTANCE" elif self == Metric.CIRCADIAN_EMD: return "CIRCADIAN_EVENT_DISTRIBUTION" elif self == Metric.CIRCADIAN_WORKFORCE_EMD: return "CIRCADIAN_WORKFORCE_DISTRIBUTION" elif self == Metric.ARRIVAL_EMD: return "ARRIVAL_EVENT_DISTRIBUTION" elif self == Metric.RELATIVE_EMD: return "RELATIVE_EVENT_DISTRIBUTION" elif self == Metric.ABSOLUTE_EMD: return "ABSOLUTE_EVENT_DISTRIBUTION" elif self == Metric.CYCLE_TIME_EMD: return "CYCLE_TIME_DISTRIBUTION" return f"Unknown Metric {str(self)}"
[docs] class CommonSettings(BaseModel): """ General configuration parameters of SIMOD and parameters common to all pipeline stages Attributes ---------- train_log_path : :class:`~pathlib.Path` Path to the training log (the one used to discover the BPS model). log_ids : :class:`EventLogIDs` Dataclass storing the mapping between the column names in the CSV and their role (case_id, activity, etc.). test_log_path : :class:`~pathlib.Path`, optional Path to the event log to perform the final evaluation of the discovered BPS model (if desired). process_model_path : :class:`~pathlib.Path`, optional Path to the BPMN model for the control-flow (skip its discovery and use this one). perform_final_evaluation : bool Boolean indicating whether to perform the final evaluation of the discovered BPS model. If true, either use the event log in [test_log_path] if specified, or split the training log to obtain a testing set. num_final_evaluations : int Number of replications of the final evaluation to perform. evaluation_metrics : list List of :class:`Metric` evaluation metrics to use in the final evaluation. use_observed_arrival_distribution : bool Boolean indicating whether to use the distribution of observed case arrival times (true), or to discover a probability distribution function to model them (false). clean_intermediate_files : bool Boolean indicating whether to delete all intermediate created files. discover_data_attributes : bool Boolean indicating whether to discover data attributes and their creation/update rules. """ # Log & Model parameters train_log_path: Path = Path("default_path.csv") log_ids: EventLogIDs = PROSIMOS_LOG_IDS test_log_path: Optional[Path] = None process_model_path: Optional[Path] = None # Final evaluation parameters perform_final_evaluation: bool = False num_final_evaluations: int = 10 evaluation_metrics: List[Metric] = field(default_factory=list) # Common config use_observed_arrival_distribution: bool = False clean_intermediate_files: bool = True discover_data_attributes: bool = False
[docs] @staticmethod def from_dict(config: dict, config_dir: Optional[Path] = None) -> "CommonSettings": """ Instantiates the SIMOD common configuration from a dictionary. Parameters ---------- config : dict Dictionary with the configuration values for the SIMOD common parameters. config_dir : :class:`~pathlib.Path`, optional If the path to the event log(s) is specified in a relative manner, ``[config_dir]`` is used to complete such paths. If ``None``, relative paths are complemented with the current directory. Returns ------- :class:`CommonSettings` Instance of the SIMOD common configuration for the specified dictionary values. """ base_files_dir = config_dir or Path.cwd() # Training log path train_log_path = Path(config["train_log_path"]) if not train_log_path.is_absolute(): train_log_path = base_files_dir / train_log_path # Log IDs if "log_ids" in config: log_ids = EventLogIDs.from_dict(config["log_ids"]) else: log_ids = PROSIMOS_LOG_IDS # Test log path if "test_log_path" in config and config["test_log_path"] is not None: test_log_path = Path(config["test_log_path"]) if not test_log_path.is_absolute(): test_log_path = base_files_dir / test_log_path else: test_log_path = None # Process model path if "process_model_path" in config and config["process_model_path"] is not None: process_model_path = Path(config["process_model_path"]) if not process_model_path.is_absolute(): process_model_path = base_files_dir / process_model_path else: process_model_path = None # Flag to perform final evaluation (set to true if there is a test log) if test_log_path is not None: perform_final_evaluation = True else: perform_final_evaluation = config.get("perform_final_evaluation", False) # Number of final evaluations & metrics to evaluate if perform_final_evaluation: num_final_evaluations = config.get("num_final_evaluations", 10) if "evaluation_metrics" in config: metrics = [Metric.from_str(metric) for metric in config["evaluation_metrics"]] else: metrics = [ Metric.DL, Metric.TWO_GRAM_DISTANCE, Metric.THREE_GRAM_DISTANCE, Metric.CIRCADIAN_EMD, Metric.CIRCADIAN_WORKFORCE_EMD, Metric.ARRIVAL_EMD, Metric.RELATIVE_EMD, Metric.ABSOLUTE_EMD, Metric.CYCLE_TIME_EMD, ] else: num_final_evaluations = 0 metrics = [] # Quality check if perform_final_evaluation and num_final_evaluations == 0: print( "Wrong configuration! perform_final_evaluation=True but " "num_final_evaluations=0. Setting to 10 by default." ) num_final_evaluations = 10 use_observed_arrival_distribution = config.get("use_observed_arrival_distribution", False) clean_up = config.get("clean_intermediate_files", True) discover_data_attributes = config.get("discover_data_attributes", False) return CommonSettings( train_log_path=train_log_path, log_ids=log_ids, test_log_path=test_log_path, process_model_path=process_model_path, perform_final_evaluation=perform_final_evaluation, num_final_evaluations=num_final_evaluations, evaluation_metrics=metrics, use_observed_arrival_distribution=use_observed_arrival_distribution, clean_intermediate_files=clean_up, discover_data_attributes=discover_data_attributes, )
[docs] def to_dict(self) -> dict: """ Translate the common configuration stored in this instance into a dictionary. Returns ------- dict Python dictionary storing this configuration. """ return { "train_log_path": str(self.train_log_path), "test_log_path": str(self.test_log_path) if self.test_log_path is not None else None, "log_ids": self.log_ids.to_dict(), "process_model_path": str(self.process_model_path) if self.process_model_path is not None else None, "num_final_evaluations": self.num_final_evaluations, "evaluation_metrics": [str(metric) for metric in self.evaluation_metrics], "use_observed_arrival_distribution": self.use_observed_arrival_distribution, "clean_intermediate_files": self.clean_intermediate_files, "discover_data_attributes": self.discover_data_attributes, }