nirdizati_light.log.common

View Source

 1import random
 2import math
 3import logging
 4import pm4py
 5import pandas as pd
 6
 7logger = logging.getLogger(__name__)
 8
 9def get_log(filepath: str, separator: str = ';'):
10    """
11    Reads a xes or csv log.
12
13    For csv logs, standard column names must be used: 'case:concept:name' for the trace id, 'concept:name' for the activity, and 'time:timestamp' for the timestamp.
14
15    Args:
16        filepath (str): Path to the log.
17        separator (str): In case of csv logs, the separator character used in the csv log.
18
19    Returns:
20        A pm4py EventLog object.
21    """
22    if filepath.endswith('.xes'):
23        log = pm4py.read_xes(filepath)
24    elif filepath.endswith('.csv'):
25        log = pd.read_csv(filepath, sep=separator)
26        log['time:timestamp'] = pd.to_datetime(log['time:timestamp'])
27    else:
28        raise ValueError("Unsupported file extension")
29    
30    # ensure case id column is of type str
31    log['case:concept:name'] = log['case:concept:name'].astype(str)
32    
33    return pm4py.convert_to_event_log(log, case_id_key='case:concept:name')
34
35
36def split_train_val_test(
37    log: pd.DataFrame,
38    train_perc: float,
39    val_perc: float,
40    test_perc: float,
41    shuffle: bool = False,
42    seed: int = 42
43):
44    """
45    Splits a DataFrame containing event log data into training, validation, and test sets.
46
47    This function divides the DataFrame based on unique case identifiers (trace_id) into
48    specified proportions for training, validation, and testing. It supports shuffling
49    the cases before splitting to ensure random distribution.
50
51    Args:
52        log (pd.DataFrame): The input DataFrame containing the event log data.
53        train_perc (float): The proportion of the data to be used for the training set.
54        val_perc (float): The proportion of the data to be used for the validation set.
55        test_perc (float): The proportion of the data to be used for the test set. This parameter
56            is not directly used in splitting but can be useful for validation.
57        shuffle (bool): If True, the cases are shuffled before splitting. Defaults to False.
58        seed (int): The seed for the random number generator when shuffling. Defaults to 42.
59
60    Returns:
61        tuple: A tuple containing three pd.DataFrame objects for the training, validation, and test sets respectively.
62
63    Raises:
64        AssertionError: If the sum of the train, validation, and test percentage splits does not equal 1.
65    """
66    assert math.isclose(train_perc + val_perc + test_perc, 1, rel_tol=1e-9), "The sum of train_perc, val_perc, and test_perc should be equal to 1"
67
68    cases = list(log['trace_id'].unique())
69
70    if shuffle:
71        random.seed(seed)
72        random.shuffle(cases)
73
74    train_size = int(train_perc * len(cases))
75    val_size = int(val_perc * len(cases))
76
77    train_cases = cases[:train_size]
78    val_cases = cases[train_size:train_size + val_size]
79    test_cases = cases[train_size + val_size:]
80
81    assert len(train_cases) + len(val_cases) + len(test_cases) == len(cases)
82
83    train_df = log[log['trace_id'].isin(train_cases)]
84    val_df = log[log['trace_id'].isin(val_cases)]
85    test_df = log[log['trace_id'].isin(test_cases)]
86
87    return train_df, val_df, test_df

logger = <Logger nirdizati_light.log.common (WARNING)>

def get_log(filepath: str, separator: str = ';'): View Source

10def get_log(filepath: str, separator: str = ';'):
11    """
12    Reads a xes or csv log.
13
14    For csv logs, standard column names must be used: 'case:concept:name' for the trace id, 'concept:name' for the activity, and 'time:timestamp' for the timestamp.
15
16    Args:
17        filepath (str): Path to the log.
18        separator (str): In case of csv logs, the separator character used in the csv log.
19
20    Returns:
21        A pm4py EventLog object.
22    """
23    if filepath.endswith('.xes'):
24        log = pm4py.read_xes(filepath)
25    elif filepath.endswith('.csv'):
26        log = pd.read_csv(filepath, sep=separator)
27        log['time:timestamp'] = pd.to_datetime(log['time:timestamp'])
28    else:
29        raise ValueError("Unsupported file extension")
30    
31    # ensure case id column is of type str
32    log['case:concept:name'] = log['case:concept:name'].astype(str)
33    
34    return pm4py.convert_to_event_log(log, case_id_key='case:concept:name')

Reads a xes or csv log.

For csv logs, standard column names must be used: 'case:concept:name' for the trace id, 'concept:name' for the activity, and 'time:timestamp' for the timestamp.

Arguments:

filepath (str): Path to the log.
separator (str): In case of csv logs, the separator character used in the csv log.

Returns:

A pm4py EventLog object.

def split_train_val_test( log: pandas.core.frame.DataFrame, train_perc: float, val_perc: float, test_perc: float, shuffle: bool = False, seed: int = 42): View Source

37def split_train_val_test(
38    log: pd.DataFrame,
39    train_perc: float,
40    val_perc: float,
41    test_perc: float,
42    shuffle: bool = False,
43    seed: int = 42
44):
45    """
46    Splits a DataFrame containing event log data into training, validation, and test sets.
47
48    This function divides the DataFrame based on unique case identifiers (trace_id) into
49    specified proportions for training, validation, and testing. It supports shuffling
50    the cases before splitting to ensure random distribution.
51
52    Args:
53        log (pd.DataFrame): The input DataFrame containing the event log data.
54        train_perc (float): The proportion of the data to be used for the training set.
55        val_perc (float): The proportion of the data to be used for the validation set.
56        test_perc (float): The proportion of the data to be used for the test set. This parameter
57            is not directly used in splitting but can be useful for validation.
58        shuffle (bool): If True, the cases are shuffled before splitting. Defaults to False.
59        seed (int): The seed for the random number generator when shuffling. Defaults to 42.
60
61    Returns:
62        tuple: A tuple containing three pd.DataFrame objects for the training, validation, and test sets respectively.
63
64    Raises:
65        AssertionError: If the sum of the train, validation, and test percentage splits does not equal 1.
66    """
67    assert math.isclose(train_perc + val_perc + test_perc, 1, rel_tol=1e-9), "The sum of train_perc, val_perc, and test_perc should be equal to 1"
68
69    cases = list(log['trace_id'].unique())
70
71    if shuffle:
72        random.seed(seed)
73        random.shuffle(cases)
74
75    train_size = int(train_perc * len(cases))
76    val_size = int(val_perc * len(cases))
77
78    train_cases = cases[:train_size]
79    val_cases = cases[train_size:train_size + val_size]
80    test_cases = cases[train_size + val_size:]
81
82    assert len(train_cases) + len(val_cases) + len(test_cases) == len(cases)
83
84    train_df = log[log['trace_id'].isin(train_cases)]
85    val_df = log[log['trace_id'].isin(val_cases)]
86    test_df = log[log['trace_id'].isin(test_cases)]
87
88    return train_df, val_df, test_df

Splits a DataFrame containing event log data into training, validation, and test sets.

This function divides the DataFrame based on unique case identifiers (trace_id) into specified proportions for training, validation, and testing. It supports shuffling the cases before splitting to ensure random distribution.

Arguments:

log (pd.DataFrame): The input DataFrame containing the event log data.
train_perc (float): The proportion of the data to be used for the training set.
val_perc (float): The proportion of the data to be used for the validation set.
test_perc (float): The proportion of the data to be used for the test set. This parameter is not directly used in splitting but can be useful for validation.
shuffle (bool): If True, the cases are shuffled before splitting. Defaults to False.
seed (int): The seed for the random number generator when shuffling. Defaults to 42.

Returns:

tuple: A tuple containing three pd.DataFrame objects for the training, validation, and test sets respectively.

Raises:

AssertionError: If the sum of the train, validation, and test percentage splits does not equal 1.