Skip to content

SimpleIndexEncoder Module API Reference

SimpleIndexEncoder

Bases: BaseEncoder

Source code in src/enc4ppm/simple_index_encoder.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
class SimpleIndexEncoder(BaseEncoder):
    def __init__(
        self,
        *,
        include_latest_payload: bool = False,

        labeling_type: LabelingType = LabelingType.NEXT_ACTIVITY,
        attributes: list[str] | str = [],
        categorical_encoding: CategoricalEncoding = CategoricalEncoding.STRING,
        numerical_scaling: NumericalScaling = NumericalScaling.NONE,
        prefix_length: int = None,
        prefix_strategy: PrefixStrategy = PrefixStrategy.UP_TO_SPECIFIED,
        add_time_features: bool = False,
        timestamp_format: str = None,
        case_id_key: str = 'case:concept:name',
        activity_key: str = 'concept:name',
        timestamp_key: str = 'time:timestamp',
        outcome_key: str = 'outcome',
    ) -> None:
        """
        Initialize the SimpleIndexEncoder.

        Args:
            include_latest_payload: Whether to include (True) or not (False) the latest values of trace and event attributes. The attributes to consider can be specified through the `attributes` parameter.
            labeling_type: Label type to apply to examples.
            attributes: Which attributes to consider. Can be a list of the attributes to consider or the string 'all' (all attributes found in the log will be encoded).
            categorical_encoding: How to encode categorical features. They can either remain strings (CategoricalEncoding.STRING) or be converted to one-hot vectors splitted across multiple columns (CategoricalEncoding.ONE_HOT).
            numerical_scaling: How to scale numerical features. They can be standardized (NumericalScaling.STANDARDIZATION) or left as-is (NumericalScaling.NONE).
            prefix_length: Maximum prefix length to consider: longer prefixes will be discarded, shorter prefixes may be discarded depending on prefix_strategy parameter. If not provided, defaults to maximum prefix length found in log. If provided, it must be a non-zero positive int number.
            prefix_strategy: Whether to consider prefix lengths from 1 to prefix_length (PrefixStrategy.UP_TO_SPECIFIED) or only the specified prefix_length (PrefixStrategy.ONLY_SPECIFIED).
            add_time_features: Whether to add time features (time since case start and time since last event) to the encoding.
            timestamp_format: Format of the timestamps in the log. If not provided, formatting will be inferred from the data.
            case_id_key: Column name for case identifiers.
            activity_key: Column name for activity names.
            timestamp_key: Column name for timestamps.
            outcome_key: Column name for outcome predition.
        """
        super().__init__(
            labeling_type,
            attributes,
            categorical_encoding,
            numerical_scaling,
            prefix_length,
            prefix_strategy,
            add_time_features,
            timestamp_format,
            case_id_key,
            activity_key,
            timestamp_key,
            outcome_key,
        )

        self.include_latest_payload = include_latest_payload


    def encode(
        self,
        df: pd.DataFrame,
        *,
        freeze: bool = False,
    ) -> pd.DataFrame:
        """
        Encode the provided DataFrame with simple-index encoding and apply the specified labeling.

        Args:
            df: DataFrame to encode.
            freeze: Freeze encoder with provided parameters. Usually set to True when encoding the train log, False otherwise. Required if you want to later save the encoder to a file.

        Returns:
            The encoded DataFrame.
        """
        return super()._encode_template(df, freeze=freeze)


    def _encode(self, df: pd.DataFrame) -> pd.DataFrame:
        rows = []
        grouped = df.groupby(self.case_id_key)

        for case_id, case_events in grouped:
            case_events = case_events.sort_values(self.timestamp_key).reset_index()

            for prefix_length in range(1, len(case_events)+1):
                row = {
                    self.case_id_key: case_id,
                    self.timestamp_key: case_events.loc[prefix_length-1, self.timestamp_key],
                    self.ORIGINAL_INDEX_KEY: case_events.loc[prefix_length-1, 'index'],
                }

                for i in range(1, self.prefix_length+1):
                    if i <= prefix_length:
                        row[f'{self.EVENT_COL_PREFIX_NAME}_{i}'] = self._get_activity_value(case_events.loc[i-1, self.activity_key])
                    else:
                        row[f'{self.EVENT_COL_PREFIX_NAME}_{i}'] = self.PADDING_CAT_VAL

                rows.append(row)

        encoded_df = pd.DataFrame(rows)

        if self.include_latest_payload:
            encoded_df = super()._include_latest_payload(encoded_df)

        # Transform to one-hot if requested
        if self.categorical_encoding == CategoricalEncoding.ONE_HOT:
            categorical_columns = []
            categorical_columns_possible_values = []

            # Activity columns
            for i in range(1, self.prefix_length+1):
                categorical_columns.append(f'{self.EVENT_COL_PREFIX_NAME}_{i}')
                categorical_columns_possible_values.append(self.log_activities)

            # Latest payload columns
            if self.include_latest_payload:
                for attribute_name, attribute in self.log_attributes.items():
                    if attribute['type'] == 'categorical':
                        # For latest payload do not consider PADDING value
                        attribute_possible_values = [attribute_value for attribute_value in attribute['values'] if attribute_value != self.PADDING_CAT_VAL]

                        categorical_columns.append(f'{attribute_name}_{self.LATEST_PAYLOAD_COL_SUFFIX_NAME}')
                        categorical_columns_possible_values.append(attribute_possible_values)

            encoded_df = one_hot(
                encoded_df,
                columns=categorical_columns,
                columns_possible_values=categorical_columns_possible_values,
                unknown_value=self.UNKNOWN_VAL,
            )

        return encoded_df

__init__(*, include_latest_payload=False, labeling_type=LabelingType.NEXT_ACTIVITY, attributes=[], categorical_encoding=CategoricalEncoding.STRING, numerical_scaling=NumericalScaling.NONE, prefix_length=None, prefix_strategy=PrefixStrategy.UP_TO_SPECIFIED, add_time_features=False, timestamp_format=None, case_id_key='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp', outcome_key='outcome')

Initialize the SimpleIndexEncoder.

Parameters:

Name Type Description Default
include_latest_payload bool

Whether to include (True) or not (False) the latest values of trace and event attributes. The attributes to consider can be specified through the attributes parameter.

False
labeling_type LabelingType

Label type to apply to examples.

NEXT_ACTIVITY
attributes list[str] | str

Which attributes to consider. Can be a list of the attributes to consider or the string 'all' (all attributes found in the log will be encoded).

[]
categorical_encoding CategoricalEncoding

How to encode categorical features. They can either remain strings (CategoricalEncoding.STRING) or be converted to one-hot vectors splitted across multiple columns (CategoricalEncoding.ONE_HOT).

STRING
numerical_scaling NumericalScaling

How to scale numerical features. They can be standardized (NumericalScaling.STANDARDIZATION) or left as-is (NumericalScaling.NONE).

NONE
prefix_length int

Maximum prefix length to consider: longer prefixes will be discarded, shorter prefixes may be discarded depending on prefix_strategy parameter. If not provided, defaults to maximum prefix length found in log. If provided, it must be a non-zero positive int number.

None
prefix_strategy PrefixStrategy

Whether to consider prefix lengths from 1 to prefix_length (PrefixStrategy.UP_TO_SPECIFIED) or only the specified prefix_length (PrefixStrategy.ONLY_SPECIFIED).

UP_TO_SPECIFIED
add_time_features bool

Whether to add time features (time since case start and time since last event) to the encoding.

False
timestamp_format str

Format of the timestamps in the log. If not provided, formatting will be inferred from the data.

None
case_id_key str

Column name for case identifiers.

'case:concept:name'
activity_key str

Column name for activity names.

'concept:name'
timestamp_key str

Column name for timestamps.

'time:timestamp'
outcome_key str

Column name for outcome predition.

'outcome'
Source code in src/enc4ppm/simple_index_encoder.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    *,
    include_latest_payload: bool = False,

    labeling_type: LabelingType = LabelingType.NEXT_ACTIVITY,
    attributes: list[str] | str = [],
    categorical_encoding: CategoricalEncoding = CategoricalEncoding.STRING,
    numerical_scaling: NumericalScaling = NumericalScaling.NONE,
    prefix_length: int = None,
    prefix_strategy: PrefixStrategy = PrefixStrategy.UP_TO_SPECIFIED,
    add_time_features: bool = False,
    timestamp_format: str = None,
    case_id_key: str = 'case:concept:name',
    activity_key: str = 'concept:name',
    timestamp_key: str = 'time:timestamp',
    outcome_key: str = 'outcome',
) -> None:
    """
    Initialize the SimpleIndexEncoder.

    Args:
        include_latest_payload: Whether to include (True) or not (False) the latest values of trace and event attributes. The attributes to consider can be specified through the `attributes` parameter.
        labeling_type: Label type to apply to examples.
        attributes: Which attributes to consider. Can be a list of the attributes to consider or the string 'all' (all attributes found in the log will be encoded).
        categorical_encoding: How to encode categorical features. They can either remain strings (CategoricalEncoding.STRING) or be converted to one-hot vectors splitted across multiple columns (CategoricalEncoding.ONE_HOT).
        numerical_scaling: How to scale numerical features. They can be standardized (NumericalScaling.STANDARDIZATION) or left as-is (NumericalScaling.NONE).
        prefix_length: Maximum prefix length to consider: longer prefixes will be discarded, shorter prefixes may be discarded depending on prefix_strategy parameter. If not provided, defaults to maximum prefix length found in log. If provided, it must be a non-zero positive int number.
        prefix_strategy: Whether to consider prefix lengths from 1 to prefix_length (PrefixStrategy.UP_TO_SPECIFIED) or only the specified prefix_length (PrefixStrategy.ONLY_SPECIFIED).
        add_time_features: Whether to add time features (time since case start and time since last event) to the encoding.
        timestamp_format: Format of the timestamps in the log. If not provided, formatting will be inferred from the data.
        case_id_key: Column name for case identifiers.
        activity_key: Column name for activity names.
        timestamp_key: Column name for timestamps.
        outcome_key: Column name for outcome predition.
    """
    super().__init__(
        labeling_type,
        attributes,
        categorical_encoding,
        numerical_scaling,
        prefix_length,
        prefix_strategy,
        add_time_features,
        timestamp_format,
        case_id_key,
        activity_key,
        timestamp_key,
        outcome_key,
    )

    self.include_latest_payload = include_latest_payload

encode(df, *, freeze=False)

Encode the provided DataFrame with simple-index encoding and apply the specified labeling.

Parameters:

Name Type Description Default
df DataFrame

DataFrame to encode.

required
freeze bool

Freeze encoder with provided parameters. Usually set to True when encoding the train log, False otherwise. Required if you want to later save the encoder to a file.

False

Returns:

Type Description
DataFrame

The encoded DataFrame.

Source code in src/enc4ppm/simple_index_encoder.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def encode(
    self,
    df: pd.DataFrame,
    *,
    freeze: bool = False,
) -> pd.DataFrame:
    """
    Encode the provided DataFrame with simple-index encoding and apply the specified labeling.

    Args:
        df: DataFrame to encode.
        freeze: Freeze encoder with provided parameters. Usually set to True when encoding the train log, False otherwise. Required if you want to later save the encoder to a file.

    Returns:
        The encoded DataFrame.
    """
    return super()._encode_template(df, freeze=freeze)