nirdizati_light.encoding.common
1import logging 2from enum import Enum 3from typing import Optional 4 5from pandas import DataFrame 6from pm4py.objects.log.obj import EventLog 7 8from nirdizati_light.encoding.constants import PrefixLengthStrategy, TaskGenerationType 9from nirdizati_light.encoding.data_encoder import Encoder 10from nirdizati_light.encoding.feature_encoder.complex_features import complex_features 11from nirdizati_light.encoding.feature_encoder.frequency_features import frequency_features 12from nirdizati_light.encoding.feature_encoder.loreley_complex_features import loreley_complex_features 13from nirdizati_light.encoding.feature_encoder.loreley_features import loreley_features 14from nirdizati_light.encoding.feature_encoder.simple_features import simple_features 15from nirdizati_light.encoding.feature_encoder.binary_features import binary_features 16from nirdizati_light.encoding.feature_encoder.simple_trace_features import simple_trace_features 17from nirdizati_light.encoding.time_encoding import TimeEncodingType, time_encoding 18from nirdizati_light.labeling.common import LabelTypes 19 20logger = logging.getLogger(__name__) 21 22 23class EncodingType(Enum): 24 """ 25 Available trace encoding types 26 """ 27 SIMPLE = 'simple' 28 FREQUENCY = 'frequency' 29 COMPLEX = 'complex' 30 LORELEY = 'loreley' 31 LORELEY_COMPLEX = 'loreley_complex' 32 SIMPLE_TRACE = 'simple_trace' 33 BINARY = 'binary' 34 35class EncodingTypeAttribute(Enum): 36 """ 37 Available trace attributes encoding types 38 """ 39 LABEL = 'label' 40 ONEHOT = 'onehot' 41 42ENCODE_LOG = { 43 EncodingType.SIMPLE.value : simple_features, 44 EncodingType.FREQUENCY.value : frequency_features, 45 EncodingType.COMPLEX.value : complex_features, 46 EncodingType.LORELEY.value: loreley_features, 47 EncodingType.LORELEY_COMPLEX.value: loreley_complex_features, 48 EncodingType.SIMPLE_TRACE.value: simple_trace_features, 49 EncodingType.BINARY.value: binary_features, 50 51} 52 53def get_encoded_df( 54 log: EventLog, 55 encoder: Optional[Encoder] = None, 56 feature_encoding_type: EncodingType = EncodingType.SIMPLE.value, 57 prefix_length: int = 10, 58 prefix_length_strategy: PrefixLengthStrategy = PrefixLengthStrategy.FIXED.value, 59 time_encoding_type: TimeEncodingType = TimeEncodingType.NONE.value, 60 attribute_encoding: EncodingTypeAttribute = EncodingTypeAttribute.LABEL.value, 61 padding: bool = True, 62 labeling_type: LabelTypes = LabelTypes.ATTRIBUTE_STRING.value, 63 task_generation_type: TaskGenerationType = TaskGenerationType.ONLY_THIS.value, 64 target_event: Optional[str] = None, 65 train_cols: Optional[DataFrame] = None, 66 train_df: Optional[DataFrame] = None, 67) -> tuple[Encoder, DataFrame]: 68 """ 69 Encodes an event log into a DataFrame using specified encoding configurations. 70 71 This method allows for the customization of the encoding process through various parameters, including the type of feature encoding, prefix length, time encoding, and more. 72 73 The method returns a tuple containing the encoder used and the resulting DataFrame. 74 75 Args: 76 log (EventLog): The event log to be encoded. 77 encoder (Optional[Encoder]): The encoder to be used. If None, a default encoder based on the feature encoding type will be used. 78 feature_encoding_type (EncodingType): The type of feature encoding to use. Defaults to EncodingType.SIMPLE. 79 prefix_length (int): The length of the prefix to consider for each case. Defaults to 10. 80 prefix_length_strategy (PrefixLengthStrategy): The strategy to use for prefix length (e.g., fixed, percentage). Defaults to PrefixLengthStrategy.FIXED. 81 time_encoding_type (TimeEncodingType): The type of time encoding to use. Defaults to TimeEncodingType.NONE. 82 attribute_encoding (EncodingTypeAttribute): The type of attribute encoding to use. Defaults to EncodingTypeAttribute.LABEL. 83 padding (bool): Whether to pad sequences to a fixed length. Defaults to True. 84 labeling_type (LabelTypes): The type of labeling to use for the encoded log. Defaults to LabelTypes.ATTRIBUTE_STRING. 85 task_generation_type (TaskGenerationType): The type of task generation to use. Defaults to TaskGenerationType.ONLY_THIS. 86 target_event (Optional[str]): The target event to consider for encoding. Defaults to None. 87 train_cols (Optional[DataFrame]): The DataFrame containing the training columns. Defaults to None. 88 train_df (Optional[DataFrame]): The training DataFrame. Defaults to None. 89 90 Returns: 91 Tuple[Encoder, DataFrame]: A tuple containing the encoder and the encoded DataFrame. 92 """ 93 94 logger.debug(f'Features encoding ({feature_encoding_type})') 95 df = ENCODE_LOG[feature_encoding_type]( 96 log, 97 prefix_length=prefix_length, 98 padding=padding, 99 prefix_length_strategy=prefix_length_strategy, 100 labeling_type=labeling_type, 101 generation_type=task_generation_type, 102 feature_list=train_cols, 103 target_event=target_event, 104 ) 105 106 logger.debug(f'Time encoding ({time_encoding_type})') 107 df = time_encoding(df, time_encoding_type) 108 109 logger.debug('Dataframe alignment') 110 if train_df is not None: 111 _, df = train_df.align(df, join='left', axis=1) 112 113 if not encoder: 114 logger.debug('Encoder initialization') 115 encoder = Encoder(df=df, attribute_encoding=attribute_encoding, prefix_length=prefix_length) 116 117 logger.debug('Encoding') 118 encoder.encode(df=df) 119 120 return encoder, df
logger =
<Logger nirdizati_light.encoding.common (WARNING)>
class
EncodingType(enum.Enum):
24class EncodingType(Enum): 25 """ 26 Available trace encoding types 27 """ 28 SIMPLE = 'simple' 29 FREQUENCY = 'frequency' 30 COMPLEX = 'complex' 31 LORELEY = 'loreley' 32 LORELEY_COMPLEX = 'loreley_complex' 33 SIMPLE_TRACE = 'simple_trace' 34 BINARY = 'binary'
Available trace encoding types
SIMPLE =
<EncodingType.SIMPLE: 'simple'>
FREQUENCY =
<EncodingType.FREQUENCY: 'frequency'>
COMPLEX =
<EncodingType.COMPLEX: 'complex'>
LORELEY =
<EncodingType.LORELEY: 'loreley'>
LORELEY_COMPLEX =
<EncodingType.LORELEY_COMPLEX: 'loreley_complex'>
SIMPLE_TRACE =
<EncodingType.SIMPLE_TRACE: 'simple_trace'>
BINARY =
<EncodingType.BINARY: 'binary'>
Inherited Members
- enum.Enum
- name
- value
class
EncodingTypeAttribute(enum.Enum):
36class EncodingTypeAttribute(Enum): 37 """ 38 Available trace attributes encoding types 39 """ 40 LABEL = 'label' 41 ONEHOT = 'onehot'
Available trace attributes encoding types
LABEL =
<EncodingTypeAttribute.LABEL: 'label'>
ONEHOT =
<EncodingTypeAttribute.ONEHOT: 'onehot'>
Inherited Members
- enum.Enum
- name
- value
ENCODE_LOG =
{'simple': <function simple_features>, 'frequency': <function frequency_features>, 'complex': <function complex_features>, 'loreley': <function loreley_features>, 'loreley_complex': <function loreley_complex_features>, 'simple_trace': <function simple_trace_features>, 'binary': <function binary_features>}
def
get_encoded_df( log: pm4py.objects.log.obj.EventLog, encoder: Optional[nirdizati_light.encoding.data_encoder.Encoder] = None, feature_encoding_type: EncodingType = 'simple', prefix_length: int = 10, prefix_length_strategy: nirdizati_light.encoding.constants.PrefixLengthStrategy = 'fixed', time_encoding_type: nirdizati_light.encoding.time_encoding.TimeEncodingType = 'none', attribute_encoding: EncodingTypeAttribute = 'label', padding: bool = True, labeling_type: nirdizati_light.labeling.common.LabelTypes = 'label_attribute_string', task_generation_type: nirdizati_light.encoding.constants.TaskGenerationType = 'only_this', target_event: Optional[str] = None, train_cols: Optional[pandas.core.frame.DataFrame] = None, train_df: Optional[pandas.core.frame.DataFrame] = None) -> tuple[nirdizati_light.encoding.data_encoder.Encoder, pandas.core.frame.DataFrame]:
54def get_encoded_df( 55 log: EventLog, 56 encoder: Optional[Encoder] = None, 57 feature_encoding_type: EncodingType = EncodingType.SIMPLE.value, 58 prefix_length: int = 10, 59 prefix_length_strategy: PrefixLengthStrategy = PrefixLengthStrategy.FIXED.value, 60 time_encoding_type: TimeEncodingType = TimeEncodingType.NONE.value, 61 attribute_encoding: EncodingTypeAttribute = EncodingTypeAttribute.LABEL.value, 62 padding: bool = True, 63 labeling_type: LabelTypes = LabelTypes.ATTRIBUTE_STRING.value, 64 task_generation_type: TaskGenerationType = TaskGenerationType.ONLY_THIS.value, 65 target_event: Optional[str] = None, 66 train_cols: Optional[DataFrame] = None, 67 train_df: Optional[DataFrame] = None, 68) -> tuple[Encoder, DataFrame]: 69 """ 70 Encodes an event log into a DataFrame using specified encoding configurations. 71 72 This method allows for the customization of the encoding process through various parameters, including the type of feature encoding, prefix length, time encoding, and more. 73 74 The method returns a tuple containing the encoder used and the resulting DataFrame. 75 76 Args: 77 log (EventLog): The event log to be encoded. 78 encoder (Optional[Encoder]): The encoder to be used. If None, a default encoder based on the feature encoding type will be used. 79 feature_encoding_type (EncodingType): The type of feature encoding to use. Defaults to EncodingType.SIMPLE. 80 prefix_length (int): The length of the prefix to consider for each case. Defaults to 10. 81 prefix_length_strategy (PrefixLengthStrategy): The strategy to use for prefix length (e.g., fixed, percentage). Defaults to PrefixLengthStrategy.FIXED. 82 time_encoding_type (TimeEncodingType): The type of time encoding to use. Defaults to TimeEncodingType.NONE. 83 attribute_encoding (EncodingTypeAttribute): The type of attribute encoding to use. Defaults to EncodingTypeAttribute.LABEL. 84 padding (bool): Whether to pad sequences to a fixed length. Defaults to True. 85 labeling_type (LabelTypes): The type of labeling to use for the encoded log. Defaults to LabelTypes.ATTRIBUTE_STRING. 86 task_generation_type (TaskGenerationType): The type of task generation to use. Defaults to TaskGenerationType.ONLY_THIS. 87 target_event (Optional[str]): The target event to consider for encoding. Defaults to None. 88 train_cols (Optional[DataFrame]): The DataFrame containing the training columns. Defaults to None. 89 train_df (Optional[DataFrame]): The training DataFrame. Defaults to None. 90 91 Returns: 92 Tuple[Encoder, DataFrame]: A tuple containing the encoder and the encoded DataFrame. 93 """ 94 95 logger.debug(f'Features encoding ({feature_encoding_type})') 96 df = ENCODE_LOG[feature_encoding_type]( 97 log, 98 prefix_length=prefix_length, 99 padding=padding, 100 prefix_length_strategy=prefix_length_strategy, 101 labeling_type=labeling_type, 102 generation_type=task_generation_type, 103 feature_list=train_cols, 104 target_event=target_event, 105 ) 106 107 logger.debug(f'Time encoding ({time_encoding_type})') 108 df = time_encoding(df, time_encoding_type) 109 110 logger.debug('Dataframe alignment') 111 if train_df is not None: 112 _, df = train_df.align(df, join='left', axis=1) 113 114 if not encoder: 115 logger.debug('Encoder initialization') 116 encoder = Encoder(df=df, attribute_encoding=attribute_encoding, prefix_length=prefix_length) 117 118 logger.debug('Encoding') 119 encoder.encode(df=df) 120 121 return encoder, df
Encodes an event log into a DataFrame using specified encoding configurations.
This method allows for the customization of the encoding process through various parameters, including the type of feature encoding, prefix length, time encoding, and more.
The method returns a tuple containing the encoder used and the resulting DataFrame.
Arguments:
- log (EventLog): The event log to be encoded.
- encoder (Optional[Encoder]): The encoder to be used. If None, a default encoder based on the feature encoding type will be used.
- feature_encoding_type (EncodingType): The type of feature encoding to use. Defaults to EncodingType.SIMPLE.
- prefix_length (int): The length of the prefix to consider for each case. Defaults to 10.
- prefix_length_strategy (PrefixLengthStrategy): The strategy to use for prefix length (e.g., fixed, percentage). Defaults to PrefixLengthStrategy.FIXED.
- time_encoding_type (TimeEncodingType): The type of time encoding to use. Defaults to TimeEncodingType.NONE.
- attribute_encoding (EncodingTypeAttribute): The type of attribute encoding to use. Defaults to EncodingTypeAttribute.LABEL.
- padding (bool): Whether to pad sequences to a fixed length. Defaults to True.
- labeling_type (LabelTypes): The type of labeling to use for the encoded log. Defaults to LabelTypes.ATTRIBUTE_STRING.
- task_generation_type (TaskGenerationType): The type of task generation to use. Defaults to TaskGenerationType.ONLY_THIS.
- target_event (Optional[str]): The target event to consider for encoding. Defaults to None.
- train_cols (Optional[DataFrame]): The DataFrame containing the training columns. Defaults to None.
- train_df (Optional[DataFrame]): The training DataFrame. Defaults to None.
Returns:
Tuple[Encoder, DataFrame]: A tuple containing the encoder and the encoded DataFrame.