nirdizati_light.encoding.common

  1import logging
  2from enum import Enum
  3from typing import Optional
  4
  5from pandas import DataFrame
  6from pm4py.objects.log.obj import EventLog
  7
  8from nirdizati_light.encoding.constants import PrefixLengthStrategy, TaskGenerationType
  9from nirdizati_light.encoding.data_encoder import Encoder
 10from nirdizati_light.encoding.feature_encoder.complex_features import complex_features
 11from nirdizati_light.encoding.feature_encoder.frequency_features import frequency_features
 12from nirdizati_light.encoding.feature_encoder.loreley_complex_features import loreley_complex_features
 13from nirdizati_light.encoding.feature_encoder.loreley_features import loreley_features
 14from nirdizati_light.encoding.feature_encoder.simple_features import simple_features
 15from nirdizati_light.encoding.feature_encoder.binary_features import binary_features
 16from nirdizati_light.encoding.feature_encoder.simple_trace_features import simple_trace_features
 17from nirdizati_light.encoding.time_encoding import TimeEncodingType, time_encoding
 18from nirdizati_light.labeling.common import LabelTypes
 19
 20logger = logging.getLogger(__name__)
 21
 22
 23class EncodingType(Enum):
 24    """
 25    Available trace encoding types
 26    """
 27    SIMPLE = 'simple'
 28    FREQUENCY = 'frequency'
 29    COMPLEX = 'complex'
 30    LORELEY = 'loreley'
 31    LORELEY_COMPLEX = 'loreley_complex'
 32    SIMPLE_TRACE = 'simple_trace'
 33    BINARY = 'binary'
 34
 35class EncodingTypeAttribute(Enum):
 36    """
 37    Available trace attributes encoding types
 38    """
 39    LABEL = 'label'
 40    ONEHOT = 'onehot'
 41
 42ENCODE_LOG = {
 43    EncodingType.SIMPLE.value : simple_features,
 44    EncodingType.FREQUENCY.value : frequency_features,
 45    EncodingType.COMPLEX.value : complex_features,
 46    EncodingType.LORELEY.value: loreley_features,
 47    EncodingType.LORELEY_COMPLEX.value: loreley_complex_features,
 48    EncodingType.SIMPLE_TRACE.value: simple_trace_features,
 49    EncodingType.BINARY.value: binary_features,
 50
 51}
 52
 53def get_encoded_df(
 54    log: EventLog,
 55    encoder: Optional[Encoder] = None,
 56    feature_encoding_type: EncodingType = EncodingType.SIMPLE.value,
 57    prefix_length: int = 10,
 58    prefix_length_strategy: PrefixLengthStrategy = PrefixLengthStrategy.FIXED.value,
 59    time_encoding_type: TimeEncodingType = TimeEncodingType.NONE.value,
 60    attribute_encoding: EncodingTypeAttribute = EncodingTypeAttribute.LABEL.value,
 61    padding: bool = True,
 62    labeling_type: LabelTypes = LabelTypes.ATTRIBUTE_STRING.value,
 63    task_generation_type: TaskGenerationType = TaskGenerationType.ONLY_THIS.value,
 64    target_event: Optional[str] = None,
 65    train_cols: Optional[DataFrame] = None,
 66    train_df: Optional[DataFrame] = None,
 67) -> tuple[Encoder, DataFrame]:
 68    """
 69    Encodes an event log into a DataFrame using specified encoding configurations.
 70
 71    This method allows for the customization of the encoding process through various parameters, including the type of feature encoding, prefix length, time encoding, and more.
 72
 73    The method returns a tuple containing the encoder used and the resulting DataFrame.
 74
 75    Args:
 76        log (EventLog): The event log to be encoded.
 77        encoder (Optional[Encoder]): The encoder to be used. If None, a default encoder based on the feature encoding type will be used.
 78        feature_encoding_type (EncodingType): The type of feature encoding to use. Defaults to EncodingType.SIMPLE.
 79        prefix_length (int): The length of the prefix to consider for each case. Defaults to 10.
 80        prefix_length_strategy (PrefixLengthStrategy): The strategy to use for prefix length (e.g., fixed, percentage). Defaults to PrefixLengthStrategy.FIXED.
 81        time_encoding_type (TimeEncodingType): The type of time encoding to use. Defaults to TimeEncodingType.NONE.
 82        attribute_encoding (EncodingTypeAttribute): The type of attribute encoding to use. Defaults to EncodingTypeAttribute.LABEL.
 83        padding (bool): Whether to pad sequences to a fixed length. Defaults to True.
 84        labeling_type (LabelTypes): The type of labeling to use for the encoded log. Defaults to LabelTypes.ATTRIBUTE_STRING.
 85        task_generation_type (TaskGenerationType): The type of task generation to use. Defaults to TaskGenerationType.ONLY_THIS.
 86        target_event (Optional[str]): The target event to consider for encoding. Defaults to None.
 87        train_cols (Optional[DataFrame]): The DataFrame containing the training columns. Defaults to None.
 88        train_df (Optional[DataFrame]): The training DataFrame. Defaults to None.
 89
 90    Returns:
 91        Tuple[Encoder, DataFrame]: A tuple containing the encoder and the encoded DataFrame.
 92    """
 93
 94    logger.debug(f'Features encoding ({feature_encoding_type})')
 95    df = ENCODE_LOG[feature_encoding_type](
 96        log,
 97        prefix_length=prefix_length,
 98        padding=padding,
 99        prefix_length_strategy=prefix_length_strategy,
100        labeling_type=labeling_type,
101        generation_type=task_generation_type,
102        feature_list=train_cols,
103        target_event=target_event,
104    )
105
106    logger.debug(f'Time encoding ({time_encoding_type})')
107    df = time_encoding(df, time_encoding_type)
108
109    logger.debug('Dataframe alignment')
110    if train_df is not None:
111        _, df = train_df.align(df, join='left', axis=1)
112
113    if not encoder:
114        logger.debug('Encoder initialization')
115        encoder = Encoder(df=df, attribute_encoding=attribute_encoding, prefix_length=prefix_length)
116
117    logger.debug('Encoding')
118    encoder.encode(df=df)
119
120    return encoder, df
logger = <Logger nirdizati_light.encoding.common (WARNING)>
class EncodingType(enum.Enum):
24class EncodingType(Enum):
25    """
26    Available trace encoding types
27    """
28    SIMPLE = 'simple'
29    FREQUENCY = 'frequency'
30    COMPLEX = 'complex'
31    LORELEY = 'loreley'
32    LORELEY_COMPLEX = 'loreley_complex'
33    SIMPLE_TRACE = 'simple_trace'
34    BINARY = 'binary'

Available trace encoding types

SIMPLE = <EncodingType.SIMPLE: 'simple'>
FREQUENCY = <EncodingType.FREQUENCY: 'frequency'>
COMPLEX = <EncodingType.COMPLEX: 'complex'>
LORELEY = <EncodingType.LORELEY: 'loreley'>
LORELEY_COMPLEX = <EncodingType.LORELEY_COMPLEX: 'loreley_complex'>
SIMPLE_TRACE = <EncodingType.SIMPLE_TRACE: 'simple_trace'>
BINARY = <EncodingType.BINARY: 'binary'>
Inherited Members
enum.Enum
name
value
class EncodingTypeAttribute(enum.Enum):
36class EncodingTypeAttribute(Enum):
37    """
38    Available trace attributes encoding types
39    """
40    LABEL = 'label'
41    ONEHOT = 'onehot'

Available trace attributes encoding types

LABEL = <EncodingTypeAttribute.LABEL: 'label'>
ONEHOT = <EncodingTypeAttribute.ONEHOT: 'onehot'>
Inherited Members
enum.Enum
name
value
ENCODE_LOG = {'simple': <function simple_features>, 'frequency': <function frequency_features>, 'complex': <function complex_features>, 'loreley': <function loreley_features>, 'loreley_complex': <function loreley_complex_features>, 'simple_trace': <function simple_trace_features>, 'binary': <function binary_features>}
def get_encoded_df( log: pm4py.objects.log.obj.EventLog, encoder: Optional[nirdizati_light.encoding.data_encoder.Encoder] = None, feature_encoding_type: EncodingType = 'simple', prefix_length: int = 10, prefix_length_strategy: nirdizati_light.encoding.constants.PrefixLengthStrategy = 'fixed', time_encoding_type: nirdizati_light.encoding.time_encoding.TimeEncodingType = 'none', attribute_encoding: EncodingTypeAttribute = 'label', padding: bool = True, labeling_type: nirdizati_light.labeling.common.LabelTypes = 'label_attribute_string', task_generation_type: nirdizati_light.encoding.constants.TaskGenerationType = 'only_this', target_event: Optional[str] = None, train_cols: Optional[pandas.core.frame.DataFrame] = None, train_df: Optional[pandas.core.frame.DataFrame] = None) -> tuple[nirdizati_light.encoding.data_encoder.Encoder, pandas.core.frame.DataFrame]:
 54def get_encoded_df(
 55    log: EventLog,
 56    encoder: Optional[Encoder] = None,
 57    feature_encoding_type: EncodingType = EncodingType.SIMPLE.value,
 58    prefix_length: int = 10,
 59    prefix_length_strategy: PrefixLengthStrategy = PrefixLengthStrategy.FIXED.value,
 60    time_encoding_type: TimeEncodingType = TimeEncodingType.NONE.value,
 61    attribute_encoding: EncodingTypeAttribute = EncodingTypeAttribute.LABEL.value,
 62    padding: bool = True,
 63    labeling_type: LabelTypes = LabelTypes.ATTRIBUTE_STRING.value,
 64    task_generation_type: TaskGenerationType = TaskGenerationType.ONLY_THIS.value,
 65    target_event: Optional[str] = None,
 66    train_cols: Optional[DataFrame] = None,
 67    train_df: Optional[DataFrame] = None,
 68) -> tuple[Encoder, DataFrame]:
 69    """
 70    Encodes an event log into a DataFrame using specified encoding configurations.
 71
 72    This method allows for the customization of the encoding process through various parameters, including the type of feature encoding, prefix length, time encoding, and more.
 73
 74    The method returns a tuple containing the encoder used and the resulting DataFrame.
 75
 76    Args:
 77        log (EventLog): The event log to be encoded.
 78        encoder (Optional[Encoder]): The encoder to be used. If None, a default encoder based on the feature encoding type will be used.
 79        feature_encoding_type (EncodingType): The type of feature encoding to use. Defaults to EncodingType.SIMPLE.
 80        prefix_length (int): The length of the prefix to consider for each case. Defaults to 10.
 81        prefix_length_strategy (PrefixLengthStrategy): The strategy to use for prefix length (e.g., fixed, percentage). Defaults to PrefixLengthStrategy.FIXED.
 82        time_encoding_type (TimeEncodingType): The type of time encoding to use. Defaults to TimeEncodingType.NONE.
 83        attribute_encoding (EncodingTypeAttribute): The type of attribute encoding to use. Defaults to EncodingTypeAttribute.LABEL.
 84        padding (bool): Whether to pad sequences to a fixed length. Defaults to True.
 85        labeling_type (LabelTypes): The type of labeling to use for the encoded log. Defaults to LabelTypes.ATTRIBUTE_STRING.
 86        task_generation_type (TaskGenerationType): The type of task generation to use. Defaults to TaskGenerationType.ONLY_THIS.
 87        target_event (Optional[str]): The target event to consider for encoding. Defaults to None.
 88        train_cols (Optional[DataFrame]): The DataFrame containing the training columns. Defaults to None.
 89        train_df (Optional[DataFrame]): The training DataFrame. Defaults to None.
 90
 91    Returns:
 92        Tuple[Encoder, DataFrame]: A tuple containing the encoder and the encoded DataFrame.
 93    """
 94
 95    logger.debug(f'Features encoding ({feature_encoding_type})')
 96    df = ENCODE_LOG[feature_encoding_type](
 97        log,
 98        prefix_length=prefix_length,
 99        padding=padding,
100        prefix_length_strategy=prefix_length_strategy,
101        labeling_type=labeling_type,
102        generation_type=task_generation_type,
103        feature_list=train_cols,
104        target_event=target_event,
105    )
106
107    logger.debug(f'Time encoding ({time_encoding_type})')
108    df = time_encoding(df, time_encoding_type)
109
110    logger.debug('Dataframe alignment')
111    if train_df is not None:
112        _, df = train_df.align(df, join='left', axis=1)
113
114    if not encoder:
115        logger.debug('Encoder initialization')
116        encoder = Encoder(df=df, attribute_encoding=attribute_encoding, prefix_length=prefix_length)
117
118    logger.debug('Encoding')
119    encoder.encode(df=df)
120
121    return encoder, df

Encodes an event log into a DataFrame using specified encoding configurations.

This method allows for the customization of the encoding process through various parameters, including the type of feature encoding, prefix length, time encoding, and more.

The method returns a tuple containing the encoder used and the resulting DataFrame.

Arguments:
  • log (EventLog): The event log to be encoded.
  • encoder (Optional[Encoder]): The encoder to be used. If None, a default encoder based on the feature encoding type will be used.
  • feature_encoding_type (EncodingType): The type of feature encoding to use. Defaults to EncodingType.SIMPLE.
  • prefix_length (int): The length of the prefix to consider for each case. Defaults to 10.
  • prefix_length_strategy (PrefixLengthStrategy): The strategy to use for prefix length (e.g., fixed, percentage). Defaults to PrefixLengthStrategy.FIXED.
  • time_encoding_type (TimeEncodingType): The type of time encoding to use. Defaults to TimeEncodingType.NONE.
  • attribute_encoding (EncodingTypeAttribute): The type of attribute encoding to use. Defaults to EncodingTypeAttribute.LABEL.
  • padding (bool): Whether to pad sequences to a fixed length. Defaults to True.
  • labeling_type (LabelTypes): The type of labeling to use for the encoded log. Defaults to LabelTypes.ATTRIBUTE_STRING.
  • task_generation_type (TaskGenerationType): The type of task generation to use. Defaults to TaskGenerationType.ONLY_THIS.
  • target_event (Optional[str]): The target event to consider for encoding. Defaults to None.
  • train_cols (Optional[DataFrame]): The DataFrame containing the training columns. Defaults to None.
  • train_df (Optional[DataFrame]): The training DataFrame. Defaults to None.
Returns:

Tuple[Encoder, DataFrame]: A tuple containing the encoder and the encoded DataFrame.