nirdizati_light.encoding.feature_encoder.complex_features
1from functools import reduce 2 3from pandas import DataFrame 4from pm4py.objects.log.obj import Trace, EventLog 5 6from nirdizati_light.encoding.constants import get_max_prefix_length, get_prefix_length, TaskGenerationType, PrefixLengthStrategy 7from nirdizati_light.labeling.common import add_label_column 8 9ATTRIBUTE_CLASSIFIER = None 10 11PREFIX_ = 'prefix_' 12 13 14def complex_features(log: EventLog, prefix_length, padding, prefix_length_strategy: str, labeling_type, generation_type, feature_list: list = None, target_event: str = None) -> DataFrame: 15 max_prefix_length = get_max_prefix_length(log, prefix_length, prefix_length_strategy, target_event) 16 columns, additional_columns = _columns_complex(log, max_prefix_length, feature_list) 17 columns_number = len(columns) 18 encoded_data = [] 19 for trace in log: 20 trace_prefix_length = get_prefix_length(trace, prefix_length, prefix_length_strategy, target_event) 21 if len(trace) <= prefix_length - 1 and not padding: 22 # trace too short and no zero padding 23 continue 24 if generation_type == TaskGenerationType.ALL_IN_ONE.value: 25 for event_index in range(1, min(trace_prefix_length + 1, len(trace) + 1)): 26 encoded_data.append(_trace_to_row(trace, event_index, additional_columns, prefix_length_strategy, padding, columns, labeling_type)) 27 else: 28 encoded_data.append(_trace_to_row(trace, trace_prefix_length, additional_columns, prefix_length_strategy, padding, columns, labeling_type)) 29 #change prefiz_i to prefix and update feature lsit 30 return DataFrame(columns=columns, data=encoded_data) 31 32 33def _get_global_trace_attributes(log: EventLog): 34 # retrieves all traces in the log and returns their intersection 35 attributes = list(reduce(set.intersection, [set(trace._get_attributes().keys()) for trace in log])) 36 trace_attributes = [attr for attr in attributes if attr not in ["concept:name", "time:timestamp", "label"]] 37 return sorted(trace_attributes) 38 39 40def _get_global_event_attributes(log): 41 """Get log event attributes that are not name or time 42 """ 43 # retrieves all events in the log and returns their intersection 44 attributes = list(reduce(set.intersection, [set(event._dict.keys()) for trace in log for event in trace])) 45 event_attributes = [attr for attr in attributes if attr not in ["concept:name"]] 46 return sorted(event_attributes) 47 48 49def _compute_additional_columns(log) -> dict: 50 return {'trace_attributes': _get_global_trace_attributes(log), 51 'event_attributes': _get_global_event_attributes(log)} 52 53 54def _columns_complex(log, prefix_length: int, feature_list: list = None) -> tuple: 55 additional_columns = _compute_additional_columns(log) 56 columns = ['trace_id'] 57 columns += additional_columns['trace_attributes'] 58 for i in range(1, prefix_length + 1): 59 columns.append(PREFIX_ + str(i)) 60 for additional_column in additional_columns['event_attributes']: 61 columns.append(additional_column + "_" + str(i)) 62 columns += ['label'] 63 if feature_list is not None: 64 assert(list(feature_list) == columns) 65 return columns, additional_columns 66 67 68def _data_complex(trace: Trace, prefix_length: int, additional_columns: dict) -> list: 69 """Creates list in form [1, value1, value2, 2, ...] 70 71 Appends values in additional_columns 72 """ 73 data = [trace.attributes.get(att, 0) for att in additional_columns['trace_attributes']] 74 for idx, event in enumerate(trace): 75 if idx == prefix_length: 76 break 77 event_name = event["concept:name"] 78 data.append(event_name) 79 80 for att in additional_columns['event_attributes']: 81 data.append(event.get(att, '0')) 82 83 return data 84 85 86def _trace_to_row(trace: Trace, prefix_length: int, additional_columns, prefix_length_strategy: str, padding, columns: list, labeling_type) -> list: 87 trace_row = [trace.attributes["concept:name"]] 88 trace_row += _data_complex(trace, prefix_length, additional_columns) 89 if padding or prefix_length_strategy == PrefixLengthStrategy.PERCENTAGE.value: 90 trace_row += [0 for _ in range(len(trace_row), len(columns) - 1)] 91 trace_row += [add_label_column(trace, labeling_type, prefix_length)] 92 return trace_row 93 94#def _trace_to_row(trace: Trace, prefix_length: int, additional_columns, prefix_length_strategy: str, padding, columns: list, labeling_type) -> list: 95#def _row_to_trace(df: DataFrame, prefix_length: int, additional_columns, prefix_length_strategy: str, padding, columns: list, labeling_type) -> list: 96# for row in df.iterrows():
ATTRIBUTE_CLASSIFIER =
None
PREFIX_ =
'prefix_'
def
complex_features( log: pm4py.objects.log.obj.EventLog, prefix_length, padding, prefix_length_strategy: str, labeling_type, generation_type, feature_list: list = None, target_event: str = None) -> pandas.core.frame.DataFrame:
15def complex_features(log: EventLog, prefix_length, padding, prefix_length_strategy: str, labeling_type, generation_type, feature_list: list = None, target_event: str = None) -> DataFrame: 16 max_prefix_length = get_max_prefix_length(log, prefix_length, prefix_length_strategy, target_event) 17 columns, additional_columns = _columns_complex(log, max_prefix_length, feature_list) 18 columns_number = len(columns) 19 encoded_data = [] 20 for trace in log: 21 trace_prefix_length = get_prefix_length(trace, prefix_length, prefix_length_strategy, target_event) 22 if len(trace) <= prefix_length - 1 and not padding: 23 # trace too short and no zero padding 24 continue 25 if generation_type == TaskGenerationType.ALL_IN_ONE.value: 26 for event_index in range(1, min(trace_prefix_length + 1, len(trace) + 1)): 27 encoded_data.append(_trace_to_row(trace, event_index, additional_columns, prefix_length_strategy, padding, columns, labeling_type)) 28 else: 29 encoded_data.append(_trace_to_row(trace, trace_prefix_length, additional_columns, prefix_length_strategy, padding, columns, labeling_type)) 30 #change prefiz_i to prefix and update feature lsit 31 return DataFrame(columns=columns, data=encoded_data)