nirdizati_light.encoding.feature_encoder.frequency_features
1from collections import Counter 2from datetime import timedelta 3 4from pandas import DataFrame 5from pm4py.objects.log.obj import EventLog, Trace, Event 6 7from nirdizati_light.encoding.constants import get_max_prefix_length, get_prefix_length, TaskGenerationType 8from nirdizati_light.labeling.common import add_label_column 9 10PREFIX_ = 'prefix_' 11 12 13def frequency_features(log: EventLog, prefix_length, padding, prefix_length_strategy: str, labeling_type, generation_type, feature_list: list = None, target_event: str = None) -> DataFrame: 14 if feature_list is None: 15 max_prefix_length = get_max_prefix_length(log, prefix_length, prefix_length_strategy, target_event) 16 feature_list = _compute_columns(log, max_prefix_length, padding) 17 encoded_data = [] 18 for trace in log: 19 trace_prefix_length = get_prefix_length(trace, prefix_length, prefix_length_strategy, target_event) 20 if len(trace) <= prefix_length - 1 and not padding: 21 # trace too short and no zero padding 22 continue 23 if generation_type == TaskGenerationType.ALL_IN_ONE.value: 24 for event_index in range(1, min(trace_prefix_length + 1, len(trace) + 1)): 25 encoded_data.append(_trace_to_row(trace, event_index, feature_list, padding, labeling_type)) 26 else: 27 encoded_data.append(_trace_to_row(trace, trace_prefix_length, feature_list, padding, labeling_type)) 28 29 return DataFrame(columns=feature_list, data=encoded_data) 30 31 32def _compute_columns(log: EventLog, prefix_length: int, padding: bool) -> list: 33 """trace_id, prefixes, any other columns, label 34 35 """ 36 ret_val = ["trace_id"] 37 ret_val += sorted(list({ 38 event['concept:name'] 39 for trace in log 40 for event in trace[:prefix_length] 41 })) 42 ret_val += ['0'] if padding else [] 43 ret_val += ['label'] 44 45 return ret_val 46 47 48def _trace_to_row(trace: Trace, prefix_length: int, columns: list, padding: bool = True, labeling_type: str = None) -> list: 49 """Row in data frame""" 50 trace_row = [ trace.attributes['concept:name'] ] 51 52 if len(trace) <= prefix_length - 1 and not padding: 53 pass 54 trace += [ 55 Event({ 56 'concept:name': '0', 57 'time:timestamp': trace[len(trace)] + timedelta(hours=i) 58 }) 59 for i in range(len(trace), prefix_length + 1) 60 ] 61 62 occurences = Counter([ 63 event['concept:name'] 64 for event in trace[:prefix_length] 65 ]) 66 cleaned_comumns = columns[1:-1] 67 trace_row += [ occurences[col] for col in cleaned_comumns ] 68 trace_row += [ add_label_column(trace, labeling_type, prefix_length) ] 69 return trace_row
PREFIX_ =
'prefix_'
def
frequency_features( log: pm4py.objects.log.obj.EventLog, prefix_length, padding, prefix_length_strategy: str, labeling_type, generation_type, feature_list: list = None, target_event: str = None) -> pandas.core.frame.DataFrame:
14def frequency_features(log: EventLog, prefix_length, padding, prefix_length_strategy: str, labeling_type, generation_type, feature_list: list = None, target_event: str = None) -> DataFrame: 15 if feature_list is None: 16 max_prefix_length = get_max_prefix_length(log, prefix_length, prefix_length_strategy, target_event) 17 feature_list = _compute_columns(log, max_prefix_length, padding) 18 encoded_data = [] 19 for trace in log: 20 trace_prefix_length = get_prefix_length(trace, prefix_length, prefix_length_strategy, target_event) 21 if len(trace) <= prefix_length - 1 and not padding: 22 # trace too short and no zero padding 23 continue 24 if generation_type == TaskGenerationType.ALL_IN_ONE.value: 25 for event_index in range(1, min(trace_prefix_length + 1, len(trace) + 1)): 26 encoded_data.append(_trace_to_row(trace, event_index, feature_list, padding, labeling_type)) 27 else: 28 encoded_data.append(_trace_to_row(trace, trace_prefix_length, feature_list, padding, labeling_type)) 29 30 return DataFrame(columns=feature_list, data=encoded_data)