nirdizati_light.explanation.wrappers.dice_wrapper

  1import warnings
  2import os
  3from datetime import datetime
  4import dice_ml
  5import numpy as np
  6import pandas as pd
  7import pm4py
  8from scipy.spatial.distance import _validate_vector
  9from scipy.spatial.distance import cdist, pdist
 10from scipy.stats import median_abs_deviation
 11from pm4py import convert_to_event_log
 12from declare4py.declare4py import Declare4Py
 13from declare4py.enums import TraceState
 14from nirdizati_light.encoding.common import get_encoded_df, EncodingType
 15
 16from nirdizati_light.predictive_model.common import ClassificationMethods
 17
 18warnings.filterwarnings("ignore", category=UserWarning)
 19
 20
 21single_prefix = ['loreley','loreley_complex']
 22
 23
 24
 25def dice_explain(CONF, predictive_model, encoder, df, query_instances, method, optimization, heuristic, support,
 26                 timestamp_col_name,model_path,random_seed=None,adapted=None,filtering=None
 27                 ):
 28    features_names = df.columns.values[:-1]
 29    feature_selection = CONF['feature_selection']
 30    dataset = CONF['data'].rpartition('/')[0].rpartition('/')[-1]
 31
 32    if 'BPIC15' in dataset:
 33        dataset_created = dataset.replace('_f2','')
 34    elif 'bpic2012' in dataset:
 35        dataset_created = dataset.replace('-COMPLETE','').replace('bpic2012','BPIC12')
 36    elif 'sepsis' in dataset:
 37        dataset_created = dataset.replace('_cases','')
 38    black_box = predictive_model.model_type
 39    categorical_features,continuous_features,cat_feature_index,cont_feature_index = split_features(df.iloc[:,:-1], encoder)
 40    if CONF['feature_selection'] == 'loreley':
 41        query_instances = query_instances[query_instances['prefix'] != 0]
 42    if CONF['feature_selection'] == 'frequency':
 43        ratio_cont = 1
 44    else:
 45        ratio_cont = len(continuous_features)/len(categorical_features)
 46    time_start = datetime.now()
 47    query_instances_for_cf = query_instances.iloc[:2,:-1]
 48    d = dice_ml.Data(dataframe=df, continuous_features=continuous_features, outcome_name='label')
 49    m = dice_model(predictive_model)
 50    dice_query_instance = dice_ml.Dice(d, m, method, encoder)
 51    time_train = (datetime.now() - time_start).total_seconds()
 52    index_test_instances = range(len(query_instances_for_cf))
 53    #model_path = model_path +'_' + str(support) + '/'
 54    extended_loss = False
 55    try:
 56        if not os.path.exists(model_path):
 57            os.makedirs(model_path)
 58            print("Directory '%s' created successfully" % model_path)
 59    except OSError as error:
 60        print("Directory '%s' can not be created" % model_path)
 61    
 62    d4py = Declare4Py()
 63    model_discovery(CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name)
 64
 65    cols = df.columns[:-1].values
 66
 67    path_results = '../experiments/cf_results_supp_%s/%s/' % (support, 'single_objective_new')
 68    if adapted & (not filtering) & (method == 'multi_objective_genetic'):
 69        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_new')
 70        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_new')
 71    elif adapted & filtering & (method == 'multi_objective_genetic'):
 72        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support, method, 'adapted_filtering_new')
 73        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support, method, 'adapted_filtering_new')
 74    elif (not adapted) & (method == 'genetic_conformance'):
 75        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'single_objective_new')
 76        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'single_objective_new')
 77    elif (adapted) & (method == 'genetic_conformance') & (optimization == 'baseline'):
 78        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_no_conformance')
 79        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_no_conformance')
 80    elif (adapted) & (method == 'genetic_conformance') & (optimization != 'baseline'):
 81        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_conformance_large')
 82        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_conformance_large')
 83    elif method =='genetic':
 84        path_results = '../experiments/cf_results_supp_%s/%s/' % (support,'single_objective_new')
 85        path_cf = '../experiments/cf_results_supp_%s/%s/' % (support,'single_objective_new')
 86    elif (not adapted) & (method == 'multi_objective_genetic') & (extended_loss):
 87        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'mixed_ga_5obj')
 88        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'mixed_ga_5_ob')
 89    elif (not adapted) & (method == 'multi_objective_genetic'):
 90        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'baseline_new')
 91        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'baseline_new')
 92
 93    for test_id,i in enumerate(index_test_instances):
 94        print(datetime.now(), dataset, black_box, test_id, len(index_test_instances),
 95              '%.2f' % (test_id+1 / len(index_test_instances)))
 96        cf_list_all = list()
 97        x_eval_list = list()
 98        desired_cfs_all = list()
 99        x = query_instances_for_cf.iloc[[i]]
100        
101        for k in [5]:
102            time_start_i = datetime.now()
103            if method == 'genetic_conformance':
104                dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite',
105                                                                           verbose=False,
106                                                                           posthoc_sparsity_algorithm='linear',
107                                                                           total_CFs=k, dataset=dataset+'_'+str(CONF['prefix_length']),
108                                                                           model_path=model_path,random_seed=random_seed,adapted=adapted)
109            elif method == 'multi_objective_genetic':
110                dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite',
111                                                                           verbose=False,
112                                                                           posthoc_sparsity_algorithm='linear',
113                                                                           total_CFs=k, dataset=dataset+'_'+str(CONF['prefix_length']),
114                                                                           model_path=model_path,random_seed=random_seed,adapted=adapted)
115            else:
116                dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite',
117                                                                           verbose=False,
118                                                                           posthoc_sparsity_algorithm='linear',
119                                                                           total_CFs=k,dataset=dataset+'_'+str(CONF['prefix_length']),
120                                                                           )
121            # function to decode cf from train_df and show it decoded before adding to list
122            generated_cfs = dice_result.cf_examples_list[0].final_cfs_df
123            cf_list = np.array(generated_cfs).astype('float64')
124            y_pred = predictive_model.model.predict(x.values.reshape(1, -1))[0]
125            time_test = (datetime.now() - time_start_i).total_seconds()
126            x_eval = evaluate_cf_list(cf_list, x.values.reshape(1,-1), cont_feature_index, cat_feature_index, df=df,
127                                  nr_of_cfs=k,y_pred=y_pred,predictive_model=predictive_model,
128                                  query_instances=query_instances,continuous_features=continuous_features,
129                                  categorical_features=categorical_features,ratio_cont=ratio_cont
130                                  )
131
132            x_eval['dataset'] = dataset
133            x_eval['idx'] = test_id+1
134            x_eval['model'] = predictive_model.model_type
135            x_eval['desired_nr_of_cfs'] = k
136            x_eval['time_train'] = time_train
137            x_eval['time_test'] = time_test
138            x_eval['runtime'] = time_train + time_test
139          #  x_eval['generated_cfs'] = x_eval['nbr_cf']
140            x_eval['method'] = method
141            x_eval['explainer'] = CONF['explanator']
142            x_eval['prefix_length'] = CONF['prefix_length']
143            x_eval['heuristic'] = heuristic
144            x_eval['optimization']  = optimization
145            x_eval_list.append(x_eval)
146            if cf_list.size > 4:
147                if method == 'random':
148                    cf_list = cf_list[:, :-1]
149                elif method == 'genetic':
150                    cf_list = cf_list[:, :-1]
151                elif method == 'genetic_conformance':
152                    cf_list = cf_list[:, :-1]
153                elif method == 'multi_objective_genetic':
154                    cf_list = cf_list[:, :-1]
155                df_conf = pd.DataFrame(data=cf_list, columns=features_names)
156
157                sat_score = conformance_score(CONF, encoder, df=df_conf, dataset=dataset, features_names=features_names,
158                                          d4py=d4py, query_instance=x, model_path=model_path,
159                                          timestamp_col_name=timestamp_col_name)
160                x_eval['sat_score'] = sat_score
161                cf_list_all.extend(cf_list[:5])
162                desired_cfs = [float(k) * np.ones_like(cf_list[:5, 0])]
163
164                desired_cfs_all.extend(*desired_cfs)
165        try:
166            if not os.path.exists(path_results+'_'+str(support)+'/'):
167                os.makedirs(path_results+'_'+str(support)+'/')
168                print("Directory '%s' created successfully" % path_results+'_'+str(support)+'/')
169        except OSError as error:
170            print("Directory '%s' can not be created" % path_results)
171        filename_results = path_results + 'cfeval_%s_%s_dice_%s.csv' % (dataset, black_box,feature_selection)
172        if len(cf_list_all) > 0:
173            df_cf = pd.DataFrame(data=cf_list_all, columns=features_names)
174            encoder.decode(df_cf)
175            if CONF['feature_selection'] in single_prefix:
176                if all(df_cf['prefix'] == '0'):
177                    cols = ['prefix_' + str(i+1) for i in range(CONF['prefix_length'])]
178                    df_cf[cols] = 0
179                else:
180                    df_cf = pd.concat([df_cf, pd.DataFrame(
181                        df_cf['prefix'].str.split(",", expand=True).fillna(value='0')).rename(
182                        columns=lambda x: f"prefix_{int(x) + 1}")], axis=1)
183                    df_cf = df_cf.replace('\[', '',regex=True)
184                    df_cf = df_cf.replace(']', '', regex=True)
185                df_cf = df_cf.drop(columns=['prefix'])
186            df_cf['desired_cfs'] = desired_cfs_all
187            df_cf['idx'] = test_id+1 * len(cf_list_all)
188            df_cf['method']= method
189            df_cf['test_id'] = np.arange(0, len(cf_list_all))
190            df_cf['dataset'] = [dataset] * len(cf_list_all)
191            df_cf['black_box'] = [black_box] * len(cf_list_all)
192            try:
193                if not os.path.exists(path_cf):
194                    os.makedirs(path_cf)
195                    print("Directory '%s' created successfully" % path_cf)
196            except OSError as error:
197                print("Directory '%s' can not be created" % path_cf)
198            if optimization != 'baseline':
199                filename_cf = path_cf + 'cf_%s_%s_dice_%s_%s_%s_%s.csv' % (dataset, black_box, feature_selection, method, optimization,
200                                                                        CONF['prefix_length'])
201            else:
202                filename_cf = path_cf + 'cf_%s_%s_dice_%s_%s_%s.csv' % (dataset, black_box,feature_selection,method,
203                                                                   CONF['prefix_length'])
204            if not os.path.isfile(filename_cf):
205                df_cf.to_csv(filename_cf, index=False)
206            else:
207                df_cf.to_csv(filename_cf, mode='a', index=False, header=False)
208        else:
209            x_eval['sat_score'] = 0
210        result_dataframe = pd.DataFrame(data=x_eval_list)
211        result_dataframe = result_dataframe[columns]
212        if not os.path.isfile(filename_results):
213            result_dataframe.to_csv(filename_results, index=False)
214        else:
215            result_dataframe.to_csv(filename_results, mode='a', index=False, header=False)
216    return dice_result
217def dice_model(predictive_model):
218    if predictive_model.model_type is ClassificationMethods.RANDOM_FOREST.value:
219        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
220    elif predictive_model.model_type is ClassificationMethods.PERCEPTRON.value:
221        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
222    elif predictive_model.model_type is ClassificationMethods.MLP.value:
223        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
224    elif predictive_model.model_type is ClassificationMethods.XGBOOST.value:
225        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
226    elif predictive_model.model_type is ClassificationMethods.SGDCLASSIFIER.value:
227        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
228    elif predictive_model.model_type is ClassificationMethods.SVM.value:
229        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
230    elif predictive_model.model_type is ClassificationMethods.KNN.value:
231        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
232    else:
233        m = dice_ml.Model(model=predictive_model.model, backend='PYT')
234    return m
235
236def split_features(df, encoder):
237    categorical_features = [col for col in df.columns if col in list(encoder._label_dict.keys())]
238    cat_feature_index = [df.columns.get_loc(c) for c in categorical_features if c in df]
239    continuous_features = [col for col in df.columns if col in list(encoder._numeric_encoder.keys())]
240    cont_feature_index = [df.columns.get_loc(c) for c in continuous_features if c in df]
241    return categorical_features,continuous_features,cat_feature_index,cont_feature_index
242
243def evaluate_cf_list(cf_list, query_instance, cont_feature_index,cat_feature_index,df, y_pred,nr_of_cfs,
244                     predictive_model, query_instances, continuous_features, categorical_features, ratio_cont):
245    nbr_features = query_instance.shape[1]
246    if cf_list.size > 4:
247        nbr_cf_ = len(cf_list)
248        nbr_features = cf_list.shape[1]
249        plausibility_sum = plausibility(query_instance, predictive_model, cf_list,nr_of_cfs, query_instances, y_pred,
250                                        cont_feature_index,cat_feature_index, df, ratio_cont
251                                       )
252        plausibility_max_nbr_cf_ = plausibility_sum / nr_of_cfs
253        plausibility_nbr_cf_ = plausibility_sum / nbr_cf_
254        distance_l2_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df)
255        distance_mad_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df)
256        distance_j_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard')
257        distance_h_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming')
258        distance_l2j_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index)
259        distance_l1j_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index)
260        distance_mh_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, df)
261
262        distance_l2_min_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df,
263                                               agg='min')
264        distance_mad_min_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df, agg='min')
265        distance_j_min_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg='min')
266        distance_h_min_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg='min')
267        distance_l2j_min_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index,
268                                         agg='min')
269        distance_l1j_min_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index,
270                                         agg='min')
271        distance_mh_min_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, df,agg='min')
272
273        distance_l2_max_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df, agg='max')
274        distance_mad_max_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df, agg='max')
275        distance_j_max_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg='max')
276        distance_h_max_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg='max')
277        distance_l2j_max_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, agg='max')
278        distance_l1j_max_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, agg='max')
279
280        distance_mh_max_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, X=df, agg='max')
281        avg_nbr_changes_per_cf_ = avg_nbr_changes_per_cf(query_instance, cf_list, continuous_features)
282        avg_nbr_changes_ = avg_nbr_changes(query_instance, cf_list, nbr_features, continuous_features)
283        if len(cf_list) > 1:
284            diversity_l2_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=df)
285            diversity_mad_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df)
286            diversity_j_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard')
287            diversity_h_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming')
288            diversity_l2j_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index)
289            diversity_mh_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df)
290
291            diversity_l2_min_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=df, agg='min')
292            diversity_mad_min_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df, agg='min')
293            diversity_j_min_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg='min')
294            diversity_h_min_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg='min')
295            diversity_l2j_min_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index, agg='min')
296            diversity_mh_min_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df, agg='min')
297
298            diversity_l2_max_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg='max')
299            diversity_mad_max_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df, agg='max')
300            diversity_j_max_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg='max')
301            diversity_h_max_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg='max')
302            diversity_l2j_max_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index, agg='max')
303            diversity_mh_max_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df, agg='max')
304
305        else:
306            diversity_l2_ = 0.0
307            diversity_mad_ = 0.0
308            diversity_j_ = 0.0
309            diversity_h_ = 0.0
310            diversity_l2j_ = 0.0
311            diversity_mh_ = 0.0
312
313            diversity_l2_min_ = 0.0
314            diversity_mad_min_ = 0.0
315            diversity_j_min_ = 0.0
316            diversity_h_min_ = 0.0
317            diversity_l2j_min_ = 0.0
318            diversity_mh_min_ = 0.0
319
320            diversity_l2_max_ = 0.0
321            diversity_mad_max_ = 0.0
322            diversity_j_max_ = 0.0
323            diversity_h_max_ = 0.0
324            diversity_l2j_max_ = 0.0
325            diversity_mh_max_ = 0.0
326
327        count_diversity_cont_ = count_diversity(cf_list, cont_feature_index, nbr_features, cont_feature_index)
328        count_diversity_cate_ = count_diversity(cf_list, cat_feature_index, nbr_features, cont_feature_index)
329        count_diversity_all_ = count_diversity_all(cf_list, nbr_features, cont_feature_index)
330        res = {  'generated_cfs': nr_of_cfs,
331                'implausibility_sum': plausibility_sum,
332                'implausibility_max_nbr_cf': plausibility_max_nbr_cf_,
333                'implausibility_nbr_cf': plausibility_nbr_cf_,
334                'distance_l2': distance_l2_,
335                'distance_mad': distance_mad_,
336                'distance_j': distance_j_,
337                'distance_h': distance_h_,
338                'distance_l2j': distance_l2j_,
339                'distance_l1j':distance_l1j_,
340                'distance_mh': distance_mh_,
341
342                'distance_l2_min': distance_l2_min_,
343                'distance_mad_min': distance_mad_min_,
344                'distance_j_min': distance_j_min_,
345                'distance_h_min': distance_h_min_,
346                'distance_l2j_min': distance_l2j_min_,
347                'distance_l1j_min': distance_l1j_min_,
348                'distance_mh_min': distance_mh_min_,
349
350                'distance_l2_max': distance_l2_max_,
351                'distance_mad_max': distance_mad_max_,
352                'distance_j_max': distance_j_max_,
353                'distance_h_max': distance_h_max_,
354                'distance_l2j_max': distance_l2j_max_,
355                'distance_l1j_max':distance_l1j_max_,
356                'distance_mh_max': distance_mh_max_,
357
358                'diversity_l2': diversity_l2_,
359                'diversity_mad': diversity_mad_,
360                'diversity_j': diversity_j_,
361                'diversity_h': diversity_h_,
362                'diversity_l2j': diversity_l2j_,
363                'diversity_mh': diversity_mh_,
364
365                'diversity_l2_min': diversity_l2_min_,
366                'diversity_mad_min': diversity_mad_min_,
367                'diversity_j_min': diversity_j_min_,
368                'diversity_h_min': diversity_h_min_,
369                'diversity_l2j_min': diversity_l2j_min_,
370                'diversity_mh_min': diversity_mh_min_,
371
372                'diversity_l2_max': diversity_l2_max_,
373                'diversity_mad_max': diversity_mad_max_,
374                'diversity_j_max': diversity_j_max_,
375                'diversity_h_max': diversity_h_max_,
376                'diversity_l2j_max': diversity_l2j_max_,
377                'diversity_mh_max': diversity_mh_max_,
378
379                'count_diversity_cont': count_diversity_cont_,
380                'count_diversity_cate': count_diversity_cate_,
381                'count_diversity_all': count_diversity_all_,
382                'avg_nbr_changes_per_cf':avg_nbr_changes_per_cf_,
383                'avg_nbr_changes': avg_nbr_changes_}
384    else:
385        res = {
386            'generated_cfs': 0,
387            'distance_l2': np.nan,
388            'distance_mad': np.nan,
389            'distance_j': np.nan,
390            'distance_h': np.nan,
391            'distance_l2j': np.nan,
392            'distance_l1j':np.nan,
393            'distance_mh': np.nan,
394            'distance_l2_min': np.nan,
395            'distance_mad_min': np.nan,
396            'distance_j_min': np.nan,
397            'distance_h_min': np.nan,
398            'distance_l2j_min': np.nan,
399            'distance_l1j_min':np.nan,
400            'distance_mh_min': np.nan,
401            'distance_l2_max': np.nan,
402            'distance_mad_max': np.nan,
403            'distance_j_max': np.nan,
404            'distance_h_max': np.nan,
405            'distance_l2j_max': np.nan,
406            'distance_l1j_max':np.nan,
407            'distance_mh_max': np.nan,
408            'avg_nbr_changes_per_cf': np.nan,
409            'avg_nbr_changes': np.nan,
410            'diversity_l2': np.nan,
411            'diversity_mad': np.nan,
412            'diversity_j': np.nan,
413            'diversity_h': np.nan,
414            'diversity_l2j': np.nan,
415            'diversity_mh': np.nan,
416            'diversity_l2_min': np.nan,
417            'diversity_mad_min': np.nan,
418            'diversity_j_min': np.nan,
419            'diversity_h_min': np.nan,
420            'diversity_l2j_min': np.nan,
421            'diversity_mh_min': np.nan,
422            'diversity_l2_max': np.nan,
423            'diversity_mad_max': np.nan,
424            'diversity_j_max': np.nan,
425            'diversity_h_max': np.nan,
426            'diversity_l2j_max': np.nan,
427            'diversity_mh_max': np.nan,
428            'count_diversity_cont': np.nan,
429            'count_diversity_cate': np.nan,
430            'count_diversity_all': np.nan,
431
432            'implausibility_sum': 0.0,
433            'implausibility_max_nbr_cf': 0.0,
434            'implausibility_nbr_cf': 0.0,
435            'sat_score': 0.0
436        }
437    return res
438
439def continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=None):
440    if metric == 'mad':
441        mad = median_abs_deviation(X.iloc[:, cont_feature_index], axis=0)
442        mad = np.array([v if v != 0 else 1.0 for v in mad])
443
444        def _mad_cityblock(u, v):
445            return mad_cityblock(u, v, mad)
446        dist = pdist(cf_list[:, cont_feature_index], metric=_mad_cityblock)
447    else:
448        dist = pdist(cf_list[:, cont_feature_index], metric=metric)
449
450    if agg is None or agg == 'mean':
451        return np.mean(dist)
452
453    if agg == 'max':
454        return np.max(dist)
455
456    if agg == 'min':
457        return np.min(dist)
458
459def categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=None):
460
461    dist = pdist(cf_list[:, cat_feature_index], metric=metric)
462
463    if agg is None or agg == 'mean':
464        return np.mean(dist)
465
466    if agg == 'max':
467        return np.max(dist)
468
469    if agg == 'min':
470        return np.min(dist)
471
472def diversity_mh(cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
473    nbr_features = cf_list.shape[1]
474    dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=X, agg=agg)
475    dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg=agg)
476    if ratio_cont is None:
477        ratio_continuous = len(cont_feature_index) / nbr_features
478        ratio_categorical = len(cat_feature_index) / nbr_features
479    else:
480        ratio_continuous = ratio_cont
481        ratio_categorical = 1.0 - ratio_cont
482    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
483    return dist
484
485def count_diversity(cf_list, features, nbr_features, cont_feature_index):
486    nbr_cf = cf_list.shape[0]
487    nbr_changes = 0
488    for i in range(nbr_cf):
489        for j in range(i+1, nbr_cf):
490            for k in features:
491                if cf_list[i][k] != cf_list[j][k]:
492                    nbr_changes += 1 if j in cont_feature_index else 0.5
493    return nbr_changes / (nbr_cf * nbr_cf * nbr_features)
494
495
496
497# piu alto e' meglio conta variet' tra cf
498def count_diversity_all(cf_list, nbr_features, cont_feature_index):
499    return count_diversity(cf_list, range(cf_list.shape[1]), nbr_features, cont_feature_index)
500
501def continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=None):
502    if metric == 'mad':
503        mad = median_abs_deviation(X.iloc[:, cont_feature_index], axis=0)
504        mad = np.array([v if v != 0 else 1.0 for v in mad])
505
506        def _mad_cityblock(u, v):
507            return mad_cityblock(u, v, mad)
508        dist = cdist(query_instance[:, cont_feature_index], cf_list[:, cont_feature_index], metric=_mad_cityblock)
509    else:
510        dist = cdist(query_instance[:, cont_feature_index], cf_list[:, cont_feature_index], metric=metric)
511
512    if agg is None or agg == 'mean':
513        return np.mean(dist)
514
515    if agg == 'max':
516        return np.max(dist)
517
518    if agg == 'min':
519        return np.min(dist)
520
521def mad_cityblock(u, v, mad):
522    u = _validate_vector(u)
523    v = _validate_vector(v)
524    l1_diff = abs(u - v)
525    l1_diff_mad = l1_diff / mad
526    return l1_diff_mad.sum()
527
528def categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=None):
529    try:
530        dist = cdist(query_instance.reshape(1, -1)[:, cat_feature_index], cf_list[:, cat_feature_index], metric=metric)
531    except:
532        print('Problem with categorical distance')
533    if agg is None or agg == 'mean':
534        return np.mean(dist)
535
536    if agg == 'max':
537        return np.max(dist)
538
539    if agg == 'min':
540        return np.min(dist)
541
542def euclidean_jaccard(query_instance, A, cont_feature_index, cat_feature_index, ratio_cont=None):
543    nbr_features = A.shape[1]
544    dist_cont = cdist(query_instance.reshape(1, -1)[:, cont_feature_index], A[:, cont_feature_index], metric='euclidean')
545    dist_cate = cdist(query_instance.reshape(1, -1)[:, cat_feature_index], A[:, cat_feature_index], metric='jaccard')
546    if ratio_cont is None:
547        ratio_continuous = len(cont_feature_index) / nbr_features
548        ratio_categorical = len(cat_feature_index) / nbr_features
549    else:
550        ratio_continuous = ratio_cont
551        ratio_categorical = 1.0 - ratio_cont
552    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
553    return dist
554
555
556def distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
557    nbr_features = cf_list.shape[1]
558    dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=agg)
559    dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=agg)
560    if ratio_cont is None:
561        ratio_continuous = len(cont_feature_index) / nbr_features
562        ratio_categorical = len(cat_feature_index) / nbr_features
563    else:
564        ratio_continuous = ratio_cont
565        ratio_categorical = 1.0 - ratio_cont
566    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
567    return dist
568
569def distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
570    nbr_features = cf_list.shape[1]
571    dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='cityblock', X=None, agg=agg)
572    dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=agg)
573    if ratio_cont is None:
574        ratio_continuous = len(cont_feature_index) / nbr_features
575        ratio_categorical = len(cat_feature_index) / nbr_features
576    else:
577        ratio_continuous = ratio_cont
578        ratio_categorical = 1.0 - ratio_cont
579    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
580    return dist
581
582def distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
583    nbr_features = cf_list.shape[1]
584    dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=X, agg=agg)
585    dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg=agg)
586    if ratio_cont is None:
587        ratio_continuous = len(cont_feature_index) / nbr_features
588        ratio_categorical = len(cat_feature_index) / nbr_features
589    else:
590        ratio_continuous = ratio_cont
591        ratio_categorical = 1.0 - ratio_cont
592    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
593    return dist
594
595def categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=None):
596    dist = pdist(cf_list[:, cat_feature_index], metric=metric)
597
598    if agg is None or agg == 'mean':
599        return np.mean(dist)
600
601    if agg == 'max':
602        return np.max(dist)
603
604    if agg == 'min':
605        return np.min(dist)
606
607def diversity_l2j(cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
608    nbr_features = cf_list.shape[1]
609    dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=agg)
610    dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=agg)
611    if ratio_cont is None:
612        ratio_continuous = len(cont_feature_index) / nbr_features
613        ratio_categorical = len(cat_feature_index) / nbr_features
614    else:
615        ratio_continuous = ratio_cont
616        ratio_categorical = 1.0 - ratio_cont
617    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
618    return dist
619
620def diversity_mh(cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
621    nbr_features = cf_list.shape[1]
622    dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=X, agg=agg)
623    dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg=agg)
624    if ratio_cont is None:
625        ratio_continuous = len(cont_feature_index) / nbr_features
626        ratio_categorical = len(cat_feature_index) / nbr_features
627    else:
628        ratio_continuous = ratio_cont
629        ratio_categorical = 1.0 - ratio_cont
630    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
631    return dist
632
633def nbr_changes_per_cf(x, cf_list, continuous_features):
634    x = x.ravel()
635    nbr_features = cf_list.shape[1] - 1 #exclude label
636    nbr_changes = np.zeros(len(cf_list))
637    for i, cf in enumerate(cf_list):
638        cf = cf[:-1]
639        for j in range(nbr_features):
640            if cf[j] != x[j]:
641                nbr_changes[i] += 1 if j in continuous_features else 0.5
642    return nbr_changes
643
644def avg_nbr_changes_per_cf(x, cf_list, continuous_features):
645    return np.mean(nbr_changes_per_cf(x, cf_list, continuous_features))
646
647def avg_nbr_changes(x, cf_list, nbr_features, continuous_features):
648    val = np.sum(nbr_changes_per_cf(x, cf_list, continuous_features))
649    nbr_cf, _ = cf_list.shape
650    return val / (nbr_cf * nbr_features)
651
652def plausibility(query_instance, predictive_model, cf_list,nr_of_cfs, query_instances,
653                 y_pred, continuous_features, categorical_features, df, ratio_cont):
654    sum_dist = 0.0
655    full_df = pd.concat([query_instances,df],ignore_index=False)
656    for cf in cf_list:
657        #X_y = full_df[full_df['label'] == y_label]
658        X_y = full_df
659        # neigh_dist = exp.cdist(x.reshape(1, -1), X_test_y)
660        neigh_dist = distance_mh(query_instance.reshape(1, -1), X_y.to_numpy(), continuous_features,
661                        categorical_features, df, ratio_cont)
662        idx_neigh = np.argsort(neigh_dist)[0]
663        # closest_idx = closest_idx = idx_neigh[0]
664        # closest = X_test_y[closest_idx]
665        closest = X_y.to_numpy()[idx_neigh]
666        d = distance_mh(cf.reshape(1,-1), closest.reshape(1, -1), continuous_features,
667                        categorical_features, df, ratio_cont)
668        sum_dist += d
669    return sum_dist
670
671def conformance_score(CONF, encoder, df, dataset, features_names, d4py, query_instance, model_path, timestamp_col_name):
672    d4py.parse_decl_model(model_path=os.path.join(model_path, dataset+'_'+str(CONF['prefix_length'])+'.decl'))
673
674    df = pd.DataFrame(df, columns=features_names)
675    try:
676        query_instance_to_decode = pd.DataFrame(np.array(query_instance, dtype=float),
677                                            columns=features_names)
678    except:
679        query_instance_to_decode = pd.DataFrame(np.array(query_instance, dtype=str),
680                                            columns=features_names)
681    encoder.decode(query_instance_to_decode)
682    encoder.decode(df)
683    df.insert(loc=0, column='Case ID', value=np.divmod(np.arange(len(df)), 1)[0] + 1)
684    df.insert(loc=1, column='label', value=1)
685    query_instance_to_decode.insert(loc=0, column='Case ID',
686                                    value=np.divmod(np.arange(len(query_instance_to_decode)), 1)[0] + 1)
687    query_instance_to_decode.insert(loc=1, column='label', value=1)
688    if CONF['feature_selection'] in single_prefix:
689        if all(df['prefix'] == '0'):
690            cols = ['prefix_' + str(i + 1) for i in range(CONF['prefix_length'])]
691            df[cols] = 0
692            query_instance_to_decode[cols] =0
693        else:
694            df = pd.concat([df, pd.DataFrame(
695                df['prefix'].str.split(",", expand=True).fillna(value='0')).rename(
696                columns=lambda x: f"prefix_{int(x) + 1}")], axis=1)
697            df = df.replace('\[', '', regex=True)
698            df = df.replace(']', '', regex=True)
699            query_instance_to_decode = pd.concat([query_instance_to_decode, pd.DataFrame(
700                query_instance_to_decode['prefix'].str.split(",", expand=True).fillna(value='0')).rename(
701                columns=lambda x: f"prefix_{int(x) + 1}")], axis=1)
702            query_instance_to_decode = query_instance_to_decode.replace('\[', '', regex=True)
703            query_instance_to_decode = query_instance_to_decode.replace(']', '', regex=True)
704        df = df.drop(columns=['prefix'])
705        query_instance_to_decode = query_instance_to_decode.drop(columns=['prefix'])
706    long_data = pd.wide_to_long(df, stubnames=['prefix'], i='Case ID',
707                                    j='order', sep='_', suffix=r'\w+')
708    long_query_instance = pd.wide_to_long(query_instance_to_decode, stubnames=['prefix'], i='Case ID',
709                                              j='order', sep='_', suffix=r'\w+')
710    long_query_instance_sorted = long_query_instance.sort_values(['Case ID', 'order'], ).reset_index(drop=False)
711    timestamps = pd.date_range('1/1/2011', periods=len(long_data), freq='H')
712    long_data_sorted = long_data.sort_values(['Case ID', 'order'], ).reset_index(drop=False)
713    long_data_sorted[timestamp_col_name] = timestamps
714    long_data_sorted['label'].replace({1: 'regular'}, inplace=True)
715    long_data_sorted.drop(columns=['order'], inplace=True)
716    columns_to_rename = {'Case ID': 'case:concept:name'}
717    columns_to_rename.update({'prefix': 'concept:name'})
718    long_data_sorted.rename(columns=columns_to_rename, inplace=True)
719    long_data_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True)
720    long_data_sorted.replace('0', 'other', inplace=True)
721    timestamps_query = pd.date_range('1/1/2011', periods=len(long_query_instance), freq='H')
722    long_query_instance_sorted[timestamp_col_name] = timestamps_query
723    long_query_instance_sorted.rename(columns=columns_to_rename, inplace=True)
724    long_query_instance_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True)
725    long_query_instance_sorted.replace('0', 'other', inplace=True)
726    long_query_instance_sorted['case:concept:name'] = long_query_instance_sorted['case:concept:name'].astype(str)
727    long_data_sorted['case:concept:name'] = long_data_sorted['case:concept:name'].astype(str)
728    event_log = convert_to_event_log(long_data_sorted)
729    query_log = convert_to_event_log(long_query_instance_sorted)
730    d4py.load_xes_log(event_log)
731    model_check_res = d4py.conformance_checking(consider_vacuity=False)
732    d4py.load_xes_log(query_log)
733    model_check_query = d4py.conformance_checking(consider_vacuity=False)
734    query_patterns = {
735        constraint
736        for trace, patts in model_check_query.items()
737        for constraint, checker in patts.items()
738        if checker.state == TraceState.SATISFIED
739    }
740
741    model_check_res = {
742        k: {
743            constraint: checker
744            for constraint, checker in v.items()
745            if checker.state != TraceState.VIOLATED and constraint in query_patterns
746        }
747        for k, v in model_check_res.items()
748    }
749
750    conformance_score = [len(v) / len(query_patterns) for v in model_check_res.values() ]
751    avg_conformance = np.mean(conformance_score)
752    print('Average conformance score', np.mean(conformance_score))
753    return avg_conformance
754
755def model_discovery(CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name):
756    df = pd.DataFrame(df, columns=features_names)
757    encoder.decode(df)
758    df.insert(loc=0, column='Case ID', value=np.divmod(np.arange(len(df)), 1)[0] + 1)
759    df.insert(loc=1, column='label', value=1)
760    long_data = pd.wide_to_long(df, stubnames=['prefix'], i='Case ID',
761                                j='order', sep='_', suffix=r'\w+')
762    timestamps = pd.date_range('1/1/2011', periods=len(long_data), freq='H')
763    long_data_sorted = long_data.sort_values(['Case ID', 'order'], ).reset_index(drop=False)
764    long_data_sorted[timestamp_col_name] = timestamps
765    long_data_sorted['label'].replace({1: 'regular'}, inplace=True)
766    long_data_sorted.drop(columns=['order'], inplace=True)
767    columns_to_rename = {'Case ID': 'case:concept:name'}
768    columns_to_rename.update({'prefix': 'concept:name'})
769    long_data_sorted.rename(columns=columns_to_rename, inplace=True)
770    long_data_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True)
771    long_data_sorted.replace('0', 'other', inplace=True)
772    long_data_sorted.replace(0.0, 'other', inplace=True)
773    long_data_sorted.replace(0, 'other', inplace=True)
774    long_data_sorted['case:concept:name'] = long_data_sorted['case:concept:name'].astype(str)
775    event_log = convert_to_event_log(long_data_sorted)
776    d4py.load_xes_log(event_log)
777    d4py.compute_frequent_itemsets(min_support=support, len_itemset=2)
778    d4py.discovery(consider_vacuity=False, max_declare_cardinality=2)
779    discovered = d4py.filter_discovery(min_support=support, output_path=os.path.join(model_path, dataset+'_'+str(CONF['prefix_length'])+'.decl'))
780    
781    #pm4py.filter_trace_attribute_values()
782
783def perform_model_analysis(model_path, dataset, CONF, encoder, full_df, support, log,dataset_confs):
784    try:
785        if not os.path.exists(model_path):
786            os.makedirs(model_path)
787            print("Directory '%s' created successfully" % model_path)
788    except OSError as error:
789        print("Directory '%s' can not be created" % model_path)
790
791    d4py = Declare4Py()
792
793    try:
794        decl_model_path = model_path + dataset + '_' + str(CONF['prefix_length']) + '.decl'
795        if not os.path.exists(decl_model_path):
796            print('Do model discovery')
797            features_names = full_df.columns.values[:-1]
798            model_discovery(CONF, encoder, full_df.iloc[:, 1:], dataset, features_names,
799                            d4py, model_path, support, [*dataset_confs.timestamp_col.values()][0])
800    except OSError as error:
801        print("File '%s' can not be created" % decl_model_path)
802
803    d4py.parse_decl_model(model_path=decl_model_path)
804
805    d4py.load_xes_log(log)
806    conformance_check = d4py.conformance_checking(consider_vacuity=False)
807
808    model_check_res = {
809        k: {
810            constraint: checker
811            for constraint, checker in v.items()
812            if checker.state != TraceState.VIOLATED
813        }
814        for k, v in conformance_check.items()
815    }
816
817    conformant_traces = [trace_id[1] for trace_id, results in model_check_res.items() if
818                         len(results) == len(d4py.model.constraints)]
819    number_of_constraints = len(d4py.model.constraints)
820    conformant_traces_ratio = len(conformant_traces) / len(log)
821
822    return conformant_traces,number_of_constraints, conformant_traces_ratio
823
824
825columns = ['dataset','heuristic', 'model', 'method', 'optimization','prefix_length','idx', 'desired_nr_of_cfs','generated_cfs', 'time_train','time_test',
826           'runtime','distance_l2', 'distance_mad', 'distance_j', 'distance_h','distance_l1j', 'distance_l2j', 'distance_mh',
827           'distance_l2_min', 'distance_mad_min', 'distance_j_min', 'distance_h_min','distance_l1j_min', 'distance_l2j_min',
828           'distance_mh_min', 'distance_l2_max', 'distance_mad_max', 'distance_j_max', 'distance_h_max',
829           'distance_l1j_max','distance_l2j_max', 'distance_mh_max', 'diversity_l2',
830           'diversity_mad', 'diversity_j', 'diversity_h', 'diversity_l2j', 'diversity_mh', 'diversity_l2_min',
831           'diversity_mad_min', 'diversity_j_min', 'diversity_h_min', 'diversity_l2j_min', 'diversity_mh_min',
832           'diversity_l2_max', 'diversity_mad_max', 'diversity_j_max', 'diversity_h_max', 'diversity_l2j_max',
833           'diversity_mh_max', 'count_diversity_cont', 'count_diversity_cate', 'count_diversity_all',
834            'avg_nbr_changes_per_cf','avg_nbr_changes','implausibility_sum',
835            'implausibility_max_nbr_cf','implausibility_nbr_cf','sat_score']
single_prefix = ['loreley', 'loreley_complex']
def dice_explain( CONF, predictive_model, encoder, df, query_instances, method, optimization, heuristic, support, timestamp_col_name, model_path, random_seed=None, adapted=None, filtering=None):
 26def dice_explain(CONF, predictive_model, encoder, df, query_instances, method, optimization, heuristic, support,
 27                 timestamp_col_name,model_path,random_seed=None,adapted=None,filtering=None
 28                 ):
 29    features_names = df.columns.values[:-1]
 30    feature_selection = CONF['feature_selection']
 31    dataset = CONF['data'].rpartition('/')[0].rpartition('/')[-1]
 32
 33    if 'BPIC15' in dataset:
 34        dataset_created = dataset.replace('_f2','')
 35    elif 'bpic2012' in dataset:
 36        dataset_created = dataset.replace('-COMPLETE','').replace('bpic2012','BPIC12')
 37    elif 'sepsis' in dataset:
 38        dataset_created = dataset.replace('_cases','')
 39    black_box = predictive_model.model_type
 40    categorical_features,continuous_features,cat_feature_index,cont_feature_index = split_features(df.iloc[:,:-1], encoder)
 41    if CONF['feature_selection'] == 'loreley':
 42        query_instances = query_instances[query_instances['prefix'] != 0]
 43    if CONF['feature_selection'] == 'frequency':
 44        ratio_cont = 1
 45    else:
 46        ratio_cont = len(continuous_features)/len(categorical_features)
 47    time_start = datetime.now()
 48    query_instances_for_cf = query_instances.iloc[:2,:-1]
 49    d = dice_ml.Data(dataframe=df, continuous_features=continuous_features, outcome_name='label')
 50    m = dice_model(predictive_model)
 51    dice_query_instance = dice_ml.Dice(d, m, method, encoder)
 52    time_train = (datetime.now() - time_start).total_seconds()
 53    index_test_instances = range(len(query_instances_for_cf))
 54    #model_path = model_path +'_' + str(support) + '/'
 55    extended_loss = False
 56    try:
 57        if not os.path.exists(model_path):
 58            os.makedirs(model_path)
 59            print("Directory '%s' created successfully" % model_path)
 60    except OSError as error:
 61        print("Directory '%s' can not be created" % model_path)
 62    
 63    d4py = Declare4Py()
 64    model_discovery(CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name)
 65
 66    cols = df.columns[:-1].values
 67
 68    path_results = '../experiments/cf_results_supp_%s/%s/' % (support, 'single_objective_new')
 69    if adapted & (not filtering) & (method == 'multi_objective_genetic'):
 70        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_new')
 71        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_new')
 72    elif adapted & filtering & (method == 'multi_objective_genetic'):
 73        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support, method, 'adapted_filtering_new')
 74        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support, method, 'adapted_filtering_new')
 75    elif (not adapted) & (method == 'genetic_conformance'):
 76        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'single_objective_new')
 77        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'single_objective_new')
 78    elif (adapted) & (method == 'genetic_conformance') & (optimization == 'baseline'):
 79        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_no_conformance')
 80        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_no_conformance')
 81    elif (adapted) & (method == 'genetic_conformance') & (optimization != 'baseline'):
 82        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_conformance_large')
 83        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_conformance_large')
 84    elif method =='genetic':
 85        path_results = '../experiments/cf_results_supp_%s/%s/' % (support,'single_objective_new')
 86        path_cf = '../experiments/cf_results_supp_%s/%s/' % (support,'single_objective_new')
 87    elif (not adapted) & (method == 'multi_objective_genetic') & (extended_loss):
 88        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'mixed_ga_5obj')
 89        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'mixed_ga_5_ob')
 90    elif (not adapted) & (method == 'multi_objective_genetic'):
 91        path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'baseline_new')
 92        path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'baseline_new')
 93
 94    for test_id,i in enumerate(index_test_instances):
 95        print(datetime.now(), dataset, black_box, test_id, len(index_test_instances),
 96              '%.2f' % (test_id+1 / len(index_test_instances)))
 97        cf_list_all = list()
 98        x_eval_list = list()
 99        desired_cfs_all = list()
100        x = query_instances_for_cf.iloc[[i]]
101        
102        for k in [5]:
103            time_start_i = datetime.now()
104            if method == 'genetic_conformance':
105                dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite',
106                                                                           verbose=False,
107                                                                           posthoc_sparsity_algorithm='linear',
108                                                                           total_CFs=k, dataset=dataset+'_'+str(CONF['prefix_length']),
109                                                                           model_path=model_path,random_seed=random_seed,adapted=adapted)
110            elif method == 'multi_objective_genetic':
111                dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite',
112                                                                           verbose=False,
113                                                                           posthoc_sparsity_algorithm='linear',
114                                                                           total_CFs=k, dataset=dataset+'_'+str(CONF['prefix_length']),
115                                                                           model_path=model_path,random_seed=random_seed,adapted=adapted)
116            else:
117                dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite',
118                                                                           verbose=False,
119                                                                           posthoc_sparsity_algorithm='linear',
120                                                                           total_CFs=k,dataset=dataset+'_'+str(CONF['prefix_length']),
121                                                                           )
122            # function to decode cf from train_df and show it decoded before adding to list
123            generated_cfs = dice_result.cf_examples_list[0].final_cfs_df
124            cf_list = np.array(generated_cfs).astype('float64')
125            y_pred = predictive_model.model.predict(x.values.reshape(1, -1))[0]
126            time_test = (datetime.now() - time_start_i).total_seconds()
127            x_eval = evaluate_cf_list(cf_list, x.values.reshape(1,-1), cont_feature_index, cat_feature_index, df=df,
128                                  nr_of_cfs=k,y_pred=y_pred,predictive_model=predictive_model,
129                                  query_instances=query_instances,continuous_features=continuous_features,
130                                  categorical_features=categorical_features,ratio_cont=ratio_cont
131                                  )
132
133            x_eval['dataset'] = dataset
134            x_eval['idx'] = test_id+1
135            x_eval['model'] = predictive_model.model_type
136            x_eval['desired_nr_of_cfs'] = k
137            x_eval['time_train'] = time_train
138            x_eval['time_test'] = time_test
139            x_eval['runtime'] = time_train + time_test
140          #  x_eval['generated_cfs'] = x_eval['nbr_cf']
141            x_eval['method'] = method
142            x_eval['explainer'] = CONF['explanator']
143            x_eval['prefix_length'] = CONF['prefix_length']
144            x_eval['heuristic'] = heuristic
145            x_eval['optimization']  = optimization
146            x_eval_list.append(x_eval)
147            if cf_list.size > 4:
148                if method == 'random':
149                    cf_list = cf_list[:, :-1]
150                elif method == 'genetic':
151                    cf_list = cf_list[:, :-1]
152                elif method == 'genetic_conformance':
153                    cf_list = cf_list[:, :-1]
154                elif method == 'multi_objective_genetic':
155                    cf_list = cf_list[:, :-1]
156                df_conf = pd.DataFrame(data=cf_list, columns=features_names)
157
158                sat_score = conformance_score(CONF, encoder, df=df_conf, dataset=dataset, features_names=features_names,
159                                          d4py=d4py, query_instance=x, model_path=model_path,
160                                          timestamp_col_name=timestamp_col_name)
161                x_eval['sat_score'] = sat_score
162                cf_list_all.extend(cf_list[:5])
163                desired_cfs = [float(k) * np.ones_like(cf_list[:5, 0])]
164
165                desired_cfs_all.extend(*desired_cfs)
166        try:
167            if not os.path.exists(path_results+'_'+str(support)+'/'):
168                os.makedirs(path_results+'_'+str(support)+'/')
169                print("Directory '%s' created successfully" % path_results+'_'+str(support)+'/')
170        except OSError as error:
171            print("Directory '%s' can not be created" % path_results)
172        filename_results = path_results + 'cfeval_%s_%s_dice_%s.csv' % (dataset, black_box,feature_selection)
173        if len(cf_list_all) > 0:
174            df_cf = pd.DataFrame(data=cf_list_all, columns=features_names)
175            encoder.decode(df_cf)
176            if CONF['feature_selection'] in single_prefix:
177                if all(df_cf['prefix'] == '0'):
178                    cols = ['prefix_' + str(i+1) for i in range(CONF['prefix_length'])]
179                    df_cf[cols] = 0
180                else:
181                    df_cf = pd.concat([df_cf, pd.DataFrame(
182                        df_cf['prefix'].str.split(",", expand=True).fillna(value='0')).rename(
183                        columns=lambda x: f"prefix_{int(x) + 1}")], axis=1)
184                    df_cf = df_cf.replace('\[', '',regex=True)
185                    df_cf = df_cf.replace(']', '', regex=True)
186                df_cf = df_cf.drop(columns=['prefix'])
187            df_cf['desired_cfs'] = desired_cfs_all
188            df_cf['idx'] = test_id+1 * len(cf_list_all)
189            df_cf['method']= method
190            df_cf['test_id'] = np.arange(0, len(cf_list_all))
191            df_cf['dataset'] = [dataset] * len(cf_list_all)
192            df_cf['black_box'] = [black_box] * len(cf_list_all)
193            try:
194                if not os.path.exists(path_cf):
195                    os.makedirs(path_cf)
196                    print("Directory '%s' created successfully" % path_cf)
197            except OSError as error:
198                print("Directory '%s' can not be created" % path_cf)
199            if optimization != 'baseline':
200                filename_cf = path_cf + 'cf_%s_%s_dice_%s_%s_%s_%s.csv' % (dataset, black_box, feature_selection, method, optimization,
201                                                                        CONF['prefix_length'])
202            else:
203                filename_cf = path_cf + 'cf_%s_%s_dice_%s_%s_%s.csv' % (dataset, black_box,feature_selection,method,
204                                                                   CONF['prefix_length'])
205            if not os.path.isfile(filename_cf):
206                df_cf.to_csv(filename_cf, index=False)
207            else:
208                df_cf.to_csv(filename_cf, mode='a', index=False, header=False)
209        else:
210            x_eval['sat_score'] = 0
211        result_dataframe = pd.DataFrame(data=x_eval_list)
212        result_dataframe = result_dataframe[columns]
213        if not os.path.isfile(filename_results):
214            result_dataframe.to_csv(filename_results, index=False)
215        else:
216            result_dataframe.to_csv(filename_results, mode='a', index=False, header=False)
217    return dice_result
def dice_model(predictive_model):
218def dice_model(predictive_model):
219    if predictive_model.model_type is ClassificationMethods.RANDOM_FOREST.value:
220        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
221    elif predictive_model.model_type is ClassificationMethods.PERCEPTRON.value:
222        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
223    elif predictive_model.model_type is ClassificationMethods.MLP.value:
224        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
225    elif predictive_model.model_type is ClassificationMethods.XGBOOST.value:
226        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
227    elif predictive_model.model_type is ClassificationMethods.SGDCLASSIFIER.value:
228        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
229    elif predictive_model.model_type is ClassificationMethods.SVM.value:
230        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
231    elif predictive_model.model_type is ClassificationMethods.KNN.value:
232        m = dice_ml.Model(model=predictive_model.model, backend='sklearn')
233    else:
234        m = dice_ml.Model(model=predictive_model.model, backend='PYT')
235    return m
def split_features(df, encoder):
237def split_features(df, encoder):
238    categorical_features = [col for col in df.columns if col in list(encoder._label_dict.keys())]
239    cat_feature_index = [df.columns.get_loc(c) for c in categorical_features if c in df]
240    continuous_features = [col for col in df.columns if col in list(encoder._numeric_encoder.keys())]
241    cont_feature_index = [df.columns.get_loc(c) for c in continuous_features if c in df]
242    return categorical_features,continuous_features,cat_feature_index,cont_feature_index
def evaluate_cf_list( cf_list, query_instance, cont_feature_index, cat_feature_index, df, y_pred, nr_of_cfs, predictive_model, query_instances, continuous_features, categorical_features, ratio_cont):
244def evaluate_cf_list(cf_list, query_instance, cont_feature_index,cat_feature_index,df, y_pred,nr_of_cfs,
245                     predictive_model, query_instances, continuous_features, categorical_features, ratio_cont):
246    nbr_features = query_instance.shape[1]
247    if cf_list.size > 4:
248        nbr_cf_ = len(cf_list)
249        nbr_features = cf_list.shape[1]
250        plausibility_sum = plausibility(query_instance, predictive_model, cf_list,nr_of_cfs, query_instances, y_pred,
251                                        cont_feature_index,cat_feature_index, df, ratio_cont
252                                       )
253        plausibility_max_nbr_cf_ = plausibility_sum / nr_of_cfs
254        plausibility_nbr_cf_ = plausibility_sum / nbr_cf_
255        distance_l2_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df)
256        distance_mad_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df)
257        distance_j_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard')
258        distance_h_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming')
259        distance_l2j_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index)
260        distance_l1j_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index)
261        distance_mh_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, df)
262
263        distance_l2_min_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df,
264                                               agg='min')
265        distance_mad_min_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df, agg='min')
266        distance_j_min_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg='min')
267        distance_h_min_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg='min')
268        distance_l2j_min_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index,
269                                         agg='min')
270        distance_l1j_min_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index,
271                                         agg='min')
272        distance_mh_min_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, df,agg='min')
273
274        distance_l2_max_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df, agg='max')
275        distance_mad_max_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df, agg='max')
276        distance_j_max_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg='max')
277        distance_h_max_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg='max')
278        distance_l2j_max_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, agg='max')
279        distance_l1j_max_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, agg='max')
280
281        distance_mh_max_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, X=df, agg='max')
282        avg_nbr_changes_per_cf_ = avg_nbr_changes_per_cf(query_instance, cf_list, continuous_features)
283        avg_nbr_changes_ = avg_nbr_changes(query_instance, cf_list, nbr_features, continuous_features)
284        if len(cf_list) > 1:
285            diversity_l2_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=df)
286            diversity_mad_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df)
287            diversity_j_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard')
288            diversity_h_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming')
289            diversity_l2j_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index)
290            diversity_mh_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df)
291
292            diversity_l2_min_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=df, agg='min')
293            diversity_mad_min_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df, agg='min')
294            diversity_j_min_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg='min')
295            diversity_h_min_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg='min')
296            diversity_l2j_min_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index, agg='min')
297            diversity_mh_min_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df, agg='min')
298
299            diversity_l2_max_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg='max')
300            diversity_mad_max_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df, agg='max')
301            diversity_j_max_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg='max')
302            diversity_h_max_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg='max')
303            diversity_l2j_max_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index, agg='max')
304            diversity_mh_max_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df, agg='max')
305
306        else:
307            diversity_l2_ = 0.0
308            diversity_mad_ = 0.0
309            diversity_j_ = 0.0
310            diversity_h_ = 0.0
311            diversity_l2j_ = 0.0
312            diversity_mh_ = 0.0
313
314            diversity_l2_min_ = 0.0
315            diversity_mad_min_ = 0.0
316            diversity_j_min_ = 0.0
317            diversity_h_min_ = 0.0
318            diversity_l2j_min_ = 0.0
319            diversity_mh_min_ = 0.0
320
321            diversity_l2_max_ = 0.0
322            diversity_mad_max_ = 0.0
323            diversity_j_max_ = 0.0
324            diversity_h_max_ = 0.0
325            diversity_l2j_max_ = 0.0
326            diversity_mh_max_ = 0.0
327
328        count_diversity_cont_ = count_diversity(cf_list, cont_feature_index, nbr_features, cont_feature_index)
329        count_diversity_cate_ = count_diversity(cf_list, cat_feature_index, nbr_features, cont_feature_index)
330        count_diversity_all_ = count_diversity_all(cf_list, nbr_features, cont_feature_index)
331        res = {  'generated_cfs': nr_of_cfs,
332                'implausibility_sum': plausibility_sum,
333                'implausibility_max_nbr_cf': plausibility_max_nbr_cf_,
334                'implausibility_nbr_cf': plausibility_nbr_cf_,
335                'distance_l2': distance_l2_,
336                'distance_mad': distance_mad_,
337                'distance_j': distance_j_,
338                'distance_h': distance_h_,
339                'distance_l2j': distance_l2j_,
340                'distance_l1j':distance_l1j_,
341                'distance_mh': distance_mh_,
342
343                'distance_l2_min': distance_l2_min_,
344                'distance_mad_min': distance_mad_min_,
345                'distance_j_min': distance_j_min_,
346                'distance_h_min': distance_h_min_,
347                'distance_l2j_min': distance_l2j_min_,
348                'distance_l1j_min': distance_l1j_min_,
349                'distance_mh_min': distance_mh_min_,
350
351                'distance_l2_max': distance_l2_max_,
352                'distance_mad_max': distance_mad_max_,
353                'distance_j_max': distance_j_max_,
354                'distance_h_max': distance_h_max_,
355                'distance_l2j_max': distance_l2j_max_,
356                'distance_l1j_max':distance_l1j_max_,
357                'distance_mh_max': distance_mh_max_,
358
359                'diversity_l2': diversity_l2_,
360                'diversity_mad': diversity_mad_,
361                'diversity_j': diversity_j_,
362                'diversity_h': diversity_h_,
363                'diversity_l2j': diversity_l2j_,
364                'diversity_mh': diversity_mh_,
365
366                'diversity_l2_min': diversity_l2_min_,
367                'diversity_mad_min': diversity_mad_min_,
368                'diversity_j_min': diversity_j_min_,
369                'diversity_h_min': diversity_h_min_,
370                'diversity_l2j_min': diversity_l2j_min_,
371                'diversity_mh_min': diversity_mh_min_,
372
373                'diversity_l2_max': diversity_l2_max_,
374                'diversity_mad_max': diversity_mad_max_,
375                'diversity_j_max': diversity_j_max_,
376                'diversity_h_max': diversity_h_max_,
377                'diversity_l2j_max': diversity_l2j_max_,
378                'diversity_mh_max': diversity_mh_max_,
379
380                'count_diversity_cont': count_diversity_cont_,
381                'count_diversity_cate': count_diversity_cate_,
382                'count_diversity_all': count_diversity_all_,
383                'avg_nbr_changes_per_cf':avg_nbr_changes_per_cf_,
384                'avg_nbr_changes': avg_nbr_changes_}
385    else:
386        res = {
387            'generated_cfs': 0,
388            'distance_l2': np.nan,
389            'distance_mad': np.nan,
390            'distance_j': np.nan,
391            'distance_h': np.nan,
392            'distance_l2j': np.nan,
393            'distance_l1j':np.nan,
394            'distance_mh': np.nan,
395            'distance_l2_min': np.nan,
396            'distance_mad_min': np.nan,
397            'distance_j_min': np.nan,
398            'distance_h_min': np.nan,
399            'distance_l2j_min': np.nan,
400            'distance_l1j_min':np.nan,
401            'distance_mh_min': np.nan,
402            'distance_l2_max': np.nan,
403            'distance_mad_max': np.nan,
404            'distance_j_max': np.nan,
405            'distance_h_max': np.nan,
406            'distance_l2j_max': np.nan,
407            'distance_l1j_max':np.nan,
408            'distance_mh_max': np.nan,
409            'avg_nbr_changes_per_cf': np.nan,
410            'avg_nbr_changes': np.nan,
411            'diversity_l2': np.nan,
412            'diversity_mad': np.nan,
413            'diversity_j': np.nan,
414            'diversity_h': np.nan,
415            'diversity_l2j': np.nan,
416            'diversity_mh': np.nan,
417            'diversity_l2_min': np.nan,
418            'diversity_mad_min': np.nan,
419            'diversity_j_min': np.nan,
420            'diversity_h_min': np.nan,
421            'diversity_l2j_min': np.nan,
422            'diversity_mh_min': np.nan,
423            'diversity_l2_max': np.nan,
424            'diversity_mad_max': np.nan,
425            'diversity_j_max': np.nan,
426            'diversity_h_max': np.nan,
427            'diversity_l2j_max': np.nan,
428            'diversity_mh_max': np.nan,
429            'count_diversity_cont': np.nan,
430            'count_diversity_cate': np.nan,
431            'count_diversity_all': np.nan,
432
433            'implausibility_sum': 0.0,
434            'implausibility_max_nbr_cf': 0.0,
435            'implausibility_nbr_cf': 0.0,
436            'sat_score': 0.0
437        }
438    return res
def continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=None):
440def continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=None):
441    if metric == 'mad':
442        mad = median_abs_deviation(X.iloc[:, cont_feature_index], axis=0)
443        mad = np.array([v if v != 0 else 1.0 for v in mad])
444
445        def _mad_cityblock(u, v):
446            return mad_cityblock(u, v, mad)
447        dist = pdist(cf_list[:, cont_feature_index], metric=_mad_cityblock)
448    else:
449        dist = pdist(cf_list[:, cont_feature_index], metric=metric)
450
451    if agg is None or agg == 'mean':
452        return np.mean(dist)
453
454    if agg == 'max':
455        return np.max(dist)
456
457    if agg == 'min':
458        return np.min(dist)
def categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=None):
596def categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=None):
597    dist = pdist(cf_list[:, cat_feature_index], metric=metric)
598
599    if agg is None or agg == 'mean':
600        return np.mean(dist)
601
602    if agg == 'max':
603        return np.max(dist)
604
605    if agg == 'min':
606        return np.min(dist)
def diversity_mh( cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
621def diversity_mh(cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
622    nbr_features = cf_list.shape[1]
623    dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=X, agg=agg)
624    dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg=agg)
625    if ratio_cont is None:
626        ratio_continuous = len(cont_feature_index) / nbr_features
627        ratio_categorical = len(cat_feature_index) / nbr_features
628    else:
629        ratio_continuous = ratio_cont
630        ratio_categorical = 1.0 - ratio_cont
631    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
632    return dist
def count_diversity(cf_list, features, nbr_features, cont_feature_index):
486def count_diversity(cf_list, features, nbr_features, cont_feature_index):
487    nbr_cf = cf_list.shape[0]
488    nbr_changes = 0
489    for i in range(nbr_cf):
490        for j in range(i+1, nbr_cf):
491            for k in features:
492                if cf_list[i][k] != cf_list[j][k]:
493                    nbr_changes += 1 if j in cont_feature_index else 0.5
494    return nbr_changes / (nbr_cf * nbr_cf * nbr_features)
def count_diversity_all(cf_list, nbr_features, cont_feature_index):
499def count_diversity_all(cf_list, nbr_features, cont_feature_index):
500    return count_diversity(cf_list, range(cf_list.shape[1]), nbr_features, cont_feature_index)
def continuous_distance( query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=None):
502def continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=None):
503    if metric == 'mad':
504        mad = median_abs_deviation(X.iloc[:, cont_feature_index], axis=0)
505        mad = np.array([v if v != 0 else 1.0 for v in mad])
506
507        def _mad_cityblock(u, v):
508            return mad_cityblock(u, v, mad)
509        dist = cdist(query_instance[:, cont_feature_index], cf_list[:, cont_feature_index], metric=_mad_cityblock)
510    else:
511        dist = cdist(query_instance[:, cont_feature_index], cf_list[:, cont_feature_index], metric=metric)
512
513    if agg is None or agg == 'mean':
514        return np.mean(dist)
515
516    if agg == 'max':
517        return np.max(dist)
518
519    if agg == 'min':
520        return np.min(dist)
def mad_cityblock(u, v, mad):
522def mad_cityblock(u, v, mad):
523    u = _validate_vector(u)
524    v = _validate_vector(v)
525    l1_diff = abs(u - v)
526    l1_diff_mad = l1_diff / mad
527    return l1_diff_mad.sum()
def categorical_distance( query_instance, cf_list, cat_feature_index, metric='jaccard', agg=None):
529def categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=None):
530    try:
531        dist = cdist(query_instance.reshape(1, -1)[:, cat_feature_index], cf_list[:, cat_feature_index], metric=metric)
532    except:
533        print('Problem with categorical distance')
534    if agg is None or agg == 'mean':
535        return np.mean(dist)
536
537    if agg == 'max':
538        return np.max(dist)
539
540    if agg == 'min':
541        return np.min(dist)
def euclidean_jaccard( query_instance, A, cont_feature_index, cat_feature_index, ratio_cont=None):
543def euclidean_jaccard(query_instance, A, cont_feature_index, cat_feature_index, ratio_cont=None):
544    nbr_features = A.shape[1]
545    dist_cont = cdist(query_instance.reshape(1, -1)[:, cont_feature_index], A[:, cont_feature_index], metric='euclidean')
546    dist_cate = cdist(query_instance.reshape(1, -1)[:, cat_feature_index], A[:, cat_feature_index], metric='jaccard')
547    if ratio_cont is None:
548        ratio_continuous = len(cont_feature_index) / nbr_features
549        ratio_categorical = len(cat_feature_index) / nbr_features
550    else:
551        ratio_continuous = ratio_cont
552        ratio_categorical = 1.0 - ratio_cont
553    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
554    return dist
def distance_l2j( query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
557def distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
558    nbr_features = cf_list.shape[1]
559    dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=agg)
560    dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=agg)
561    if ratio_cont is None:
562        ratio_continuous = len(cont_feature_index) / nbr_features
563        ratio_categorical = len(cat_feature_index) / nbr_features
564    else:
565        ratio_continuous = ratio_cont
566        ratio_categorical = 1.0 - ratio_cont
567    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
568    return dist
def distance_l1j( query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
570def distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
571    nbr_features = cf_list.shape[1]
572    dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='cityblock', X=None, agg=agg)
573    dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=agg)
574    if ratio_cont is None:
575        ratio_continuous = len(cont_feature_index) / nbr_features
576        ratio_categorical = len(cat_feature_index) / nbr_features
577    else:
578        ratio_continuous = ratio_cont
579        ratio_categorical = 1.0 - ratio_cont
580    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
581    return dist
def distance_mh( query_instance, cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
583def distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
584    nbr_features = cf_list.shape[1]
585    dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=X, agg=agg)
586    dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg=agg)
587    if ratio_cont is None:
588        ratio_continuous = len(cont_feature_index) / nbr_features
589        ratio_categorical = len(cat_feature_index) / nbr_features
590    else:
591        ratio_continuous = ratio_cont
592        ratio_categorical = 1.0 - ratio_cont
593    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
594    return dist
def diversity_l2j( cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
608def diversity_l2j(cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
609    nbr_features = cf_list.shape[1]
610    dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=agg)
611    dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=agg)
612    if ratio_cont is None:
613        ratio_continuous = len(cont_feature_index) / nbr_features
614        ratio_categorical = len(cat_feature_index) / nbr_features
615    else:
616        ratio_continuous = ratio_cont
617        ratio_categorical = 1.0 - ratio_cont
618    dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate
619    return dist
def nbr_changes_per_cf(x, cf_list, continuous_features):
634def nbr_changes_per_cf(x, cf_list, continuous_features):
635    x = x.ravel()
636    nbr_features = cf_list.shape[1] - 1 #exclude label
637    nbr_changes = np.zeros(len(cf_list))
638    for i, cf in enumerate(cf_list):
639        cf = cf[:-1]
640        for j in range(nbr_features):
641            if cf[j] != x[j]:
642                nbr_changes[i] += 1 if j in continuous_features else 0.5
643    return nbr_changes
def avg_nbr_changes_per_cf(x, cf_list, continuous_features):
645def avg_nbr_changes_per_cf(x, cf_list, continuous_features):
646    return np.mean(nbr_changes_per_cf(x, cf_list, continuous_features))
def avg_nbr_changes(x, cf_list, nbr_features, continuous_features):
648def avg_nbr_changes(x, cf_list, nbr_features, continuous_features):
649    val = np.sum(nbr_changes_per_cf(x, cf_list, continuous_features))
650    nbr_cf, _ = cf_list.shape
651    return val / (nbr_cf * nbr_features)
def plausibility( query_instance, predictive_model, cf_list, nr_of_cfs, query_instances, y_pred, continuous_features, categorical_features, df, ratio_cont):
653def plausibility(query_instance, predictive_model, cf_list,nr_of_cfs, query_instances,
654                 y_pred, continuous_features, categorical_features, df, ratio_cont):
655    sum_dist = 0.0
656    full_df = pd.concat([query_instances,df],ignore_index=False)
657    for cf in cf_list:
658        #X_y = full_df[full_df['label'] == y_label]
659        X_y = full_df
660        # neigh_dist = exp.cdist(x.reshape(1, -1), X_test_y)
661        neigh_dist = distance_mh(query_instance.reshape(1, -1), X_y.to_numpy(), continuous_features,
662                        categorical_features, df, ratio_cont)
663        idx_neigh = np.argsort(neigh_dist)[0]
664        # closest_idx = closest_idx = idx_neigh[0]
665        # closest = X_test_y[closest_idx]
666        closest = X_y.to_numpy()[idx_neigh]
667        d = distance_mh(cf.reshape(1,-1), closest.reshape(1, -1), continuous_features,
668                        categorical_features, df, ratio_cont)
669        sum_dist += d
670    return sum_dist
def conformance_score( CONF, encoder, df, dataset, features_names, d4py, query_instance, model_path, timestamp_col_name):
672def conformance_score(CONF, encoder, df, dataset, features_names, d4py, query_instance, model_path, timestamp_col_name):
673    d4py.parse_decl_model(model_path=os.path.join(model_path, dataset+'_'+str(CONF['prefix_length'])+'.decl'))
674
675    df = pd.DataFrame(df, columns=features_names)
676    try:
677        query_instance_to_decode = pd.DataFrame(np.array(query_instance, dtype=float),
678                                            columns=features_names)
679    except:
680        query_instance_to_decode = pd.DataFrame(np.array(query_instance, dtype=str),
681                                            columns=features_names)
682    encoder.decode(query_instance_to_decode)
683    encoder.decode(df)
684    df.insert(loc=0, column='Case ID', value=np.divmod(np.arange(len(df)), 1)[0] + 1)
685    df.insert(loc=1, column='label', value=1)
686    query_instance_to_decode.insert(loc=0, column='Case ID',
687                                    value=np.divmod(np.arange(len(query_instance_to_decode)), 1)[0] + 1)
688    query_instance_to_decode.insert(loc=1, column='label', value=1)
689    if CONF['feature_selection'] in single_prefix:
690        if all(df['prefix'] == '0'):
691            cols = ['prefix_' + str(i + 1) for i in range(CONF['prefix_length'])]
692            df[cols] = 0
693            query_instance_to_decode[cols] =0
694        else:
695            df = pd.concat([df, pd.DataFrame(
696                df['prefix'].str.split(",", expand=True).fillna(value='0')).rename(
697                columns=lambda x: f"prefix_{int(x) + 1}")], axis=1)
698            df = df.replace('\[', '', regex=True)
699            df = df.replace(']', '', regex=True)
700            query_instance_to_decode = pd.concat([query_instance_to_decode, pd.DataFrame(
701                query_instance_to_decode['prefix'].str.split(",", expand=True).fillna(value='0')).rename(
702                columns=lambda x: f"prefix_{int(x) + 1}")], axis=1)
703            query_instance_to_decode = query_instance_to_decode.replace('\[', '', regex=True)
704            query_instance_to_decode = query_instance_to_decode.replace(']', '', regex=True)
705        df = df.drop(columns=['prefix'])
706        query_instance_to_decode = query_instance_to_decode.drop(columns=['prefix'])
707    long_data = pd.wide_to_long(df, stubnames=['prefix'], i='Case ID',
708                                    j='order', sep='_', suffix=r'\w+')
709    long_query_instance = pd.wide_to_long(query_instance_to_decode, stubnames=['prefix'], i='Case ID',
710                                              j='order', sep='_', suffix=r'\w+')
711    long_query_instance_sorted = long_query_instance.sort_values(['Case ID', 'order'], ).reset_index(drop=False)
712    timestamps = pd.date_range('1/1/2011', periods=len(long_data), freq='H')
713    long_data_sorted = long_data.sort_values(['Case ID', 'order'], ).reset_index(drop=False)
714    long_data_sorted[timestamp_col_name] = timestamps
715    long_data_sorted['label'].replace({1: 'regular'}, inplace=True)
716    long_data_sorted.drop(columns=['order'], inplace=True)
717    columns_to_rename = {'Case ID': 'case:concept:name'}
718    columns_to_rename.update({'prefix': 'concept:name'})
719    long_data_sorted.rename(columns=columns_to_rename, inplace=True)
720    long_data_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True)
721    long_data_sorted.replace('0', 'other', inplace=True)
722    timestamps_query = pd.date_range('1/1/2011', periods=len(long_query_instance), freq='H')
723    long_query_instance_sorted[timestamp_col_name] = timestamps_query
724    long_query_instance_sorted.rename(columns=columns_to_rename, inplace=True)
725    long_query_instance_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True)
726    long_query_instance_sorted.replace('0', 'other', inplace=True)
727    long_query_instance_sorted['case:concept:name'] = long_query_instance_sorted['case:concept:name'].astype(str)
728    long_data_sorted['case:concept:name'] = long_data_sorted['case:concept:name'].astype(str)
729    event_log = convert_to_event_log(long_data_sorted)
730    query_log = convert_to_event_log(long_query_instance_sorted)
731    d4py.load_xes_log(event_log)
732    model_check_res = d4py.conformance_checking(consider_vacuity=False)
733    d4py.load_xes_log(query_log)
734    model_check_query = d4py.conformance_checking(consider_vacuity=False)
735    query_patterns = {
736        constraint
737        for trace, patts in model_check_query.items()
738        for constraint, checker in patts.items()
739        if checker.state == TraceState.SATISFIED
740    }
741
742    model_check_res = {
743        k: {
744            constraint: checker
745            for constraint, checker in v.items()
746            if checker.state != TraceState.VIOLATED and constraint in query_patterns
747        }
748        for k, v in model_check_res.items()
749    }
750
751    conformance_score = [len(v) / len(query_patterns) for v in model_check_res.values() ]
752    avg_conformance = np.mean(conformance_score)
753    print('Average conformance score', np.mean(conformance_score))
754    return avg_conformance
def model_discovery( CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name):
756def model_discovery(CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name):
757    df = pd.DataFrame(df, columns=features_names)
758    encoder.decode(df)
759    df.insert(loc=0, column='Case ID', value=np.divmod(np.arange(len(df)), 1)[0] + 1)
760    df.insert(loc=1, column='label', value=1)
761    long_data = pd.wide_to_long(df, stubnames=['prefix'], i='Case ID',
762                                j='order', sep='_', suffix=r'\w+')
763    timestamps = pd.date_range('1/1/2011', periods=len(long_data), freq='H')
764    long_data_sorted = long_data.sort_values(['Case ID', 'order'], ).reset_index(drop=False)
765    long_data_sorted[timestamp_col_name] = timestamps
766    long_data_sorted['label'].replace({1: 'regular'}, inplace=True)
767    long_data_sorted.drop(columns=['order'], inplace=True)
768    columns_to_rename = {'Case ID': 'case:concept:name'}
769    columns_to_rename.update({'prefix': 'concept:name'})
770    long_data_sorted.rename(columns=columns_to_rename, inplace=True)
771    long_data_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True)
772    long_data_sorted.replace('0', 'other', inplace=True)
773    long_data_sorted.replace(0.0, 'other', inplace=True)
774    long_data_sorted.replace(0, 'other', inplace=True)
775    long_data_sorted['case:concept:name'] = long_data_sorted['case:concept:name'].astype(str)
776    event_log = convert_to_event_log(long_data_sorted)
777    d4py.load_xes_log(event_log)
778    d4py.compute_frequent_itemsets(min_support=support, len_itemset=2)
779    d4py.discovery(consider_vacuity=False, max_declare_cardinality=2)
780    discovered = d4py.filter_discovery(min_support=support, output_path=os.path.join(model_path, dataset+'_'+str(CONF['prefix_length'])+'.decl'))
781    
782    #pm4py.filter_trace_attribute_values()
def perform_model_analysis( model_path, dataset, CONF, encoder, full_df, support, log, dataset_confs):
784def perform_model_analysis(model_path, dataset, CONF, encoder, full_df, support, log,dataset_confs):
785    try:
786        if not os.path.exists(model_path):
787            os.makedirs(model_path)
788            print("Directory '%s' created successfully" % model_path)
789    except OSError as error:
790        print("Directory '%s' can not be created" % model_path)
791
792    d4py = Declare4Py()
793
794    try:
795        decl_model_path = model_path + dataset + '_' + str(CONF['prefix_length']) + '.decl'
796        if not os.path.exists(decl_model_path):
797            print('Do model discovery')
798            features_names = full_df.columns.values[:-1]
799            model_discovery(CONF, encoder, full_df.iloc[:, 1:], dataset, features_names,
800                            d4py, model_path, support, [*dataset_confs.timestamp_col.values()][0])
801    except OSError as error:
802        print("File '%s' can not be created" % decl_model_path)
803
804    d4py.parse_decl_model(model_path=decl_model_path)
805
806    d4py.load_xes_log(log)
807    conformance_check = d4py.conformance_checking(consider_vacuity=False)
808
809    model_check_res = {
810        k: {
811            constraint: checker
812            for constraint, checker in v.items()
813            if checker.state != TraceState.VIOLATED
814        }
815        for k, v in conformance_check.items()
816    }
817
818    conformant_traces = [trace_id[1] for trace_id, results in model_check_res.items() if
819                         len(results) == len(d4py.model.constraints)]
820    number_of_constraints = len(d4py.model.constraints)
821    conformant_traces_ratio = len(conformant_traces) / len(log)
822
823    return conformant_traces,number_of_constraints, conformant_traces_ratio
columns = ['dataset', 'heuristic', 'model', 'method', 'optimization', 'prefix_length', 'idx', 'desired_nr_of_cfs', 'generated_cfs', 'time_train', 'time_test', 'runtime', 'distance_l2', 'distance_mad', 'distance_j', 'distance_h', 'distance_l1j', 'distance_l2j', 'distance_mh', 'distance_l2_min', 'distance_mad_min', 'distance_j_min', 'distance_h_min', 'distance_l1j_min', 'distance_l2j_min', 'distance_mh_min', 'distance_l2_max', 'distance_mad_max', 'distance_j_max', 'distance_h_max', 'distance_l1j_max', 'distance_l2j_max', 'distance_mh_max', 'diversity_l2', 'diversity_mad', 'diversity_j', 'diversity_h', 'diversity_l2j', 'diversity_mh', 'diversity_l2_min', 'diversity_mad_min', 'diversity_j_min', 'diversity_h_min', 'diversity_l2j_min', 'diversity_mh_min', 'diversity_l2_max', 'diversity_mad_max', 'diversity_j_max', 'diversity_h_max', 'diversity_l2j_max', 'diversity_mh_max', 'count_diversity_cont', 'count_diversity_cate', 'count_diversity_all', 'avg_nbr_changes_per_cf', 'avg_nbr_changes', 'implausibility_sum', 'implausibility_max_nbr_cf', 'implausibility_nbr_cf', 'sat_score']