nirdizati_light.explanation.wrappers.dice_wrapper
1import warnings 2import os 3from datetime import datetime 4import dice_ml 5import numpy as np 6import pandas as pd 7import pm4py 8from scipy.spatial.distance import _validate_vector 9from scipy.spatial.distance import cdist, pdist 10from scipy.stats import median_abs_deviation 11from pm4py import convert_to_event_log 12from declare4py.declare4py import Declare4Py 13from declare4py.enums import TraceState 14from nirdizati_light.encoding.common import get_encoded_df, EncodingType 15 16from nirdizati_light.predictive_model.common import ClassificationMethods 17 18warnings.filterwarnings("ignore", category=UserWarning) 19 20 21single_prefix = ['loreley','loreley_complex'] 22 23 24 25def dice_explain(CONF, predictive_model, encoder, df, query_instances, method, optimization, heuristic, support, 26 timestamp_col_name,model_path,random_seed=None,adapted=None,filtering=None 27 ): 28 features_names = df.columns.values[:-1] 29 feature_selection = CONF['feature_selection'] 30 dataset = CONF['data'].rpartition('/')[0].rpartition('/')[-1] 31 32 if 'BPIC15' in dataset: 33 dataset_created = dataset.replace('_f2','') 34 elif 'bpic2012' in dataset: 35 dataset_created = dataset.replace('-COMPLETE','').replace('bpic2012','BPIC12') 36 elif 'sepsis' in dataset: 37 dataset_created = dataset.replace('_cases','') 38 black_box = predictive_model.model_type 39 categorical_features,continuous_features,cat_feature_index,cont_feature_index = split_features(df.iloc[:,:-1], encoder) 40 if CONF['feature_selection'] == 'loreley': 41 query_instances = query_instances[query_instances['prefix'] != 0] 42 if CONF['feature_selection'] == 'frequency': 43 ratio_cont = 1 44 else: 45 ratio_cont = len(continuous_features)/len(categorical_features) 46 time_start = datetime.now() 47 query_instances_for_cf = query_instances.iloc[:2,:-1] 48 d = dice_ml.Data(dataframe=df, continuous_features=continuous_features, outcome_name='label') 49 m = dice_model(predictive_model) 50 dice_query_instance = dice_ml.Dice(d, m, method, encoder) 51 time_train = (datetime.now() - time_start).total_seconds() 52 index_test_instances = range(len(query_instances_for_cf)) 53 #model_path = model_path +'_' + str(support) + '/' 54 extended_loss = False 55 try: 56 if not os.path.exists(model_path): 57 os.makedirs(model_path) 58 print("Directory '%s' created successfully" % model_path) 59 except OSError as error: 60 print("Directory '%s' can not be created" % model_path) 61 62 d4py = Declare4Py() 63 model_discovery(CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name) 64 65 cols = df.columns[:-1].values 66 67 path_results = '../experiments/cf_results_supp_%s/%s/' % (support, 'single_objective_new') 68 if adapted & (not filtering) & (method == 'multi_objective_genetic'): 69 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_new') 70 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_new') 71 elif adapted & filtering & (method == 'multi_objective_genetic'): 72 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support, method, 'adapted_filtering_new') 73 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support, method, 'adapted_filtering_new') 74 elif (not adapted) & (method == 'genetic_conformance'): 75 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'single_objective_new') 76 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'single_objective_new') 77 elif (adapted) & (method == 'genetic_conformance') & (optimization == 'baseline'): 78 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_no_conformance') 79 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_no_conformance') 80 elif (adapted) & (method == 'genetic_conformance') & (optimization != 'baseline'): 81 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_conformance_large') 82 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_conformance_large') 83 elif method =='genetic': 84 path_results = '../experiments/cf_results_supp_%s/%s/' % (support,'single_objective_new') 85 path_cf = '../experiments/cf_results_supp_%s/%s/' % (support,'single_objective_new') 86 elif (not adapted) & (method == 'multi_objective_genetic') & (extended_loss): 87 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'mixed_ga_5obj') 88 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'mixed_ga_5_ob') 89 elif (not adapted) & (method == 'multi_objective_genetic'): 90 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'baseline_new') 91 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'baseline_new') 92 93 for test_id,i in enumerate(index_test_instances): 94 print(datetime.now(), dataset, black_box, test_id, len(index_test_instances), 95 '%.2f' % (test_id+1 / len(index_test_instances))) 96 cf_list_all = list() 97 x_eval_list = list() 98 desired_cfs_all = list() 99 x = query_instances_for_cf.iloc[[i]] 100 101 for k in [5]: 102 time_start_i = datetime.now() 103 if method == 'genetic_conformance': 104 dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite', 105 verbose=False, 106 posthoc_sparsity_algorithm='linear', 107 total_CFs=k, dataset=dataset+'_'+str(CONF['prefix_length']), 108 model_path=model_path,random_seed=random_seed,adapted=adapted) 109 elif method == 'multi_objective_genetic': 110 dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite', 111 verbose=False, 112 posthoc_sparsity_algorithm='linear', 113 total_CFs=k, dataset=dataset+'_'+str(CONF['prefix_length']), 114 model_path=model_path,random_seed=random_seed,adapted=adapted) 115 else: 116 dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite', 117 verbose=False, 118 posthoc_sparsity_algorithm='linear', 119 total_CFs=k,dataset=dataset+'_'+str(CONF['prefix_length']), 120 ) 121 # function to decode cf from train_df and show it decoded before adding to list 122 generated_cfs = dice_result.cf_examples_list[0].final_cfs_df 123 cf_list = np.array(generated_cfs).astype('float64') 124 y_pred = predictive_model.model.predict(x.values.reshape(1, -1))[0] 125 time_test = (datetime.now() - time_start_i).total_seconds() 126 x_eval = evaluate_cf_list(cf_list, x.values.reshape(1,-1), cont_feature_index, cat_feature_index, df=df, 127 nr_of_cfs=k,y_pred=y_pred,predictive_model=predictive_model, 128 query_instances=query_instances,continuous_features=continuous_features, 129 categorical_features=categorical_features,ratio_cont=ratio_cont 130 ) 131 132 x_eval['dataset'] = dataset 133 x_eval['idx'] = test_id+1 134 x_eval['model'] = predictive_model.model_type 135 x_eval['desired_nr_of_cfs'] = k 136 x_eval['time_train'] = time_train 137 x_eval['time_test'] = time_test 138 x_eval['runtime'] = time_train + time_test 139 # x_eval['generated_cfs'] = x_eval['nbr_cf'] 140 x_eval['method'] = method 141 x_eval['explainer'] = CONF['explanator'] 142 x_eval['prefix_length'] = CONF['prefix_length'] 143 x_eval['heuristic'] = heuristic 144 x_eval['optimization'] = optimization 145 x_eval_list.append(x_eval) 146 if cf_list.size > 4: 147 if method == 'random': 148 cf_list = cf_list[:, :-1] 149 elif method == 'genetic': 150 cf_list = cf_list[:, :-1] 151 elif method == 'genetic_conformance': 152 cf_list = cf_list[:, :-1] 153 elif method == 'multi_objective_genetic': 154 cf_list = cf_list[:, :-1] 155 df_conf = pd.DataFrame(data=cf_list, columns=features_names) 156 157 sat_score = conformance_score(CONF, encoder, df=df_conf, dataset=dataset, features_names=features_names, 158 d4py=d4py, query_instance=x, model_path=model_path, 159 timestamp_col_name=timestamp_col_name) 160 x_eval['sat_score'] = sat_score 161 cf_list_all.extend(cf_list[:5]) 162 desired_cfs = [float(k) * np.ones_like(cf_list[:5, 0])] 163 164 desired_cfs_all.extend(*desired_cfs) 165 try: 166 if not os.path.exists(path_results+'_'+str(support)+'/'): 167 os.makedirs(path_results+'_'+str(support)+'/') 168 print("Directory '%s' created successfully" % path_results+'_'+str(support)+'/') 169 except OSError as error: 170 print("Directory '%s' can not be created" % path_results) 171 filename_results = path_results + 'cfeval_%s_%s_dice_%s.csv' % (dataset, black_box,feature_selection) 172 if len(cf_list_all) > 0: 173 df_cf = pd.DataFrame(data=cf_list_all, columns=features_names) 174 encoder.decode(df_cf) 175 if CONF['feature_selection'] in single_prefix: 176 if all(df_cf['prefix'] == '0'): 177 cols = ['prefix_' + str(i+1) for i in range(CONF['prefix_length'])] 178 df_cf[cols] = 0 179 else: 180 df_cf = pd.concat([df_cf, pd.DataFrame( 181 df_cf['prefix'].str.split(",", expand=True).fillna(value='0')).rename( 182 columns=lambda x: f"prefix_{int(x) + 1}")], axis=1) 183 df_cf = df_cf.replace('\[', '',regex=True) 184 df_cf = df_cf.replace(']', '', regex=True) 185 df_cf = df_cf.drop(columns=['prefix']) 186 df_cf['desired_cfs'] = desired_cfs_all 187 df_cf['idx'] = test_id+1 * len(cf_list_all) 188 df_cf['method']= method 189 df_cf['test_id'] = np.arange(0, len(cf_list_all)) 190 df_cf['dataset'] = [dataset] * len(cf_list_all) 191 df_cf['black_box'] = [black_box] * len(cf_list_all) 192 try: 193 if not os.path.exists(path_cf): 194 os.makedirs(path_cf) 195 print("Directory '%s' created successfully" % path_cf) 196 except OSError as error: 197 print("Directory '%s' can not be created" % path_cf) 198 if optimization != 'baseline': 199 filename_cf = path_cf + 'cf_%s_%s_dice_%s_%s_%s_%s.csv' % (dataset, black_box, feature_selection, method, optimization, 200 CONF['prefix_length']) 201 else: 202 filename_cf = path_cf + 'cf_%s_%s_dice_%s_%s_%s.csv' % (dataset, black_box,feature_selection,method, 203 CONF['prefix_length']) 204 if not os.path.isfile(filename_cf): 205 df_cf.to_csv(filename_cf, index=False) 206 else: 207 df_cf.to_csv(filename_cf, mode='a', index=False, header=False) 208 else: 209 x_eval['sat_score'] = 0 210 result_dataframe = pd.DataFrame(data=x_eval_list) 211 result_dataframe = result_dataframe[columns] 212 if not os.path.isfile(filename_results): 213 result_dataframe.to_csv(filename_results, index=False) 214 else: 215 result_dataframe.to_csv(filename_results, mode='a', index=False, header=False) 216 return dice_result 217def dice_model(predictive_model): 218 if predictive_model.model_type is ClassificationMethods.RANDOM_FOREST.value: 219 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 220 elif predictive_model.model_type is ClassificationMethods.PERCEPTRON.value: 221 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 222 elif predictive_model.model_type is ClassificationMethods.MLP.value: 223 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 224 elif predictive_model.model_type is ClassificationMethods.XGBOOST.value: 225 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 226 elif predictive_model.model_type is ClassificationMethods.SGDCLASSIFIER.value: 227 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 228 elif predictive_model.model_type is ClassificationMethods.SVM.value: 229 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 230 elif predictive_model.model_type is ClassificationMethods.KNN.value: 231 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 232 else: 233 m = dice_ml.Model(model=predictive_model.model, backend='PYT') 234 return m 235 236def split_features(df, encoder): 237 categorical_features = [col for col in df.columns if col in list(encoder._label_dict.keys())] 238 cat_feature_index = [df.columns.get_loc(c) for c in categorical_features if c in df] 239 continuous_features = [col for col in df.columns if col in list(encoder._numeric_encoder.keys())] 240 cont_feature_index = [df.columns.get_loc(c) for c in continuous_features if c in df] 241 return categorical_features,continuous_features,cat_feature_index,cont_feature_index 242 243def evaluate_cf_list(cf_list, query_instance, cont_feature_index,cat_feature_index,df, y_pred,nr_of_cfs, 244 predictive_model, query_instances, continuous_features, categorical_features, ratio_cont): 245 nbr_features = query_instance.shape[1] 246 if cf_list.size > 4: 247 nbr_cf_ = len(cf_list) 248 nbr_features = cf_list.shape[1] 249 plausibility_sum = plausibility(query_instance, predictive_model, cf_list,nr_of_cfs, query_instances, y_pred, 250 cont_feature_index,cat_feature_index, df, ratio_cont 251 ) 252 plausibility_max_nbr_cf_ = plausibility_sum / nr_of_cfs 253 plausibility_nbr_cf_ = plausibility_sum / nbr_cf_ 254 distance_l2_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df) 255 distance_mad_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df) 256 distance_j_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard') 257 distance_h_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming') 258 distance_l2j_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index) 259 distance_l1j_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index) 260 distance_mh_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, df) 261 262 distance_l2_min_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df, 263 agg='min') 264 distance_mad_min_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df, agg='min') 265 distance_j_min_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg='min') 266 distance_h_min_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg='min') 267 distance_l2j_min_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, 268 agg='min') 269 distance_l1j_min_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, 270 agg='min') 271 distance_mh_min_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, df,agg='min') 272 273 distance_l2_max_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df, agg='max') 274 distance_mad_max_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df, agg='max') 275 distance_j_max_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg='max') 276 distance_h_max_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg='max') 277 distance_l2j_max_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, agg='max') 278 distance_l1j_max_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, agg='max') 279 280 distance_mh_max_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, X=df, agg='max') 281 avg_nbr_changes_per_cf_ = avg_nbr_changes_per_cf(query_instance, cf_list, continuous_features) 282 avg_nbr_changes_ = avg_nbr_changes(query_instance, cf_list, nbr_features, continuous_features) 283 if len(cf_list) > 1: 284 diversity_l2_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=df) 285 diversity_mad_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df) 286 diversity_j_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard') 287 diversity_h_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming') 288 diversity_l2j_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index) 289 diversity_mh_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df) 290 291 diversity_l2_min_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=df, agg='min') 292 diversity_mad_min_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df, agg='min') 293 diversity_j_min_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg='min') 294 diversity_h_min_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg='min') 295 diversity_l2j_min_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index, agg='min') 296 diversity_mh_min_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df, agg='min') 297 298 diversity_l2_max_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg='max') 299 diversity_mad_max_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df, agg='max') 300 diversity_j_max_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg='max') 301 diversity_h_max_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg='max') 302 diversity_l2j_max_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index, agg='max') 303 diversity_mh_max_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df, agg='max') 304 305 else: 306 diversity_l2_ = 0.0 307 diversity_mad_ = 0.0 308 diversity_j_ = 0.0 309 diversity_h_ = 0.0 310 diversity_l2j_ = 0.0 311 diversity_mh_ = 0.0 312 313 diversity_l2_min_ = 0.0 314 diversity_mad_min_ = 0.0 315 diversity_j_min_ = 0.0 316 diversity_h_min_ = 0.0 317 diversity_l2j_min_ = 0.0 318 diversity_mh_min_ = 0.0 319 320 diversity_l2_max_ = 0.0 321 diversity_mad_max_ = 0.0 322 diversity_j_max_ = 0.0 323 diversity_h_max_ = 0.0 324 diversity_l2j_max_ = 0.0 325 diversity_mh_max_ = 0.0 326 327 count_diversity_cont_ = count_diversity(cf_list, cont_feature_index, nbr_features, cont_feature_index) 328 count_diversity_cate_ = count_diversity(cf_list, cat_feature_index, nbr_features, cont_feature_index) 329 count_diversity_all_ = count_diversity_all(cf_list, nbr_features, cont_feature_index) 330 res = { 'generated_cfs': nr_of_cfs, 331 'implausibility_sum': plausibility_sum, 332 'implausibility_max_nbr_cf': plausibility_max_nbr_cf_, 333 'implausibility_nbr_cf': plausibility_nbr_cf_, 334 'distance_l2': distance_l2_, 335 'distance_mad': distance_mad_, 336 'distance_j': distance_j_, 337 'distance_h': distance_h_, 338 'distance_l2j': distance_l2j_, 339 'distance_l1j':distance_l1j_, 340 'distance_mh': distance_mh_, 341 342 'distance_l2_min': distance_l2_min_, 343 'distance_mad_min': distance_mad_min_, 344 'distance_j_min': distance_j_min_, 345 'distance_h_min': distance_h_min_, 346 'distance_l2j_min': distance_l2j_min_, 347 'distance_l1j_min': distance_l1j_min_, 348 'distance_mh_min': distance_mh_min_, 349 350 'distance_l2_max': distance_l2_max_, 351 'distance_mad_max': distance_mad_max_, 352 'distance_j_max': distance_j_max_, 353 'distance_h_max': distance_h_max_, 354 'distance_l2j_max': distance_l2j_max_, 355 'distance_l1j_max':distance_l1j_max_, 356 'distance_mh_max': distance_mh_max_, 357 358 'diversity_l2': diversity_l2_, 359 'diversity_mad': diversity_mad_, 360 'diversity_j': diversity_j_, 361 'diversity_h': diversity_h_, 362 'diversity_l2j': diversity_l2j_, 363 'diversity_mh': diversity_mh_, 364 365 'diversity_l2_min': diversity_l2_min_, 366 'diversity_mad_min': diversity_mad_min_, 367 'diversity_j_min': diversity_j_min_, 368 'diversity_h_min': diversity_h_min_, 369 'diversity_l2j_min': diversity_l2j_min_, 370 'diversity_mh_min': diversity_mh_min_, 371 372 'diversity_l2_max': diversity_l2_max_, 373 'diversity_mad_max': diversity_mad_max_, 374 'diversity_j_max': diversity_j_max_, 375 'diversity_h_max': diversity_h_max_, 376 'diversity_l2j_max': diversity_l2j_max_, 377 'diversity_mh_max': diversity_mh_max_, 378 379 'count_diversity_cont': count_diversity_cont_, 380 'count_diversity_cate': count_diversity_cate_, 381 'count_diversity_all': count_diversity_all_, 382 'avg_nbr_changes_per_cf':avg_nbr_changes_per_cf_, 383 'avg_nbr_changes': avg_nbr_changes_} 384 else: 385 res = { 386 'generated_cfs': 0, 387 'distance_l2': np.nan, 388 'distance_mad': np.nan, 389 'distance_j': np.nan, 390 'distance_h': np.nan, 391 'distance_l2j': np.nan, 392 'distance_l1j':np.nan, 393 'distance_mh': np.nan, 394 'distance_l2_min': np.nan, 395 'distance_mad_min': np.nan, 396 'distance_j_min': np.nan, 397 'distance_h_min': np.nan, 398 'distance_l2j_min': np.nan, 399 'distance_l1j_min':np.nan, 400 'distance_mh_min': np.nan, 401 'distance_l2_max': np.nan, 402 'distance_mad_max': np.nan, 403 'distance_j_max': np.nan, 404 'distance_h_max': np.nan, 405 'distance_l2j_max': np.nan, 406 'distance_l1j_max':np.nan, 407 'distance_mh_max': np.nan, 408 'avg_nbr_changes_per_cf': np.nan, 409 'avg_nbr_changes': np.nan, 410 'diversity_l2': np.nan, 411 'diversity_mad': np.nan, 412 'diversity_j': np.nan, 413 'diversity_h': np.nan, 414 'diversity_l2j': np.nan, 415 'diversity_mh': np.nan, 416 'diversity_l2_min': np.nan, 417 'diversity_mad_min': np.nan, 418 'diversity_j_min': np.nan, 419 'diversity_h_min': np.nan, 420 'diversity_l2j_min': np.nan, 421 'diversity_mh_min': np.nan, 422 'diversity_l2_max': np.nan, 423 'diversity_mad_max': np.nan, 424 'diversity_j_max': np.nan, 425 'diversity_h_max': np.nan, 426 'diversity_l2j_max': np.nan, 427 'diversity_mh_max': np.nan, 428 'count_diversity_cont': np.nan, 429 'count_diversity_cate': np.nan, 430 'count_diversity_all': np.nan, 431 432 'implausibility_sum': 0.0, 433 'implausibility_max_nbr_cf': 0.0, 434 'implausibility_nbr_cf': 0.0, 435 'sat_score': 0.0 436 } 437 return res 438 439def continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=None): 440 if metric == 'mad': 441 mad = median_abs_deviation(X.iloc[:, cont_feature_index], axis=0) 442 mad = np.array([v if v != 0 else 1.0 for v in mad]) 443 444 def _mad_cityblock(u, v): 445 return mad_cityblock(u, v, mad) 446 dist = pdist(cf_list[:, cont_feature_index], metric=_mad_cityblock) 447 else: 448 dist = pdist(cf_list[:, cont_feature_index], metric=metric) 449 450 if agg is None or agg == 'mean': 451 return np.mean(dist) 452 453 if agg == 'max': 454 return np.max(dist) 455 456 if agg == 'min': 457 return np.min(dist) 458 459def categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=None): 460 461 dist = pdist(cf_list[:, cat_feature_index], metric=metric) 462 463 if agg is None or agg == 'mean': 464 return np.mean(dist) 465 466 if agg == 'max': 467 return np.max(dist) 468 469 if agg == 'min': 470 return np.min(dist) 471 472def diversity_mh(cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None): 473 nbr_features = cf_list.shape[1] 474 dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=X, agg=agg) 475 dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg=agg) 476 if ratio_cont is None: 477 ratio_continuous = len(cont_feature_index) / nbr_features 478 ratio_categorical = len(cat_feature_index) / nbr_features 479 else: 480 ratio_continuous = ratio_cont 481 ratio_categorical = 1.0 - ratio_cont 482 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 483 return dist 484 485def count_diversity(cf_list, features, nbr_features, cont_feature_index): 486 nbr_cf = cf_list.shape[0] 487 nbr_changes = 0 488 for i in range(nbr_cf): 489 for j in range(i+1, nbr_cf): 490 for k in features: 491 if cf_list[i][k] != cf_list[j][k]: 492 nbr_changes += 1 if j in cont_feature_index else 0.5 493 return nbr_changes / (nbr_cf * nbr_cf * nbr_features) 494 495 496 497# piu alto e' meglio conta variet' tra cf 498def count_diversity_all(cf_list, nbr_features, cont_feature_index): 499 return count_diversity(cf_list, range(cf_list.shape[1]), nbr_features, cont_feature_index) 500 501def continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=None): 502 if metric == 'mad': 503 mad = median_abs_deviation(X.iloc[:, cont_feature_index], axis=0) 504 mad = np.array([v if v != 0 else 1.0 for v in mad]) 505 506 def _mad_cityblock(u, v): 507 return mad_cityblock(u, v, mad) 508 dist = cdist(query_instance[:, cont_feature_index], cf_list[:, cont_feature_index], metric=_mad_cityblock) 509 else: 510 dist = cdist(query_instance[:, cont_feature_index], cf_list[:, cont_feature_index], metric=metric) 511 512 if agg is None or agg == 'mean': 513 return np.mean(dist) 514 515 if agg == 'max': 516 return np.max(dist) 517 518 if agg == 'min': 519 return np.min(dist) 520 521def mad_cityblock(u, v, mad): 522 u = _validate_vector(u) 523 v = _validate_vector(v) 524 l1_diff = abs(u - v) 525 l1_diff_mad = l1_diff / mad 526 return l1_diff_mad.sum() 527 528def categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=None): 529 try: 530 dist = cdist(query_instance.reshape(1, -1)[:, cat_feature_index], cf_list[:, cat_feature_index], metric=metric) 531 except: 532 print('Problem with categorical distance') 533 if agg is None or agg == 'mean': 534 return np.mean(dist) 535 536 if agg == 'max': 537 return np.max(dist) 538 539 if agg == 'min': 540 return np.min(dist) 541 542def euclidean_jaccard(query_instance, A, cont_feature_index, cat_feature_index, ratio_cont=None): 543 nbr_features = A.shape[1] 544 dist_cont = cdist(query_instance.reshape(1, -1)[:, cont_feature_index], A[:, cont_feature_index], metric='euclidean') 545 dist_cate = cdist(query_instance.reshape(1, -1)[:, cat_feature_index], A[:, cat_feature_index], metric='jaccard') 546 if ratio_cont is None: 547 ratio_continuous = len(cont_feature_index) / nbr_features 548 ratio_categorical = len(cat_feature_index) / nbr_features 549 else: 550 ratio_continuous = ratio_cont 551 ratio_categorical = 1.0 - ratio_cont 552 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 553 return dist 554 555 556def distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None): 557 nbr_features = cf_list.shape[1] 558 dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=agg) 559 dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=agg) 560 if ratio_cont is None: 561 ratio_continuous = len(cont_feature_index) / nbr_features 562 ratio_categorical = len(cat_feature_index) / nbr_features 563 else: 564 ratio_continuous = ratio_cont 565 ratio_categorical = 1.0 - ratio_cont 566 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 567 return dist 568 569def distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None): 570 nbr_features = cf_list.shape[1] 571 dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='cityblock', X=None, agg=agg) 572 dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=agg) 573 if ratio_cont is None: 574 ratio_continuous = len(cont_feature_index) / nbr_features 575 ratio_categorical = len(cat_feature_index) / nbr_features 576 else: 577 ratio_continuous = ratio_cont 578 ratio_categorical = 1.0 - ratio_cont 579 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 580 return dist 581 582def distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None): 583 nbr_features = cf_list.shape[1] 584 dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=X, agg=agg) 585 dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg=agg) 586 if ratio_cont is None: 587 ratio_continuous = len(cont_feature_index) / nbr_features 588 ratio_categorical = len(cat_feature_index) / nbr_features 589 else: 590 ratio_continuous = ratio_cont 591 ratio_categorical = 1.0 - ratio_cont 592 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 593 return dist 594 595def categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=None): 596 dist = pdist(cf_list[:, cat_feature_index], metric=metric) 597 598 if agg is None or agg == 'mean': 599 return np.mean(dist) 600 601 if agg == 'max': 602 return np.max(dist) 603 604 if agg == 'min': 605 return np.min(dist) 606 607def diversity_l2j(cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None): 608 nbr_features = cf_list.shape[1] 609 dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=agg) 610 dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=agg) 611 if ratio_cont is None: 612 ratio_continuous = len(cont_feature_index) / nbr_features 613 ratio_categorical = len(cat_feature_index) / nbr_features 614 else: 615 ratio_continuous = ratio_cont 616 ratio_categorical = 1.0 - ratio_cont 617 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 618 return dist 619 620def diversity_mh(cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None): 621 nbr_features = cf_list.shape[1] 622 dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=X, agg=agg) 623 dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg=agg) 624 if ratio_cont is None: 625 ratio_continuous = len(cont_feature_index) / nbr_features 626 ratio_categorical = len(cat_feature_index) / nbr_features 627 else: 628 ratio_continuous = ratio_cont 629 ratio_categorical = 1.0 - ratio_cont 630 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 631 return dist 632 633def nbr_changes_per_cf(x, cf_list, continuous_features): 634 x = x.ravel() 635 nbr_features = cf_list.shape[1] - 1 #exclude label 636 nbr_changes = np.zeros(len(cf_list)) 637 for i, cf in enumerate(cf_list): 638 cf = cf[:-1] 639 for j in range(nbr_features): 640 if cf[j] != x[j]: 641 nbr_changes[i] += 1 if j in continuous_features else 0.5 642 return nbr_changes 643 644def avg_nbr_changes_per_cf(x, cf_list, continuous_features): 645 return np.mean(nbr_changes_per_cf(x, cf_list, continuous_features)) 646 647def avg_nbr_changes(x, cf_list, nbr_features, continuous_features): 648 val = np.sum(nbr_changes_per_cf(x, cf_list, continuous_features)) 649 nbr_cf, _ = cf_list.shape 650 return val / (nbr_cf * nbr_features) 651 652def plausibility(query_instance, predictive_model, cf_list,nr_of_cfs, query_instances, 653 y_pred, continuous_features, categorical_features, df, ratio_cont): 654 sum_dist = 0.0 655 full_df = pd.concat([query_instances,df],ignore_index=False) 656 for cf in cf_list: 657 #X_y = full_df[full_df['label'] == y_label] 658 X_y = full_df 659 # neigh_dist = exp.cdist(x.reshape(1, -1), X_test_y) 660 neigh_dist = distance_mh(query_instance.reshape(1, -1), X_y.to_numpy(), continuous_features, 661 categorical_features, df, ratio_cont) 662 idx_neigh = np.argsort(neigh_dist)[0] 663 # closest_idx = closest_idx = idx_neigh[0] 664 # closest = X_test_y[closest_idx] 665 closest = X_y.to_numpy()[idx_neigh] 666 d = distance_mh(cf.reshape(1,-1), closest.reshape(1, -1), continuous_features, 667 categorical_features, df, ratio_cont) 668 sum_dist += d 669 return sum_dist 670 671def conformance_score(CONF, encoder, df, dataset, features_names, d4py, query_instance, model_path, timestamp_col_name): 672 d4py.parse_decl_model(model_path=os.path.join(model_path, dataset+'_'+str(CONF['prefix_length'])+'.decl')) 673 674 df = pd.DataFrame(df, columns=features_names) 675 try: 676 query_instance_to_decode = pd.DataFrame(np.array(query_instance, dtype=float), 677 columns=features_names) 678 except: 679 query_instance_to_decode = pd.DataFrame(np.array(query_instance, dtype=str), 680 columns=features_names) 681 encoder.decode(query_instance_to_decode) 682 encoder.decode(df) 683 df.insert(loc=0, column='Case ID', value=np.divmod(np.arange(len(df)), 1)[0] + 1) 684 df.insert(loc=1, column='label', value=1) 685 query_instance_to_decode.insert(loc=0, column='Case ID', 686 value=np.divmod(np.arange(len(query_instance_to_decode)), 1)[0] + 1) 687 query_instance_to_decode.insert(loc=1, column='label', value=1) 688 if CONF['feature_selection'] in single_prefix: 689 if all(df['prefix'] == '0'): 690 cols = ['prefix_' + str(i + 1) for i in range(CONF['prefix_length'])] 691 df[cols] = 0 692 query_instance_to_decode[cols] =0 693 else: 694 df = pd.concat([df, pd.DataFrame( 695 df['prefix'].str.split(",", expand=True).fillna(value='0')).rename( 696 columns=lambda x: f"prefix_{int(x) + 1}")], axis=1) 697 df = df.replace('\[', '', regex=True) 698 df = df.replace(']', '', regex=True) 699 query_instance_to_decode = pd.concat([query_instance_to_decode, pd.DataFrame( 700 query_instance_to_decode['prefix'].str.split(",", expand=True).fillna(value='0')).rename( 701 columns=lambda x: f"prefix_{int(x) + 1}")], axis=1) 702 query_instance_to_decode = query_instance_to_decode.replace('\[', '', regex=True) 703 query_instance_to_decode = query_instance_to_decode.replace(']', '', regex=True) 704 df = df.drop(columns=['prefix']) 705 query_instance_to_decode = query_instance_to_decode.drop(columns=['prefix']) 706 long_data = pd.wide_to_long(df, stubnames=['prefix'], i='Case ID', 707 j='order', sep='_', suffix=r'\w+') 708 long_query_instance = pd.wide_to_long(query_instance_to_decode, stubnames=['prefix'], i='Case ID', 709 j='order', sep='_', suffix=r'\w+') 710 long_query_instance_sorted = long_query_instance.sort_values(['Case ID', 'order'], ).reset_index(drop=False) 711 timestamps = pd.date_range('1/1/2011', periods=len(long_data), freq='H') 712 long_data_sorted = long_data.sort_values(['Case ID', 'order'], ).reset_index(drop=False) 713 long_data_sorted[timestamp_col_name] = timestamps 714 long_data_sorted['label'].replace({1: 'regular'}, inplace=True) 715 long_data_sorted.drop(columns=['order'], inplace=True) 716 columns_to_rename = {'Case ID': 'case:concept:name'} 717 columns_to_rename.update({'prefix': 'concept:name'}) 718 long_data_sorted.rename(columns=columns_to_rename, inplace=True) 719 long_data_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True) 720 long_data_sorted.replace('0', 'other', inplace=True) 721 timestamps_query = pd.date_range('1/1/2011', periods=len(long_query_instance), freq='H') 722 long_query_instance_sorted[timestamp_col_name] = timestamps_query 723 long_query_instance_sorted.rename(columns=columns_to_rename, inplace=True) 724 long_query_instance_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True) 725 long_query_instance_sorted.replace('0', 'other', inplace=True) 726 long_query_instance_sorted['case:concept:name'] = long_query_instance_sorted['case:concept:name'].astype(str) 727 long_data_sorted['case:concept:name'] = long_data_sorted['case:concept:name'].astype(str) 728 event_log = convert_to_event_log(long_data_sorted) 729 query_log = convert_to_event_log(long_query_instance_sorted) 730 d4py.load_xes_log(event_log) 731 model_check_res = d4py.conformance_checking(consider_vacuity=False) 732 d4py.load_xes_log(query_log) 733 model_check_query = d4py.conformance_checking(consider_vacuity=False) 734 query_patterns = { 735 constraint 736 for trace, patts in model_check_query.items() 737 for constraint, checker in patts.items() 738 if checker.state == TraceState.SATISFIED 739 } 740 741 model_check_res = { 742 k: { 743 constraint: checker 744 for constraint, checker in v.items() 745 if checker.state != TraceState.VIOLATED and constraint in query_patterns 746 } 747 for k, v in model_check_res.items() 748 } 749 750 conformance_score = [len(v) / len(query_patterns) for v in model_check_res.values() ] 751 avg_conformance = np.mean(conformance_score) 752 print('Average conformance score', np.mean(conformance_score)) 753 return avg_conformance 754 755def model_discovery(CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name): 756 df = pd.DataFrame(df, columns=features_names) 757 encoder.decode(df) 758 df.insert(loc=0, column='Case ID', value=np.divmod(np.arange(len(df)), 1)[0] + 1) 759 df.insert(loc=1, column='label', value=1) 760 long_data = pd.wide_to_long(df, stubnames=['prefix'], i='Case ID', 761 j='order', sep='_', suffix=r'\w+') 762 timestamps = pd.date_range('1/1/2011', periods=len(long_data), freq='H') 763 long_data_sorted = long_data.sort_values(['Case ID', 'order'], ).reset_index(drop=False) 764 long_data_sorted[timestamp_col_name] = timestamps 765 long_data_sorted['label'].replace({1: 'regular'}, inplace=True) 766 long_data_sorted.drop(columns=['order'], inplace=True) 767 columns_to_rename = {'Case ID': 'case:concept:name'} 768 columns_to_rename.update({'prefix': 'concept:name'}) 769 long_data_sorted.rename(columns=columns_to_rename, inplace=True) 770 long_data_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True) 771 long_data_sorted.replace('0', 'other', inplace=True) 772 long_data_sorted.replace(0.0, 'other', inplace=True) 773 long_data_sorted.replace(0, 'other', inplace=True) 774 long_data_sorted['case:concept:name'] = long_data_sorted['case:concept:name'].astype(str) 775 event_log = convert_to_event_log(long_data_sorted) 776 d4py.load_xes_log(event_log) 777 d4py.compute_frequent_itemsets(min_support=support, len_itemset=2) 778 d4py.discovery(consider_vacuity=False, max_declare_cardinality=2) 779 discovered = d4py.filter_discovery(min_support=support, output_path=os.path.join(model_path, dataset+'_'+str(CONF['prefix_length'])+'.decl')) 780 781 #pm4py.filter_trace_attribute_values() 782 783def perform_model_analysis(model_path, dataset, CONF, encoder, full_df, support, log,dataset_confs): 784 try: 785 if not os.path.exists(model_path): 786 os.makedirs(model_path) 787 print("Directory '%s' created successfully" % model_path) 788 except OSError as error: 789 print("Directory '%s' can not be created" % model_path) 790 791 d4py = Declare4Py() 792 793 try: 794 decl_model_path = model_path + dataset + '_' + str(CONF['prefix_length']) + '.decl' 795 if not os.path.exists(decl_model_path): 796 print('Do model discovery') 797 features_names = full_df.columns.values[:-1] 798 model_discovery(CONF, encoder, full_df.iloc[:, 1:], dataset, features_names, 799 d4py, model_path, support, [*dataset_confs.timestamp_col.values()][0]) 800 except OSError as error: 801 print("File '%s' can not be created" % decl_model_path) 802 803 d4py.parse_decl_model(model_path=decl_model_path) 804 805 d4py.load_xes_log(log) 806 conformance_check = d4py.conformance_checking(consider_vacuity=False) 807 808 model_check_res = { 809 k: { 810 constraint: checker 811 for constraint, checker in v.items() 812 if checker.state != TraceState.VIOLATED 813 } 814 for k, v in conformance_check.items() 815 } 816 817 conformant_traces = [trace_id[1] for trace_id, results in model_check_res.items() if 818 len(results) == len(d4py.model.constraints)] 819 number_of_constraints = len(d4py.model.constraints) 820 conformant_traces_ratio = len(conformant_traces) / len(log) 821 822 return conformant_traces,number_of_constraints, conformant_traces_ratio 823 824 825columns = ['dataset','heuristic', 'model', 'method', 'optimization','prefix_length','idx', 'desired_nr_of_cfs','generated_cfs', 'time_train','time_test', 826 'runtime','distance_l2', 'distance_mad', 'distance_j', 'distance_h','distance_l1j', 'distance_l2j', 'distance_mh', 827 'distance_l2_min', 'distance_mad_min', 'distance_j_min', 'distance_h_min','distance_l1j_min', 'distance_l2j_min', 828 'distance_mh_min', 'distance_l2_max', 'distance_mad_max', 'distance_j_max', 'distance_h_max', 829 'distance_l1j_max','distance_l2j_max', 'distance_mh_max', 'diversity_l2', 830 'diversity_mad', 'diversity_j', 'diversity_h', 'diversity_l2j', 'diversity_mh', 'diversity_l2_min', 831 'diversity_mad_min', 'diversity_j_min', 'diversity_h_min', 'diversity_l2j_min', 'diversity_mh_min', 832 'diversity_l2_max', 'diversity_mad_max', 'diversity_j_max', 'diversity_h_max', 'diversity_l2j_max', 833 'diversity_mh_max', 'count_diversity_cont', 'count_diversity_cate', 'count_diversity_all', 834 'avg_nbr_changes_per_cf','avg_nbr_changes','implausibility_sum', 835 'implausibility_max_nbr_cf','implausibility_nbr_cf','sat_score']
single_prefix =
['loreley', 'loreley_complex']
def
dice_explain( CONF, predictive_model, encoder, df, query_instances, method, optimization, heuristic, support, timestamp_col_name, model_path, random_seed=None, adapted=None, filtering=None):
26def dice_explain(CONF, predictive_model, encoder, df, query_instances, method, optimization, heuristic, support, 27 timestamp_col_name,model_path,random_seed=None,adapted=None,filtering=None 28 ): 29 features_names = df.columns.values[:-1] 30 feature_selection = CONF['feature_selection'] 31 dataset = CONF['data'].rpartition('/')[0].rpartition('/')[-1] 32 33 if 'BPIC15' in dataset: 34 dataset_created = dataset.replace('_f2','') 35 elif 'bpic2012' in dataset: 36 dataset_created = dataset.replace('-COMPLETE','').replace('bpic2012','BPIC12') 37 elif 'sepsis' in dataset: 38 dataset_created = dataset.replace('_cases','') 39 black_box = predictive_model.model_type 40 categorical_features,continuous_features,cat_feature_index,cont_feature_index = split_features(df.iloc[:,:-1], encoder) 41 if CONF['feature_selection'] == 'loreley': 42 query_instances = query_instances[query_instances['prefix'] != 0] 43 if CONF['feature_selection'] == 'frequency': 44 ratio_cont = 1 45 else: 46 ratio_cont = len(continuous_features)/len(categorical_features) 47 time_start = datetime.now() 48 query_instances_for_cf = query_instances.iloc[:2,:-1] 49 d = dice_ml.Data(dataframe=df, continuous_features=continuous_features, outcome_name='label') 50 m = dice_model(predictive_model) 51 dice_query_instance = dice_ml.Dice(d, m, method, encoder) 52 time_train = (datetime.now() - time_start).total_seconds() 53 index_test_instances = range(len(query_instances_for_cf)) 54 #model_path = model_path +'_' + str(support) + '/' 55 extended_loss = False 56 try: 57 if not os.path.exists(model_path): 58 os.makedirs(model_path) 59 print("Directory '%s' created successfully" % model_path) 60 except OSError as error: 61 print("Directory '%s' can not be created" % model_path) 62 63 d4py = Declare4Py() 64 model_discovery(CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name) 65 66 cols = df.columns[:-1].values 67 68 path_results = '../experiments/cf_results_supp_%s/%s/' % (support, 'single_objective_new') 69 if adapted & (not filtering) & (method == 'multi_objective_genetic'): 70 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_new') 71 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_new') 72 elif adapted & filtering & (method == 'multi_objective_genetic'): 73 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support, method, 'adapted_filtering_new') 74 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support, method, 'adapted_filtering_new') 75 elif (not adapted) & (method == 'genetic_conformance'): 76 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'single_objective_new') 77 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'single_objective_new') 78 elif (adapted) & (method == 'genetic_conformance') & (optimization == 'baseline'): 79 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_no_conformance') 80 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_no_conformance') 81 elif (adapted) & (method == 'genetic_conformance') & (optimization != 'baseline'): 82 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_conformance_large') 83 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'adapted_loss_conformance_large') 84 elif method =='genetic': 85 path_results = '../experiments/cf_results_supp_%s/%s/' % (support,'single_objective_new') 86 path_cf = '../experiments/cf_results_supp_%s/%s/' % (support,'single_objective_new') 87 elif (not adapted) & (method == 'multi_objective_genetic') & (extended_loss): 88 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'mixed_ga_5obj') 89 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'mixed_ga_5_ob') 90 elif (not adapted) & (method == 'multi_objective_genetic'): 91 path_results = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'baseline_new') 92 path_cf = '../experiments/cf_results_supp_%s/%s_%s/' % (support,method,'baseline_new') 93 94 for test_id,i in enumerate(index_test_instances): 95 print(datetime.now(), dataset, black_box, test_id, len(index_test_instances), 96 '%.2f' % (test_id+1 / len(index_test_instances))) 97 cf_list_all = list() 98 x_eval_list = list() 99 desired_cfs_all = list() 100 x = query_instances_for_cf.iloc[[i]] 101 102 for k in [5]: 103 time_start_i = datetime.now() 104 if method == 'genetic_conformance': 105 dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite', 106 verbose=False, 107 posthoc_sparsity_algorithm='linear', 108 total_CFs=k, dataset=dataset+'_'+str(CONF['prefix_length']), 109 model_path=model_path,random_seed=random_seed,adapted=adapted) 110 elif method == 'multi_objective_genetic': 111 dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite', 112 verbose=False, 113 posthoc_sparsity_algorithm='linear', 114 total_CFs=k, dataset=dataset+'_'+str(CONF['prefix_length']), 115 model_path=model_path,random_seed=random_seed,adapted=adapted) 116 else: 117 dice_result = dice_query_instance.generate_counterfactuals(x,encoder=encoder, desired_class='opposite', 118 verbose=False, 119 posthoc_sparsity_algorithm='linear', 120 total_CFs=k,dataset=dataset+'_'+str(CONF['prefix_length']), 121 ) 122 # function to decode cf from train_df and show it decoded before adding to list 123 generated_cfs = dice_result.cf_examples_list[0].final_cfs_df 124 cf_list = np.array(generated_cfs).astype('float64') 125 y_pred = predictive_model.model.predict(x.values.reshape(1, -1))[0] 126 time_test = (datetime.now() - time_start_i).total_seconds() 127 x_eval = evaluate_cf_list(cf_list, x.values.reshape(1,-1), cont_feature_index, cat_feature_index, df=df, 128 nr_of_cfs=k,y_pred=y_pred,predictive_model=predictive_model, 129 query_instances=query_instances,continuous_features=continuous_features, 130 categorical_features=categorical_features,ratio_cont=ratio_cont 131 ) 132 133 x_eval['dataset'] = dataset 134 x_eval['idx'] = test_id+1 135 x_eval['model'] = predictive_model.model_type 136 x_eval['desired_nr_of_cfs'] = k 137 x_eval['time_train'] = time_train 138 x_eval['time_test'] = time_test 139 x_eval['runtime'] = time_train + time_test 140 # x_eval['generated_cfs'] = x_eval['nbr_cf'] 141 x_eval['method'] = method 142 x_eval['explainer'] = CONF['explanator'] 143 x_eval['prefix_length'] = CONF['prefix_length'] 144 x_eval['heuristic'] = heuristic 145 x_eval['optimization'] = optimization 146 x_eval_list.append(x_eval) 147 if cf_list.size > 4: 148 if method == 'random': 149 cf_list = cf_list[:, :-1] 150 elif method == 'genetic': 151 cf_list = cf_list[:, :-1] 152 elif method == 'genetic_conformance': 153 cf_list = cf_list[:, :-1] 154 elif method == 'multi_objective_genetic': 155 cf_list = cf_list[:, :-1] 156 df_conf = pd.DataFrame(data=cf_list, columns=features_names) 157 158 sat_score = conformance_score(CONF, encoder, df=df_conf, dataset=dataset, features_names=features_names, 159 d4py=d4py, query_instance=x, model_path=model_path, 160 timestamp_col_name=timestamp_col_name) 161 x_eval['sat_score'] = sat_score 162 cf_list_all.extend(cf_list[:5]) 163 desired_cfs = [float(k) * np.ones_like(cf_list[:5, 0])] 164 165 desired_cfs_all.extend(*desired_cfs) 166 try: 167 if not os.path.exists(path_results+'_'+str(support)+'/'): 168 os.makedirs(path_results+'_'+str(support)+'/') 169 print("Directory '%s' created successfully" % path_results+'_'+str(support)+'/') 170 except OSError as error: 171 print("Directory '%s' can not be created" % path_results) 172 filename_results = path_results + 'cfeval_%s_%s_dice_%s.csv' % (dataset, black_box,feature_selection) 173 if len(cf_list_all) > 0: 174 df_cf = pd.DataFrame(data=cf_list_all, columns=features_names) 175 encoder.decode(df_cf) 176 if CONF['feature_selection'] in single_prefix: 177 if all(df_cf['prefix'] == '0'): 178 cols = ['prefix_' + str(i+1) for i in range(CONF['prefix_length'])] 179 df_cf[cols] = 0 180 else: 181 df_cf = pd.concat([df_cf, pd.DataFrame( 182 df_cf['prefix'].str.split(",", expand=True).fillna(value='0')).rename( 183 columns=lambda x: f"prefix_{int(x) + 1}")], axis=1) 184 df_cf = df_cf.replace('\[', '',regex=True) 185 df_cf = df_cf.replace(']', '', regex=True) 186 df_cf = df_cf.drop(columns=['prefix']) 187 df_cf['desired_cfs'] = desired_cfs_all 188 df_cf['idx'] = test_id+1 * len(cf_list_all) 189 df_cf['method']= method 190 df_cf['test_id'] = np.arange(0, len(cf_list_all)) 191 df_cf['dataset'] = [dataset] * len(cf_list_all) 192 df_cf['black_box'] = [black_box] * len(cf_list_all) 193 try: 194 if not os.path.exists(path_cf): 195 os.makedirs(path_cf) 196 print("Directory '%s' created successfully" % path_cf) 197 except OSError as error: 198 print("Directory '%s' can not be created" % path_cf) 199 if optimization != 'baseline': 200 filename_cf = path_cf + 'cf_%s_%s_dice_%s_%s_%s_%s.csv' % (dataset, black_box, feature_selection, method, optimization, 201 CONF['prefix_length']) 202 else: 203 filename_cf = path_cf + 'cf_%s_%s_dice_%s_%s_%s.csv' % (dataset, black_box,feature_selection,method, 204 CONF['prefix_length']) 205 if not os.path.isfile(filename_cf): 206 df_cf.to_csv(filename_cf, index=False) 207 else: 208 df_cf.to_csv(filename_cf, mode='a', index=False, header=False) 209 else: 210 x_eval['sat_score'] = 0 211 result_dataframe = pd.DataFrame(data=x_eval_list) 212 result_dataframe = result_dataframe[columns] 213 if not os.path.isfile(filename_results): 214 result_dataframe.to_csv(filename_results, index=False) 215 else: 216 result_dataframe.to_csv(filename_results, mode='a', index=False, header=False) 217 return dice_result
def
dice_model(predictive_model):
218def dice_model(predictive_model): 219 if predictive_model.model_type is ClassificationMethods.RANDOM_FOREST.value: 220 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 221 elif predictive_model.model_type is ClassificationMethods.PERCEPTRON.value: 222 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 223 elif predictive_model.model_type is ClassificationMethods.MLP.value: 224 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 225 elif predictive_model.model_type is ClassificationMethods.XGBOOST.value: 226 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 227 elif predictive_model.model_type is ClassificationMethods.SGDCLASSIFIER.value: 228 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 229 elif predictive_model.model_type is ClassificationMethods.SVM.value: 230 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 231 elif predictive_model.model_type is ClassificationMethods.KNN.value: 232 m = dice_ml.Model(model=predictive_model.model, backend='sklearn') 233 else: 234 m = dice_ml.Model(model=predictive_model.model, backend='PYT') 235 return m
def
split_features(df, encoder):
237def split_features(df, encoder): 238 categorical_features = [col for col in df.columns if col in list(encoder._label_dict.keys())] 239 cat_feature_index = [df.columns.get_loc(c) for c in categorical_features if c in df] 240 continuous_features = [col for col in df.columns if col in list(encoder._numeric_encoder.keys())] 241 cont_feature_index = [df.columns.get_loc(c) for c in continuous_features if c in df] 242 return categorical_features,continuous_features,cat_feature_index,cont_feature_index
def
evaluate_cf_list( cf_list, query_instance, cont_feature_index, cat_feature_index, df, y_pred, nr_of_cfs, predictive_model, query_instances, continuous_features, categorical_features, ratio_cont):
244def evaluate_cf_list(cf_list, query_instance, cont_feature_index,cat_feature_index,df, y_pred,nr_of_cfs, 245 predictive_model, query_instances, continuous_features, categorical_features, ratio_cont): 246 nbr_features = query_instance.shape[1] 247 if cf_list.size > 4: 248 nbr_cf_ = len(cf_list) 249 nbr_features = cf_list.shape[1] 250 plausibility_sum = plausibility(query_instance, predictive_model, cf_list,nr_of_cfs, query_instances, y_pred, 251 cont_feature_index,cat_feature_index, df, ratio_cont 252 ) 253 plausibility_max_nbr_cf_ = plausibility_sum / nr_of_cfs 254 plausibility_nbr_cf_ = plausibility_sum / nbr_cf_ 255 distance_l2_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df) 256 distance_mad_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df) 257 distance_j_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard') 258 distance_h_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming') 259 distance_l2j_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index) 260 distance_l1j_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index) 261 distance_mh_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, df) 262 263 distance_l2_min_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df, 264 agg='min') 265 distance_mad_min_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df, agg='min') 266 distance_j_min_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg='min') 267 distance_h_min_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg='min') 268 distance_l2j_min_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, 269 agg='min') 270 distance_l1j_min_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, 271 agg='min') 272 distance_mh_min_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, df,agg='min') 273 274 distance_l2_max_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=df, agg='max') 275 distance_mad_max_ = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=df, agg='max') 276 distance_j_max_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg='max') 277 distance_h_max_ = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg='max') 278 distance_l2j_max_ = distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, agg='max') 279 distance_l1j_max_ = distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, agg='max') 280 281 distance_mh_max_ = distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, X=df, agg='max') 282 avg_nbr_changes_per_cf_ = avg_nbr_changes_per_cf(query_instance, cf_list, continuous_features) 283 avg_nbr_changes_ = avg_nbr_changes(query_instance, cf_list, nbr_features, continuous_features) 284 if len(cf_list) > 1: 285 diversity_l2_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=df) 286 diversity_mad_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df) 287 diversity_j_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard') 288 diversity_h_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming') 289 diversity_l2j_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index) 290 diversity_mh_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df) 291 292 diversity_l2_min_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=df, agg='min') 293 diversity_mad_min_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df, agg='min') 294 diversity_j_min_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg='min') 295 diversity_h_min_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg='min') 296 diversity_l2j_min_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index, agg='min') 297 diversity_mh_min_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df, agg='min') 298 299 diversity_l2_max_ = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg='max') 300 diversity_mad_max_ = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=df, agg='max') 301 diversity_j_max_ = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg='max') 302 diversity_h_max_ = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg='max') 303 diversity_l2j_max_ = diversity_l2j(cf_list, cont_feature_index, cat_feature_index, agg='max') 304 diversity_mh_max_ = diversity_mh(cf_list, cont_feature_index, cat_feature_index, df, agg='max') 305 306 else: 307 diversity_l2_ = 0.0 308 diversity_mad_ = 0.0 309 diversity_j_ = 0.0 310 diversity_h_ = 0.0 311 diversity_l2j_ = 0.0 312 diversity_mh_ = 0.0 313 314 diversity_l2_min_ = 0.0 315 diversity_mad_min_ = 0.0 316 diversity_j_min_ = 0.0 317 diversity_h_min_ = 0.0 318 diversity_l2j_min_ = 0.0 319 diversity_mh_min_ = 0.0 320 321 diversity_l2_max_ = 0.0 322 diversity_mad_max_ = 0.0 323 diversity_j_max_ = 0.0 324 diversity_h_max_ = 0.0 325 diversity_l2j_max_ = 0.0 326 diversity_mh_max_ = 0.0 327 328 count_diversity_cont_ = count_diversity(cf_list, cont_feature_index, nbr_features, cont_feature_index) 329 count_diversity_cate_ = count_diversity(cf_list, cat_feature_index, nbr_features, cont_feature_index) 330 count_diversity_all_ = count_diversity_all(cf_list, nbr_features, cont_feature_index) 331 res = { 'generated_cfs': nr_of_cfs, 332 'implausibility_sum': plausibility_sum, 333 'implausibility_max_nbr_cf': plausibility_max_nbr_cf_, 334 'implausibility_nbr_cf': plausibility_nbr_cf_, 335 'distance_l2': distance_l2_, 336 'distance_mad': distance_mad_, 337 'distance_j': distance_j_, 338 'distance_h': distance_h_, 339 'distance_l2j': distance_l2j_, 340 'distance_l1j':distance_l1j_, 341 'distance_mh': distance_mh_, 342 343 'distance_l2_min': distance_l2_min_, 344 'distance_mad_min': distance_mad_min_, 345 'distance_j_min': distance_j_min_, 346 'distance_h_min': distance_h_min_, 347 'distance_l2j_min': distance_l2j_min_, 348 'distance_l1j_min': distance_l1j_min_, 349 'distance_mh_min': distance_mh_min_, 350 351 'distance_l2_max': distance_l2_max_, 352 'distance_mad_max': distance_mad_max_, 353 'distance_j_max': distance_j_max_, 354 'distance_h_max': distance_h_max_, 355 'distance_l2j_max': distance_l2j_max_, 356 'distance_l1j_max':distance_l1j_max_, 357 'distance_mh_max': distance_mh_max_, 358 359 'diversity_l2': diversity_l2_, 360 'diversity_mad': diversity_mad_, 361 'diversity_j': diversity_j_, 362 'diversity_h': diversity_h_, 363 'diversity_l2j': diversity_l2j_, 364 'diversity_mh': diversity_mh_, 365 366 'diversity_l2_min': diversity_l2_min_, 367 'diversity_mad_min': diversity_mad_min_, 368 'diversity_j_min': diversity_j_min_, 369 'diversity_h_min': diversity_h_min_, 370 'diversity_l2j_min': diversity_l2j_min_, 371 'diversity_mh_min': diversity_mh_min_, 372 373 'diversity_l2_max': diversity_l2_max_, 374 'diversity_mad_max': diversity_mad_max_, 375 'diversity_j_max': diversity_j_max_, 376 'diversity_h_max': diversity_h_max_, 377 'diversity_l2j_max': diversity_l2j_max_, 378 'diversity_mh_max': diversity_mh_max_, 379 380 'count_diversity_cont': count_diversity_cont_, 381 'count_diversity_cate': count_diversity_cate_, 382 'count_diversity_all': count_diversity_all_, 383 'avg_nbr_changes_per_cf':avg_nbr_changes_per_cf_, 384 'avg_nbr_changes': avg_nbr_changes_} 385 else: 386 res = { 387 'generated_cfs': 0, 388 'distance_l2': np.nan, 389 'distance_mad': np.nan, 390 'distance_j': np.nan, 391 'distance_h': np.nan, 392 'distance_l2j': np.nan, 393 'distance_l1j':np.nan, 394 'distance_mh': np.nan, 395 'distance_l2_min': np.nan, 396 'distance_mad_min': np.nan, 397 'distance_j_min': np.nan, 398 'distance_h_min': np.nan, 399 'distance_l2j_min': np.nan, 400 'distance_l1j_min':np.nan, 401 'distance_mh_min': np.nan, 402 'distance_l2_max': np.nan, 403 'distance_mad_max': np.nan, 404 'distance_j_max': np.nan, 405 'distance_h_max': np.nan, 406 'distance_l2j_max': np.nan, 407 'distance_l1j_max':np.nan, 408 'distance_mh_max': np.nan, 409 'avg_nbr_changes_per_cf': np.nan, 410 'avg_nbr_changes': np.nan, 411 'diversity_l2': np.nan, 412 'diversity_mad': np.nan, 413 'diversity_j': np.nan, 414 'diversity_h': np.nan, 415 'diversity_l2j': np.nan, 416 'diversity_mh': np.nan, 417 'diversity_l2_min': np.nan, 418 'diversity_mad_min': np.nan, 419 'diversity_j_min': np.nan, 420 'diversity_h_min': np.nan, 421 'diversity_l2j_min': np.nan, 422 'diversity_mh_min': np.nan, 423 'diversity_l2_max': np.nan, 424 'diversity_mad_max': np.nan, 425 'diversity_j_max': np.nan, 426 'diversity_h_max': np.nan, 427 'diversity_l2j_max': np.nan, 428 'diversity_mh_max': np.nan, 429 'count_diversity_cont': np.nan, 430 'count_diversity_cate': np.nan, 431 'count_diversity_all': np.nan, 432 433 'implausibility_sum': 0.0, 434 'implausibility_max_nbr_cf': 0.0, 435 'implausibility_nbr_cf': 0.0, 436 'sat_score': 0.0 437 } 438 return res
def
continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=None):
440def continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=None): 441 if metric == 'mad': 442 mad = median_abs_deviation(X.iloc[:, cont_feature_index], axis=0) 443 mad = np.array([v if v != 0 else 1.0 for v in mad]) 444 445 def _mad_cityblock(u, v): 446 return mad_cityblock(u, v, mad) 447 dist = pdist(cf_list[:, cont_feature_index], metric=_mad_cityblock) 448 else: 449 dist = pdist(cf_list[:, cont_feature_index], metric=metric) 450 451 if agg is None or agg == 'mean': 452 return np.mean(dist) 453 454 if agg == 'max': 455 return np.max(dist) 456 457 if agg == 'min': 458 return np.min(dist)
def
categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=None):
596def categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=None): 597 dist = pdist(cf_list[:, cat_feature_index], metric=metric) 598 599 if agg is None or agg == 'mean': 600 return np.mean(dist) 601 602 if agg == 'max': 603 return np.max(dist) 604 605 if agg == 'min': 606 return np.min(dist)
def
diversity_mh( cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
621def diversity_mh(cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None): 622 nbr_features = cf_list.shape[1] 623 dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='mad', X=X, agg=agg) 624 dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='hamming', agg=agg) 625 if ratio_cont is None: 626 ratio_continuous = len(cont_feature_index) / nbr_features 627 ratio_categorical = len(cat_feature_index) / nbr_features 628 else: 629 ratio_continuous = ratio_cont 630 ratio_categorical = 1.0 - ratio_cont 631 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 632 return dist
def
count_diversity(cf_list, features, nbr_features, cont_feature_index):
486def count_diversity(cf_list, features, nbr_features, cont_feature_index): 487 nbr_cf = cf_list.shape[0] 488 nbr_changes = 0 489 for i in range(nbr_cf): 490 for j in range(i+1, nbr_cf): 491 for k in features: 492 if cf_list[i][k] != cf_list[j][k]: 493 nbr_changes += 1 if j in cont_feature_index else 0.5 494 return nbr_changes / (nbr_cf * nbr_cf * nbr_features)
def
count_diversity_all(cf_list, nbr_features, cont_feature_index):
def
continuous_distance( query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=None):
502def continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=None): 503 if metric == 'mad': 504 mad = median_abs_deviation(X.iloc[:, cont_feature_index], axis=0) 505 mad = np.array([v if v != 0 else 1.0 for v in mad]) 506 507 def _mad_cityblock(u, v): 508 return mad_cityblock(u, v, mad) 509 dist = cdist(query_instance[:, cont_feature_index], cf_list[:, cont_feature_index], metric=_mad_cityblock) 510 else: 511 dist = cdist(query_instance[:, cont_feature_index], cf_list[:, cont_feature_index], metric=metric) 512 513 if agg is None or agg == 'mean': 514 return np.mean(dist) 515 516 if agg == 'max': 517 return np.max(dist) 518 519 if agg == 'min': 520 return np.min(dist)
def
mad_cityblock(u, v, mad):
def
categorical_distance( query_instance, cf_list, cat_feature_index, metric='jaccard', agg=None):
529def categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=None): 530 try: 531 dist = cdist(query_instance.reshape(1, -1)[:, cat_feature_index], cf_list[:, cat_feature_index], metric=metric) 532 except: 533 print('Problem with categorical distance') 534 if agg is None or agg == 'mean': 535 return np.mean(dist) 536 537 if agg == 'max': 538 return np.max(dist) 539 540 if agg == 'min': 541 return np.min(dist)
def
euclidean_jaccard( query_instance, A, cont_feature_index, cat_feature_index, ratio_cont=None):
543def euclidean_jaccard(query_instance, A, cont_feature_index, cat_feature_index, ratio_cont=None): 544 nbr_features = A.shape[1] 545 dist_cont = cdist(query_instance.reshape(1, -1)[:, cont_feature_index], A[:, cont_feature_index], metric='euclidean') 546 dist_cate = cdist(query_instance.reshape(1, -1)[:, cat_feature_index], A[:, cat_feature_index], metric='jaccard') 547 if ratio_cont is None: 548 ratio_continuous = len(cont_feature_index) / nbr_features 549 ratio_categorical = len(cat_feature_index) / nbr_features 550 else: 551 ratio_continuous = ratio_cont 552 ratio_categorical = 1.0 - ratio_cont 553 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 554 return dist
def
distance_l2j( query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
557def distance_l2j(query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None): 558 nbr_features = cf_list.shape[1] 559 dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='euclidean', X=None, agg=agg) 560 dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=agg) 561 if ratio_cont is None: 562 ratio_continuous = len(cont_feature_index) / nbr_features 563 ratio_categorical = len(cat_feature_index) / nbr_features 564 else: 565 ratio_continuous = ratio_cont 566 ratio_categorical = 1.0 - ratio_cont 567 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 568 return dist
def
distance_l1j( query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
570def distance_l1j(query_instance, cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None): 571 nbr_features = cf_list.shape[1] 572 dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='cityblock', X=None, agg=agg) 573 dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='jaccard', agg=agg) 574 if ratio_cont is None: 575 ratio_continuous = len(cont_feature_index) / nbr_features 576 ratio_categorical = len(cat_feature_index) / nbr_features 577 else: 578 ratio_continuous = ratio_cont 579 ratio_categorical = 1.0 - ratio_cont 580 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 581 return dist
def
distance_mh( query_instance, cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None):
583def distance_mh(query_instance, cf_list, cont_feature_index, cat_feature_index, X, ratio_cont=None, agg=None): 584 nbr_features = cf_list.shape[1] 585 dist_cont = continuous_distance(query_instance, cf_list, cont_feature_index, metric='mad', X=X, agg=agg) 586 dist_cate = categorical_distance(query_instance, cf_list, cat_feature_index, metric='hamming', agg=agg) 587 if ratio_cont is None: 588 ratio_continuous = len(cont_feature_index) / nbr_features 589 ratio_categorical = len(cat_feature_index) / nbr_features 590 else: 591 ratio_continuous = ratio_cont 592 ratio_categorical = 1.0 - ratio_cont 593 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 594 return dist
def
diversity_l2j( cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None):
608def diversity_l2j(cf_list, cont_feature_index, cat_feature_index, ratio_cont=None, agg=None): 609 nbr_features = cf_list.shape[1] 610 dist_cont = continuous_diversity(cf_list, cont_feature_index, metric='euclidean', X=None, agg=agg) 611 dist_cate = categorical_diversity(cf_list, cat_feature_index, metric='jaccard', agg=agg) 612 if ratio_cont is None: 613 ratio_continuous = len(cont_feature_index) / nbr_features 614 ratio_categorical = len(cat_feature_index) / nbr_features 615 else: 616 ratio_continuous = ratio_cont 617 ratio_categorical = 1.0 - ratio_cont 618 dist = ratio_continuous * dist_cont + ratio_categorical * dist_cate 619 return dist
def
nbr_changes_per_cf(x, cf_list, continuous_features):
634def nbr_changes_per_cf(x, cf_list, continuous_features): 635 x = x.ravel() 636 nbr_features = cf_list.shape[1] - 1 #exclude label 637 nbr_changes = np.zeros(len(cf_list)) 638 for i, cf in enumerate(cf_list): 639 cf = cf[:-1] 640 for j in range(nbr_features): 641 if cf[j] != x[j]: 642 nbr_changes[i] += 1 if j in continuous_features else 0.5 643 return nbr_changes
def
avg_nbr_changes_per_cf(x, cf_list, continuous_features):
def
avg_nbr_changes(x, cf_list, nbr_features, continuous_features):
def
plausibility( query_instance, predictive_model, cf_list, nr_of_cfs, query_instances, y_pred, continuous_features, categorical_features, df, ratio_cont):
653def plausibility(query_instance, predictive_model, cf_list,nr_of_cfs, query_instances, 654 y_pred, continuous_features, categorical_features, df, ratio_cont): 655 sum_dist = 0.0 656 full_df = pd.concat([query_instances,df],ignore_index=False) 657 for cf in cf_list: 658 #X_y = full_df[full_df['label'] == y_label] 659 X_y = full_df 660 # neigh_dist = exp.cdist(x.reshape(1, -1), X_test_y) 661 neigh_dist = distance_mh(query_instance.reshape(1, -1), X_y.to_numpy(), continuous_features, 662 categorical_features, df, ratio_cont) 663 idx_neigh = np.argsort(neigh_dist)[0] 664 # closest_idx = closest_idx = idx_neigh[0] 665 # closest = X_test_y[closest_idx] 666 closest = X_y.to_numpy()[idx_neigh] 667 d = distance_mh(cf.reshape(1,-1), closest.reshape(1, -1), continuous_features, 668 categorical_features, df, ratio_cont) 669 sum_dist += d 670 return sum_dist
def
conformance_score( CONF, encoder, df, dataset, features_names, d4py, query_instance, model_path, timestamp_col_name):
672def conformance_score(CONF, encoder, df, dataset, features_names, d4py, query_instance, model_path, timestamp_col_name): 673 d4py.parse_decl_model(model_path=os.path.join(model_path, dataset+'_'+str(CONF['prefix_length'])+'.decl')) 674 675 df = pd.DataFrame(df, columns=features_names) 676 try: 677 query_instance_to_decode = pd.DataFrame(np.array(query_instance, dtype=float), 678 columns=features_names) 679 except: 680 query_instance_to_decode = pd.DataFrame(np.array(query_instance, dtype=str), 681 columns=features_names) 682 encoder.decode(query_instance_to_decode) 683 encoder.decode(df) 684 df.insert(loc=0, column='Case ID', value=np.divmod(np.arange(len(df)), 1)[0] + 1) 685 df.insert(loc=1, column='label', value=1) 686 query_instance_to_decode.insert(loc=0, column='Case ID', 687 value=np.divmod(np.arange(len(query_instance_to_decode)), 1)[0] + 1) 688 query_instance_to_decode.insert(loc=1, column='label', value=1) 689 if CONF['feature_selection'] in single_prefix: 690 if all(df['prefix'] == '0'): 691 cols = ['prefix_' + str(i + 1) for i in range(CONF['prefix_length'])] 692 df[cols] = 0 693 query_instance_to_decode[cols] =0 694 else: 695 df = pd.concat([df, pd.DataFrame( 696 df['prefix'].str.split(",", expand=True).fillna(value='0')).rename( 697 columns=lambda x: f"prefix_{int(x) + 1}")], axis=1) 698 df = df.replace('\[', '', regex=True) 699 df = df.replace(']', '', regex=True) 700 query_instance_to_decode = pd.concat([query_instance_to_decode, pd.DataFrame( 701 query_instance_to_decode['prefix'].str.split(",", expand=True).fillna(value='0')).rename( 702 columns=lambda x: f"prefix_{int(x) + 1}")], axis=1) 703 query_instance_to_decode = query_instance_to_decode.replace('\[', '', regex=True) 704 query_instance_to_decode = query_instance_to_decode.replace(']', '', regex=True) 705 df = df.drop(columns=['prefix']) 706 query_instance_to_decode = query_instance_to_decode.drop(columns=['prefix']) 707 long_data = pd.wide_to_long(df, stubnames=['prefix'], i='Case ID', 708 j='order', sep='_', suffix=r'\w+') 709 long_query_instance = pd.wide_to_long(query_instance_to_decode, stubnames=['prefix'], i='Case ID', 710 j='order', sep='_', suffix=r'\w+') 711 long_query_instance_sorted = long_query_instance.sort_values(['Case ID', 'order'], ).reset_index(drop=False) 712 timestamps = pd.date_range('1/1/2011', periods=len(long_data), freq='H') 713 long_data_sorted = long_data.sort_values(['Case ID', 'order'], ).reset_index(drop=False) 714 long_data_sorted[timestamp_col_name] = timestamps 715 long_data_sorted['label'].replace({1: 'regular'}, inplace=True) 716 long_data_sorted.drop(columns=['order'], inplace=True) 717 columns_to_rename = {'Case ID': 'case:concept:name'} 718 columns_to_rename.update({'prefix': 'concept:name'}) 719 long_data_sorted.rename(columns=columns_to_rename, inplace=True) 720 long_data_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True) 721 long_data_sorted.replace('0', 'other', inplace=True) 722 timestamps_query = pd.date_range('1/1/2011', periods=len(long_query_instance), freq='H') 723 long_query_instance_sorted[timestamp_col_name] = timestamps_query 724 long_query_instance_sorted.rename(columns=columns_to_rename, inplace=True) 725 long_query_instance_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True) 726 long_query_instance_sorted.replace('0', 'other', inplace=True) 727 long_query_instance_sorted['case:concept:name'] = long_query_instance_sorted['case:concept:name'].astype(str) 728 long_data_sorted['case:concept:name'] = long_data_sorted['case:concept:name'].astype(str) 729 event_log = convert_to_event_log(long_data_sorted) 730 query_log = convert_to_event_log(long_query_instance_sorted) 731 d4py.load_xes_log(event_log) 732 model_check_res = d4py.conformance_checking(consider_vacuity=False) 733 d4py.load_xes_log(query_log) 734 model_check_query = d4py.conformance_checking(consider_vacuity=False) 735 query_patterns = { 736 constraint 737 for trace, patts in model_check_query.items() 738 for constraint, checker in patts.items() 739 if checker.state == TraceState.SATISFIED 740 } 741 742 model_check_res = { 743 k: { 744 constraint: checker 745 for constraint, checker in v.items() 746 if checker.state != TraceState.VIOLATED and constraint in query_patterns 747 } 748 for k, v in model_check_res.items() 749 } 750 751 conformance_score = [len(v) / len(query_patterns) for v in model_check_res.values() ] 752 avg_conformance = np.mean(conformance_score) 753 print('Average conformance score', np.mean(conformance_score)) 754 return avg_conformance
def
model_discovery( CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name):
756def model_discovery(CONF, encoder, df, dataset, features_names, d4py, model_path, support, timestamp_col_name): 757 df = pd.DataFrame(df, columns=features_names) 758 encoder.decode(df) 759 df.insert(loc=0, column='Case ID', value=np.divmod(np.arange(len(df)), 1)[0] + 1) 760 df.insert(loc=1, column='label', value=1) 761 long_data = pd.wide_to_long(df, stubnames=['prefix'], i='Case ID', 762 j='order', sep='_', suffix=r'\w+') 763 timestamps = pd.date_range('1/1/2011', periods=len(long_data), freq='H') 764 long_data_sorted = long_data.sort_values(['Case ID', 'order'], ).reset_index(drop=False) 765 long_data_sorted[timestamp_col_name] = timestamps 766 long_data_sorted['label'].replace({1: 'regular'}, inplace=True) 767 long_data_sorted.drop(columns=['order'], inplace=True) 768 columns_to_rename = {'Case ID': 'case:concept:name'} 769 columns_to_rename.update({'prefix': 'concept:name'}) 770 long_data_sorted.rename(columns=columns_to_rename, inplace=True) 771 long_data_sorted['label'].replace({'regular': 'false', 'deviant': 'true'}, inplace=True) 772 long_data_sorted.replace('0', 'other', inplace=True) 773 long_data_sorted.replace(0.0, 'other', inplace=True) 774 long_data_sorted.replace(0, 'other', inplace=True) 775 long_data_sorted['case:concept:name'] = long_data_sorted['case:concept:name'].astype(str) 776 event_log = convert_to_event_log(long_data_sorted) 777 d4py.load_xes_log(event_log) 778 d4py.compute_frequent_itemsets(min_support=support, len_itemset=2) 779 d4py.discovery(consider_vacuity=False, max_declare_cardinality=2) 780 discovered = d4py.filter_discovery(min_support=support, output_path=os.path.join(model_path, dataset+'_'+str(CONF['prefix_length'])+'.decl')) 781 782 #pm4py.filter_trace_attribute_values()
def
perform_model_analysis( model_path, dataset, CONF, encoder, full_df, support, log, dataset_confs):
784def perform_model_analysis(model_path, dataset, CONF, encoder, full_df, support, log,dataset_confs): 785 try: 786 if not os.path.exists(model_path): 787 os.makedirs(model_path) 788 print("Directory '%s' created successfully" % model_path) 789 except OSError as error: 790 print("Directory '%s' can not be created" % model_path) 791 792 d4py = Declare4Py() 793 794 try: 795 decl_model_path = model_path + dataset + '_' + str(CONF['prefix_length']) + '.decl' 796 if not os.path.exists(decl_model_path): 797 print('Do model discovery') 798 features_names = full_df.columns.values[:-1] 799 model_discovery(CONF, encoder, full_df.iloc[:, 1:], dataset, features_names, 800 d4py, model_path, support, [*dataset_confs.timestamp_col.values()][0]) 801 except OSError as error: 802 print("File '%s' can not be created" % decl_model_path) 803 804 d4py.parse_decl_model(model_path=decl_model_path) 805 806 d4py.load_xes_log(log) 807 conformance_check = d4py.conformance_checking(consider_vacuity=False) 808 809 model_check_res = { 810 k: { 811 constraint: checker 812 for constraint, checker in v.items() 813 if checker.state != TraceState.VIOLATED 814 } 815 for k, v in conformance_check.items() 816 } 817 818 conformant_traces = [trace_id[1] for trace_id, results in model_check_res.items() if 819 len(results) == len(d4py.model.constraints)] 820 number_of_constraints = len(d4py.model.constraints) 821 conformant_traces_ratio = len(conformant_traces) / len(log) 822 823 return conformant_traces,number_of_constraints, conformant_traces_ratio
columns =
['dataset', 'heuristic', 'model', 'method', 'optimization', 'prefix_length', 'idx', 'desired_nr_of_cfs', 'generated_cfs', 'time_train', 'time_test', 'runtime', 'distance_l2', 'distance_mad', 'distance_j', 'distance_h', 'distance_l1j', 'distance_l2j', 'distance_mh', 'distance_l2_min', 'distance_mad_min', 'distance_j_min', 'distance_h_min', 'distance_l1j_min', 'distance_l2j_min', 'distance_mh_min', 'distance_l2_max', 'distance_mad_max', 'distance_j_max', 'distance_h_max', 'distance_l1j_max', 'distance_l2j_max', 'distance_mh_max', 'diversity_l2', 'diversity_mad', 'diversity_j', 'diversity_h', 'diversity_l2j', 'diversity_mh', 'diversity_l2_min', 'diversity_mad_min', 'diversity_j_min', 'diversity_h_min', 'diversity_l2j_min', 'diversity_mh_min', 'diversity_l2_max', 'diversity_mad_max', 'diversity_j_max', 'diversity_h_max', 'diversity_l2j_max', 'diversity_mh_max', 'count_diversity_cont', 'count_diversity_cate', 'count_diversity_all', 'avg_nbr_changes_per_cf', 'avg_nbr_changes', 'implausibility_sum', 'implausibility_max_nbr_cf', 'implausibility_nbr_cf', 'sat_score']