nirdizati_light.encoding.data_encoder
1import numpy as np 2import pandas as pd 3from pandas import DataFrame 4from pandas.core.dtypes.common import is_numeric_dtype 5from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder 6 7PADDING_VALUE = 0 8#onehot and minmaxscaler not fully done 9 10class Encoder: 11 def __init__(self, df: DataFrame = None, attribute_encoding=None, prefix_length=None): 12 self.attribute_encoding = attribute_encoding 13 self.prefix_length = prefix_length 14 self._label_encoder = {} 15 self._numeric_encoder = {} 16 self._label_dict = {} 17 self._label_dict_decoder = {} 18 self._scaled_values = {} 19 self._unscaled_values = {} 20 21 for column in df: 22 if column != 'trace_id': 23 if not is_numeric_dtype(df[column].dtype):#or (is_numeric_dtype(df[column].dtype) and np.any(df[column] < 0)): 24 if attribute_encoding == 'label': 25 if column == 'label': 26 self._label_encoder[column] = LabelEncoder().fit( 27 sorted(df[column].apply(lambda x: str(x)))) 28 classes = self._label_encoder[column].classes_ 29 transforms = self._label_encoder[column].transform(classes) 30 self._label_dict[column] = dict(zip(classes, transforms)) 31 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 32 else: 33 self._label_encoder[column] = LabelEncoder().fit( 34 sorted(pd.concat([pd.Series([str(PADDING_VALUE)]), df[column].apply(lambda x: str(x))]))) 35 classes = self._label_encoder[column].classes_ 36 transforms = self._label_encoder[column].transform(classes) 37 self._label_dict[column] = dict(zip(classes, transforms)) 38 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 39 elif attribute_encoding == "onehot": 40 if column == 'label': 41 self._label_encoder[column] = LabelEncoder().fit( 42 sorted(df[column].apply(lambda x: str(x)))) 43 classes = self._label_encoder[column].classes_ 44 transforms = self._label_encoder[column].transform(classes) 45 self._label_dict[column] = dict(zip(classes, transforms)) 46 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 47 else: 48 self._label_encoder[column] = OneHotEncoder(drop='if_binary', sparse_output=False, 49 handle_unknown='ignore').fit(df[column].astype(str).values.reshape(-1,1)) 50 categories = self._label_encoder[column].categories_[0].reshape(-1, 1) 51 transforms = [tuple(enc) for enc in self._label_encoder[column].transform(categories)] 52 classes = list(categories.flatten()) 53 self._label_dict[column] = dict(zip(classes, transforms)) 54 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 55 56 else: 57 self._numeric_encoder[column] = MinMaxScaler().fit( 58 df[column].values.reshape(-1,1) 59 ) 60 unscaled = df[column].values 61 scaled = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten() 62 self._scaled_values[column] = scaled 63 self._unscaled_values[column] = unscaled 64 print('column:', column, 'considered number, top 5 values are:', list(df[column][:5])) 65 66 def encode(self, df: DataFrame) -> None: 67 for column in df: 68 if column != 'trace_id': 69 if column in self._label_encoder: 70 try: 71 df[column] = df[column].apply(lambda x: self._label_dict[column].get(str(x), PADDING_VALUE)) 72 except: 73 print('Error') 74 else: 75 try: 76 df[column] = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten() 77 except: 78 print('Error') 79 80 81 def decode(self, df: DataFrame) -> None: 82 for column in df: 83 if column != 'trace_id': 84 if column in self._label_encoder: 85 df[column] = df[column].apply(lambda x: self._label_dict_decoder[column].get(x, PADDING_VALUE)) 86 else: 87 df[column] = self._numeric_encoder[column].inverse_transform(df[column].values.reshape(-1,1)).flatten() 88 89 def decode_row(self, row) -> np.array: 90 decoded_row = [] 91 for column, value in row.items(): 92 if column != 'trace_id': 93 if column in self._label_encoder: 94 decoded_row += [self._label_dict_decoder[column].get(value, PADDING_VALUE)] 95 elif column in self._numeric_encoder: 96 decoded_row += [self._numeric_encoder[column].inverse_transform(np.array(value).reshape(-1,1))[0][0]] 97 else: 98 decoded_row += [value] 99 return np.array(decoded_row) 100 101 def decode_column(self, column, column_name) -> np.array: 102 decoded_column = [] 103 if column != 'trace_id': 104 if column_name in self._encoder: 105 if not is_numeric_dtype(df[column].dtype): 106 decoded_column += [self._label_dict_decoder[column_name].get(x, PADDING_VALUE) for x in column] 107 else: 108 decoded_column += [self._unscaled_values[column_name].get(x) for x in column] 109 else: 110 decoded_column += list(column) 111 return np.array(decoded_column) 112 113 def get_values(self, column_name): 114 if not is_numeric_dtype(df[column].dtype): 115 return (self._label_dict[column_name].keys(), self._label_dict_decoder[column_name].keys())
PADDING_VALUE =
0
class
Encoder:
11class Encoder: 12 def __init__(self, df: DataFrame = None, attribute_encoding=None, prefix_length=None): 13 self.attribute_encoding = attribute_encoding 14 self.prefix_length = prefix_length 15 self._label_encoder = {} 16 self._numeric_encoder = {} 17 self._label_dict = {} 18 self._label_dict_decoder = {} 19 self._scaled_values = {} 20 self._unscaled_values = {} 21 22 for column in df: 23 if column != 'trace_id': 24 if not is_numeric_dtype(df[column].dtype):#or (is_numeric_dtype(df[column].dtype) and np.any(df[column] < 0)): 25 if attribute_encoding == 'label': 26 if column == 'label': 27 self._label_encoder[column] = LabelEncoder().fit( 28 sorted(df[column].apply(lambda x: str(x)))) 29 classes = self._label_encoder[column].classes_ 30 transforms = self._label_encoder[column].transform(classes) 31 self._label_dict[column] = dict(zip(classes, transforms)) 32 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 33 else: 34 self._label_encoder[column] = LabelEncoder().fit( 35 sorted(pd.concat([pd.Series([str(PADDING_VALUE)]), df[column].apply(lambda x: str(x))]))) 36 classes = self._label_encoder[column].classes_ 37 transforms = self._label_encoder[column].transform(classes) 38 self._label_dict[column] = dict(zip(classes, transforms)) 39 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 40 elif attribute_encoding == "onehot": 41 if column == 'label': 42 self._label_encoder[column] = LabelEncoder().fit( 43 sorted(df[column].apply(lambda x: str(x)))) 44 classes = self._label_encoder[column].classes_ 45 transforms = self._label_encoder[column].transform(classes) 46 self._label_dict[column] = dict(zip(classes, transforms)) 47 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 48 else: 49 self._label_encoder[column] = OneHotEncoder(drop='if_binary', sparse_output=False, 50 handle_unknown='ignore').fit(df[column].astype(str).values.reshape(-1,1)) 51 categories = self._label_encoder[column].categories_[0].reshape(-1, 1) 52 transforms = [tuple(enc) for enc in self._label_encoder[column].transform(categories)] 53 classes = list(categories.flatten()) 54 self._label_dict[column] = dict(zip(classes, transforms)) 55 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 56 57 else: 58 self._numeric_encoder[column] = MinMaxScaler().fit( 59 df[column].values.reshape(-1,1) 60 ) 61 unscaled = df[column].values 62 scaled = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten() 63 self._scaled_values[column] = scaled 64 self._unscaled_values[column] = unscaled 65 print('column:', column, 'considered number, top 5 values are:', list(df[column][:5])) 66 67 def encode(self, df: DataFrame) -> None: 68 for column in df: 69 if column != 'trace_id': 70 if column in self._label_encoder: 71 try: 72 df[column] = df[column].apply(lambda x: self._label_dict[column].get(str(x), PADDING_VALUE)) 73 except: 74 print('Error') 75 else: 76 try: 77 df[column] = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten() 78 except: 79 print('Error') 80 81 82 def decode(self, df: DataFrame) -> None: 83 for column in df: 84 if column != 'trace_id': 85 if column in self._label_encoder: 86 df[column] = df[column].apply(lambda x: self._label_dict_decoder[column].get(x, PADDING_VALUE)) 87 else: 88 df[column] = self._numeric_encoder[column].inverse_transform(df[column].values.reshape(-1,1)).flatten() 89 90 def decode_row(self, row) -> np.array: 91 decoded_row = [] 92 for column, value in row.items(): 93 if column != 'trace_id': 94 if column in self._label_encoder: 95 decoded_row += [self._label_dict_decoder[column].get(value, PADDING_VALUE)] 96 elif column in self._numeric_encoder: 97 decoded_row += [self._numeric_encoder[column].inverse_transform(np.array(value).reshape(-1,1))[0][0]] 98 else: 99 decoded_row += [value] 100 return np.array(decoded_row) 101 102 def decode_column(self, column, column_name) -> np.array: 103 decoded_column = [] 104 if column != 'trace_id': 105 if column_name in self._encoder: 106 if not is_numeric_dtype(df[column].dtype): 107 decoded_column += [self._label_dict_decoder[column_name].get(x, PADDING_VALUE) for x in column] 108 else: 109 decoded_column += [self._unscaled_values[column_name].get(x) for x in column] 110 else: 111 decoded_column += list(column) 112 return np.array(decoded_column) 113 114 def get_values(self, column_name): 115 if not is_numeric_dtype(df[column].dtype): 116 return (self._label_dict[column_name].keys(), self._label_dict_decoder[column_name].keys())
Encoder( df: pandas.core.frame.DataFrame = None, attribute_encoding=None, prefix_length=None)
12 def __init__(self, df: DataFrame = None, attribute_encoding=None, prefix_length=None): 13 self.attribute_encoding = attribute_encoding 14 self.prefix_length = prefix_length 15 self._label_encoder = {} 16 self._numeric_encoder = {} 17 self._label_dict = {} 18 self._label_dict_decoder = {} 19 self._scaled_values = {} 20 self._unscaled_values = {} 21 22 for column in df: 23 if column != 'trace_id': 24 if not is_numeric_dtype(df[column].dtype):#or (is_numeric_dtype(df[column].dtype) and np.any(df[column] < 0)): 25 if attribute_encoding == 'label': 26 if column == 'label': 27 self._label_encoder[column] = LabelEncoder().fit( 28 sorted(df[column].apply(lambda x: str(x)))) 29 classes = self._label_encoder[column].classes_ 30 transforms = self._label_encoder[column].transform(classes) 31 self._label_dict[column] = dict(zip(classes, transforms)) 32 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 33 else: 34 self._label_encoder[column] = LabelEncoder().fit( 35 sorted(pd.concat([pd.Series([str(PADDING_VALUE)]), df[column].apply(lambda x: str(x))]))) 36 classes = self._label_encoder[column].classes_ 37 transforms = self._label_encoder[column].transform(classes) 38 self._label_dict[column] = dict(zip(classes, transforms)) 39 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 40 elif attribute_encoding == "onehot": 41 if column == 'label': 42 self._label_encoder[column] = LabelEncoder().fit( 43 sorted(df[column].apply(lambda x: str(x)))) 44 classes = self._label_encoder[column].classes_ 45 transforms = self._label_encoder[column].transform(classes) 46 self._label_dict[column] = dict(zip(classes, transforms)) 47 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 48 else: 49 self._label_encoder[column] = OneHotEncoder(drop='if_binary', sparse_output=False, 50 handle_unknown='ignore').fit(df[column].astype(str).values.reshape(-1,1)) 51 categories = self._label_encoder[column].categories_[0].reshape(-1, 1) 52 transforms = [tuple(enc) for enc in self._label_encoder[column].transform(categories)] 53 classes = list(categories.flatten()) 54 self._label_dict[column] = dict(zip(classes, transforms)) 55 self._label_dict_decoder[column] = dict(zip(transforms, classes)) 56 57 else: 58 self._numeric_encoder[column] = MinMaxScaler().fit( 59 df[column].values.reshape(-1,1) 60 ) 61 unscaled = df[column].values 62 scaled = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten() 63 self._scaled_values[column] = scaled 64 self._unscaled_values[column] = unscaled 65 print('column:', column, 'considered number, top 5 values are:', list(df[column][:5]))
def
encode(self, df: pandas.core.frame.DataFrame) -> None:
67 def encode(self, df: DataFrame) -> None: 68 for column in df: 69 if column != 'trace_id': 70 if column in self._label_encoder: 71 try: 72 df[column] = df[column].apply(lambda x: self._label_dict[column].get(str(x), PADDING_VALUE)) 73 except: 74 print('Error') 75 else: 76 try: 77 df[column] = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten() 78 except: 79 print('Error')
def
decode(self, df: pandas.core.frame.DataFrame) -> None:
82 def decode(self, df: DataFrame) -> None: 83 for column in df: 84 if column != 'trace_id': 85 if column in self._label_encoder: 86 df[column] = df[column].apply(lambda x: self._label_dict_decoder[column].get(x, PADDING_VALUE)) 87 else: 88 df[column] = self._numeric_encoder[column].inverse_transform(df[column].values.reshape(-1,1)).flatten()
def
decode_row(self, row) -> <built-in function array>:
90 def decode_row(self, row) -> np.array: 91 decoded_row = [] 92 for column, value in row.items(): 93 if column != 'trace_id': 94 if column in self._label_encoder: 95 decoded_row += [self._label_dict_decoder[column].get(value, PADDING_VALUE)] 96 elif column in self._numeric_encoder: 97 decoded_row += [self._numeric_encoder[column].inverse_transform(np.array(value).reshape(-1,1))[0][0]] 98 else: 99 decoded_row += [value] 100 return np.array(decoded_row)
def
decode_column(self, column, column_name) -> <built-in function array>:
102 def decode_column(self, column, column_name) -> np.array: 103 decoded_column = [] 104 if column != 'trace_id': 105 if column_name in self._encoder: 106 if not is_numeric_dtype(df[column].dtype): 107 decoded_column += [self._label_dict_decoder[column_name].get(x, PADDING_VALUE) for x in column] 108 else: 109 decoded_column += [self._unscaled_values[column_name].get(x) for x in column] 110 else: 111 decoded_column += list(column) 112 return np.array(decoded_column)