nirdizati_light.encoding.data_encoder

  1import numpy as np
  2import pandas as pd
  3from pandas import DataFrame
  4from pandas.core.dtypes.common import is_numeric_dtype
  5from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder
  6
  7PADDING_VALUE = 0
  8#onehot and minmaxscaler not fully done
  9
 10class Encoder:
 11    def __init__(self, df: DataFrame = None, attribute_encoding=None, prefix_length=None):
 12        self.attribute_encoding = attribute_encoding
 13        self.prefix_length = prefix_length
 14        self._label_encoder = {}
 15        self._numeric_encoder = {}
 16        self._label_dict = {}
 17        self._label_dict_decoder = {}
 18        self._scaled_values = {}
 19        self._unscaled_values = {}
 20        
 21        for column in df:
 22            if column != 'trace_id':
 23                if not is_numeric_dtype(df[column].dtype):#or (is_numeric_dtype(df[column].dtype) and np.any(df[column] < 0)):
 24                    if attribute_encoding == 'label':
 25                        if column == 'label':
 26                            self._label_encoder[column] = LabelEncoder().fit(
 27                                sorted(df[column].apply(lambda x: str(x))))
 28                            classes = self._label_encoder[column].classes_
 29                            transforms = self._label_encoder[column].transform(classes)
 30                            self._label_dict[column] = dict(zip(classes, transforms))
 31                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
 32                        else:
 33                            self._label_encoder[column] = LabelEncoder().fit(
 34                                sorted(pd.concat([pd.Series([str(PADDING_VALUE)]), df[column].apply(lambda x: str(x))])))
 35                            classes = self._label_encoder[column].classes_
 36                            transforms = self._label_encoder[column].transform(classes)
 37                            self._label_dict[column] = dict(zip(classes, transforms))
 38                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
 39                    elif attribute_encoding == "onehot":
 40                        if column == 'label':
 41                            self._label_encoder[column] = LabelEncoder().fit(
 42                                sorted(df[column].apply(lambda x: str(x))))
 43                            classes = self._label_encoder[column].classes_
 44                            transforms = self._label_encoder[column].transform(classes)
 45                            self._label_dict[column] = dict(zip(classes, transforms))
 46                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
 47                        else:
 48                            self._label_encoder[column] = OneHotEncoder(drop='if_binary', sparse_output=False,
 49                                       handle_unknown='ignore').fit(df[column].astype(str).values.reshape(-1,1))
 50                            categories = self._label_encoder[column].categories_[0].reshape(-1, 1)
 51                            transforms = [tuple(enc) for enc in self._label_encoder[column].transform(categories)]
 52                            classes = list(categories.flatten())
 53                            self._label_dict[column] = dict(zip(classes, transforms))
 54                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
 55
 56                else:
 57                    self._numeric_encoder[column] = MinMaxScaler().fit(
 58                        df[column].values.reshape(-1,1)
 59                    )
 60                    unscaled = df[column].values
 61                    scaled = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten()
 62                    self._scaled_values[column] = scaled
 63                    self._unscaled_values[column] = unscaled
 64                    print('column:', column, 'considered number, top 5 values are:', list(df[column][:5]))
 65
 66    def encode(self, df: DataFrame) -> None:
 67        for column in df:
 68            if column != 'trace_id':
 69                if column in self._label_encoder:
 70                    try:
 71                        df[column] = df[column].apply(lambda x: self._label_dict[column].get(str(x), PADDING_VALUE))
 72                    except:
 73                        print('Error')
 74                else:
 75                    try:
 76                        df[column] = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten()
 77                    except:
 78                        print('Error')
 79
 80
 81    def decode(self, df: DataFrame) -> None:
 82        for column in df:
 83                if column != 'trace_id':
 84                        if column in self._label_encoder:
 85                            df[column] = df[column].apply(lambda x: self._label_dict_decoder[column].get(x, PADDING_VALUE))
 86                        else:
 87                            df[column] = self._numeric_encoder[column].inverse_transform(df[column].values.reshape(-1,1)).flatten()
 88
 89    def decode_row(self, row) -> np.array:
 90        decoded_row = []
 91        for column, value in row.items():
 92            if column != 'trace_id':
 93                if column in self._label_encoder:
 94                     decoded_row += [self._label_dict_decoder[column].get(value, PADDING_VALUE)]
 95                elif column in self._numeric_encoder:
 96                    decoded_row += [self._numeric_encoder[column].inverse_transform(np.array(value).reshape(-1,1))[0][0]]
 97            else:
 98                decoded_row += [value]
 99        return np.array(decoded_row)
100
101    def decode_column(self, column, column_name) -> np.array:
102        decoded_column = []
103        if column != 'trace_id':
104            if column_name in self._encoder:
105                if not is_numeric_dtype(df[column].dtype):
106                    decoded_column += [self._label_dict_decoder[column_name].get(x, PADDING_VALUE) for x in column]
107                else:
108                    decoded_column += [self._unscaled_values[column_name].get(x) for x in column]
109        else:
110            decoded_column += list(column)
111        return np.array(decoded_column)
112
113    def get_values(self, column_name):
114        if not is_numeric_dtype(df[column].dtype):
115            return (self._label_dict[column_name].keys(), self._label_dict_decoder[column_name].keys())
PADDING_VALUE = 0
class Encoder:
 11class Encoder:
 12    def __init__(self, df: DataFrame = None, attribute_encoding=None, prefix_length=None):
 13        self.attribute_encoding = attribute_encoding
 14        self.prefix_length = prefix_length
 15        self._label_encoder = {}
 16        self._numeric_encoder = {}
 17        self._label_dict = {}
 18        self._label_dict_decoder = {}
 19        self._scaled_values = {}
 20        self._unscaled_values = {}
 21        
 22        for column in df:
 23            if column != 'trace_id':
 24                if not is_numeric_dtype(df[column].dtype):#or (is_numeric_dtype(df[column].dtype) and np.any(df[column] < 0)):
 25                    if attribute_encoding == 'label':
 26                        if column == 'label':
 27                            self._label_encoder[column] = LabelEncoder().fit(
 28                                sorted(df[column].apply(lambda x: str(x))))
 29                            classes = self._label_encoder[column].classes_
 30                            transforms = self._label_encoder[column].transform(classes)
 31                            self._label_dict[column] = dict(zip(classes, transforms))
 32                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
 33                        else:
 34                            self._label_encoder[column] = LabelEncoder().fit(
 35                                sorted(pd.concat([pd.Series([str(PADDING_VALUE)]), df[column].apply(lambda x: str(x))])))
 36                            classes = self._label_encoder[column].classes_
 37                            transforms = self._label_encoder[column].transform(classes)
 38                            self._label_dict[column] = dict(zip(classes, transforms))
 39                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
 40                    elif attribute_encoding == "onehot":
 41                        if column == 'label':
 42                            self._label_encoder[column] = LabelEncoder().fit(
 43                                sorted(df[column].apply(lambda x: str(x))))
 44                            classes = self._label_encoder[column].classes_
 45                            transforms = self._label_encoder[column].transform(classes)
 46                            self._label_dict[column] = dict(zip(classes, transforms))
 47                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
 48                        else:
 49                            self._label_encoder[column] = OneHotEncoder(drop='if_binary', sparse_output=False,
 50                                       handle_unknown='ignore').fit(df[column].astype(str).values.reshape(-1,1))
 51                            categories = self._label_encoder[column].categories_[0].reshape(-1, 1)
 52                            transforms = [tuple(enc) for enc in self._label_encoder[column].transform(categories)]
 53                            classes = list(categories.flatten())
 54                            self._label_dict[column] = dict(zip(classes, transforms))
 55                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
 56
 57                else:
 58                    self._numeric_encoder[column] = MinMaxScaler().fit(
 59                        df[column].values.reshape(-1,1)
 60                    )
 61                    unscaled = df[column].values
 62                    scaled = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten()
 63                    self._scaled_values[column] = scaled
 64                    self._unscaled_values[column] = unscaled
 65                    print('column:', column, 'considered number, top 5 values are:', list(df[column][:5]))
 66
 67    def encode(self, df: DataFrame) -> None:
 68        for column in df:
 69            if column != 'trace_id':
 70                if column in self._label_encoder:
 71                    try:
 72                        df[column] = df[column].apply(lambda x: self._label_dict[column].get(str(x), PADDING_VALUE))
 73                    except:
 74                        print('Error')
 75                else:
 76                    try:
 77                        df[column] = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten()
 78                    except:
 79                        print('Error')
 80
 81
 82    def decode(self, df: DataFrame) -> None:
 83        for column in df:
 84                if column != 'trace_id':
 85                        if column in self._label_encoder:
 86                            df[column] = df[column].apply(lambda x: self._label_dict_decoder[column].get(x, PADDING_VALUE))
 87                        else:
 88                            df[column] = self._numeric_encoder[column].inverse_transform(df[column].values.reshape(-1,1)).flatten()
 89
 90    def decode_row(self, row) -> np.array:
 91        decoded_row = []
 92        for column, value in row.items():
 93            if column != 'trace_id':
 94                if column in self._label_encoder:
 95                     decoded_row += [self._label_dict_decoder[column].get(value, PADDING_VALUE)]
 96                elif column in self._numeric_encoder:
 97                    decoded_row += [self._numeric_encoder[column].inverse_transform(np.array(value).reshape(-1,1))[0][0]]
 98            else:
 99                decoded_row += [value]
100        return np.array(decoded_row)
101
102    def decode_column(self, column, column_name) -> np.array:
103        decoded_column = []
104        if column != 'trace_id':
105            if column_name in self._encoder:
106                if not is_numeric_dtype(df[column].dtype):
107                    decoded_column += [self._label_dict_decoder[column_name].get(x, PADDING_VALUE) for x in column]
108                else:
109                    decoded_column += [self._unscaled_values[column_name].get(x) for x in column]
110        else:
111            decoded_column += list(column)
112        return np.array(decoded_column)
113
114    def get_values(self, column_name):
115        if not is_numeric_dtype(df[column].dtype):
116            return (self._label_dict[column_name].keys(), self._label_dict_decoder[column_name].keys())
Encoder( df: pandas.core.frame.DataFrame = None, attribute_encoding=None, prefix_length=None)
12    def __init__(self, df: DataFrame = None, attribute_encoding=None, prefix_length=None):
13        self.attribute_encoding = attribute_encoding
14        self.prefix_length = prefix_length
15        self._label_encoder = {}
16        self._numeric_encoder = {}
17        self._label_dict = {}
18        self._label_dict_decoder = {}
19        self._scaled_values = {}
20        self._unscaled_values = {}
21        
22        for column in df:
23            if column != 'trace_id':
24                if not is_numeric_dtype(df[column].dtype):#or (is_numeric_dtype(df[column].dtype) and np.any(df[column] < 0)):
25                    if attribute_encoding == 'label':
26                        if column == 'label':
27                            self._label_encoder[column] = LabelEncoder().fit(
28                                sorted(df[column].apply(lambda x: str(x))))
29                            classes = self._label_encoder[column].classes_
30                            transforms = self._label_encoder[column].transform(classes)
31                            self._label_dict[column] = dict(zip(classes, transforms))
32                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
33                        else:
34                            self._label_encoder[column] = LabelEncoder().fit(
35                                sorted(pd.concat([pd.Series([str(PADDING_VALUE)]), df[column].apply(lambda x: str(x))])))
36                            classes = self._label_encoder[column].classes_
37                            transforms = self._label_encoder[column].transform(classes)
38                            self._label_dict[column] = dict(zip(classes, transforms))
39                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
40                    elif attribute_encoding == "onehot":
41                        if column == 'label':
42                            self._label_encoder[column] = LabelEncoder().fit(
43                                sorted(df[column].apply(lambda x: str(x))))
44                            classes = self._label_encoder[column].classes_
45                            transforms = self._label_encoder[column].transform(classes)
46                            self._label_dict[column] = dict(zip(classes, transforms))
47                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
48                        else:
49                            self._label_encoder[column] = OneHotEncoder(drop='if_binary', sparse_output=False,
50                                       handle_unknown='ignore').fit(df[column].astype(str).values.reshape(-1,1))
51                            categories = self._label_encoder[column].categories_[0].reshape(-1, 1)
52                            transforms = [tuple(enc) for enc in self._label_encoder[column].transform(categories)]
53                            classes = list(categories.flatten())
54                            self._label_dict[column] = dict(zip(classes, transforms))
55                            self._label_dict_decoder[column] = dict(zip(transforms, classes))
56
57                else:
58                    self._numeric_encoder[column] = MinMaxScaler().fit(
59                        df[column].values.reshape(-1,1)
60                    )
61                    unscaled = df[column].values
62                    scaled = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten()
63                    self._scaled_values[column] = scaled
64                    self._unscaled_values[column] = unscaled
65                    print('column:', column, 'considered number, top 5 values are:', list(df[column][:5]))
attribute_encoding
prefix_length
def encode(self, df: pandas.core.frame.DataFrame) -> None:
67    def encode(self, df: DataFrame) -> None:
68        for column in df:
69            if column != 'trace_id':
70                if column in self._label_encoder:
71                    try:
72                        df[column] = df[column].apply(lambda x: self._label_dict[column].get(str(x), PADDING_VALUE))
73                    except:
74                        print('Error')
75                else:
76                    try:
77                        df[column] = self._numeric_encoder[column].transform(df[column].values.reshape(-1,1)).flatten()
78                    except:
79                        print('Error')
def decode(self, df: pandas.core.frame.DataFrame) -> None:
82    def decode(self, df: DataFrame) -> None:
83        for column in df:
84                if column != 'trace_id':
85                        if column in self._label_encoder:
86                            df[column] = df[column].apply(lambda x: self._label_dict_decoder[column].get(x, PADDING_VALUE))
87                        else:
88                            df[column] = self._numeric_encoder[column].inverse_transform(df[column].values.reshape(-1,1)).flatten()
def decode_row(self, row) -> <built-in function array>:
 90    def decode_row(self, row) -> np.array:
 91        decoded_row = []
 92        for column, value in row.items():
 93            if column != 'trace_id':
 94                if column in self._label_encoder:
 95                     decoded_row += [self._label_dict_decoder[column].get(value, PADDING_VALUE)]
 96                elif column in self._numeric_encoder:
 97                    decoded_row += [self._numeric_encoder[column].inverse_transform(np.array(value).reshape(-1,1))[0][0]]
 98            else:
 99                decoded_row += [value]
100        return np.array(decoded_row)
def decode_column(self, column, column_name) -> <built-in function array>:
102    def decode_column(self, column, column_name) -> np.array:
103        decoded_column = []
104        if column != 'trace_id':
105            if column_name in self._encoder:
106                if not is_numeric_dtype(df[column].dtype):
107                    decoded_column += [self._label_dict_decoder[column_name].get(x, PADDING_VALUE) for x in column]
108                else:
109                    decoded_column += [self._unscaled_values[column_name].get(x) for x in column]
110        else:
111            decoded_column += list(column)
112        return np.array(decoded_column)
def get_values(self, column_name):
114    def get_values(self, column_name):
115        if not is_numeric_dtype(df[column].dtype):
116            return (self._label_dict[column_name].keys(), self._label_dict_decoder[column_name].keys())