nirdizati_light.encoding.time_encoding
1from datetime import datetime 2from datetime import timezone 3from enum import Enum 4 5import dateparser 6import holidays 7import pandas as pd 8from dateutil.parser import parse 9from numpy import * 10from pandas import * 11 12 13class TimeType(Enum): 14 DATE = 'date' 15 DURATION = 'duration' 16 NONE = 'none' 17 18 19class TimeEncodingType(Enum): 20 DATE = 'date' 21 DURATION = 'duration' 22 DATE_AND_DURATION = 'date_and_duration' 23 NONE = 'none' 24 25 26def time_encoding(df: DataFrame, encoding_type) -> DataFrame: 27 """ 28 Encodes the columns of string of the given DataFrame if they are date or duration. 29 30 :param pandas.Dataframe df: the dataframe to encode 31 :param str encoding_type: the type of encoding to perform 32 :return pandas.DataFrame: 33 """ 34 35 last_time = [None] * len(df) 36 df_output = DataFrame() 37 38 for column_name in df.keys(): 39 current_time = df[column_name] 40 column_type = is_time_or_duration(current_time) 41 42 if column_type == TimeType.DATE.value and encoding_type == TimeEncodingType.NONE.value: 43 df_output[column_name] = convert_datetime_in_UTC(current_time) 44 45 if column_type == TimeType.DATE.value and encoding_type in [TimeEncodingType.DATE.value, TimeEncodingType.DATE_AND_DURATION.value]: 46 result_df = parse_date(current_time, column_name) 47 df_output.append(result_df) 48 49 if column_type == TimeType.NONE.value or encoding_type == TimeEncodingType.DURATION.value: 50 df_output[column_name] = current_time 51 52 if column_type == TimeType.DURATION.value and encoding_type in [TimeEncodingType.DURATION.value, TimeEncodingType.DATE_AND_DURATION.value]: 53 if not all(val is None for val in last_time) and not all(val is None for val in current_time): 54 df_output.append(parse_duration(current_time, column_name, last_time)) 55 last_time = [ 56 old_time if new_time is None else new_time 57 for new_time, old_time in zip(current_time, last_time) 58 ] 59 60 return df_output 61 62 63def convert_datetime_in_UTC(column: list): 64 return [ 65 value.replace(tzinfo=timezone.utc).timestamp() 66 if isinstance(value, datetime) 67 else dateparser.parse(value).replace(tzinfo=timezone.utc).timestamp() 68 for value in column 69 ] 70 71 72def is_time_or_duration(column: list): 73 """Returns whether the column contains dates, durations, or otherwise 74 75 :param column: 76 :return: 77 """ 78 column_type = TimeType.NONE.value 79 80 if is_duration(column): 81 column_type = TimeType.DURATION.value 82 elif is_date(column): 83 column_type = TimeType.DATE.value 84 85 return column_type 86 87 88def is_date(column: list) -> bool: 89 """Returns whether all string can be interpreted as a date. 90 91 Accepts empty string and None Object in python 92 Function take from https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format 93 :param column: list of str, strings to check for date 94 :return: True if all string of column are dates 95 """ 96 for value in column: 97 if isinstance(value, str): 98 if value != "" and value != 'None': 99 try: 100 float(value) 101 return False 102 except ValueError: 103 try: 104 parse(value) 105 except ValueError: 106 return False 107 elif isinstance(value, datetime) or value is None: 108 pass 109 else: 110 return False 111 112 return True 113 114 115def is_duration(column: list) -> bool: 116 """Returns whether all string can be interpreted as a duration. 117 118 Accepts empty string and None Object in python 119 :param column: list of str, strings to check for periods of time 120 :return: True if all string of column are periods of time 121 """ 122 for value in column: 123 if isinstance(value, str): 124 if value != "" and value != 'None': 125 try: 126 float(value) 127 return False 128 except ValueError: 129 groups = format_string_duration_parse(value) 130 if not all([ 131 (len(group) == 2 and group[0].isnumeric() and group[1] in duration_allowed_word) 132 for group in groups 133 ]): 134 return False 135 elif value is None: 136 pass 137 else: 138 return False 139 140 return True 141 142 143duration_allowed_word = ['d', 'days', 'h', 'hours', 'm', 'minutes', 's', 'seconds'] 144 145 146def format_string_duration_parse(string: str) -> list: 147 """Returns a list containing the given string split 148 149 :param string: 150 :return: 151 """ 152 string = string.replace(" ", "") 153 154 chars = [string[0]] 155 for char in string[1:]: 156 if not chars[-1].isnumeric() and char.isnumeric(): 157 chars += ['|'] 158 chars += [char] 159 elif chars[-1].isnumeric() and not char.isnumeric(): 160 chars += ['_'] 161 chars += [char] 162 else: 163 chars += [char] 164 # From 18d5h38m36s, I want have for example 18_d|5_h|38_m|36_s 165 166 formatted_string = [tuple(group.split('_')) for group in "".join(chars).split('|')] 167 # recreates the string, then splits it first to have the number_keyword and then create the tuples 168 169 return formatted_string 170 171 172def is_special_occasion(date): 173 countries = ['AR', 'AU', 'AT', 'BY', 'BE', 'BR', 'BG', 'CA', 'CL', 'CO', 'HR', 'CZ', 'DK', 'EG', 'EE', 'FI', 'FR', 174 'DE', 'GR', 'HU', 'IS', 'IN', 'IE', 'IL', 'IT', 'JM', 'JP', 'LT', 'MX', 'MA', 'NL', 'NZ', 'PL', 'PT', 175 'RO', 'RU', 'SA', 'RS', 'SK', 'SI', 'ZA', 'ES', 'SE', 'CH', 'TR', 'UA', 'AE', 'GB', 'US'] 176 for country in countries: 177 holiday = holidays.country_holidays(country) 178 if date.strftime("%m-%d-%Y") in holiday: 179 return True 180 return False 181 182 183def encode_date(value): 184 if isinstance(value, datetime): 185 date = value 186 else: 187 date = dateparser.parse(value) # Returns a datetime type 188 return [date.isoweekday(), date.day, date.month, date.year, date.hour, date.minute, date.second, 189 is_special_occasion(date)] 190 191 192def parse_date(column: list, column_name: str) -> (DataFrame, list): 193 """Parses strings of column into datetime objects and returns a DataFrame 194 195 :param column: list of str, strings to parse into date 196 :param column_name: 197 :return: 198 """ 199 columns = [(column_name+'_date_week_day'), (column_name+'_date_day'), (column_name+'_date_month'), 200 (column_name+'_date_year'), (column_name+'_date_hours'), (column_name+'_date_minutes'), 201 (column_name+'_date_seconds'), (column_name+'_date_special_occasion')] 202 203 encoded_dates = [ 204 [None for _ in columns] 205 if (value is None or value == '' or value == 'None') 206 else encode_date(value) 207 for value in column 208 ] 209 210 results_df = DataFrame(data=encoded_dates, columns=columns) 211 results_df = results_df.where(pd.notnull(results_df), None) 212 213 return results_df 214 215 216def encode_duration(value): 217 return [value.days, value.hours, value.minutes, value.seconds] 218 219 220def encode_dates_for_duration(date: datetime, last_date: datetime): 221 if date is None or last_date is None: 222 return None 223 else: 224 tot_seconds = int((date - last_date).total_seconds()) 225 226 if tot_seconds > 0: 227 tot_minutes = int(tot_seconds / 60) 228 tot_hours = int(tot_minutes / 60) 229 days = int(tot_hours / 24) 230 return datetime.timedelta(days=days, hours=(tot_hours % 24), minutes=(tot_minutes % 60), 231 seconds=(tot_seconds % 60)) 232 else: 233 return None 234 235 236def parse_duration(current_time: list, column_name: str, last_time: list) -> DataFrame: 237 """Parses strings of column into datetime objects and returns a DataFrame 238 239 I assume that I receive the duration in one of the following format 240 - number (milliseconds) 241 - number d number h number m number 242 - number days number hours number minutes number seconds 243 - number days 244 245 All space will be removed 246 :param current_time: 247 :param column_name: 248 :param last_time: 249 :return: 250 """ 251 columns = [(column_name+'_elapsed_days'), (column_name+'_elapsed_hours'), (column_name+'_elapsed_minutes'), 252 (column_name+'_elapsed_seconds')] 253 254 encoded_durations = [ 255 encode_duration( 256 encode_dates_for_duration(new_date, old_date) 257 ) 258 for new_date, old_date in zip(current_time, last_time) 259 ] 260 261 results_df = DataFrame(data=encoded_durations, columns=columns) 262 results_df = results_df.where(pd.notnull(results_df), None) 263 264 return results_df 265 266 267if __name__ == '__main__': 268 time_test = [ 269 '1990-12-1', 270 '', 271 None, 272 'None', 273 '01/19/1990', 274 '01/19/90', 275 'Jan 1990', 276 'January1990', 277 '2005/3', 278 'Monday at 12:01am', 279 'January 1, 2047 at 8:21:00AM', 280 ] 281 282 duration_test = [ 283 '2d9h32m46s', 284 '2d 9h', 285 '', 286 None, 287 'None', 288 '2days9hours37minutes46seconds', 289 '2days 9hours 37minutes 46seconds', 290 ] 291 292 print(is_time_or_duration(time_test)) 293 print(is_time_or_duration(duration_test)) 294 295 parsed_dates = parse_date(time_test, 't1') 296 print(parsed_dates.head()) 297 print(parse_duration(duration_test, 't2').head())
Create a collection of name/value pairs.
Example enumeration:
>>> class Color(Enum):
... RED = 1
... BLUE = 2
... GREEN = 3
Access them by:
- attribute access::
>>> Color.RED
<Color.RED: 1>
- value lookup:
>>> Color(1)
<Color.RED: 1>
- name lookup:
>>> Color['RED']
<Color.RED: 1>
Enumerations can be iterated over, and know how many members they have:
>>> len(Color)
3
>>> list(Color)
[<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]
Methods can be added to enumerations, and members can have their own attributes -- see the documentation for details.
Inherited Members
- enum.Enum
- name
- value
20class TimeEncodingType(Enum): 21 DATE = 'date' 22 DURATION = 'duration' 23 DATE_AND_DURATION = 'date_and_duration' 24 NONE = 'none'
Create a collection of name/value pairs.
Example enumeration:
>>> class Color(Enum):
... RED = 1
... BLUE = 2
... GREEN = 3
Access them by:
- attribute access::
>>> Color.RED
<Color.RED: 1>
- value lookup:
>>> Color(1)
<Color.RED: 1>
- name lookup:
>>> Color['RED']
<Color.RED: 1>
Enumerations can be iterated over, and know how many members they have:
>>> len(Color)
3
>>> list(Color)
[<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]
Methods can be added to enumerations, and members can have their own attributes -- see the documentation for details.
Inherited Members
- enum.Enum
- name
- value
27def time_encoding(df: DataFrame, encoding_type) -> DataFrame: 28 """ 29 Encodes the columns of string of the given DataFrame if they are date or duration. 30 31 :param pandas.Dataframe df: the dataframe to encode 32 :param str encoding_type: the type of encoding to perform 33 :return pandas.DataFrame: 34 """ 35 36 last_time = [None] * len(df) 37 df_output = DataFrame() 38 39 for column_name in df.keys(): 40 current_time = df[column_name] 41 column_type = is_time_or_duration(current_time) 42 43 if column_type == TimeType.DATE.value and encoding_type == TimeEncodingType.NONE.value: 44 df_output[column_name] = convert_datetime_in_UTC(current_time) 45 46 if column_type == TimeType.DATE.value and encoding_type in [TimeEncodingType.DATE.value, TimeEncodingType.DATE_AND_DURATION.value]: 47 result_df = parse_date(current_time, column_name) 48 df_output.append(result_df) 49 50 if column_type == TimeType.NONE.value or encoding_type == TimeEncodingType.DURATION.value: 51 df_output[column_name] = current_time 52 53 if column_type == TimeType.DURATION.value and encoding_type in [TimeEncodingType.DURATION.value, TimeEncodingType.DATE_AND_DURATION.value]: 54 if not all(val is None for val in last_time) and not all(val is None for val in current_time): 55 df_output.append(parse_duration(current_time, column_name, last_time)) 56 last_time = [ 57 old_time if new_time is None else new_time 58 for new_time, old_time in zip(current_time, last_time) 59 ] 60 61 return df_output
Encodes the columns of string of the given DataFrame if they are date or duration.
Parameters
- pandas.Dataframe df: the dataframe to encode
- str encoding_type: the type of encoding to perform
Returns
73def is_time_or_duration(column: list): 74 """Returns whether the column contains dates, durations, or otherwise 75 76 :param column: 77 :return: 78 """ 79 column_type = TimeType.NONE.value 80 81 if is_duration(column): 82 column_type = TimeType.DURATION.value 83 elif is_date(column): 84 column_type = TimeType.DATE.value 85 86 return column_type
Returns whether the column contains dates, durations, or otherwise
Parameters
- column:
Returns
89def is_date(column: list) -> bool: 90 """Returns whether all string can be interpreted as a date. 91 92 Accepts empty string and None Object in python 93 Function take from https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format 94 :param column: list of str, strings to check for date 95 :return: True if all string of column are dates 96 """ 97 for value in column: 98 if isinstance(value, str): 99 if value != "" and value != 'None': 100 try: 101 float(value) 102 return False 103 except ValueError: 104 try: 105 parse(value) 106 except ValueError: 107 return False 108 elif isinstance(value, datetime) or value is None: 109 pass 110 else: 111 return False 112 113 return True
Returns whether all string can be interpreted as a date.
Accepts empty string and None Object in python Function take from https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
Parameters
- column: list of str, strings to check for date
Returns
True if all string of column are dates
116def is_duration(column: list) -> bool: 117 """Returns whether all string can be interpreted as a duration. 118 119 Accepts empty string and None Object in python 120 :param column: list of str, strings to check for periods of time 121 :return: True if all string of column are periods of time 122 """ 123 for value in column: 124 if isinstance(value, str): 125 if value != "" and value != 'None': 126 try: 127 float(value) 128 return False 129 except ValueError: 130 groups = format_string_duration_parse(value) 131 if not all([ 132 (len(group) == 2 and group[0].isnumeric() and group[1] in duration_allowed_word) 133 for group in groups 134 ]): 135 return False 136 elif value is None: 137 pass 138 else: 139 return False 140 141 return True
Returns whether all string can be interpreted as a duration.
Accepts empty string and None Object in python
Parameters
- column: list of str, strings to check for periods of time
Returns
True if all string of column are periods of time
147def format_string_duration_parse(string: str) -> list: 148 """Returns a list containing the given string split 149 150 :param string: 151 :return: 152 """ 153 string = string.replace(" ", "") 154 155 chars = [string[0]] 156 for char in string[1:]: 157 if not chars[-1].isnumeric() and char.isnumeric(): 158 chars += ['|'] 159 chars += [char] 160 elif chars[-1].isnumeric() and not char.isnumeric(): 161 chars += ['_'] 162 chars += [char] 163 else: 164 chars += [char] 165 # From 18d5h38m36s, I want have for example 18_d|5_h|38_m|36_s 166 167 formatted_string = [tuple(group.split('_')) for group in "".join(chars).split('|')] 168 # recreates the string, then splits it first to have the number_keyword and then create the tuples 169 170 return formatted_string
Returns a list containing the given string split
Parameters
- string:
Returns
173def is_special_occasion(date): 174 countries = ['AR', 'AU', 'AT', 'BY', 'BE', 'BR', 'BG', 'CA', 'CL', 'CO', 'HR', 'CZ', 'DK', 'EG', 'EE', 'FI', 'FR', 175 'DE', 'GR', 'HU', 'IS', 'IN', 'IE', 'IL', 'IT', 'JM', 'JP', 'LT', 'MX', 'MA', 'NL', 'NZ', 'PL', 'PT', 176 'RO', 'RU', 'SA', 'RS', 'SK', 'SI', 'ZA', 'ES', 'SE', 'CH', 'TR', 'UA', 'AE', 'GB', 'US'] 177 for country in countries: 178 holiday = holidays.country_holidays(country) 179 if date.strftime("%m-%d-%Y") in holiday: 180 return True 181 return False
193def parse_date(column: list, column_name: str) -> (DataFrame, list): 194 """Parses strings of column into datetime objects and returns a DataFrame 195 196 :param column: list of str, strings to parse into date 197 :param column_name: 198 :return: 199 """ 200 columns = [(column_name+'_date_week_day'), (column_name+'_date_day'), (column_name+'_date_month'), 201 (column_name+'_date_year'), (column_name+'_date_hours'), (column_name+'_date_minutes'), 202 (column_name+'_date_seconds'), (column_name+'_date_special_occasion')] 203 204 encoded_dates = [ 205 [None for _ in columns] 206 if (value is None or value == '' or value == 'None') 207 else encode_date(value) 208 for value in column 209 ] 210 211 results_df = DataFrame(data=encoded_dates, columns=columns) 212 results_df = results_df.where(pd.notnull(results_df), None) 213 214 return results_df
Parses strings of column into datetime objects and returns a DataFrame
Parameters
- column: list of str, strings to parse into date
- column_name:
Returns
221def encode_dates_for_duration(date: datetime, last_date: datetime): 222 if date is None or last_date is None: 223 return None 224 else: 225 tot_seconds = int((date - last_date).total_seconds()) 226 227 if tot_seconds > 0: 228 tot_minutes = int(tot_seconds / 60) 229 tot_hours = int(tot_minutes / 60) 230 days = int(tot_hours / 24) 231 return datetime.timedelta(days=days, hours=(tot_hours % 24), minutes=(tot_minutes % 60), 232 seconds=(tot_seconds % 60)) 233 else: 234 return None
237def parse_duration(current_time: list, column_name: str, last_time: list) -> DataFrame: 238 """Parses strings of column into datetime objects and returns a DataFrame 239 240 I assume that I receive the duration in one of the following format 241 - number (milliseconds) 242 - number d number h number m number 243 - number days number hours number minutes number seconds 244 - number days 245 246 All space will be removed 247 :param current_time: 248 :param column_name: 249 :param last_time: 250 :return: 251 """ 252 columns = [(column_name+'_elapsed_days'), (column_name+'_elapsed_hours'), (column_name+'_elapsed_minutes'), 253 (column_name+'_elapsed_seconds')] 254 255 encoded_durations = [ 256 encode_duration( 257 encode_dates_for_duration(new_date, old_date) 258 ) 259 for new_date, old_date in zip(current_time, last_time) 260 ] 261 262 results_df = DataFrame(data=encoded_durations, columns=columns) 263 results_df = results_df.where(pd.notnull(results_df), None) 264 265 return results_df
Parses strings of column into datetime objects and returns a DataFrame
I assume that I receive the duration in one of the following format
- number (milliseconds)
- number d number h number m number
- number days number hours number minutes number seconds
- number days
All space will be removed
Parameters
- current_time:
- column_name:
- last_time: