nirdizati_light.encoding.time_encoding

  1from datetime import datetime
  2from datetime import timezone
  3from enum import Enum
  4
  5import dateparser
  6import holidays
  7import pandas as pd
  8from dateutil.parser import parse
  9from numpy import *
 10from pandas import *
 11
 12
 13class TimeType(Enum):
 14    DATE = 'date'
 15    DURATION = 'duration'
 16    NONE = 'none'
 17
 18
 19class TimeEncodingType(Enum):
 20    DATE = 'date'
 21    DURATION = 'duration'
 22    DATE_AND_DURATION = 'date_and_duration'
 23    NONE = 'none'
 24
 25
 26def time_encoding(df: DataFrame, encoding_type) -> DataFrame:
 27    """
 28    Encodes the columns of string of the given DataFrame if they are date or duration.
 29
 30    :param pandas.Dataframe df: the dataframe to encode
 31    :param str encoding_type: the type of encoding to perform
 32    :return pandas.DataFrame:
 33    """
 34    
 35    last_time = [None] * len(df)
 36    df_output = DataFrame()
 37
 38    for column_name in df.keys():
 39        current_time = df[column_name]
 40        column_type = is_time_or_duration(current_time)
 41
 42        if column_type == TimeType.DATE.value and encoding_type == TimeEncodingType.NONE.value:
 43            df_output[column_name] = convert_datetime_in_UTC(current_time)
 44
 45        if column_type == TimeType.DATE.value and encoding_type in [TimeEncodingType.DATE.value, TimeEncodingType.DATE_AND_DURATION.value]:
 46            result_df = parse_date(current_time, column_name)
 47            df_output.append(result_df)
 48
 49        if column_type == TimeType.NONE.value or encoding_type == TimeEncodingType.DURATION.value:
 50            df_output[column_name] = current_time
 51
 52        if column_type == TimeType.DURATION.value and encoding_type in [TimeEncodingType.DURATION.value, TimeEncodingType.DATE_AND_DURATION.value]:
 53            if not all(val is None for val in last_time) and not all(val is None for val in current_time):
 54                df_output.append(parse_duration(current_time, column_name, last_time))
 55            last_time = [
 56                old_time if new_time is None else new_time
 57                for new_time, old_time in zip(current_time, last_time)
 58            ]
 59
 60    return df_output
 61
 62
 63def convert_datetime_in_UTC(column: list):
 64    return [
 65        value.replace(tzinfo=timezone.utc).timestamp()
 66        if isinstance(value, datetime)
 67        else dateparser.parse(value).replace(tzinfo=timezone.utc).timestamp()
 68        for value in column
 69    ]
 70
 71
 72def is_time_or_duration(column: list):
 73    """Returns whether the column contains dates, durations, or otherwise
 74
 75    :param column:
 76    :return:
 77    """
 78    column_type = TimeType.NONE.value
 79
 80    if is_duration(column):
 81        column_type = TimeType.DURATION.value
 82    elif is_date(column):
 83        column_type = TimeType.DATE.value
 84
 85    return column_type
 86
 87
 88def is_date(column: list) -> bool:
 89    """Returns whether all string can be interpreted as a date.
 90
 91    Accepts empty string and None Object in python
 92    Function take from https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
 93    :param column: list of str, strings to check for date
 94    :return: True if all string of column are dates
 95    """
 96    for value in column:
 97        if isinstance(value, str):
 98            if value != "" and value != 'None':
 99                try:
100                    float(value)
101                    return False
102                except ValueError:
103                    try:
104                        parse(value)
105                    except ValueError:
106                        return False
107        elif isinstance(value, datetime) or value is None:
108            pass
109        else:
110            return False
111
112    return True
113
114
115def is_duration(column: list) -> bool:
116    """Returns whether all string can be interpreted as a duration.
117
118    Accepts empty string and None Object in python
119    :param column: list of str, strings to check for periods of time
120    :return: True if all string of column are periods of time
121    """
122    for value in column:
123        if isinstance(value, str):
124            if value != "" and value != 'None':
125                try:
126                    float(value)
127                    return False
128                except ValueError:
129                    groups = format_string_duration_parse(value)
130                    if not all([
131                        (len(group) == 2 and group[0].isnumeric() and group[1] in duration_allowed_word)
132                        for group in groups
133                    ]):
134                        return False
135        elif value is None:
136            pass
137        else:
138            return False
139
140    return True
141
142
143duration_allowed_word = ['d', 'days', 'h', 'hours', 'm', 'minutes', 's', 'seconds']
144
145
146def format_string_duration_parse(string: str) -> list:
147    """Returns a list containing the given string split
148
149    :param string:
150    :return:
151    """
152    string = string.replace(" ", "")
153
154    chars = [string[0]]
155    for char in string[1:]:
156        if not chars[-1].isnumeric() and char.isnumeric():
157            chars += ['|']
158            chars += [char]
159        elif chars[-1].isnumeric() and not char.isnumeric():
160            chars += ['_']
161            chars += [char]
162        else:
163            chars += [char]
164    # From 18d5h38m36s, I want have for example 18_d|5_h|38_m|36_s
165
166    formatted_string = [tuple(group.split('_')) for group in "".join(chars).split('|')]
167    # recreates the string, then splits it first to have the number_keyword and then create the tuples
168
169    return formatted_string
170
171
172def is_special_occasion(date):
173    countries = ['AR', 'AU', 'AT', 'BY', 'BE', 'BR', 'BG', 'CA', 'CL', 'CO', 'HR', 'CZ', 'DK', 'EG', 'EE', 'FI', 'FR',
174               'DE', 'GR', 'HU', 'IS', 'IN', 'IE', 'IL', 'IT', 'JM', 'JP', 'LT', 'MX', 'MA', 'NL', 'NZ', 'PL', 'PT',
175               'RO', 'RU', 'SA', 'RS', 'SK', 'SI', 'ZA', 'ES', 'SE', 'CH', 'TR', 'UA', 'AE', 'GB', 'US']
176    for country in countries:
177        holiday = holidays.country_holidays(country)
178        if date.strftime("%m-%d-%Y") in holiday:
179            return True
180    return False
181
182
183def encode_date(value):
184    if isinstance(value, datetime):
185        date = value
186    else:
187        date = dateparser.parse(value)  # Returns a datetime type
188    return [date.isoweekday(), date.day, date.month, date.year, date.hour, date.minute, date.second,
189            is_special_occasion(date)]
190
191
192def parse_date(column: list, column_name: str) -> (DataFrame, list):
193    """Parses strings of column into datetime objects and returns a DataFrame
194
195    :param column: list of str, strings to parse into date
196    :param column_name:
197    :return:
198    """
199    columns = [(column_name+'_date_week_day'), (column_name+'_date_day'), (column_name+'_date_month'),
200               (column_name+'_date_year'), (column_name+'_date_hours'), (column_name+'_date_minutes'),
201               (column_name+'_date_seconds'), (column_name+'_date_special_occasion')]
202
203    encoded_dates = [
204        [None for _ in columns]
205        if (value is None or value == '' or value == 'None')
206        else encode_date(value)
207        for value in column
208    ]
209
210    results_df = DataFrame(data=encoded_dates, columns=columns)
211    results_df = results_df.where(pd.notnull(results_df), None)
212
213    return results_df
214
215
216def encode_duration(value):
217    return [value.days, value.hours, value.minutes, value.seconds]
218
219
220def encode_dates_for_duration(date: datetime, last_date: datetime):
221    if date is None or last_date is None:
222        return None
223    else:
224        tot_seconds = int((date - last_date).total_seconds())
225
226        if tot_seconds > 0:
227            tot_minutes = int(tot_seconds / 60)
228            tot_hours = int(tot_minutes / 60)
229            days = int(tot_hours / 24)
230            return datetime.timedelta(days=days, hours=(tot_hours % 24), minutes=(tot_minutes % 60),
231                                      seconds=(tot_seconds % 60))
232        else:
233            return None
234
235
236def parse_duration(current_time: list, column_name: str, last_time: list) -> DataFrame:
237    """Parses strings of column into datetime objects and returns a DataFrame
238
239    I assume that I receive the duration in one of the following format
240    - number (milliseconds)
241    - number d number h number m number
242    - number days number hours number minutes number seconds
243    - number days
244
245    All space will be removed
246    :param current_time:
247    :param column_name:
248    :param last_time:
249    :return:
250    """
251    columns = [(column_name+'_elapsed_days'), (column_name+'_elapsed_hours'), (column_name+'_elapsed_minutes'),
252               (column_name+'_elapsed_seconds')]
253
254    encoded_durations = [
255        encode_duration(
256            encode_dates_for_duration(new_date, old_date)
257        )
258        for new_date, old_date in zip(current_time, last_time)
259    ]
260
261    results_df = DataFrame(data=encoded_durations, columns=columns)
262    results_df = results_df.where(pd.notnull(results_df), None)
263
264    return results_df
265
266
267if __name__ == '__main__':
268    time_test = [
269        '1990-12-1',
270        '',
271        None,
272        'None',
273        '01/19/1990',
274        '01/19/90',
275        'Jan 1990',
276        'January1990',
277        '2005/3',
278        'Monday at 12:01am',
279        'January 1, 2047 at 8:21:00AM',
280    ]
281
282    duration_test = [
283        '2d9h32m46s',
284        '2d 9h',
285        '',
286        None,
287        'None',
288        '2days9hours37minutes46seconds',
289        '2days 9hours 37minutes 46seconds',
290    ]
291
292    print(is_time_or_duration(time_test))
293    print(is_time_or_duration(duration_test))
294
295    parsed_dates = parse_date(time_test, 't1')
296    print(parsed_dates.head())
297    print(parse_duration(duration_test, 't2').head())
class TimeType(enum.Enum):
14class TimeType(Enum):
15    DATE = 'date'
16    DURATION = 'duration'
17    NONE = 'none'

Create a collection of name/value pairs.

Example enumeration:

>>> class Color(Enum):
...     RED = 1
...     BLUE = 2
...     GREEN = 3

Access them by:

  • attribute access::
>>> Color.RED
<Color.RED: 1>
  • value lookup:
>>> Color(1)
<Color.RED: 1>
  • name lookup:
>>> Color['RED']
<Color.RED: 1>

Enumerations can be iterated over, and know how many members they have:

>>> len(Color)
3
>>> list(Color)
[<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]

Methods can be added to enumerations, and members can have their own attributes -- see the documentation for details.

DATE = <TimeType.DATE: 'date'>
DURATION = <TimeType.DURATION: 'duration'>
NONE = <TimeType.NONE: 'none'>
Inherited Members
enum.Enum
name
value
class TimeEncodingType(enum.Enum):
20class TimeEncodingType(Enum):
21    DATE = 'date'
22    DURATION = 'duration'
23    DATE_AND_DURATION = 'date_and_duration'
24    NONE = 'none'

Create a collection of name/value pairs.

Example enumeration:

>>> class Color(Enum):
...     RED = 1
...     BLUE = 2
...     GREEN = 3

Access them by:

  • attribute access::
>>> Color.RED
<Color.RED: 1>
  • value lookup:
>>> Color(1)
<Color.RED: 1>
  • name lookup:
>>> Color['RED']
<Color.RED: 1>

Enumerations can be iterated over, and know how many members they have:

>>> len(Color)
3
>>> list(Color)
[<Color.RED: 1>, <Color.BLUE: 2>, <Color.GREEN: 3>]

Methods can be added to enumerations, and members can have their own attributes -- see the documentation for details.

DATE = <TimeEncodingType.DATE: 'date'>
DURATION = <TimeEncodingType.DURATION: 'duration'>
DATE_AND_DURATION = <TimeEncodingType.DATE_AND_DURATION: 'date_and_duration'>
NONE = <TimeEncodingType.NONE: 'none'>
Inherited Members
enum.Enum
name
value
def time_encoding( df: pandas.core.frame.DataFrame, encoding_type) -> pandas.core.frame.DataFrame:
27def time_encoding(df: DataFrame, encoding_type) -> DataFrame:
28    """
29    Encodes the columns of string of the given DataFrame if they are date or duration.
30
31    :param pandas.Dataframe df: the dataframe to encode
32    :param str encoding_type: the type of encoding to perform
33    :return pandas.DataFrame:
34    """
35    
36    last_time = [None] * len(df)
37    df_output = DataFrame()
38
39    for column_name in df.keys():
40        current_time = df[column_name]
41        column_type = is_time_or_duration(current_time)
42
43        if column_type == TimeType.DATE.value and encoding_type == TimeEncodingType.NONE.value:
44            df_output[column_name] = convert_datetime_in_UTC(current_time)
45
46        if column_type == TimeType.DATE.value and encoding_type in [TimeEncodingType.DATE.value, TimeEncodingType.DATE_AND_DURATION.value]:
47            result_df = parse_date(current_time, column_name)
48            df_output.append(result_df)
49
50        if column_type == TimeType.NONE.value or encoding_type == TimeEncodingType.DURATION.value:
51            df_output[column_name] = current_time
52
53        if column_type == TimeType.DURATION.value and encoding_type in [TimeEncodingType.DURATION.value, TimeEncodingType.DATE_AND_DURATION.value]:
54            if not all(val is None for val in last_time) and not all(val is None for val in current_time):
55                df_output.append(parse_duration(current_time, column_name, last_time))
56            last_time = [
57                old_time if new_time is None else new_time
58                for new_time, old_time in zip(current_time, last_time)
59            ]
60
61    return df_output

Encodes the columns of string of the given DataFrame if they are date or duration.

Parameters
  • pandas.Dataframe df: the dataframe to encode
  • str encoding_type: the type of encoding to perform
Returns
def convert_datetime_in_UTC(column: list):
64def convert_datetime_in_UTC(column: list):
65    return [
66        value.replace(tzinfo=timezone.utc).timestamp()
67        if isinstance(value, datetime)
68        else dateparser.parse(value).replace(tzinfo=timezone.utc).timestamp()
69        for value in column
70    ]
def is_time_or_duration(column: list):
73def is_time_or_duration(column: list):
74    """Returns whether the column contains dates, durations, or otherwise
75
76    :param column:
77    :return:
78    """
79    column_type = TimeType.NONE.value
80
81    if is_duration(column):
82        column_type = TimeType.DURATION.value
83    elif is_date(column):
84        column_type = TimeType.DATE.value
85
86    return column_type

Returns whether the column contains dates, durations, or otherwise

Parameters
  • column:
Returns
def is_date(column: list) -> bool:
 89def is_date(column: list) -> bool:
 90    """Returns whether all string can be interpreted as a date.
 91
 92    Accepts empty string and None Object in python
 93    Function take from https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format
 94    :param column: list of str, strings to check for date
 95    :return: True if all string of column are dates
 96    """
 97    for value in column:
 98        if isinstance(value, str):
 99            if value != "" and value != 'None':
100                try:
101                    float(value)
102                    return False
103                except ValueError:
104                    try:
105                        parse(value)
106                    except ValueError:
107                        return False
108        elif isinstance(value, datetime) or value is None:
109            pass
110        else:
111            return False
112
113    return True

Returns whether all string can be interpreted as a date.

Accepts empty string and None Object in python Function take from https://stackoverflow.com/questions/25341945/check-if-string-has-date-any-format

Parameters
  • column: list of str, strings to check for date
Returns

True if all string of column are dates

def is_duration(column: list) -> bool:
116def is_duration(column: list) -> bool:
117    """Returns whether all string can be interpreted as a duration.
118
119    Accepts empty string and None Object in python
120    :param column: list of str, strings to check for periods of time
121    :return: True if all string of column are periods of time
122    """
123    for value in column:
124        if isinstance(value, str):
125            if value != "" and value != 'None':
126                try:
127                    float(value)
128                    return False
129                except ValueError:
130                    groups = format_string_duration_parse(value)
131                    if not all([
132                        (len(group) == 2 and group[0].isnumeric() and group[1] in duration_allowed_word)
133                        for group in groups
134                    ]):
135                        return False
136        elif value is None:
137            pass
138        else:
139            return False
140
141    return True

Returns whether all string can be interpreted as a duration.

Accepts empty string and None Object in python

Parameters
  • column: list of str, strings to check for periods of time
Returns

True if all string of column are periods of time

duration_allowed_word = ['d', 'days', 'h', 'hours', 'm', 'minutes', 's', 'seconds']
def format_string_duration_parse(string: str) -> list:
147def format_string_duration_parse(string: str) -> list:
148    """Returns a list containing the given string split
149
150    :param string:
151    :return:
152    """
153    string = string.replace(" ", "")
154
155    chars = [string[0]]
156    for char in string[1:]:
157        if not chars[-1].isnumeric() and char.isnumeric():
158            chars += ['|']
159            chars += [char]
160        elif chars[-1].isnumeric() and not char.isnumeric():
161            chars += ['_']
162            chars += [char]
163        else:
164            chars += [char]
165    # From 18d5h38m36s, I want have for example 18_d|5_h|38_m|36_s
166
167    formatted_string = [tuple(group.split('_')) for group in "".join(chars).split('|')]
168    # recreates the string, then splits it first to have the number_keyword and then create the tuples
169
170    return formatted_string

Returns a list containing the given string split

Parameters
  • string:
Returns
def is_special_occasion(date):
173def is_special_occasion(date):
174    countries = ['AR', 'AU', 'AT', 'BY', 'BE', 'BR', 'BG', 'CA', 'CL', 'CO', 'HR', 'CZ', 'DK', 'EG', 'EE', 'FI', 'FR',
175               'DE', 'GR', 'HU', 'IS', 'IN', 'IE', 'IL', 'IT', 'JM', 'JP', 'LT', 'MX', 'MA', 'NL', 'NZ', 'PL', 'PT',
176               'RO', 'RU', 'SA', 'RS', 'SK', 'SI', 'ZA', 'ES', 'SE', 'CH', 'TR', 'UA', 'AE', 'GB', 'US']
177    for country in countries:
178        holiday = holidays.country_holidays(country)
179        if date.strftime("%m-%d-%Y") in holiday:
180            return True
181    return False
def encode_date(value):
184def encode_date(value):
185    if isinstance(value, datetime):
186        date = value
187    else:
188        date = dateparser.parse(value)  # Returns a datetime type
189    return [date.isoweekday(), date.day, date.month, date.year, date.hour, date.minute, date.second,
190            is_special_occasion(date)]
def parse_date( column: list, column_name: str) -> (<class 'pandas.core.frame.DataFrame'>, <class 'list'>):
193def parse_date(column: list, column_name: str) -> (DataFrame, list):
194    """Parses strings of column into datetime objects and returns a DataFrame
195
196    :param column: list of str, strings to parse into date
197    :param column_name:
198    :return:
199    """
200    columns = [(column_name+'_date_week_day'), (column_name+'_date_day'), (column_name+'_date_month'),
201               (column_name+'_date_year'), (column_name+'_date_hours'), (column_name+'_date_minutes'),
202               (column_name+'_date_seconds'), (column_name+'_date_special_occasion')]
203
204    encoded_dates = [
205        [None for _ in columns]
206        if (value is None or value == '' or value == 'None')
207        else encode_date(value)
208        for value in column
209    ]
210
211    results_df = DataFrame(data=encoded_dates, columns=columns)
212    results_df = results_df.where(pd.notnull(results_df), None)
213
214    return results_df

Parses strings of column into datetime objects and returns a DataFrame

Parameters
  • column: list of str, strings to parse into date
  • column_name:
Returns
def encode_duration(value):
217def encode_duration(value):
218    return [value.days, value.hours, value.minutes, value.seconds]
def encode_dates_for_duration(date: datetime.datetime, last_date: datetime.datetime):
221def encode_dates_for_duration(date: datetime, last_date: datetime):
222    if date is None or last_date is None:
223        return None
224    else:
225        tot_seconds = int((date - last_date).total_seconds())
226
227        if tot_seconds > 0:
228            tot_minutes = int(tot_seconds / 60)
229            tot_hours = int(tot_minutes / 60)
230            days = int(tot_hours / 24)
231            return datetime.timedelta(days=days, hours=(tot_hours % 24), minutes=(tot_minutes % 60),
232                                      seconds=(tot_seconds % 60))
233        else:
234            return None
def parse_duration( current_time: list, column_name: str, last_time: list) -> pandas.core.frame.DataFrame:
237def parse_duration(current_time: list, column_name: str, last_time: list) -> DataFrame:
238    """Parses strings of column into datetime objects and returns a DataFrame
239
240    I assume that I receive the duration in one of the following format
241    - number (milliseconds)
242    - number d number h number m number
243    - number days number hours number minutes number seconds
244    - number days
245
246    All space will be removed
247    :param current_time:
248    :param column_name:
249    :param last_time:
250    :return:
251    """
252    columns = [(column_name+'_elapsed_days'), (column_name+'_elapsed_hours'), (column_name+'_elapsed_minutes'),
253               (column_name+'_elapsed_seconds')]
254
255    encoded_durations = [
256        encode_duration(
257            encode_dates_for_duration(new_date, old_date)
258        )
259        for new_date, old_date in zip(current_time, last_time)
260    ]
261
262    results_df = DataFrame(data=encoded_durations, columns=columns)
263    results_df = results_df.where(pd.notnull(results_df), None)
264
265    return results_df

Parses strings of column into datetime objects and returns a DataFrame

I assume that I receive the duration in one of the following format

  • number (milliseconds)
  • number d number h number m number
  • number days number hours number minutes number seconds
  • number days

All space will be removed

Parameters
  • current_time:
  • column_name:
  • last_time:
Returns