BaseEncoder Module API Reference

`BaseEncoder`

Bases: ABC

Source code in src/enc4ppm/base_encoder.py

class BaseEncoder(ABC):
    ORIGINAL_INDEX_KEY = 'OriginalIndex'
    TIME_SINCE_CS_KEY = 'TimeSinceCaseStart'
    TIME_SINCE_PE_KEY = 'TimeSincePreviousEvent'
    EVENT_COL_PREFIX_NAME = 'event'
    TIMESTAMP_COL_PREFIX_NAME = 'Timestamp'
    LATEST_PAYLOAD_COL_SUFFIX_NAME = 'latest'
    LABEL_KEY = 'label'
    UNKNOWN_VAL = 'UNKNOWN'
    PADDING_CAT_VAL = 'PADDING'
    PADDING_NUM_VAL = 0.0

    def __init__(
        self,
        labeling_type: LabelingType = LabelingType.NEXT_ACTIVITY,
        attributes: list[str] | str = [],
        categorical_encoding: CategoricalEncoding = CategoricalEncoding.STRING,
        numerical_scaling: NumericalScaling = NumericalScaling.NONE,
        prefix_length: int = None,
        prefix_strategy: PrefixStrategy = PrefixStrategy.UP_TO_SPECIFIED,
        add_time_features: bool = False,
        timestamp_format: str = None,
        case_id_key: str = 'case:concept:name',
        activity_key: str = 'concept:name',
        timestamp_key: str = 'time:timestamp',
        outcome_key: str = 'outcome',
    ) -> None:
        self.labeling_type = labeling_type
        self.attributes = attributes
        self.categorical_encoding = categorical_encoding
        self.numerical_scaling = numerical_scaling
        self.prefix_length = prefix_length
        self.prefix_strategy = prefix_strategy
        self.add_time_features = add_time_features
        self.timestamp_format = timestamp_format
        self.case_id_key = case_id_key
        self.activity_key = activity_key
        self.timestamp_key = timestamp_key
        self.outcome_key = outcome_key

        # Instance variables
        self.is_frozen: bool = False
        self.was_frozen: bool = False
        self.original_df: pd.DataFrame = pd.DataFrame()
        self.log_activities: list[str] = []
        self.log_attributes: dict[str, dict[str, str | list | dict]] = {}
        self.numerical_scaling_info = {}
        self.remaining_time_num_bins = 10


    @abstractmethod
    def _encode(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
        """
        The _encode abstract method must be defined by subclasses and must contain the specific encoding logic of the encoder.
        In particular, the _encode implementation must create the necessary columns for the specific encoding + add the ORIGINAL_INDEX_KEY column.
        The _encode method must not filter rows (events), but instead return them all: the BaseEncoder will then _apply_prefix_strategy to filter them.
        """
        pass


    def _encode_template(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
        """
        The _encode_template method is a template method which performs both common operations shared amongs all encoders and the specific logic of each encoder.
        In particular, common operations are: _preprocess_log, _label_log, _apply_prefix_strategy and _postprocess_log; specific encoding is performed by the _encode method.
        """
        self.original_df = df
        self.was_frozen = self.is_frozen

        self._check_log(df)
        self._check_parameters(df)
        df = self._preprocess_log(df)

        if not self.is_frozen:
            self._extract_log_data(df)

        if 'freeze' in kwargs and kwargs['freeze']:
            self.is_frozen = True

        encoded_df = self._encode(df)

        encoded_df = self._after_encode(encoded_df)
        encoded_df = self._label_log(encoded_df)
        encoded_df = self._apply_prefix_strategy(encoded_df)
        encoded_df = self._postprocess_log(encoded_df)

        return encoded_df


    def _check_log(self, df: pd.DataFrame) -> None:
        """
        Checks and validations on input log.
        """
        if not isinstance(df, pd.DataFrame):
            raise TypeError("df must be a pandas DataFrame")

        if df.empty:
            raise ValueError("df cannot be empty")

        for col in [self.case_id_key, self.activity_key, self.timestamp_key]:
            if col not in df.columns:
                raise ValueError(f"df must contain column '{col}'")


    def _check_parameters(self, df: pd.DataFrame) -> None:
        """
        Checks and validations on encoder parameters.
        """
        # Labeling type
        if not isinstance(self.labeling_type, LabelingType):
            raise TypeError(f'labeling_type must be a valid LabelingType: {[e.name for e in LabelingType]}')

        if self.labeling_type == LabelingType.OUTCOME and (self.outcome_key is None or self.outcome_key not in df.columns):
            raise ValueError("If labeling_type is set to OUTCOME, then you must specify the outcome_key parameter and it must be present in the DataFrame")

        # Attributes
        if not isinstance(self.attributes, str) and not isinstance(self.attributes, list):
            raise ValueError(f'attributes must be either a list of strings or the string "all"')

        if isinstance(self.attributes, str) and self.attributes != 'all':
            raise ValueError("Since attributes is set to a string, then it must be set to the value 'all'. Otherwise, set it to a list of strings indicating the attributes you want to consider.")

        if isinstance(self.attributes, list):
            for attribute in self.attributes:
                if not isinstance(attribute, str):
                    raise ValueError('Since attributes is a list, it must contain only string elements')

                if attribute not in self.original_df.columns:
                    raise ValueError(f"attributes contains value '{attribute}', which cannot be found in the log")

        # Prefix length and strategy
        if self.prefix_length is not None and (not isinstance(self.prefix_length, int) or self.prefix_length <= 0):
            raise ValueError(f'prefix_length must be either None or a positive integer ({self.prefix_length} has been provided instead)')

        if self.prefix_length is None and self.prefix_strategy == PrefixStrategy.ONLY_SPECIFIED:
            raise ValueError(f'If prefix strategy is set to ONLY_SPECIFIED, then you must specify the prefix_length parameter')

        if not isinstance(self.prefix_strategy, PrefixStrategy):
            raise TypeError(f'prefix_strategy must be a valid PrefixStrategy: {[e.name for e in PrefixStrategy]}')


    def _preprocess_log(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Common preprocessing logic shared by all encoders.
        """
        df = df.copy()

        # Cast case id column to string
        df[self.case_id_key] = df[self.case_id_key].astype(str)

        # Cast timestamp column to datetime
        df[self.timestamp_key] = pd.to_datetime(df[self.timestamp_key], format=self.timestamp_format)

        # Change null values to UNKNOWN_VAL or 0, based on their type
        fill_dict = {}

        for col in df.select_dtypes(include=['object', 'category']).columns:
            fill_dict[col] = self.UNKNOWN_VAL

        for col in df.select_dtypes(include=['number']).columns:
            fill_dict[col] = 0

        df = df.fillna(fill_dict).infer_objects(copy=False)

        return df


    def _extract_log_data(self, df: pd.DataFrame) -> None:
        """
        From log data, create necessary variables for later use (e.g: determines prefix length, build activity and attribute vocabs, etc.)
        """
        # Set prefix length
        max_prefix_length_log = df.groupby(self.case_id_key).size().max().item()

        if self.prefix_length is None:
            self.prefix_length = max_prefix_length_log

        # Build activity vocab
        self.log_activities = df[self.activity_key].unique().tolist() + [self.UNKNOWN_VAL] + [self.PADDING_CAT_VAL]

        # Build outcome vocab
        if self.labeling_type == LabelingType.OUTCOME:
            self.log_outcomes = df[self.outcome_key].unique().tolist()

        # Build attribute vocabs
        if self.attributes == 'all':
            self.attributes = [a for a in df.columns.tolist() if a not in [self.case_id_key, self.activity_key, self.timestamp_key]]

        for attribute_name in self.attributes:
            attribute_values = df[attribute_name].unique()

            is_numeric = is_numeric_dtype(attribute_values)
            is_static = df.groupby(self.case_id_key)[attribute_name].nunique().eq(1).all()

            attribute_dict = {
                'type': 'numerical' if is_numeric else 'categorical',
                'scope': 'trace' if is_static else 'event',
            }

            if is_numeric_dtype(attribute_values):
                attribute_dict['values'] = {
                    'min': attribute_values.min().item(),
                    'max': attribute_values.max().item(),
                    'mean': attribute_values.mean().item(),
                    'std': attribute_values.std().item() if len(attribute_values) > 1 else 0.0,
                }
            else:
                attribute_values = attribute_values[attribute_values != self.UNKNOWN_VAL] # remove UNKNOWN_VAL if present, because it'll be added anyway
                attribute_dict['values'] = attribute_values.tolist() + [self.UNKNOWN_VAL] + [self.PADDING_CAT_VAL]

            self.log_attributes[attribute_name] = attribute_dict


    def _after_encode(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Common logic to execute right after encoding.
        """
        # Check whether OriginalIndex is present
        if self.ORIGINAL_INDEX_KEY not in df.columns:
            raise ValueError(f'You must include {self.ORIGINAL_INDEX_KEY} column when implementing your own custom encoder!')

        # Sort by case and timestamp
        df = df.sort_values([self.case_id_key, self.timestamp_key], ascending=[True, True]).reset_index(drop=True)

        # If requested, add columns TimeSinceCaseStart and TimeSincePreviousEvent to dataframe
        if self.add_time_features:
            first_timestamp_per_case = df.groupby(self.case_id_key)[self.timestamp_key].transform('min')

            df[self.TIME_SINCE_CS_KEY] = (df[self.timestamp_key] - first_timestamp_per_case).dt.total_seconds()
            df[self.TIME_SINCE_PE_KEY] = df.groupby(self.case_id_key)[self.timestamp_key].diff().dt.total_seconds()
            df[self.TIME_SINCE_PE_KEY] = df[self.TIME_SINCE_PE_KEY].fillna(0)

        return df


    def _label_log(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Common logic shared by all encoders. The method labels the provided log with the provided LabelingType.
        """
        if self.labeling_type == LabelingType.NEXT_ACTIVITY:
            # Get the next ORIGINAL_INDEX_KEY per case
            df['next_index'] = df.groupby(self.case_id_key)[self.ORIGINAL_INDEX_KEY].shift(-1)

            # Map next_index to activity in original_df
            df[self.LABEL_KEY] = df['next_index'].map(
                lambda idx: self._get_activity_value(self.original_df.at[idx, self.activity_key]) if pd.notna(idx) else None
            )

            # Drop the helper column
            df = df.drop(columns=['next_index'])

        elif self.labeling_type == LabelingType.REMAINING_TIME or self.labeling_type == LabelingType.REMAINING_TIME_CLASSIFICATION:
            # Get the last timestamp for each case
            last_timestamp_per_case = df.groupby(self.case_id_key)[self.timestamp_key].transform('max')

            # Compute remaining time in hours
            df[self.LABEL_KEY] = (last_timestamp_per_case - df[self.timestamp_key]).dt.total_seconds() / 60 / 60

            if self.labeling_type == LabelingType.REMAINING_TIME:
                # Save mean and std for later use
                if not self.was_frozen:
                    self.numerical_scaling_info[self.LABEL_KEY] = {
                        'mean': df[self.LABEL_KEY].mean(),
                        'std': df[self.LABEL_KEY].std(ddof=0),
                    }

            if self.labeling_type == LabelingType.REMAINING_TIME_CLASSIFICATION:
                # Cut in bins
                if not self.was_frozen:
                    df[self.LABEL_KEY], bins = pd.cut(
                        df[self.LABEL_KEY],
                        bins=self.remaining_time_num_bins,
                        retbins=True,
                        include_lowest=True,
                        right=False,
                        labels=[f'Bin_{i+1}' for i in range(self.remaining_time_num_bins)]
                    )
                    df[self.LABEL_KEY] = df[self.LABEL_KEY].astype(str)
                    self.remaining_time_bins = bins
                else:
                    df[self.LABEL_KEY] = pd.cut(
                        df[self.LABEL_KEY],
                        bins=self.remaining_time_bins,
                        include_lowest=True,
                        right=False,
                        labels=[f'Bin_{i+1}' for i in range(len(self.remaining_time_bins)-1)]
                    )
                    df[self.LABEL_KEY] = df[self.LABEL_KEY].cat.add_categories([self.UNKNOWN_VAL])
                    df[self.LABEL_KEY] = df[self.LABEL_KEY].fillna(self.UNKNOWN_VAL)
                    df[self.LABEL_KEY] = df[self.LABEL_KEY].astype(str)

        elif self.labeling_type == LabelingType.OUTCOME:
            # Get outcome for each case (from original_df)
            df[self.LABEL_KEY] = df[self.ORIGINAL_INDEX_KEY].map(self.original_df[self.outcome_key])

        return df


    def _apply_prefix_strategy(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Common logic shared by all encoders. The method filters the log with respect to specified prefix_length value.
        """
        # Compute event number in case (starting from 1)
        df = df.sort_values([self.case_id_key, self.timestamp_key], ascending=[True, True]).reset_index(drop=True)
        df['event_num_in_case'] = df.groupby(self.case_id_key).cumcount() + 1

        if self.prefix_strategy == PrefixStrategy.UP_TO_SPECIFIED:
            filtered_df = df[df['event_num_in_case'] <= self.prefix_length]
        elif self.prefix_strategy == PrefixStrategy.ONLY_SPECIFIED:
            filtered_df = df[df['event_num_in_case'] == self.prefix_length]
        else:
            filtered_df = df

        # Drop the helper column
        filtered_df = filtered_df.drop(columns=['event_num_in_case'])

        return filtered_df


    def _postprocess_log(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Common postprocessing logic shared by all encoders. The method restores original ordering and drops unnecessary data.
        """
        if self.add_time_features and not self.was_frozen:
            self.numerical_scaling_info[self.TIME_SINCE_CS_KEY] = {
                'mean': df[self.TIME_SINCE_CS_KEY].mean(),
                'std': df[self.TIME_SINCE_CS_KEY].std(ddof=0),
            }
            self.numerical_scaling_info[self.TIME_SINCE_PE_KEY] = {
                'mean': df[self.TIME_SINCE_PE_KEY].mean(),
                'std': df[self.TIME_SINCE_PE_KEY].std(ddof=0),
            }

        # Scale time features
        if self.add_time_features:
            if self.numerical_scaling == NumericalScaling.STANDARDIZATION:
                df[self.TIME_SINCE_CS_KEY] = (df[self.TIME_SINCE_CS_KEY] - self.numerical_scaling_info[self.TIME_SINCE_CS_KEY]['mean']) / self.numerical_scaling_info[self.TIME_SINCE_CS_KEY]['std']
                df[self.TIME_SINCE_PE_KEY] = (df[self.TIME_SINCE_PE_KEY] - self.numerical_scaling_info[self.TIME_SINCE_PE_KEY]['mean']) / self.numerical_scaling_info[self.TIME_SINCE_PE_KEY]['std']

        # Scale label if it is remaining time
        if self.labeling_type == LabelingType.REMAINING_TIME:
            if self.numerical_scaling == NumericalScaling.STANDARDIZATION:
                df[self.LABEL_KEY] = (df[self.LABEL_KEY] - self.numerical_scaling_info[self.LABEL_KEY]['mean']) / self.numerical_scaling_info[self.LABEL_KEY]['std']

        # Scale numerical attributes
        for attribute_name, attribute_info in self.log_attributes.items():
            if attribute_info['type'] == 'numerical':
                if self.numerical_scaling == NumericalScaling.STANDARDIZATION:
                    for col in df.columns:
                        if attribute_name in col:
                            df[col] = (df[col] - self.log_attributes[attribute_name]['values']['mean']) / self.log_attributes[attribute_name]['values']['std']

        # Restore original ordering
        df = df.sort_values(by=self.ORIGINAL_INDEX_KEY).reset_index(drop=True)

        # Drop unnecessary data
        df = df.drop(columns=[self.timestamp_key, self.ORIGINAL_INDEX_KEY])
        if self.labeling_type != LabelingType.NONE:
            df = df.dropna(subset=[self.LABEL_KEY]).reset_index(drop=True)

        return df


    def _include_latest_payload(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Add latest payload attributes to encoded DataFrame. 
        """
        if self.attributes == [] or self.attributes is None:
            return df

        if self.ORIGINAL_INDEX_KEY not in df.columns:
            raise ValueError(f'You must include {self.ORIGINAL_INDEX_KEY} column into df before calling _include_latest_payload')

        # Add latest payload of specified attributes to the dataframe
        for attribute_name in self.attributes:
            attribute_values = []

            for _, row in df.iterrows():
                attribute_values.append(
                    self._get_attribute_value(attribute_name, self.original_df.loc[row[self.ORIGINAL_INDEX_KEY], attribute_name])
                )

            df[f'{attribute_name}_{self.LATEST_PAYLOAD_COL_SUFFIX_NAME}'] = attribute_values

        return df


    def _get_activity_value(self, activity_value: str) -> str:
        """
        Return specified activity_value if present in self.log_activities, otherwise a string representing unknown activity.
        """
        if activity_value in self.log_activities:
            return activity_value

        return self.UNKNOWN_VAL


    def _get_attribute_value(self, attribute_name: str, attribute_value: str) -> str:
        """
        Return specified attribute_value if present in self.log_attributes under attribute_name, otherwise a string representing unknown attribute.
        """
        if attribute_name not in self.log_attributes:
            raise ValueError(f'Attribute {attribute_name} not found in log attributes {list(self.log_attributes.keys())}')

        # Numerical attribute
        if self.log_attributes[attribute_name]['type'] == 'numerical':
            return attribute_value

        # Categorical attribute
        if attribute_value in self.log_attributes[attribute_name]['values']:
            return attribute_value

        return self.UNKNOWN_VAL


    def summary(self) -> None:
        """
        Print a summary of the encoder. Only works if the encoder has been frozen.
        """
        if not self.is_frozen:
            raise RuntimeError("Encoder must be frozen before summarizing.")

        # Print a summary of the encoder's configuration and learned parameters
        print("Encoder Summary:")
        print(f" - Encoder Type: {self.__class__.__name__}")
        print(f" - Labeling Type: {self.labeling_type}")
        print(f" - Categorical Encoding: {self.categorical_encoding}")
        print(f" - Numerical Scaling Info: {self.numerical_scaling_info}")
        if self.labeling_type == LabelingType.REMAINING_TIME_CLASSIFICATION:
            print(f" - Remaining Time Num Bins: {self.remaining_time_num_bins}")
        print(f" - Prefix Length: {self.prefix_length}")
        print(f" - Prefix Strategy: {self.prefix_strategy}")
        print(f" - Timestamp Format: {self.timestamp_format}")
        print(f" - Case ID Key: {self.case_id_key}")
        print(f" - Activity Key: {self.activity_key}")
        print(f" - Timestamp Key: {self.timestamp_key}")
        print(f" - Log Activities ({len(self.log_activities)}): {self.log_activities}")
        print(f" - Log Attributes ({len(self.log_attributes)}):")
        pprint.pprint(self.log_attributes)


    def save(self, filepath: str) -> None:
        """
        Save the encoder instance to a pickle file. Only works if the encoder has been frozen.

        Args:
            filepath (str): Path to the pickle file where the encoder will be saved.
        """
        if not self.is_frozen:
            raise RuntimeError("Encoder must be frozen before saving. Call with freeze=True during encoding.")

        # Do not save original_df
        self.original_df = None

        with open(filepath, 'wb') as f:
            pickle.dump(self, f)


    @classmethod
    def load(cls, filepath: str):
        """
        Load a frozen encoder instance from a pickle file.

        Args:
            filepath (str): Path to the pickle file to load.

        Returns:
            encoder (BaseEncoder): The loaded encoder instance.
        """
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"File '{filepath}' does not exist.")

        with open(filepath, 'rb') as f:
            encoder = pickle.load(f)

        if not isinstance(encoder, cls):
            raise TypeError(f"Loaded object is not an instance of {cls.__name__}")

        return encoder


    def unscale_numerical_feature(self, df: pd.DataFrame | pd.Series, feature_name: str) -> pd.DataFrame | pd.Series:
        """
        Reverts the scaling transformation applied to a numerical feature in a DataFrame or Series.

        Args:
            df (pd.DataFrame | pd.Series): The input data containing the scaled feature(s) to be unscaled.
            feature_name (str): The name of the numerical feature to unscale.

        Returns:
            df (pd.DataFrame | pd.Series): The DataFrame or Series with the specified feature unscaled.
        """
        if self.numerical_scaling_info is None or feature_name not in self.numerical_scaling_info:
            raise ValueError(f'Feature {feature_name} has no scaling info available. Available scaling info: {self.numerical_scaling_info}')

        df = df.copy()

        if isinstance(df, pd.Series):
            return df * self.numerical_scaling_info[feature_name]['std'] + self.numerical_scaling_info[feature_name]['mean']
        elif isinstance(df, pd.DataFrame):
            if feature_name not in df.columns:
                raise ValueError(f'Feature {feature_name} not found in provided DataFrame. Available columns: {df.columns.tolist()}')

            df[feature_name] = df[feature_name] * self.numerical_scaling_info[feature_name]['std'] + self.numerical_scaling_info[feature_name]['mean']

        return df


    def set_remaining_time_num_bins(self, num_bins: int) -> None:
        """
        Set the number of bins to use for remaining time classification. Only works if the encoder has not been frozen yet.

        Args:
            num_bins (int): Number of bins to use for remaining time classification.
        """
        if self.is_frozen:
            raise RuntimeError("Cannot change remaining time bins after encoder has been frozen.")

        if not isinstance(num_bins, int) or num_bins <= 0:
            raise ValueError("num_bins must be a positive integer.")

        self.remaining_time_num_bins = num_bins

`load(filepath)` `classmethod`

Load a frozen encoder instance from a pickle file.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	Path to the pickle file to load.	required

Returns:

Name	Type	Description
`encoder`	`BaseEncoder`	The loaded encoder instance.

Source code in src/enc4ppm/base_encoder.py

@classmethod
def load(cls, filepath: str):
    """
    Load a frozen encoder instance from a pickle file.

    Args:
        filepath (str): Path to the pickle file to load.

    Returns:
        encoder (BaseEncoder): The loaded encoder instance.
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File '{filepath}' does not exist.")

    with open(filepath, 'rb') as f:
        encoder = pickle.load(f)

    if not isinstance(encoder, cls):
        raise TypeError(f"Loaded object is not an instance of {cls.__name__}")

    return encoder

`save(filepath)`

Save the encoder instance to a pickle file. Only works if the encoder has been frozen.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	Path to the pickle file where the encoder will be saved.	required

Source code in src/enc4ppm/base_encoder.py

def save(self, filepath: str) -> None:
    """
    Save the encoder instance to a pickle file. Only works if the encoder has been frozen.

    Args:
        filepath (str): Path to the pickle file where the encoder will be saved.
    """
    if not self.is_frozen:
        raise RuntimeError("Encoder must be frozen before saving. Call with freeze=True during encoding.")

    # Do not save original_df
    self.original_df = None

    with open(filepath, 'wb') as f:
        pickle.dump(self, f)

`set_remaining_time_num_bins(num_bins)`

Set the number of bins to use for remaining time classification. Only works if the encoder has not been frozen yet.

Parameters:

Name	Type	Description	Default
`num_bins`	`int`	Number of bins to use for remaining time classification.	required

Source code in src/enc4ppm/base_encoder.py

def set_remaining_time_num_bins(self, num_bins: int) -> None:
    """
    Set the number of bins to use for remaining time classification. Only works if the encoder has not been frozen yet.

    Args:
        num_bins (int): Number of bins to use for remaining time classification.
    """
    if self.is_frozen:
        raise RuntimeError("Cannot change remaining time bins after encoder has been frozen.")

    if not isinstance(num_bins, int) or num_bins <= 0:
        raise ValueError("num_bins must be a positive integer.")

    self.remaining_time_num_bins = num_bins

`summary()`

Print a summary of the encoder. Only works if the encoder has been frozen.

Source code in src/enc4ppm/base_encoder.py

def summary(self) -> None:
    """
    Print a summary of the encoder. Only works if the encoder has been frozen.
    """
    if not self.is_frozen:
        raise RuntimeError("Encoder must be frozen before summarizing.")

    # Print a summary of the encoder's configuration and learned parameters
    print("Encoder Summary:")
    print(f" - Encoder Type: {self.__class__.__name__}")
    print(f" - Labeling Type: {self.labeling_type}")
    print(f" - Categorical Encoding: {self.categorical_encoding}")
    print(f" - Numerical Scaling Info: {self.numerical_scaling_info}")
    if self.labeling_type == LabelingType.REMAINING_TIME_CLASSIFICATION:
        print(f" - Remaining Time Num Bins: {self.remaining_time_num_bins}")
    print(f" - Prefix Length: {self.prefix_length}")
    print(f" - Prefix Strategy: {self.prefix_strategy}")
    print(f" - Timestamp Format: {self.timestamp_format}")
    print(f" - Case ID Key: {self.case_id_key}")
    print(f" - Activity Key: {self.activity_key}")
    print(f" - Timestamp Key: {self.timestamp_key}")
    print(f" - Log Activities ({len(self.log_activities)}): {self.log_activities}")
    print(f" - Log Attributes ({len(self.log_attributes)}):")
    pprint.pprint(self.log_attributes)

`unscale_numerical_feature(df, feature_name)`

Reverts the scaling transformation applied to a numerical feature in a DataFrame or Series.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame \| Series`	The input data containing the scaled feature(s) to be unscaled.	required
`feature_name`	`str`	The name of the numerical feature to unscale.	required

Returns:

Name	Type	Description
`df`	`DataFrame \| Series`	The DataFrame or Series with the specified feature unscaled.

Source code in src/enc4ppm/base_encoder.py

def unscale_numerical_feature(self, df: pd.DataFrame | pd.Series, feature_name: str) -> pd.DataFrame | pd.Series:
    """
    Reverts the scaling transformation applied to a numerical feature in a DataFrame or Series.

    Args:
        df (pd.DataFrame | pd.Series): The input data containing the scaled feature(s) to be unscaled.
        feature_name (str): The name of the numerical feature to unscale.

    Returns:
        df (pd.DataFrame | pd.Series): The DataFrame or Series with the specified feature unscaled.
    """
    if self.numerical_scaling_info is None or feature_name not in self.numerical_scaling_info:
        raise ValueError(f'Feature {feature_name} has no scaling info available. Available scaling info: {self.numerical_scaling_info}')

    df = df.copy()

    if isinstance(df, pd.Series):
        return df * self.numerical_scaling_info[feature_name]['std'] + self.numerical_scaling_info[feature_name]['mean']
    elif isinstance(df, pd.DataFrame):
        if feature_name not in df.columns:
            raise ValueError(f'Feature {feature_name} not found in provided DataFrame. Available columns: {df.columns.tolist()}')

        df[feature_name] = df[feature_name] * self.numerical_scaling_info[feature_name]['std'] + self.numerical_scaling_info[feature_name]['mean']

    return df

BaseEncoder Module API Reference

BaseEncoder

load(filepath) classmethod

save(filepath)

set_remaining_time_num_bins(num_bins)

summary()

unscale_numerical_feature(df, feature_name)

`BaseEncoder`

`load(filepath)` `classmethod`

`save(filepath)`

`set_remaining_time_num_bins(num_bins)`

`summary()`

`unscale_numerical_feature(df, feature_name)`