Source code for etna.transforms.encoders.categorical
from enum import Enum
from typing import Optional
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.utils._encode import _check_unknown
from sklearn.utils._encode import _encode
from etna.datasets import TSDataset
from etna.transforms.base import Transform
[docs]class ImputerMode(str, Enum):
"""Enum for different imputation strategy."""
new_value = "new_value"
mean = "mean"
none = "none"
[docs]class _LabelEncoder(preprocessing.LabelEncoder):
[docs] def transform(self, y: pd.Series, strategy: str):
diff = _check_unknown(y, known_values=self.classes_)
index = np.where(np.isin(y, diff))[0]
encoded = _encode(y, uniques=self.classes_, check_unknown=False).astype(float)
if strategy == ImputerMode.none:
filling_value = None
elif strategy == ImputerMode.new_value:
filling_value = -1
elif strategy == ImputerMode.mean:
filling_value = np.mean(encoded[~np.isin(y, diff)])
else:
raise ValueError(f"The strategy '{strategy}' doesn't exist")
encoded[index] = filling_value
return encoded
[docs]class LabelEncoderTransform(Transform):
"""Encode categorical feature with value between 0 and n_classes-1."""
def __init__(self, in_column: str, out_column: Optional[str] = None, strategy: str = ImputerMode.mean):
"""
Init LabelEncoderTransform.
Parameters
----------
in_column:
Name of column to be transformed
out_column:
Name of added column. If not given, use ``self.__repr__()``
strategy:
Filling encoding in not fitted values:
- If "new_value", then replace missing values with '-1'
- If "mean", then replace missing values using the mean in encoded column
- If "none", then replace missing values with None
"""
self.in_column = in_column
self.out_column = out_column
self.strategy = strategy
self.le = _LabelEncoder()
[docs] def fit(self, df: pd.DataFrame) -> "LabelEncoderTransform":
"""
Fit Label encoder.
Parameters
----------
df:
Dataframe with data to fit the transform
Returns
-------
:
Fitted transform
"""
y = TSDataset.to_flatten(df)[self.in_column]
self.le.fit(y=y)
return self
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Encode the ``in_column`` by fitted Label encoder.
Parameters
----------
df
Dataframe with data to transform
Returns
-------
:
Dataframe with column with encoded values
"""
out_column = self._get_column_name()
result_df = TSDataset.to_flatten(df)
result_df[out_column] = self.le.transform(result_df[self.in_column], self.strategy)
result_df[out_column] = result_df[out_column].astype("category")
result_df = TSDataset.to_dataset(result_df)
return result_df
def _get_column_name(self) -> str:
"""Get the ``out_column`` depending on the transform's parameters."""
if self.out_column:
return self.out_column
return self.__repr__()
[docs]class OneHotEncoderTransform(Transform):
"""Encode categorical feature as a one-hot numeric features.
If unknown category is encountered during transform, the resulting one-hot
encoded columns for this feature will be all zeros.
"""
def __init__(self, in_column: str, out_column: Optional[str] = None):
"""
Init OneHotEncoderTransform.
Parameters
----------
in_column:
Name of column to be encoded
out_column:
Prefix of names of added columns. If not given, use ``self.__repr__()``
"""
self.in_column = in_column
self.out_column = out_column
self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
[docs] def fit(self, df: pd.DataFrame) -> "OneHotEncoderTransform":
"""
Fit One Hot encoder.
Parameters
----------
df:
Dataframe with data to fit the transform
Returns
-------
:
Fitted transform
"""
x = TSDataset.to_flatten(df)[self.in_column].values.reshape(-1, 1)
self.ohe.fit(X=x)
return self
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Encode the `in_column` by fitted One Hot encoder.
Parameters
----------
df
Dataframe with data to transform
Returns
-------
:
Dataframe with column with encoded values
"""
out_column = self._get_column_name()
out_columns = [out_column + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]
result_df = TSDataset.to_flatten(df)
x = result_df[self.in_column].values.reshape(-1, 1)
result_df[out_columns] = self.ohe.transform(X=x)
result_df[out_columns] = result_df[out_columns].astype("category")
result_df = TSDataset.to_dataset(result_df)
return result_df
def _get_column_name(self) -> str:
"""Get the ``out_column`` depending on the transform's parameters."""
if self.out_column:
return self.out_column
return self.__repr__()