Source code for etna.transforms.missing_values.imputation
from enum import Enum
from typing import List
from typing import Optional
import numpy as np
import pandas as pd
from etna.transforms.base import PerSegmentWrapper
from etna.transforms.base import Transform
[docs]class ImputerMode(str, Enum):
"""Enum for different imputation strategy."""
zero = "zero"
mean = "mean"
running_mean = "running_mean"
forward_fill = "forward_fill"
seasonal = "seasonal"
[docs]class _OneSegmentTimeSeriesImputerTransform(Transform):
"""One segment version of transform to fill NaNs in series of a given dataframe.
- It is assumed that given series begins with first non NaN value.
- This transform can't fill NaNs in the future, only on train data.
- This transform can't fill NaNs if all values are NaNs. In this case exception is raised.
"""
def __init__(self, in_column: str, strategy: str, window: int, seasonality: int, default_value: Optional[float]):
"""
Create instance of _OneSegmentTimeSeriesImputerTransform.
Parameters
----------
in_column:
name of processed column
strategy:
filling value in missing timestamps:
- If "zero", then replace missing dates with zeros
- If "mean", then replace missing dates using the mean in fit stage.
- If "running_mean" then replace missing dates using mean of subset of data
- If "forward_fill" then replace missing dates using last existing value
- If "seasonal" then replace missing dates using seasonal moving average
window:
In case of moving average and seasonality.
* If ``window=-1`` all previous dates are taken in account
* Otherwise only window previous dates
seasonality:
the length of the seasonality
default_value:
value which will be used to impute the NaNs left after applying the imputer with the chosen strategy
Raises
------
ValueError:
if incorrect strategy given
"""
self.in_column = in_column
self.strategy = ImputerMode(strategy)
self.window = window
self.seasonality = seasonality
self.default_value = default_value
self.fill_value: Optional[int] = None
self.nan_timestamps: Optional[List[pd.Timestamp]] = None
[docs] def fit(self, df: pd.DataFrame) -> "_OneSegmentTimeSeriesImputerTransform":
"""
Fit preprocess params.
Parameters
----------
df: pd.DataFrame
dataframe with series to fit preprocess params with
Returns
-------
self: _OneSegmentTimeSeriesImputerTransform
fitted preprocess
"""
raw_series = df[self.in_column]
if np.all(raw_series.isna()):
raise ValueError("Series hasn't non NaN values which means it is empty and can't be filled.")
series = raw_series[raw_series.first_valid_index() :]
self.nan_timestamps = series[series.isna()].index
if self.strategy == ImputerMode.zero:
self.fill_value = 0
elif self.strategy == ImputerMode.mean:
self.fill_value = series.mean()
return self
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform given series.
Parameters
----------
df: pd.Dataframe
transform ``in_column`` series of given dataframe
Returns
-------
result: pd.DataFrame
dataframe with in_column series with filled gaps
"""
result_df = df.copy()
cur_nans = result_df[result_df[self.in_column].isna()].index
result_df[self.in_column] = self._fill(result_df[self.in_column])
# restore nans not in self.nan_timestamps
restore_nans = cur_nans.difference(self.nan_timestamps)
result_df.loc[restore_nans, self.in_column] = np.nan
return result_df
[docs] def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Inverse transform dataframe.
Parameters
----------
df: pd.Dataframe
inverse transform ``in_column`` series of given dataframe
Returns
-------
result: pd.DataFrame
dataframe with in_column series with initial values
"""
result_df = df.copy()
index = result_df.index.intersection(self.nan_timestamps)
result_df.loc[index, self.in_column] = np.nan
return result_df
def _fill(self, df: pd.Series) -> pd.Series:
"""
Create new Series taking all previous dates and adding missing dates.
Fills missed values for new dates according to ``self.strategy``
Parameters
----------
df: pd.Series
series to fill
Returns
-------
result: pd.Series
"""
if self.nan_timestamps is None:
raise ValueError("Trying to apply the unfitted transform! First fit the transform.")
if self.strategy == ImputerMode.zero or self.strategy == ImputerMode.mean:
df = df.fillna(value=self.fill_value)
elif self.strategy == ImputerMode.forward_fill:
df = df.fillna(method="ffill")
elif self.strategy == ImputerMode.running_mean or self.strategy == ImputerMode.seasonal:
history = self.seasonality * self.window if self.window != -1 else len(df)
timestamps = list(df.index)
for timestamp in self.nan_timestamps:
i = timestamps.index(timestamp)
indexes = np.arange(i - self.seasonality, i - self.seasonality - history, -self.seasonality)
indexes = indexes[indexes >= 0]
df.iloc[i] = np.nanmean(df.iloc[indexes])
if self.default_value:
df = df.fillna(value=self.default_value)
return df
[docs]class TimeSeriesImputerTransform(PerSegmentWrapper):
"""Transform to fill NaNs in series of a given dataframe.
- It is assumed that given series begins with first non NaN value.
- This transform can't fill NaNs in the future, only on train data.
- This transform can't fill NaNs if all values are NaNs. In this case exception is raised.
Warning
-------
This transform can suffer from look-ahead bias in 'mean' mode. For transforming data at some timestamp
it uses information from the whole train part.
"""
def __init__(
self,
in_column: str = "target",
strategy: str = ImputerMode.zero,
window: int = -1,
seasonality: int = 1,
default_value: Optional[float] = None,
):
"""
Create instance of TimeSeriesImputerTransform.
Parameters
----------
in_column:
name of processed column
strategy:
filling value in missing timestamps:
- If "zero", then replace missing dates with zeros
- If "mean", then replace missing dates using the mean in fit stage.
- If "running_mean" then replace missing dates using mean of subset of data
- If "forward_fill" then replace missing dates using last existing value
- If "seasonal" then replace missing dates using seasonal moving average
window:
In case of moving average and seasonality.
* If ``window=-1`` all previous dates are taken in account
* Otherwise only window previous dates
seasonality:
the length of the seasonality
default_value:
value which will be used to impute the NaNs left after applying the imputer with the chosen strategy
Raises
------
ValueError:
if incorrect strategy given
"""
self.in_column = in_column
self.strategy = strategy
self.window = window
self.seasonality = seasonality
self.default_value = default_value
super().__init__(
transform=_OneSegmentTimeSeriesImputerTransform(
in_column=self.in_column,
strategy=self.strategy,
window=self.window,
seasonality=self.seasonality,
default_value=self.default_value,
)
)
__all__ = ["TimeSeriesImputerTransform"]