Source code for etna.transforms.missing_values.resample
import warnings
from typing import List
from typing import Optional
import pandas as pd
from etna.transforms.base import PerSegmentWrapper
from etna.transforms.base import Transform
[docs]class _OneSegmentResampleWithDistributionTransform(Transform):
"""_OneSegmentResampleWithDistributionTransform resamples the given column using the distribution of the other column."""
def __init__(self, in_column: str, distribution_column: str, inplace: bool, out_column: Optional[str]):
"""
Init _OneSegmentResampleWithDistributionTransform.
Parameters
----------
in_column:
name of column to be resampled
distribution_column:
name of column to obtain the distribution from
inplace:
* if True, apply resampling inplace to in_column,
* if False, add transformed column to dataset
out_column:
name of added column. If not given, use ``self.__repr__()``
"""
self.in_column = in_column
self.distribution_column = distribution_column
self.inplace = inplace
self.out_column = out_column
self.distribution: pd.DataFrame = None
def _get_folds(self, df: pd.DataFrame) -> List[int]:
"""
Generate fold number for each timestamp of the dataframe.
Here the ``in_column`` frequency gap is divided into the folds with the size of dataset frequency gap.
"""
in_column_index = df[self.in_column].dropna().index
if len(in_column_index) <= 1 or (len(in_column_index) >= 3 and not pd.infer_freq(in_column_index)):
raise ValueError(
"Can not infer in_column frequency!"
"Check that in_column frequency is compatible with dataset frequency."
)
in_column_freq = in_column_index[1] - in_column_index[0]
dataset_freq = df.index[1] - df.index[0]
n_folds_per_gap = in_column_freq // dataset_freq
n_periods = len(df) // n_folds_per_gap + 2
in_column_start_index = in_column_index[0]
left_tie_len = len(df[:in_column_start_index]) - 1
right_tie_len = len(df[in_column_start_index:])
folds_for_left_tie = [fold for fold in range(n_folds_per_gap - left_tie_len, n_folds_per_gap)]
folds_for_right_tie = [fold for _ in range(n_periods) for fold in range(n_folds_per_gap)][:right_tie_len]
return folds_for_left_tie + folds_for_right_tie
[docs] def fit(self, df: pd.DataFrame) -> "_OneSegmentResampleWithDistributionTransform":
"""
Obtain the resampling frequency and distribution from ``distribution_column``.
Parameters
----------
df:
dataframe with data to fit the transform.
Returns
-------
:
"""
df = df[[self.in_column, self.distribution_column]]
df["fold"] = self._get_folds(df=df)
self.distribution = df[["fold", self.distribution_column]].groupby("fold").sum().reset_index()
self.distribution[self.distribution_column] /= self.distribution[self.distribution_column].sum()
self.distribution.rename(columns={self.distribution_column: "distribution"}, inplace=True)
self.distribution.columns.name = None
return self
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Resample the `in_column` using the distribution of `distribution_column`.
Parameters
----------
df
dataframe with data to transform.
Returns
-------
:
result dataframe
"""
df["fold"] = self._get_folds(df)
df = df.reset_index().merge(self.distribution, on="fold").set_index("timestamp").sort_index()
df[self.out_column] = df[self.in_column].ffill() * df["distribution"]
df = df.drop(["fold", "distribution"], axis=1)
return df
[docs]class ResampleWithDistributionTransform(PerSegmentWrapper):
"""ResampleWithDistributionTransform resamples the given column using the distribution of the other column.
Warning
-------
This transform can suffer from look-ahead bias. For transforming data at some timestamp
it uses information from the whole train part.
"""
def __init__(
self, in_column: str, distribution_column: str, inplace: bool = True, out_column: Optional[str] = None
):
"""
Init ResampleWithDistributionTransform.
Parameters
----------
in_column:
name of column to be resampled
distribution_column:
name of column to obtain the distribution from
inplace:
* if True, apply resampling inplace to in_column,
* if False, add transformed column to dataset
out_column:
name of added column. If not given, use ``self.__repr__()``
"""
self.in_column = in_column
self.distribution_column = distribution_column
self.inplace = inplace
self.out_column = self._get_out_column(out_column)
super().__init__(
transform=_OneSegmentResampleWithDistributionTransform(
in_column=in_column,
distribution_column=distribution_column,
inplace=inplace,
out_column=self.out_column,
)
)
def _get_out_column(self, out_column: Optional[str]) -> str:
"""Get the `out_column` depending on the transform's parameters."""
if self.inplace and out_column:
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
if self.inplace:
return self.in_column
if out_column:
return out_column
return self.__repr__()