Source code for etna.transforms.outliers.base

from abc import ABC
from abc import abstractmethod
from typing import Dict
from typing import List
from typing import Optional

import numpy as np
import pandas as pd

from etna.datasets import TSDataset
from etna.transforms.base import Transform


[docs]class OutliersTransform(Transform, ABC): """Finds outliers in specific columns of DataFrame and replaces it with NaNs.""" def __init__(self, in_column: str): """ Create instance of OutliersTransform. Parameters ---------- in_column: name of processed column """ self.in_column = in_column self.outliers_timestamps: Optional[Dict[str, List[pd.Timestamp]]] = None self.original_values: Optional[Dict[str, List[pd.Timestamp]]] = None def _save_original_values(self, ts: TSDataset): """ Save values to be replaced with NaNs. Parameters ---------- ts: original TSDataset """ if self.outliers_timestamps is None: raise ValueError("Something went wrong during outliers detection stage! Check the transform parameters.") self.original_values = dict() for segment, timestamps in self.outliers_timestamps.items(): segment_ts = ts[:, segment, :] segment_values = segment_ts[segment_ts.index.isin(timestamps)].droplevel("segment", axis=1)[self.in_column] self.original_values[segment] = segment_values
[docs] def fit(self, df: pd.DataFrame) -> "OutliersTransform": """ Find outliers using detection method. Parameters ---------- df: dataframe with series to find outliers Returns ------- result: OutliersTransform instance with saved outliers """ ts = TSDataset(df, freq=pd.infer_freq(df.index)) self.outliers_timestamps = self.detect_outliers(ts) self._save_original_values(ts) return self
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Replace found outliers with NaNs. Parameters ---------- df: transform ``in_column`` series of given dataframe Returns ------- result: pd.DataFrame dataframe with in_column series with filled with NaNs """ if self.outliers_timestamps is None: raise ValueError("Transform is not fitted! Fit the Transform before calling transform method.") result_df = df.copy() for segment in df.columns.get_level_values("segment").unique(): result_df.loc[self.outliers_timestamps[segment], pd.IndexSlice[segment, self.in_column]] = np.NaN return result_df
[docs] def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Inverse transformation. Returns back deleted values. Parameters ---------- df: data to transform Returns ------- result: pd.DataFrame data with reconstructed values """ if self.original_values is None or self.outliers_timestamps is None: raise ValueError("Transform is not fitted! Fit the Transform before calling inverse_transform method.") result = df.copy() for segment in self.original_values.keys(): segment_ts = result[segment, self.in_column] segment_ts[segment_ts.index.isin(self.outliers_timestamps[segment])] = self.original_values[segment] return result
[docs] @abstractmethod def detect_outliers(self, ts: TSDataset) -> Dict[str, List[pd.Timestamp]]: """Call function for detection outliers with self parameters. Parameters ---------- ts: dataset to process Returns ------- : dict of outliers in format {segment: [outliers_timestamps]} """ pass