Source code for etna.ensembles.stacking_ensemble

import warnings
from copy import deepcopy
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Set
from typing import Tuple
from typing import Union

import numpy as np
import pandas as pd
from joblib import Parallel
from joblib import delayed
from sklearn.base import RegressorMixin
from sklearn.linear_model import LinearRegression
from typing_extensions import Literal

from etna.datasets import TSDataset
from etna.ensembles import EnsembleMixin
from etna.loggers import tslogger
from etna.metrics import MAE
from etna.pipeline.base import BasePipeline


[docs]class StackingEnsemble(BasePipeline, EnsembleMixin): """StackingEnsemble is a pipeline that forecast future using the metamodel to combine the forecasts of the base models. Examples -------- >>> from etna.datasets import generate_ar_df >>> from etna.datasets import TSDataset >>> from etna.ensembles import VotingEnsemble >>> from etna.models import NaiveModel >>> from etna.models import MovingAverageModel >>> from etna.pipeline import Pipeline >>> import pandas as pd >>> pd.options.display.float_format = '{:,.2f}'.format >>> df = generate_ar_df(periods=100, start_time="2021-06-01", ar_coef=[0.8], n_segments=3) >>> df_ts_format = TSDataset.to_dataset(df) >>> ts = TSDataset(df_ts_format, "D") >>> ma_pipeline = Pipeline(model=MovingAverageModel(window=5), transforms=[], horizon=7) >>> naive_pipeline = Pipeline(model=NaiveModel(lag=10), transforms=[], horizon=7) >>> ensemble = StackingEnsemble(pipelines=[ma_pipeline, naive_pipeline]) >>> _ = ensemble.fit(ts=ts) >>> forecast = ensemble.forecast() >>> forecast[:,:,"target"] segment segment_0 segment_1 segment_2 feature target target target timestamp 2021-09-09 0.70 1.47 0.20 2021-09-10 0.62 1.53 0.26 2021-09-11 0.50 1.78 0.36 2021-09-12 0.37 1.88 0.21 2021-09-13 0.46 1.87 0.25 2021-09-14 0.44 1.49 0.21 2021-09-15 0.36 1.56 0.30 """ def __init__( self, pipelines: List[BasePipeline], final_model: RegressorMixin = LinearRegression(), n_folds: int = 3, features_to_use: Union[None, Literal["all"], List[str]] = None, n_jobs: int = 1, joblib_params: Optional[Dict[str, Any]] = None, ): """Init StackingEnsemble. Parameters ---------- pipelines: List of pipelines that should be used in ensemble. final_model: Regression model with fit/predict interface which will be used to combine the base estimators. n_folds: Number of folds to use in the backtest. Backtest is not used for model evaluation but for prediction. features_to_use: Features except the forecasts of the base models to use in the ``final_model``. n_jobs: Number of jobs to run in parallel. joblib_params: Additional parameters for :py:class:`joblib.Parallel`. Raises ------ ValueError: If the number of the pipelines is less than 2 or pipelines have different horizons. """ self._validate_pipeline_number(pipelines=pipelines) self.pipelines = pipelines self.final_model = final_model self._validate_backtest_n_folds(n_folds) self.n_folds = n_folds self.features_to_use = features_to_use self.filtered_features_for_final_model: Union[None, Set[str]] = None self.n_jobs = n_jobs if joblib_params is None: self.joblib_params = dict(verbose=11, backend="multiprocessing", mmap_mode="c") else: self.joblib_params = joblib_params super().__init__(horizon=self._get_horizon(pipelines=pipelines)) def _filter_features_to_use(self, forecasts: List[TSDataset]) -> Union[None, Set[str]]: """Return all the features from ``features_to_use`` which can be obtained from base models' forecasts.""" features_df = pd.concat([forecast.df for forecast in forecasts], axis=1) available_features = set(features_df.columns.get_level_values("feature")) - {"fold_number"} features_to_use = self.features_to_use if features_to_use is None: return None elif features_to_use == "all": return available_features - {"target"} elif isinstance(features_to_use, list): features_to_use_unique = set(features_to_use) if len(features_to_use_unique) == 0: return None elif features_to_use_unique.issubset(available_features): return features_to_use_unique else: unavailable_features = features_to_use_unique - available_features warnings.warn(f"Features {unavailable_features} are not found and will be dropped!") return features_to_use_unique.intersection(available_features) else: warnings.warn( "Feature list is passed in the wrong format." "Only the base models' forecasts will be used for the final forecast." ) return None def _backtest_pipeline(self, pipeline: BasePipeline, ts: TSDataset) -> TSDataset: """Get forecasts from backtest for given pipeline.""" with tslogger.disable(): _, forecasts, _ = pipeline.backtest(ts=ts, metrics=[MAE()], n_folds=self.n_folds) forecasts = TSDataset(df=forecasts, freq=ts.freq) return forecasts
[docs] def fit(self, ts: TSDataset) -> "StackingEnsemble": """Fit the ensemble. Parameters ---------- ts: TSDataset to fit ensemble. Returns ------- self: Fitted ensemble. """ self.ts = ts # Get forecasts from base models on backtest to fit the final model on forecasts = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( delayed(self._backtest_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines ) # Fit the final model self.filtered_features_for_final_model = self._filter_features_to_use(forecasts) x, y = self._make_features(forecasts=forecasts, train=True) self.final_model.fit(x, y) # Fit the base models self.pipelines = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( delayed(self._fit_pipeline)(pipeline=pipeline, ts=deepcopy(ts)) for pipeline in self.pipelines ) return self
def _make_features( self, forecasts: List[TSDataset], train: bool = False ) -> Tuple[pd.DataFrame, Optional[pd.Series]]: """Prepare features for the ``final_model``.""" if self.ts is None: raise ValueError("StackingEnsemble is not fitted! Fit the StackingEnsemble before calling forecast method.") # Stack targets from the forecasts targets = [ forecast[:, :, "target"].rename({"target": f"regressor_target_{i}"}, axis=1) for i, forecast in enumerate(forecasts) ] targets = pd.concat(targets, axis=1) # Get features from filtered_features_for_final_model features = pd.DataFrame() if self.filtered_features_for_final_model is not None: features_in_forecasts = [ list( set(forecast.columns.get_level_values("feature")).intersection( self.filtered_features_for_final_model ) ) for forecast in forecasts ] features = pd.concat( [forecast[:, :, features_in_forecasts[i]] for i, forecast in enumerate(forecasts)], axis=1 ) features = features.loc[:, ~features.columns.duplicated()] features_df = pd.concat([features, targets], axis=1) # Flatten the features to fit the sklearn interface x = pd.concat([features_df.loc[:, segment] for segment in self.ts.segments], axis=0) if train: y = pd.concat( [ self.ts[forecasts[0].index.min() : forecasts[0].index.max(), segment, "target"] for segment in self.ts.segments ], axis=0, ) return x, y else: return x, None def _forecast(self) -> TSDataset: """Make predictions. Compute the combination of pipelines' forecasts using ``final_model`` """ if self.ts is None: raise ValueError("Something went wrong, ts is None!") # Get forecast forecasts = Parallel(n_jobs=self.n_jobs, **self.joblib_params)( delayed(self._forecast_pipeline)(pipeline=pipeline) for pipeline in self.pipelines ) x, _ = self._make_features(forecasts=forecasts, train=False) y = self.final_model.predict(x).reshape(-1, self.horizon).T # Format the forecast into TSDataset segment_col = [segment for segment in self.ts.segments for _ in range(self.horizon)] x.loc[:, "segment"] = segment_col x.loc[:, "timestamp"] = x.index.values df_exog = TSDataset.to_dataset(x) df = forecasts[0][:, :, "target"].copy() df.loc[pd.IndexSlice[:], pd.IndexSlice[:, "target"]] = np.NAN forecast = TSDataset(df=df, freq=self.ts.freq, df_exog=df_exog) forecast.loc[pd.IndexSlice[:], pd.IndexSlice[:, "target"]] = y return forecast