Source code for etna.datasets.datasets_generation

from typing import List
from typing import Optional

import numpy as np
import pandas as pd
from numpy.random import RandomState
from statsmodels.tsa.arima_process import arma_generate_sample


[docs]def generate_ar_df( periods: int, start_time: str, ar_coef: Optional[list] = None, sigma: float = 1, n_segments: int = 1, freq: str = "1D", random_seed: int = 1, ) -> pd.DataFrame: """ Create DataFrame with AR process data. Parameters ---------- periods: number of timestamps start_time: start timestamp ar_coef: AR coefficients sigma: scale of AR noise n_segments: number of segments freq: pandas frequency string for :py:func:`pandas.date_range` that is used to generate timestamp random_seed: random seed """ if ar_coef is None: ar_coef = [1] random_sampler = RandomState(seed=random_seed).normal ar_coef = np.r_[1, -np.array(ar_coef)] ar_samples = arma_generate_sample( ar=ar_coef, ma=[1], nsample=(n_segments, periods), axis=1, distrvs=random_sampler, scale=sigma ) df = pd.DataFrame(data=ar_samples.T, columns=[f"segment_{i}" for i in range(n_segments)]) df["timestamp"] = pd.date_range(start=start_time, freq=freq, periods=periods) df = df.melt(id_vars=["timestamp"], value_name="target", var_name="segment") return df
[docs]def generate_periodic_df( periods: int, start_time: str, scale: float = 10, period: int = 1, n_segments: int = 1, freq: str = "1D", add_noise: bool = False, sigma: float = 1, random_seed: int = 1, ) -> pd.DataFrame: """ Create DataFrame with periodic data. Parameters ---------- periods: number of timestamps start_time: start timestamp scale: we sample data from Uniform[0, scale) period: data frequency -- x[i+period] = x[i] n_segments: number of segments freq: pandas frequency string for :py:func:`pandas.date_range` that is used to generate timestamp add_noise: if True we add noise to final samples sigma: scale of added noise random_seed: random seed """ samples = RandomState(seed=random_seed).randint(int(scale), size=(n_segments, period)) patterns = [list(ar) for ar in samples] df = generate_from_patterns_df( periods=periods, start_time=start_time, patterns=patterns, sigma=sigma, random_seed=random_seed, freq=freq, add_noise=add_noise, ) return df
[docs]def generate_const_df( periods: int, start_time: str, scale: float, n_segments: int = 1, freq: str = "1D", add_noise: bool = False, sigma: float = 1, random_seed: int = 1, ) -> pd.DataFrame: """ Create DataFrame with const data. Parameters ---------- periods: number of timestamps start_time: start timestamp scale: const value to fill period: data frequency -- x[i+period] = x[i] n_segments: number of segments freq: pandas frequency string for :py:func:`pandas.date_range` that is used to generate timestamp add_noise: if True we add noise to final samples sigma: scale of added noise random_seed: random seed """ patterns = [[scale] for _ in range(n_segments)] df = generate_from_patterns_df( periods=periods, start_time=start_time, patterns=patterns, sigma=sigma, random_seed=random_seed, freq=freq, add_noise=add_noise, ) return df
[docs]def generate_from_patterns_df( periods: int, start_time: str, patterns: List[List[float]], freq: str = "1D", add_noise=False, sigma: float = 1, random_seed: int = 1, ) -> pd.DataFrame: """ Create DataFrame from patterns. Parameters ---------- periods: number of timestamps start_time: start timestamp patterns: list of lists with patterns to be repeated freq: pandas frequency string for :py:func:`pandas.date_range` that is used to generate timestamp add_noise: if True we add noise to final samples sigma: scale of added noise random_seed: random seed """ n_segments = len(patterns) if add_noise: noise = RandomState(seed=random_seed).normal(scale=sigma, size=(n_segments, periods)) else: noise = np.zeros(shape=(n_segments, periods)) samples = noise for idx, pattern in enumerate(patterns): samples[idx, :] += np.array(pattern * (periods // len(pattern) + 1))[:periods] df = pd.DataFrame(data=samples.T, columns=[f"segment_{i}" for i in range(n_segments)]) df["timestamp"] = pd.date_range(start=start_time, freq=freq, periods=periods) df = df.melt(id_vars=["timestamp"], value_name="target", var_name="segment") return df