Source code for openstef_models.transforms.general.scaler
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0
"""Transform for scaling features in time series data.
This module provides data scaling functionality using various scaling methods
from scikit-learn to normalize and standardize features in time series datasets
for improved machine learning model performance.
"""
from typing import Any, Literal, override
from pydantic import Field, PrivateAttr
from openstef_core.base_model import BaseConfig
from openstef_core.datasets import TimeSeriesDataset
from openstef_core.exceptions import MissingExtraError, NotFittedError
from openstef_core.transforms import TimeSeriesTransform
from openstef_models.utils.feature_selection import FeatureSelection
try:
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler
except ImportError as e:
raise MissingExtraError("sklearn", package="openstef-models") from e
type ScalingMethod = Literal["min-max", "max-abs", "standard", "robust"]
[docs]
class Scaler(BaseConfig, TimeSeriesTransform):
"""Transform that scales time series data using various scikit-learn scaling methods.
Available methods include:
- MinMaxScaler: Scales features based on min/max of training set (between 0 and 1).
- MaxAbs: Scales features by their maximum absolute value (between -1 and 1).
- Standard: Standardizes features by removing the mean and scaling to unit variance (0 mean, 1 std).
- Robust: Scales features using statistics that are robust to outliers: median and IQR.
Example:
>>> import pandas as pd
>>> from datetime import timedelta
>>> from openstef_core.datasets import TimeSeriesDataset
>>> from openstef_models.transforms.general import Scaler
>>>
>>> # Create sample data
>>> data = pd.DataFrame({
... 'load': [100, 200, 300],
... 'temperature': [20, 25, 30]
... }, index=pd.date_range('2025-01-01', periods=3, freq='h'))
>>> dataset = TimeSeriesDataset(data, timedelta(hours=1))
>>>
>>> # Initialize and apply transform
>>> scaler = Scaler(method="standard")
>>> scaler.fit(dataset)
>>> transformed_dataset = scaler.transform(dataset)
>>> abs(float(transformed_dataset.data['load'].mean().round(6)))
0.0
>>> # use ddof=0 to get population std (as used by StandardScaler)
>>> float(transformed_dataset.data['load'].std(ddof=0).round(6))
1.0
>>> abs(float(transformed_dataset.data['temperature'].mean().round(6)))
0.0
>>> float(transformed_dataset.data['temperature'].std(ddof=0).round(6))
1.0
"""
method: ScalingMethod = Field(default="standard", description="Scaling method to use.")
selection: FeatureSelection = Field(
default=FeatureSelection.ALL,
description="Features to scale.",
)
_scaler: MinMaxScaler | MaxAbsScaler | StandardScaler | RobustScaler = PrivateAttr()
_is_fitted: bool = PrivateAttr(default=False)
@property
@override
def is_fitted(self) -> bool:
return self._is_fitted
[docs]
@override
def model_post_init(self, context: Any) -> None:
match self.method:
case "min-max":
self._scaler = MinMaxScaler()
case "max-abs":
self._scaler = MaxAbsScaler()
case "standard":
self._scaler = StandardScaler()
case "robust":
self._scaler = RobustScaler()
[docs]
@override
def fit(self, data: TimeSeriesDataset) -> None:
"""Fit the scaler to the input time series data.
Args:
data: Time series dataset.
"""
features = self.selection.resolve(data.feature_names)
self._scaler.fit(data.data[features]) # type: ignore[reportUnknownMemberType]
self._is_fitted = True
[docs]
@override
def features_added(self) -> list[str]:
return []