Source code for openstef_models.models.component_splitting.linear_component_splitter
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0
"""Linear component splitter for energy component analysis.
Provides a linear model based component splitter that splits
energy data into predefined components.
The splitter applies a pre-trained model from OpenSTEF V3.4.24 to divide total energy consumption
into three predefined components. Training is currently not supported.
"""
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Protocol, override
import joblib
import pandas as pd
from pydantic import Field
from openstef_core.datasets import EnergyComponentDataset, TimeSeriesDataset
from openstef_core.types import EnergyComponentType
from openstef_models.models.component_splitting.component_splitter import ComponentSplitter, ComponentSplitterConfig
if TYPE_CHECKING:
import numpy as np
import numpy.typing as npt
_logger = logging.getLogger(__name__)
[docs]
class LinearComponentSplitterModel(Protocol):
"""Protocol for linear component splitter model interface.
Defines the expected interface for the linear component splitter model loaded from joblib.
"""
[docs]
def predict(self, x: "pd.DataFrame | npt.NDArray[np.float64]") -> "npt.NDArray[np.float64]":
"""Predict energy components from input features.
Args:
x: Input features as dataframe or numpy array.
Returns:
Predicted components as numpy array.
"""
...
[docs]
class LinearComponentSplitterConfig(ComponentSplitterConfig):
"""Configuration for linear component splitter."""
linear_model_path: Path = Field(
default=Path(__file__).parent / "linear_component_splitter_model" / "linear_component_splitter_model.z",
description="Path to the pre-trained linear model file.",
)
radiation_column: str = Field(
default="radiation",
description="Column name in the input dataset representing radiation.",
)
windspeed_100m_column: str = Field(
default="windspeed_100m",
description="Column name in the input dataset representing windspeed at 100m.",
)
[docs]
class LinearComponentSplitter(ComponentSplitter):
"""Linear component splitter for energy data.
Provides a linear component splitter that uses a simple linear model to split
energy data into predefined components. The predefined components are:
- Wind on shore
- Solar
- Other
The splitter applies a pre-trained model from OpenSTEF V3.4.24 to divide total energy consumption
into three predefined components. Training is currently not supported.
Example:
Basic usage
>>> from openstef_core.types import EnergyComponentType
>>> config = LinearComponentSplitterConfig(
... source_column="total_load",
... components=[EnergyComponentType.SOLAR, EnergyComponentType.WIND, EnergyComponentType.OTHER],
... )
>>> splitter = LinearComponentSplitter(config)
>>> components = splitter.predict(time_series_data) # doctest: +SKIP
"""
_config: LinearComponentSplitterConfig
_model: LinearComponentSplitterModel | None
[docs]
def __init__(self, config: LinearComponentSplitterConfig) -> None:
"""Initialize the linear component splitter.
Args:
config: Configuration with model path and column names.
"""
super().__init__()
self._config = config
self._model = joblib.load(self.config.linear_model_path) # type: ignore[reportUnknownMemberType]
@property
@override
def config(self) -> LinearComponentSplitterConfig:
"""Get the splitter configuration.
Returns:
Current configuration with component ratios and settings.
"""
return self._config
@property
@override
def is_fitted(self) -> bool:
return True
def _create_input_features(self, data: TimeSeriesDataset) -> pd.DataFrame:
"""Create input features required by the linear model.
Args:
data: Input time series dataset with required columns.
Returns:
DataFrame with the 3 features needed for linear model prediction:
radiation, windspeed_100m, and total_load.
Raises:
ValueError: If required columns are missing.
"""
df = data.data
source_col = self.config.source_column
radiation_col = self.config.radiation_column
wind_col = self.config.windspeed_100m_column
# Create feature dataframe with the expected column names
input_df = pd.DataFrame(
{
"radiation": df[radiation_col],
"windspeed_100m": df[wind_col],
"total_load": df[source_col],
},
index=df.index,
)
# Drop rows with NaN values
input_df = input_df.dropna() # pyright: ignore[reportUnknownMemberType]
if input_df.empty:
error_msg = "No valid data available for component splitting after dropping NaNs"
raise ValueError(error_msg)
return input_df
[docs]
@override
def fit(self, data: TimeSeriesDataset, data_val: TimeSeriesDataset | None = None) -> None:
"""No training supported currently for linear component splitter.
The linear model is pre-trained and loaded from a file.
"""
[docs]
@override
def predict(self, data: TimeSeriesDataset) -> EnergyComponentDataset:
"""Predict energy components using the linear model.
Args:
data: Input time series dataset containing total load, radiation, and windspeed_100m.
Returns:
Energy component dataset with wind, solar, and other components.
Raises:
ValueError: If required columns are missing or model not loaded.
"""
if self._model is None:
raise ValueError("Linear model not loaded")
input_df = self._create_input_features(data)
predictions = self._model.predict(input_df)
# Create component dataframe
forecasts = pd.DataFrame(
predictions,
columns=[EnergyComponentType.WIND, EnergyComponentType.SOLAR],
index=input_df.index,
)
# Clip wind and solar components to be strictly negative
forecasts[EnergyComponentType.SOLAR] = forecasts[EnergyComponentType.SOLAR].clip(upper=0.0)
forecasts[EnergyComponentType.WIND] = forecasts[EnergyComponentType.WIND].clip(upper=0.0)
# Calculate "other" component as residual
forecasts[EnergyComponentType.OTHER] = (
input_df["total_load"] - forecasts[EnergyComponentType.SOLAR] - forecasts[EnergyComponentType.WIND]
)
# Reindex to match original input, fill missing with 0
components_df = forecasts.reindex(index=data.data.index, fill_value=0.0)
# Only return requested components
requested_components = self.config.components
components_df = components_df[[col for col in requested_components if col in components_df.columns]]
return EnergyComponentDataset(
data=components_df,
sample_interval=data.sample_interval,
)