Source code for openstef_models.models.forecasting.lgbm_forecaster

# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
#
# SPDX-License-Identifier: MPL-2.0

"""LightGBM-based forecasting models for probabilistic energy forecasting.

Provides gradient boosting tree models using LightGBM for multi-quantile energy
forecasting. Optimized for time series data with specialized loss functions and
comprehensive hyperparameter control for production forecasting workflows.
"""

from typing import TYPE_CHECKING, ClassVar, Literal, override

import numpy as np
import pandas as pd
from pydantic import Field, PrivateAttr

from openstef_core.datasets import ForecastDataset, ForecastInputDataset, TimeSeriesDataset
from openstef_core.exceptions import (
    MissingExtraError,
    NotFittedError,
)
from openstef_core.mixins import HyperParams
from openstef_core.utils.pandas import normalize_to_unit_sum
from openstef_models.explainability.mixins import ContributionsMixin, ExplainableForecaster
from openstef_models.models.forecasting.forecaster import Forecaster
from openstef_models.utils.multi_quantile_regressor import MultiQuantileRegressor

if TYPE_CHECKING:
    import numpy.typing as npt
    from lightgbm import LGBMRegressor


[docs] class LGBMHyperParams(HyperParams): """LightGBM hyperparameters for gradient boosting tree models. Example: Creating custom hyperparameters for deep trees with regularization >>> hyperparams = LGBMHyperParams( ... n_estimators=200, ... max_depth=8, ... learning_rate=0.1, ... reg_alpha=0.1, ... reg_lambda=1.0, ... ) Note: These parameters are optimized for probabilistic forecasting with quantile regression. The default objective function is specialized for magnitude-weighted pinball loss. """ # Core Tree Boosting Parameters n_estimators: int = Field( default=100, description="Number of boosting rounds/trees to fit. Higher values may improve performance but " "increase training time and risk overfitting.", ) learning_rate: float = Field( default=0.49, # 0.3 alias="eta", description="Step size shrinkage used to prevent overfitting. Range: [0,1]. Lower values require " "more boosting rounds.", ) max_depth: int = Field( default=2, # 8, description="Maximum depth of trees. Higher values capture more complex patterns but risk " "overfitting. Range: [1,∞]", ) min_child_weight: float = Field( default=1, description="Minimum sum of instance weight (hessian) needed in a child. Higher values prevent " "overfitting. Range: [0,∞]", ) min_data_in_leaf: int = Field( default=10, description="Minimum number of data points in a leaf. Higher values prevent overfitting. Range: [1,∞]", ) min_data_in_bin: int = Field( default=10, description="Minimum number of data points in a bin. Higher values prevent overfitting. Range: [1,∞]", ) # Regularization reg_alpha: float = Field( default=0, description="L1 regularization on leaf weights. Higher values increase regularization. Range: [0,∞]", ) reg_lambda: float = Field( default=1, description="L2 regularization on leaf weights. Higher values increase regularization. Range: [0,∞]", ) # Tree Structure Control num_leaves: int = Field( default=100, # 31 description="Maximum number of leaves. 0 means no limit. Only relevant when grow_policy='lossguide'.", ) max_bin: int = Field( default=256, description="Maximum number of discrete bins for continuous features. Higher values may improve accuracy but " "increase memory. Only for hist tree_method.", ) # Subsampling Parameters colsample_bytree: float = Field( default=1.0, description="Fraction of features used when constructing each tree. Range: (0,1]", )
[docs] @classmethod def forecaster_class(cls) -> "type[LGBMForecaster]": """Create a LightGBM forecaster instance from this configuration. Returns: Forecaster class associated with this configuration. """ return LGBMForecaster
MODEL_CODE_VERSION = 1
[docs] class LGBMForecaster(Forecaster, ExplainableForecaster, ContributionsMixin): """LightGBM-based forecaster for probabilistic energy forecasting. Implements gradient boosting trees using LightGBM for multi-quantile forecasting. Optimized for time series prediction with specialized loss functions and comprehensive hyperparameter control suitable for production energy forecasting. The forecaster uses a multi-output strategy where each quantile is predicted by separate trees within the same boosting ensemble. This approach provides well-calibrated uncertainty estimates while maintaining computational efficiency. Invariants: - fit() must be called before predict() to train the model - Configuration quantiles determine the number of prediction outputs - Model state is preserved across predict() calls after fitting - Input features must match training data structure during prediction Example: Basic forecasting workflow >>> from datetime import timedelta >>> from openstef_core.types import LeadTime, Quantile >>> forecaster = LGBMForecaster( ... quantiles=[Quantile(0.1), Quantile(0.5), Quantile(0.9)], ... horizons=[LeadTime(timedelta(hours=1))], ... hyperparams=LGBMHyperParams(n_estimators=100, max_depth=6), ... ) >>> forecaster.fit(training_data) # doctest: +SKIP >>> predictions = forecaster.predict(test_data) # doctest: +SKIP Note: LightGBM dependency is optional and must be installed separately. The model automatically handles multi-quantile output and uses magnitude-weighted pinball loss by default for better forecasting performance. See Also: LGBMHyperParams: Detailed hyperparameter configuration options. Forecaster: Base interface for all forecasting models. GBLinearForecaster: Alternative linear model using XGBoost. """ HyperParams: ClassVar[type[LGBMHyperParams]] = LGBMHyperParams hyperparams: LGBMHyperParams = Field(default_factory=LGBMHyperParams) device: str = Field( default="cpu", description="Device for LightGBM computation. Options: 'cpu', 'cuda', 'cuda:<ordinal>', 'gpu'", ) n_jobs: int = Field( default=1, description="Number of parallel threads for tree construction. -1 uses all available cores.", ) verbosity: Literal[-1, 0, 1, 2, 3] = Field( default=-1, description="Verbosity level. 0=silent, 1=warning, 2=info, 3=debug" ) random_state: int | None = Field( default=None, alias="seed", description="Random seed for reproducibility.", ) early_stopping_rounds: int | None = Field( default=None, description="Training stops if performance doesn't improve for this many rounds.", ) _lgbm_model: MultiQuantileRegressor = PrivateAttr() @property @override def hparams(self) -> LGBMHyperParams: return self.hyperparams
[docs] def model_post_init(self, _context: object, /) -> None: """Initialize the underlying LightGBM model from configuration. Raises: MissingExtraError: If lightgbm is not installed. """ try: from lightgbm import LGBMRegressor # noqa: PLC0415 except ImportError as e: raise MissingExtraError("lightgbm", "openstef-models") from e lgbm_params = { # Core parameters "linear_tree": False, "objective": "quantile", "n_estimators": self.hyperparams.n_estimators, "learning_rate": self.hyperparams.learning_rate, "max_depth": self.hyperparams.max_depth, "min_child_weight": self.hyperparams.min_child_weight, # Data binning "min_data_in_leaf": self.hyperparams.min_data_in_leaf, "min_data_in_bin": self.hyperparams.min_data_in_bin, # Regularization "reg_alpha": self.hyperparams.reg_alpha, "reg_lambda": self.hyperparams.reg_lambda, # Tree structure control "num_leaves": self.hyperparams.num_leaves, "max_bin": self.hyperparams.max_bin, # Subsampling "colsample_bytree": self.hyperparams.colsample_bytree, # General parameters "random_state": self.random_state, "early_stopping_rounds": self.early_stopping_rounds, "verbosity": self.verbosity, "n_jobs": self.n_jobs, } self._lgbm_model = MultiQuantileRegressor( base_learner=LGBMRegressor, # type: ignore quantile_param="alpha", hyperparams=lgbm_params, quantiles=[float(q) for q in self.quantiles], )
@property @override def is_fitted(self) -> bool: return self._lgbm_model.is_fitted @staticmethod def _prepare_fit_input(data: ForecastInputDataset) -> tuple[pd.DataFrame, np.ndarray, pd.Series]: input_data: pd.DataFrame = data.input_data() target: np.ndarray = np.asarray(data.target_series.values) sample_weight: pd.Series = data.sample_weight_series return input_data, target, sample_weight
[docs] @override def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None = None) -> None: # Prepare training data input_data, target, sample_weight = self._prepare_fit_input(data) # Evaluation sets eval_set = [(input_data, target)] sample_weight_eval_set = [sample_weight] if data_val is not None: input_data_val, target_val, sample_weight_val = self._prepare_fit_input(data_val) eval_set.append((input_data_val, target_val)) sample_weight_eval_set.append(sample_weight_val) self._lgbm_model.fit( X=input_data, y=target, feature_name=input_data.columns.tolist(), sample_weight=sample_weight, eval_set=eval_set, eval_sample_weight=sample_weight_eval_set, )
[docs] @override def predict(self, data: ForecastInputDataset) -> ForecastDataset: if not self.is_fitted: raise NotFittedError(self.__class__.__name__) input_data: pd.DataFrame = data.input_data(start=data.forecast_start) prediction: npt.NDArray[np.floating] = self._lgbm_model.predict(X=input_data) return ForecastDataset( data=pd.DataFrame( data=prediction, index=input_data.index, columns=[quantile.format() for quantile in self.quantiles], ), sample_interval=data.sample_interval, target_column=data.target_column, )
[docs] def predict_contributions(self, data: ForecastInputDataset) -> TimeSeriesDataset: """Compute SHAP feature contributions for the median quantile. Args: data: Input dataset for which to compute feature contributions. Returns: TimeSeriesDataset with per-feature SHAP values plus a bias column. Raises: NotFittedError: If the model has not been fitted. """ if not self.is_fitted: raise NotFittedError(self.__class__.__name__) input_data: pd.DataFrame = data.input_data(start=data.forecast_start) n_quantiles = len(self.quantiles) # Extract median quantile model median_idx = min(range(n_quantiles), key=lambda i: abs(float(self.quantiles[i]) - 0.5)) model: LGBMRegressor = self._lgbm_model.models[median_idx] # type: ignore # Get SHAP contributions from median quantile model (includes bias as last column) contribs: np.ndarray = model.predict(input_data, pred_contrib=True) # type: ignore columns = [*input_data.columns, "bias"] contribs_df = pd.DataFrame(contribs, index=input_data.index, columns=columns) return TimeSeriesDataset(data=contribs_df, sample_interval=data.sample_interval)
@property @override def feature_importances(self) -> pd.DataFrame: models: list[LGBMRegressor] = self._lgbm_model.models # type: ignore weights_df = pd.DataFrame( [models[i].feature_importances_ for i in range(len(models))], index=[quantile.format() for quantile in self.quantiles], columns=self._lgbm_model.model_feature_names if self._lgbm_model.has_feature_names else None, ).transpose() weights_df.index.name = "feature_name" weights_df.columns.name = "quantiles" return weights_df.pipe(normalize_to_unit_sum)
__all__ = ["LGBMForecaster", "LGBMHyperParams"]