Source code for openstef_models.models.forecasting.xgboost_forecaster

# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0

"""XGBoost-based forecasting models for probabilistic energy forecasting.

Provides gradient boosting tree models using XGBoost for multi-quantile energy
forecasting. Optimized for time series data with specialized loss functions and
comprehensive hyperparameter control for production forecasting workflows.
"""

from typing import Annotated, ClassVar, Literal, override

import numpy as np
import pandas as pd
from pydantic import Field, PrivateAttr
from sklearn.preprocessing import StandardScaler

from openstef_core.datasets import ForecastDataset, ForecastInputDataset, TimeSeriesDataset
from openstef_core.exceptions import MissingExtraError, NotFittedError
from openstef_core.mixins.param_ranges import CategoricalRange, FloatRange, IntRange
from openstef_core.mixins.predictor import HyperParams
from openstef_core.utils.pandas import normalize_to_unit_sum
from openstef_models.explainability.mixins import ContributionsMixin, ExplainableForecaster
from openstef_models.models.forecasting.forecaster import Forecaster
from openstef_models.utils.evaluation_functions import EvaluationFunctionType, get_evaluation_function
from openstef_models.utils.loss_functions import (
    ObjectiveFunctionType,
    get_objective_function,
    xgb_prepare_target_for_objective,
)
from openstef_models.utils.xgboost import get_median_shap_contribs

try:
    import xgboost as xgb
except ImportError as e:
    raise MissingExtraError("xgboost", "openstef-models") from e


[docs] class XGBoostHyperParams(HyperParams): """XGBoost hyperparameters for gradient boosting tree models. Configures tree-specific parameters for XGBoost gbtree booster. Provides control over model complexity, regularization, and training behavior for energy forecasting tasks. These parameters control tree structure, learning rates, regularization, and sampling strategies. Proper tuning is essential for balancing model performance and overfitting prevention in time series forecasting. Example: Creating custom hyperparameters for deep trees with regularization >>> hyperparams = XGBoostHyperParams( ... n_estimators=200, ... max_depth=8, ... learning_rate=0.1, ... reg_alpha=0.1, ... reg_lambda=1.0, ... subsample=0.8, ... ) Note: These parameters are optimized for probabilistic forecasting with quantile regression. The default objective function is specialized for magnitude-weighted pinball loss. """ # Core Tree Boosting Parameters n_estimators: Annotated[int, IntRange(50, 500)] = Field( default=100, description="Number of boosting rounds/trees to fit. Higher values may improve performance but " "increase training time and risk overfitting.", ) learning_rate: Annotated[float, FloatRange(0.01, 0.5, log=True)] = Field( default=0.3, alias="eta", description="Step size shrinkage used to prevent overfitting. Range: [0,1]. Lower values require " "more boosting rounds.", ) max_depth: Annotated[int, IntRange(1, 15)] = Field( default=6, description="Maximum depth of trees. Higher values capture more complex patterns but risk " "overfitting. Range: [1,∞]", ) min_child_weight: Annotated[float, FloatRange(1.0, 10.0)] = Field( default=1, description="Minimum sum of instance weight (hessian) needed in a child. Higher values prevent " "overfitting. Range: [0,∞]", ) gamma: Annotated[float, FloatRange(0.0, 5.0)] = Field( default=0, alias="min_split_loss", description="Minimum loss reduction required to make a split. Higher values make algorithm more " "conservative. Range: [0,∞]", ) objective: ObjectiveFunctionType = Field( default="pinball_loss", description="Objective function for training. 'pinball_loss' is recommended for probabilistic forecasting.", ) evaluation_metric: EvaluationFunctionType = Field( default="mean_pinball_loss", description="Metric used for evaluation during training. Defaults to 'mean_pinball_loss' " "for quantile regression.", ) # Regularization reg_alpha: Annotated[float, FloatRange(1e-8, 10.0, log=True)] = Field( default=0, description="L1 regularization on leaf weights. Higher values increase regularization. Range: [0,∞]" ) reg_lambda: Annotated[float, FloatRange(1e-8, 10.0, log=True)] = Field( default=1, description="L2 regularization on leaf weights. Higher values increase regularization. Range: [0,∞]" ) max_delta_step: float = Field( default=0, description="Maximum delta step allowed for leaf weight estimation. Useful for logistic regression with " "imbalanced classes. Range: [0,∞]", ) # Tree Structure Control max_leaves: int = Field( default=0, description="Maximum number of leaves. 0 means no limit. Only relevant when grow_policy='lossguide'." ) grow_policy: Annotated[Literal["depthwise", "lossguide"], CategoricalRange(("depthwise", "lossguide"))] = Field( default="depthwise", description="Controls how new nodes are added. 'depthwise' grows level by level, 'lossguide' adds leaves " "with highest loss reduction.", ) max_bin: int = Field( default=256, description="Maximum number of discrete bins for continuous features. Higher values may improve accuracy but " "increase memory. Only for hist tree_method.", ) num_parallel_trees: int = Field( default=1, description="Number of trees to grow per round. Higher values increase model complexity and training time. " "Range: [1,∞]", ) # Subsampling Parameters subsample: Annotated[float, FloatRange(0.5, 1.0)] = Field( default=1.0, description="Fraction of training samples used for each tree. Lower values prevent overfitting. Range: (0,1]", ) colsample_bytree: Annotated[float, FloatRange(0.5, 1.0)] = Field( default=1.0, description="Fraction of features used when constructing each tree. Range: (0,1]" ) colsample_bylevel: float = Field( default=1.0, description="Fraction of features used for each level within a tree. Range: (0,1]" ) colsample_bynode: float = Field( default=1.0, description="Fraction of features used for each split/node. Range: (0,1]" ) # Tree Construction Method tree_method: Annotated[ Literal["auto", "exact", "hist", "approx", "gpu_hist"], CategoricalRange(("auto", "hist", "approx")), ] = Field( default="auto", description="Tree construction algorithm. 'hist' is fastest for large datasets, 'exact' for small " "datasets, 'approx' is deprecated.", ) # General Parameters random_state: int | None = Field( default=42, description="Random seed for reproducibility. Controls tree structure randomness." ) early_stopping_rounds: int | None = Field( default=None, description="Training will stop if performance doesn't improve for this many rounds. Requires validation data.", ) use_target_scaling: bool = Field( default=True, description="Whether to apply standard scaling to the target variable before training. Improves convergence.", )
[docs] @classmethod def forecaster_class(cls) -> "type[XGBoostForecaster]": """Get the forecaster class for these hyperparams. Returns: Forecaster class associated with this configuration. """ return XGBoostForecaster
MODEL_CODE_VERSION = 1
[docs] class XGBoostForecaster(Forecaster, ExplainableForecaster, ContributionsMixin): """XGBoost-based forecaster for probabilistic energy forecasting. Implements gradient boosting trees using XGBoost for multi-quantile forecasting. Optimized for time series prediction with specialized loss functions and hyperparameter control suitable for production energy forecasting. The forecaster uses a multi-output strategy where each quantile is predicted by separate trees within the same boosting ensemble. This approach provides well-calibrated uncertainty estimates while maintaining computational efficiency. Invariants: - fit() must be called before predict() to train the model - Configuration quantiles determine the number of prediction outputs - Model state is preserved across predict() calls after fitting - Input features must match training data structure during prediction Example: Basic forecasting workflow >>> from datetime import timedelta >>> from openstef_core.types import LeadTime, Quantile >>> forecaster = XGBoostForecaster( ... quantiles=[Quantile(0.1), Quantile(0.5), Quantile(0.9)], ... horizons=[LeadTime(timedelta(hours=1))], ... hyperparams=XGBoostHyperParams(n_estimators=100, max_depth=6), ... ) >>> forecaster.fit(training_data) # doctest: +SKIP >>> predictions = forecaster.predict(test_data) # doctest: +SKIP Note: XGBoost dependency is optional and must be installed separately. The model automatically handles multi-quantile output and uses magnitude-weighted pinball loss by default for better forecasting performance. See Also: XGBoostHyperParams: Detailed hyperparameter configuration options. Forecaster: Base interface for all forecasting models. GBLinearForecaster: Alternative linear model using XGBoost. """ HyperParams: ClassVar[type[XGBoostHyperParams]] = XGBoostHyperParams hyperparams: XGBoostHyperParams = Field(default_factory=XGBoostHyperParams) device: str = Field( default="cpu", description="Device for XGBoost computation. Options: 'cpu', 'cuda', 'cuda:<ordinal>', 'gpu'" ) n_jobs: int = Field( default=1, description="Number of parallel threads for tree construction. -1 uses all available cores." ) verbosity: Literal[0, 1, 2, 3, True] = Field( default=1, description="Verbosity level. 0=silent, 1=warning, 2=info, 3=debug" ) _xgboost_model: xgb.XGBRegressor = PrivateAttr() _target_scaler: StandardScaler | None = PrivateAttr() @property @override def hparams(self) -> XGBoostHyperParams: return self.hyperparams
[docs] def model_post_init(self, _context: object, /) -> None: """Initialize the underlying XGBoost model from configuration.""" self._xgboost_model = xgb.XGBRegressor( # Multi-output configuration multi_strategy="one_output_per_tree", # Core parameters for forecasting n_estimators=self.hyperparams.n_estimators, learning_rate=self.hyperparams.learning_rate, max_depth=self.hyperparams.max_depth, min_child_weight=self.hyperparams.min_child_weight, gamma=self.hyperparams.gamma, # Regularization parameters reg_alpha=self.hyperparams.reg_alpha, reg_lambda=self.hyperparams.reg_lambda, max_delta_step=self.hyperparams.max_delta_step, # Tree structure control max_leaves=self.hyperparams.max_leaves, grow_policy=self.hyperparams.grow_policy, max_bin=self.hyperparams.max_bin, num_parallel_trees=self.hyperparams.num_parallel_trees, # Subsampling parameters subsample=self.hyperparams.subsample, colsample_bytree=self.hyperparams.colsample_bytree, colsample_bylevel=self.hyperparams.colsample_bylevel, colsample_bynode=self.hyperparams.colsample_bynode, # Tree construction method tree_method=self.hyperparams.tree_method, # General parameters random_state=self.hyperparams.random_state, device=self.device, n_jobs=self.n_jobs, verbosity=self.verbosity, # Early stopping handled in fit method early_stopping_rounds=self.hyperparams.early_stopping_rounds, # Objective objective=get_objective_function(function_type=self.hyperparams.objective, quantiles=self.quantiles), eval_metric=get_evaluation_function( function_type=self.hyperparams.evaluation_metric, quantiles=self.quantiles ), disable_default_eval_metric=True, ) self._target_scaler = StandardScaler() if self.hyperparams.use_target_scaling else None
@property @override def is_fitted(self) -> bool: return self._xgboost_model.__sklearn_is_fitted__() def _prepare_fit_input(self, data: ForecastInputDataset) -> tuple[pd.DataFrame, np.ndarray, pd.Series]: input_data: pd.DataFrame = data.input_data() # Scale the target variable target: np.ndarray = np.asarray(data.target_series.values) if self._target_scaler is not None: target = self._target_scaler.transform(target.reshape(-1, 1)).flatten() # Reshape target for multi-quantile objectives target = xgb_prepare_target_for_objective( target=target, quantiles=self.quantiles, objective=self.hyperparams.objective, ) # Prepare sample weights sample_weight: pd.Series = data.sample_weight_series return input_data, target, sample_weight
[docs] @override def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None = None) -> None: # Fit the target scaler target: np.ndarray = np.asarray(data.target_series.values) if self._target_scaler is not None: self._target_scaler.fit(target.reshape(-1, 1)) # Prepare training data input_data, target, sample_weight = self._prepare_fit_input(data) # Evaluation sets eval_set = [(input_data, target)] sample_weight_eval_set = [sample_weight] if data_val is not None: input_data_val, target_val, sample_weight_val = self._prepare_fit_input(data_val) eval_set.append((input_data_val, target_val)) sample_weight_eval_set.append(sample_weight_val) self._xgboost_model.fit( X=input_data, y=target, sample_weight=sample_weight, eval_set=eval_set, sample_weight_eval_set=sample_weight_eval_set, verbose=self.verbosity, )
[docs] @override def predict(self, data: ForecastInputDataset) -> ForecastDataset: if not self.is_fitted: raise NotFittedError(self.__class__.__name__) # Get input features for prediction input_data: pd.DataFrame = data.input_data(start=data.forecast_start) # Generate predictions predictions_array: np.ndarray = self._xgboost_model.predict(input_data).reshape(-1, len(self.quantiles)) # Inverse transform the scaled predictions if self._target_scaler is not None and len(predictions_array) > 0: predictions_array = self._target_scaler.inverse_transform(predictions_array) return ForecastDataset.from_quantile_predictions( predictions_array, input_data.index, self.quantiles, data.sample_interval, target_column=data.target_column, )
[docs] def predict_contributions(self, data: ForecastInputDataset) -> TimeSeriesDataset: """Compute SHAP feature contributions for the median quantile. Args: data: Input dataset for which to compute feature contributions. Returns: TimeSeriesDataset with per-feature SHAP values plus a bias column. Raises: NotFittedError: If the model has not been fitted. """ if not self.is_fitted: raise NotFittedError(self.__class__.__name__) input_data: pd.DataFrame = data.input_data(start=data.forecast_start) contribs = get_median_shap_contribs(self._xgboost_model.get_booster(), input_data, self.quantiles) # Inverse transform for target scaling if ( self._target_scaler is not None and self._target_scaler.scale_ is not None and self._target_scaler.mean_ is not None ): scale = float(self._target_scaler.scale_[0]) mean = float(self._target_scaler.mean_[0]) contribs[:, :-1] *= scale contribs[:, -1] = contribs[:, -1] * scale + mean columns = [*input_data.columns, "bias"] contribs_df = pd.DataFrame(contribs, index=input_data.index, columns=columns) return TimeSeriesDataset(data=contribs_df, sample_interval=data.sample_interval)
@property @override def feature_importances(self) -> pd.DataFrame: booster = self._xgboost_model.get_booster() weights_df = pd.DataFrame( data=booster.get_score(importance_type="gain"), index=[quantile.format() for quantile in self.quantiles], ).transpose() weights_df.index.name = "feature_name" weights_df.columns.name = "quantiles" return weights_df.pipe(normalize_to_unit_sum)
__all__ = ["XGBoostForecaster", "XGBoostHyperParams"]