Source code for openstef_models.models.forecasting.xgboost_forecaster

# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0

"""XGBoost-based forecasting models for probabilistic energy forecasting.

Provides gradient boosting tree models using XGBoost for multi-quantile energy
forecasting. Optimized for time series data with specialized loss functions and
comprehensive hyperparameter control for production forecasting workflows.
"""

from typing import Annotated, ClassVar, Literal, override

import numpy as np
import pandas as pd
from pydantic import Field, PrivateAttr
from sklearn.preprocessing import StandardScaler

from openstef_core.datasets import ForecastDataset, ForecastInputDataset, TimeSeriesDataset
from openstef_core.exceptions import MissingExtraError, NotFittedError
from openstef_core.mixins.param_ranges import CategoricalRange, FloatRange, IntRange
from openstef_core.mixins.predictor import HyperParams
from openstef_core.utils.pandas import normalize_to_unit_sum
from openstef_models.explainability.mixins import ContributionsMixin, ExplainableForecaster
from openstef_models.models.forecasting.forecaster import Forecaster
from openstef_models.utils.evaluation_functions import EvaluationFunctionType, get_evaluation_function
from openstef_models.utils.loss_functions import (
    ObjectiveFunctionType,
    get_objective_function,
    xgb_prepare_target_for_objective,
)
from openstef_models.utils.xgboost import get_median_shap_contribs

try:
    import xgboost as xgb
except ImportError as e:
    raise MissingExtraError("xgboost", "openstef-models", install_extra="xgb-cpu") from e



[docs]
class XGBoostHyperParams(HyperParams):
    """XGBoost hyperparameters for gradient boosting tree models.

    Configures tree-specific parameters for XGBoost gbtree booster. Provides
    control over model complexity, regularization, and training
    behavior for energy forecasting tasks.

    These parameters control tree structure, learning rates, regularization,
    and sampling strategies. Proper tuning is essential for balancing model
    performance and overfitting prevention in time series forecasting.

    Example:
        Creating custom hyperparameters for deep trees with regularization

        >>> hyperparams = XGBoostHyperParams(
        ...     n_estimators=200,
        ...     max_depth=8,
        ...     learning_rate=0.1,
        ...     reg_alpha=0.1,
        ...     reg_lambda=1.0,
        ...     subsample=0.8,
        ... )

    Note:
        These parameters are optimized for probabilistic forecasting with
        quantile regression. The default objective function is specialized
        for magnitude-weighted pinball loss.
    """

    # Core Tree Boosting Parameters
    n_estimators: Annotated[int, IntRange(50, 500)] = Field(
        default=100,
        description="Number of boosting rounds/trees to fit. Higher values may improve performance but "
        "increase training time and risk overfitting.",
    )
    learning_rate: Annotated[float, FloatRange(0.01, 0.5, log=True)] = Field(
        default=0.3,
        alias="eta",
        description="Step size shrinkage used to prevent overfitting. Range: [0,1]. Lower values require "
        "more boosting rounds.",
    )
    max_depth: Annotated[int, IntRange(1, 15)] = Field(
        default=6,
        description="Maximum depth of trees. Higher values capture more complex patterns but risk "
        "overfitting. Range: [1,∞]",
    )
    min_child_weight: Annotated[float, FloatRange(1.0, 10.0)] = Field(
        default=1,
        description="Minimum sum of instance weight (hessian) needed in a child. Higher values prevent "
        "overfitting. Range: [0,∞]",
    )
    gamma: Annotated[float, FloatRange(0.0, 5.0)] = Field(
        default=0,
        alias="min_split_loss",
        description="Minimum loss reduction required to make a split. Higher values make algorithm more "
        "conservative. Range: [0,∞]",
    )
    objective: ObjectiveFunctionType = Field(
        default="pinball_loss",
        description="Objective function for training. 'pinball_loss' is recommended for probabilistic forecasting.",
    )
    evaluation_metric: EvaluationFunctionType = Field(
        default="mean_pinball_loss",
        description="Metric used for evaluation during training. Defaults to 'mean_pinball_loss' "
        "for quantile regression.",
    )

    # Regularization
    reg_alpha: Annotated[float, FloatRange(1e-8, 10.0, log=True)] = Field(
        default=0, description="L1 regularization on leaf weights. Higher values increase regularization. Range: [0,∞]"
    )
    reg_lambda: Annotated[float, FloatRange(1e-8, 10.0, log=True)] = Field(
        default=1, description="L2 regularization on leaf weights. Higher values increase regularization. Range: [0,∞]"
    )
    max_delta_step: float = Field(
        default=0,
        description="Maximum delta step allowed for leaf weight estimation. Useful for logistic regression with "
        "imbalanced classes. Range: [0,∞]",
    )

    # Tree Structure Control
    max_leaves: int = Field(
        default=0, description="Maximum number of leaves. 0 means no limit. Only relevant when grow_policy='lossguide'."
    )
    grow_policy: Annotated[Literal["depthwise", "lossguide"], CategoricalRange(("depthwise", "lossguide"))] = Field(
        default="depthwise",
        description="Controls how new nodes are added. 'depthwise' grows level by level, 'lossguide' adds leaves "
        "with highest loss reduction.",
    )
    max_bin: int = Field(
        default=256,
        description="Maximum number of discrete bins for continuous features. Higher values may improve accuracy but "
        "increase memory. Only for hist tree_method.",
    )
    num_parallel_trees: int = Field(
        default=1,
        description="Number of trees to grow per round. Higher values increase model complexity and training time. "
        "Range: [1,∞]",
    )

    # Subsampling Parameters
    subsample: Annotated[float, FloatRange(0.5, 1.0)] = Field(
        default=1.0,
        description="Fraction of training samples used for each tree. Lower values prevent overfitting. Range: (0,1]",
    )
    colsample_bytree: Annotated[float, FloatRange(0.5, 1.0)] = Field(
        default=1.0, description="Fraction of features used when constructing each tree. Range: (0,1]"
    )
    colsample_bylevel: float = Field(
        default=1.0, description="Fraction of features used for each level within a tree. Range: (0,1]"
    )
    colsample_bynode: float = Field(
        default=1.0, description="Fraction of features used for each split/node. Range: (0,1]"
    )

    # Tree Construction Method
    tree_method: Annotated[
        Literal["auto", "exact", "hist", "approx", "gpu_hist"],
        CategoricalRange(("auto", "hist", "approx")),
    ] = Field(
        default="auto",
        description="Tree construction algorithm. 'hist' is fastest for large datasets, 'exact' for small "
        "datasets, 'approx' is deprecated.",
    )

    # General Parameters
    random_state: int | None = Field(
        default=42, description="Random seed for reproducibility. Controls tree structure randomness."
    )
    early_stopping_rounds: int | None = Field(
        default=None,
        description="Training will stop if performance doesn't improve for this many rounds. Requires validation data.",
    )
    use_target_scaling: bool = Field(
        default=True,
        description="Whether to apply standard scaling to the target variable before training. Improves convergence.",
    )


[docs]
    @classmethod
    def forecaster_class(cls) -> "type[XGBoostForecaster]":
        """Get the forecaster class for these hyperparams.

        Returns:
            Forecaster class associated with this configuration.
        """
        return XGBoostForecaster




MODEL_CODE_VERSION = 1



[docs]
class XGBoostForecaster(Forecaster, ExplainableForecaster, ContributionsMixin):
    """XGBoost-based forecaster for probabilistic energy forecasting.

    Implements gradient boosting trees using XGBoost for multi-quantile forecasting.
    Optimized for time series prediction with specialized loss functions and
    hyperparameter control suitable for production energy forecasting.

    The forecaster uses a multi-output strategy where each quantile is predicted
    by separate trees within the same boosting ensemble. This approach provides
    well-calibrated uncertainty estimates while maintaining computational efficiency.

    Invariants:
        - fit() must be called before predict() to train the model
        - Configuration quantiles determine the number of prediction outputs
        - Model state is preserved across predict() calls after fitting
        - Input features must match training data structure during prediction

    Example:
        Basic forecasting workflow

        >>> from datetime import timedelta
        >>> from openstef_core.types import LeadTime, Quantile
        >>> forecaster = XGBoostForecaster(
        ...     quantiles=[Quantile(0.1), Quantile(0.5), Quantile(0.9)],
        ...     horizons=[LeadTime(timedelta(hours=1))],
        ...     hyperparams=XGBoostHyperParams(n_estimators=100, max_depth=6),
        ... )
        >>> forecaster.fit(training_data)  # doctest: +SKIP
        >>> predictions = forecaster.predict(test_data)  # doctest: +SKIP

    Note:
        XGBoost dependency is optional and must be installed separately.
        The model automatically handles multi-quantile output and uses
        magnitude-weighted pinball loss by default for better forecasting performance.

    See Also:
        XGBoostHyperParams: Detailed hyperparameter configuration options.
        Forecaster: Base interface for all forecasting models.
        GBLinearForecaster: Alternative linear model using XGBoost.
    """

    HyperParams: ClassVar[type[XGBoostHyperParams]] = XGBoostHyperParams

    hyperparams: XGBoostHyperParams = Field(default_factory=XGBoostHyperParams)
    device: str = Field(
        default="cpu", description="Device for XGBoost computation. Options: 'cpu', 'cuda', 'cuda:<ordinal>', 'gpu'"
    )
    n_jobs: int = Field(
        default=1, description="Number of parallel threads for tree construction. -1 uses all available cores."
    )
    verbosity: Literal[0, 1, 2, 3, True] = Field(
        default=1, description="Verbosity level. 0=silent, 1=warning, 2=info, 3=debug"
    )

    _xgboost_model: xgb.XGBRegressor = PrivateAttr()
    _target_scaler: StandardScaler | None = PrivateAttr()

    @property
    @override
    def hparams(self) -> XGBoostHyperParams:
        return self.hyperparams


[docs]
    @override
    def model_post_init(self, _context: object, /) -> None:
        """Initialize the underlying XGBoost model from configuration."""
        self._xgboost_model = xgb.XGBRegressor(
            # Multi-output configuration
            multi_strategy="one_output_per_tree",
            # Core parameters for forecasting
            n_estimators=self.hyperparams.n_estimators,
            learning_rate=self.hyperparams.learning_rate,
            max_depth=self.hyperparams.max_depth,
            min_child_weight=self.hyperparams.min_child_weight,
            gamma=self.hyperparams.gamma,
            # Regularization parameters
            reg_alpha=self.hyperparams.reg_alpha,
            reg_lambda=self.hyperparams.reg_lambda,
            max_delta_step=self.hyperparams.max_delta_step,
            # Tree structure control
            max_leaves=self.hyperparams.max_leaves,
            grow_policy=self.hyperparams.grow_policy,
            max_bin=self.hyperparams.max_bin,
            num_parallel_trees=self.hyperparams.num_parallel_trees,
            # Subsampling parameters
            subsample=self.hyperparams.subsample,
            colsample_bytree=self.hyperparams.colsample_bytree,
            colsample_bylevel=self.hyperparams.colsample_bylevel,
            colsample_bynode=self.hyperparams.colsample_bynode,
            # Tree construction method
            tree_method=self.hyperparams.tree_method,
            # General parameters
            random_state=self.hyperparams.random_state,
            device=self.device,
            n_jobs=self.n_jobs,
            verbosity=self.verbosity,
            # Early stopping handled in fit method
            early_stopping_rounds=self.hyperparams.early_stopping_rounds,
            # Objective
            objective=get_objective_function(function_type=self.hyperparams.objective, quantiles=self.quantiles),
            eval_metric=get_evaluation_function(
                function_type=self.hyperparams.evaluation_metric, quantiles=self.quantiles
            ),
            disable_default_eval_metric=True,
        )
        self._target_scaler = StandardScaler() if self.hyperparams.use_target_scaling else None


    @property
    @override
    def is_fitted(self) -> bool:
        return self._xgboost_model.__sklearn_is_fitted__()

    def _prepare_fit_input(self, data: ForecastInputDataset) -> tuple[pd.DataFrame, np.ndarray, pd.Series]:
        input_data: pd.DataFrame = data.input_data()
        # Scale the target variable
        target: np.ndarray = np.asarray(data.target_series.values)
        if self._target_scaler is not None:
            target = self._target_scaler.transform(target.reshape(-1, 1)).flatten()
        # Reshape target for multi-quantile objectives
        target = xgb_prepare_target_for_objective(
            target=target,
            quantiles=self.quantiles,
            objective=self.hyperparams.objective,
        )
        # Prepare sample weights
        sample_weight: pd.Series = data.sample_weight_series
        return input_data, target, sample_weight


[docs]
    @override
    def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None = None) -> None:
        # Fit the target scaler
        target: np.ndarray = np.asarray(data.target_series.values)
        if self._target_scaler is not None:
            self._target_scaler.fit(target.reshape(-1, 1))

        # Prepare training data
        input_data, target, sample_weight = self._prepare_fit_input(data)

        # Evaluation sets
        eval_set = [(input_data, target)]
        sample_weight_eval_set = [sample_weight]

        if data_val is not None:
            input_data_val, target_val, sample_weight_val = self._prepare_fit_input(data_val)
            eval_set.append((input_data_val, target_val))
            sample_weight_eval_set.append(sample_weight_val)

        self._xgboost_model.fit(
            X=input_data,
            y=target,
            sample_weight=sample_weight,
            eval_set=eval_set,
            sample_weight_eval_set=sample_weight_eval_set,
            verbose=self.verbosity,
        )



[docs]
    @override
    def predict(self, data: ForecastInputDataset) -> ForecastDataset:
        if not self.is_fitted:
            raise NotFittedError(self.__class__.__name__)

        # Get input features for prediction
        input_data: pd.DataFrame = data.input_data(start=data.forecast_start)
        # Generate predictions
        predictions_array: np.ndarray = self._xgboost_model.predict(input_data).reshape(-1, len(self.quantiles))

        # Inverse transform the scaled predictions
        if self._target_scaler is not None and len(predictions_array) > 0:
            predictions_array = self._target_scaler.inverse_transform(predictions_array)

        return ForecastDataset.from_quantile_predictions(
            predictions_array,
            input_data.index,
            self.quantiles,
            data.sample_interval,
            target_column=data.target_column,
        )



[docs]
    @override
    def predict_contributions(self, data: ForecastInputDataset) -> TimeSeriesDataset:
        """Compute SHAP feature contributions for the median quantile.

        Args:
            data: Input dataset for which to compute feature contributions.

        Returns:
            TimeSeriesDataset with per-feature SHAP values plus a bias column.

        Raises:
            NotFittedError: If the model has not been fitted.
        """
        if not self.is_fitted:
            raise NotFittedError(self.__class__.__name__)

        input_data: pd.DataFrame = data.input_data(start=data.forecast_start)
        contribs = get_median_shap_contribs(self._xgboost_model.get_booster(), input_data, self.quantiles)

        # Inverse transform for target scaling
        if (
            self._target_scaler is not None
            and self._target_scaler.scale_ is not None
            and self._target_scaler.mean_ is not None
        ):
            scale = float(self._target_scaler.scale_[0])
            mean = float(self._target_scaler.mean_[0])
            contribs[:, :-1] *= scale
            contribs[:, -1] = contribs[:, -1] * scale + mean

        columns = [*input_data.columns, "bias"]
        contribs_df = pd.DataFrame(contribs, index=input_data.index, columns=columns)
        return TimeSeriesDataset(data=contribs_df, sample_interval=data.sample_interval)


    @property
    @override
    def feature_importances(self) -> pd.DataFrame:
        booster = self._xgboost_model.get_booster()
        weights_df = pd.DataFrame(
            data=booster.get_score(importance_type="gain"),
            index=[quantile.format() for quantile in self.quantiles],
        ).transpose()
        weights_df.index.name = "feature_name"
        weights_df.columns.name = "quantiles"

        return weights_df.pipe(normalize_to_unit_sum)



__all__ = ["XGBoostForecaster", "XGBoostHyperParams"]