Source code for openstef_models.utils.loss_functions

# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0

"""Loss functions for quantile regression with XGBoost.

This module provides custom loss functions for multi-quantile regression with XGBoost,
including magnitude-weighted pinball loss, arctan-smoothed pinball loss, and standard
pinball loss. All functions support sample weighting for flexible training.
"""

from collections.abc import Callable
from functools import partial
from typing import Any, Literal, cast

import numpy as np
import numpy.typing as npt

from openstef_core.types import Quantile



[docs]
def arctan_loss_multi_objective(
    y_true: npt.NDArray[np.floating],
    y_pred: npt.NDArray[np.floating],
    quantiles: list[Quantile],
    sample_weight: npt.NDArray[np.floating] | None = None,
    s: float = 0.1,
) -> tuple[npt.NDArray[np.floating], npt.NDArray[np.floating]]:
    """Arctan-smoothed multi-quantile pinball loss objective function for XGBoost.

    Computes first and second derivatives of the arctan pinball loss, a smooth approximation
    of the standard pinball loss that provides non-zero second derivatives essential for
    XGBoost's second-order optimization. This formulation addresses the fundamental limitation
    of standard pinball loss having zero second derivatives.

    The arctan pinball loss maintains asymptotic unbiasedness while providing substantial
    second derivatives in the relevant domain (typically -10 to 10 for standardized targets),
    making it well-suited for gradient boosting algorithms that rely on second-order
    information for tree splitting decisions.

    Args:
        y_true: True target values, shape (n_samples, n_quantiles)
        y_pred: Predicted values, shape (n_samples, n_quantiles)
        quantiles: List of validated quantiles corresponding to predictions
        sample_weight: Optional sample weights, shape (n_samples,). If provided,
            gradients and hessians are scaled by these weights to give different importance
            to different samples in the loss computation.
        s: Smoothing parameter for the arctan function. Smaller values provide closer approximation
           to standard pinball loss but reduce second derivative magnitude.

    Returns:
        tuple: (gradient, hessian) arrays in 2D format for XGBoost multi-output

            - gradient: First derivative of arctan pinball loss, shape (n_samples, n_quantiles)
            - hessian: Second derivative of arctan pinball loss, shape (n_samples, n_quantiles)

            Both arrays are normalized by n_quantiles for consistent optimization dynamics.

    Mathematical Formulation:
        Loss function: L^(arctan)_τ,s(u) = (τ - 0.5 + arctan(u/s)/π) * u + s/π

        Where:

        - u = y_true - y_pred (prediction error)
        - τ = quantile level (0 < τ < 1)
        - s = smoothing parameter

        First derivative (gradient):
        ∂L/∂u = τ - 0.5 + arctan(u/s)/π + u/(π*s*(1+(u/s)²))

        Second derivative (hessian):
        ∂²L/∂u² = 2/(π*s) * (1+(u/s)²)^(-2)

    Key Properties:

        - Smooth approximation: Continuously differentiable everywhere
        - Asymptotic unbiasedness: Behaves like standard pinball loss for large ``|u|``
        - Non-zero hessian: Provides substantial second derivatives for XGBoost optimization
        - Scale sensitivity: Hessian magnitude inversely proportional to smoothing parameter s

    Gradient Normalization:
        Both gradient and hessian are divided by n_quantiles to ensure consistent optimization
        dynamics regardless of the number of quantiles being predicted. This normalization
        prevents gradient magnitude from scaling linearly with the number of outputs.

    References:
        - Arctan pinball loss: arXiv:2406.02293v1 "Composite Quantile Regression With XGBoost"
        - Second-order optimization: XGBoost requires non-zero hessian for tree splitting
        - Implementation: https://github.com/LaurensSluyterman/XGBoost_quantile_regression
    """
    # Resize the predictions and targets.
    n_quantiles = len(quantiles)
    y_pred = np.reshape(y_pred, (-1, n_quantiles))
    y_true = np.reshape(y_true, (-1, n_quantiles))
    sample_weight = sample_weight[:, np.newaxis] if sample_weight is not None else None

    # Calculate the differences
    u = y_true - y_pred

    # Extract quantile values into array for vectorized operations
    quantile_values = np.array(quantiles)  # shape: (n_quantiles,)

    # Calculate the scaled differences
    z = u / s  # shape: (n_samples, n_quantiles)

    # Vectorized computation using broadcasting
    # quantile_values broadcasts from (n_quantiles,) to (n_samples, n_quantiles)
    x = 1 + z**2  # shape: (n_samples, n_quantiles)

    # Compute errors for proper sign handling
    errors = y_pred - y_true  # shape: (n_samples, n_quantiles)

    # Compute gradients for all quantiles simultaneously
    # Scale gradient by error magnitude to handle unscaled data, similar to magnitude-weighted approach
    # The arctan term provides smoothness, while error scaling ensures effective learning with large-scale data
    grad = (quantile_values - 0.5 + (1 / np.pi) * np.arctan(z) + z / (np.pi * x)) * np.abs(errors)

    # Compute hessians for all quantiles simultaneously
    # Use constant hessian for numerical stability (arctan's natural hessian decays too quickly)
    hess = np.ones_like(errors)

    # Apply sample weights if provided
    if sample_weight is not None:
        grad *= sample_weight
        hess *= sample_weight

    # Shape: (n_samples, n_quantiles) for proper multi-output format
    # Negate gradient to get correct sign (arctan formula uses u = y_true - y_pred convention)
    return -grad / n_quantiles, hess / n_quantiles




[docs]
def pinball_loss_multi_objective(
    y_true: npt.NDArray[np.floating],
    y_pred: npt.NDArray[np.floating],
    quantiles: list[Quantile],
    sample_weight: npt.NDArray[np.floating] | None = None,
) -> tuple[npt.NDArray[np.floating], npt.NDArray[np.floating]]:
    """Multi-quantile pinball loss objective function for XGBoost.

    Computes first-order derivatives of pinball loss for multiple quantiles
    and non-degenerate substitutes for second-order derivatives. This implementation
    scales gradients by error magnitude, making it effective with both scaled and
    unscaled data.

    The pinball loss is the standard loss function for quantile regression, where
    underestimation errors are penalized by the quantile level and overestimation
    errors by (1 - quantile). This ensures unbiased quantile estimates when the
    model converges.

    Gradient scaling by error magnitude addresses convergence issues with large-scale
    unscaled data (e.g., values in millions) by making the objective function responsive
    to error scale.

    Non-zero second-order derivatives are returned instead of zeros because XGBoost requires
    non-degenerate hessian values for proper convergence. See XGBoost issue #1825 for details
    on why this approximation is mathematically valid. Ensure that the hyperparameter
    `max_delta_step` satisfies: 0.5 * max_delta_step <= min(quantile, 1 - quantile) for all
    quantiles.

    Args:
        y_true: True target values, shape (n_samples, n_quantiles)
        y_pred: Predicted values, shape (n_samples, n_quantiles)
        quantiles: List of validated quantiles corresponding to predictions
        sample_weight: Optional sample weights, shape (n_samples,). If provided,
            gradients and hessians are scaled by these weights to give different importance
            to different samples in the loss computation.

    Returns:
        tuple: (gradient, hessian) arrays in 2D format for XGBoost multi-output

            - gradient: First derivative scaled by error magnitude, shape (n_samples, n_quantiles)
            - hessian: Constant positive values for numerical stability, shape (n_samples, n_quantiles)

            Both arrays are normalized by n_quantiles for consistent loss values.

    Mathematical Formulation:
        Magnitude-aware pinball loss gradient: ∇L = (τ * I(error < 0) + (1 - τ) * I(error >= 0)) * error

        Where τ is the quantile level, I(·) is the indicator function, and error = y_pred - y_true.
        The gradient is scaled by error magnitude to handle unscaled data effectively.

    References:
        - Original pinball loss: Koenker & Bassett (1978), "Regression Quantiles"
        - XGBoost hessian approximation: https://github.com/dmlc/xgboost/issues/1825
        - Implementation reference: https://gist.github.com/Nikolay-Lysenko/06769d701c1d9c9acb9a66f2f9d7a6c7
    """
    # Resize the predictions and targets.
    n_quantiles = len(quantiles)
    y_pred = np.reshape(y_pred, (-1, n_quantiles))
    y_true = np.reshape(y_true, (-1, n_quantiles))
    sample_weight = sample_weight[:, np.newaxis] if sample_weight is not None else None

    # Extract quantile values into array for vectorized operations
    quantile_values = np.array(quantiles)  # shape: (n_quantiles,)

    # Compute errors for all quantiles at once
    errors = y_pred - y_true  # shape: (n_samples, n_quantiles)

    # Compute masks for all quantiles simultaneously
    left_mask = errors < 0  # underprediction, shape: (n_samples, n_quantiles)
    right_mask = errors >= 0  # overprediction, shape: (n_samples, n_quantiles)

    # Vectorized gradient computation using broadcasting
    # quantile_values broadcasts from (n_quantiles,) to (n_samples, n_quantiles)
    # Scale by error magnitude to handle unscaled data (same approach as magnitude-weighted version)
    gradient = (quantile_values * left_mask + (1 - quantile_values) * right_mask) * errors

    # Non-degenerate hessian for XGBoost numerical stability
    hessian = np.ones_like(y_pred)

    # Apply sample weights if provided
    if sample_weight is not None:
        gradient *= sample_weight
        hessian *= sample_weight

    # Shape: (n_samples, n_quantiles) for proper multi-output format
    return gradient / n_quantiles, hessian / n_quantiles



type ObjectiveFunctionType = Literal["pinball_loss", "arctan_loss"]

OBJECTIVE_MAP = {
    "pinball_loss": pinball_loss_multi_objective,
    "arctan_loss": arctan_loss_multi_objective,
}


def xgb_prepare_target_for_objective(
    target: np.ndarray,
    objective: ObjectiveFunctionType | Literal["reg:quantileerror"],
    quantiles: list[Quantile],
) -> np.ndarray:
    if objective == "reg:quantileerror":
        return target

    return np.repeat(target[:, np.newaxis], repeats=len(quantiles), axis=1)


def get_objective_function(
    function_type: ObjectiveFunctionType,
    quantiles: list[Quantile],
    **kwargs: Any,
) -> Callable[
    [npt.NDArray[np.floating], npt.NDArray[np.floating]], tuple[npt.NDArray[np.floating], npt.NDArray[np.floating]]
]:
    fn = partial(OBJECTIVE_MAP[function_type], quantiles=quantiles, **kwargs)
    cast(Any, fn).__name__ = function_type
    return fn


__all__ = [
    "OBJECTIVE_MAP",
    "ObjectiveFunctionType",
    "arctan_loss_multi_objective",
    "pinball_loss_multi_objective",
]