Source code for openstef_beam.metrics.metrics_probabilistic

# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0

"""Metrics for forecasts that predict probability distributions instead of single values.

Unlike deterministic forecasts that predict one value (e.g., "load will be 100 MW"),
probabilistic forecasts predict a range of possible outcomes with their likelihoods
(e.g., "80% chance load will be between 90-110 MW"). These metrics evaluate both
how accurate these probability estimates are and how well-calibrated they are.

Key concepts:
    - Calibration: Do 90% prediction intervals actually contain the true value 90% of the time?
    - Sharpness: How narrow are the prediction intervals (more precise is better)?
    - Proper scoring: Metrics that reward honest probability estimates over gaming the system.
"""

import numpy as np
import numpy.typing as npt

from openstef_core.exceptions import MissingExtraError
from openstef_core.types import Quantile


[docs] def crps( y_true: npt.NDArray[np.floating], y_pred: npt.NDArray[np.floating], quantiles: npt.NDArray[np.floating], sample_weights: npt.NDArray[np.floating] | None = None, ) -> float: """Calculate the Continuous Ranked Probability Score (CRPS) for probabilistic forecasts. CRPS is a proper scoring rule that measures the quality of probabilistic forecasts. It generalizes the absolute error to distributional forecasts and is expressed in the same units as the forecast variable. Args: y_true: Observed values with shape (num_samples,). y_pred: Predicted quantiles with shape (num_samples, num_quantiles). Each row contains quantile predictions for the corresponding observation. quantiles: Quantile levels with shape (num_quantiles,). Must be sorted in ascending order and contain values in [0, 1]. sample_weights: Optional weights for each sample with shape (num_samples,). If None, all samples are weighted equally. Returns: The weighted average CRPS across all samples. Lower values indicate better forecast quality. Example: Evaluate quantile forecasts for energy load: >>> import numpy as np >>> y_true = np.array([100, 120, 110]) >>> quantiles = np.array([0.1, 0.5, 0.9]) >>> y_pred = np.array([[95, 100, 105], # Quantiles for first observation ... [115, 120, 125], # Quantiles for second observation ... [105, 110, 115]]) # Quantiles for third observation >>> score = crps(y_true, y_pred, quantiles) >>> isinstance(score, float) True Note: CRPS reduces to the absolute error when comparing point forecasts (single quantile). For well-calibrated forecasts, CRPS approximately equals half the expected absolute error of random forecasts. Raises: MissingExtraError: If the `scoringrules` package is not installed. """ try: import scoringrules as sr # noqa: PLC0415 - import is quite slow, so we delay it until this function is called except ImportError as e: raise MissingExtraError("scoringrules", package="openstef-beam") from e return float(np.average(sr.crps_quantile(y_true, y_pred, quantiles), weights=sample_weights))
[docs] def rcrps( y_true: npt.NDArray[np.floating], y_pred: npt.NDArray[np.floating], quantiles: npt.NDArray[np.floating], lower_quantile: float = 0.05, upper_quantile: float = 0.95, sample_weights: npt.NDArray[np.floating] | None = None, ) -> float: """Calculate the relative Continuous Ranked Probability Score (rCRPS). The rCRPS normalizes the CRPS by the range of observed values, making it scale-invariant and suitable for comparing forecast quality across different datasets or time periods with varying magnitudes. Args: y_true: Observed values with shape (num_samples,). y_pred: Predicted quantiles with shape (num_samples, num_quantiles). quantiles: Quantile levels with shape (num_quantiles,). Must be sorted in ascending order and contain values in [0, 1]. lower_quantile: Lower quantile for range calculation. Must be in [0, 1]. upper_quantile: Upper quantile for range calculation. Must be in [0, 1] and greater than lower_quantile. sample_weights: Optional weights for each sample with shape (num_samples,). Returns: The relative CRPS as a float. Returns NaN if the range between quantiles is zero. Example: Compare forecast quality across different scales: >>> import numpy as np >>> # High load period >>> y_true_high = np.array([1000, 1200, 1100]) >>> quantiles = np.array([0.1, 0.5, 0.9]) >>> y_pred_high = np.array([[950, 1000, 1050], ... [1150, 1200, 1250], ... [1050, 1100, 1150]]) >>> rcrps_high = rcrps(y_true_high, y_pred_high, quantiles) >>> isinstance(rcrps_high, float) True Note: rCRPS allows fair comparison of forecast quality between periods with different load levels, such as summer vs. winter energy demand. """ y_range = np.quantile(y_true, q=upper_quantile) - np.quantile(y_true, q=lower_quantile) if y_range == 0: return float("NaN") return float(crps(y_true, y_pred, quantiles, sample_weights) / y_range)
[docs] def observed_probability( y_true: npt.NDArray[np.floating], y_pred: npt.NDArray[np.floating], ) -> float: """Calculate the observed probability (empirical quantile) of predicted values. This function determines what quantile the predicted values correspond to based on the observed outcomes. For well-calibrated forecasts, a prediction at the p-th quantile should have approximately p fraction of observations below it. Args: y_true: Observed values with shape (num_samples,). y_pred: Predicted values with shape (num_samples,). These are typically predictions from a specific quantile level. Returns: The empirical quantile level as a float in [0, 1]. This represents the fraction of observations that fall below the predicted values. Example: Check calibration of median forecasts: >>> import numpy as np >>> y_true = np.array([95, 105, 100, 110, 90]) >>> y_pred = np.array([100, 100, 100, 100, 100]) # Median predictions >>> obs_prob = observed_probability(y_true, y_pred) >>> round(obs_prob, 1) # Should be close to 0.5 for well-calibrated median 0.4 Note: This metric is fundamental for evaluating forecast calibration. Systematic deviations from expected quantile levels indicate overconfident or underconfident uncertainty estimates. """ probability = np.mean(y_true < y_pred) return float(probability) if not np.isnan(probability) else 0.0
[docs] def mean_absolute_calibration_error( y_true: npt.NDArray[np.floating], y_pred: npt.NDArray[np.floating], quantiles: npt.NDArray[np.floating], ) -> float: """Calculate the Mean Absolute Calibration Error (MACE) for probabilistic forecasts. MACE measures how well the predicted quantiles match their nominal levels by comparing observed probabilities to expected quantile levels. Perfect calibration yields MACE = 0. Args: y_true: Observed values with shape (num_samples,). y_pred: Predicted quantiles with shape (num_samples, num_quantiles). Each column represents predictions for a specific quantile level. quantiles: Nominal quantile levels with shape (num_quantiles,). Must be sorted in ascending order and contain values in [0, 1]. Returns: The mean absolute calibration error as a float in [0, 0.5]. Values closer to 0 indicate better calibration. Example: Evaluate calibration of quantile forecasts: >>> import numpy as np >>> y_true = np.array([95, 105, 100, 110, 90, 115, 85, 120]) >>> quantiles = np.array([0.1, 0.5, 0.9]) >>> # Well-calibrated forecasts >>> y_pred = np.array([[90, 95, 100], # 10%, 50%, 90% quantiles ... [100, 105, 110], ... [95, 100, 105], ... [105, 110, 115], ... [85, 90, 95], ... [110, 115, 120], ... [80, 85, 90], ... [115, 120, 125]]) >>> mace = mean_absolute_calibration_error(y_true, y_pred, quantiles) >>> round(mace, 2) 0.23 Note: MACE is a key diagnostic for probabilistic forecasts. High MACE values indicate that the forecast confidence intervals are either too wide (overconfident) or too narrow (underconfident). """ observed_probs = np.array([observed_probability(y_true, y_pred[:, i]) for i in range(len(quantiles))]) return float(np.mean(np.abs(observed_probs - quantiles)))
def mean_pinball_loss( y_true: npt.NDArray[np.floating], y_pred: npt.NDArray[np.floating], quantiles: list[Quantile], sample_weight: npt.NDArray[np.floating] | None = None, ) -> float: """Calculate the Mean Pinball Loss for quantile forecasts. The Pinball Loss is a proper scoring rule for evaluating quantile forecasts. It penalizes under- and over-predictions differently based on the quantile level. Args: y_true: Observed values with shape (num_samples,) or (num_samples, num_quantiles). y_pred: Predicted quantiles with shape (num_samples, num_quantiles). Each column corresponds to predictions for a specific quantile level. quantiles: Quantile levels with shape (num_quantiles,). Must be sorted in ascending order and contain values in [0, 1]. sample_weight: Optional weights for each sample with shape (num_samples,). Returns: The weighted average Pinball Loss across all samples and quantiles. Lower values indicate better forecast quality. """ # Resize the predictions and targets. y_pred = np.reshape(y_pred, [-1, len(quantiles)]) n_rows = y_pred.shape[0] y_true = np.reshape(y_true, [n_rows, -1]) sample_weight = np.reshape(sample_weight, [n_rows, 1]) if sample_weight is not None else None # Extract quantile values into array for vectorized operations quantile_values = np.array(quantiles) # shape: (n_quantiles,) # Compute errors for all quantiles at once errors = y_true - y_pred # shape: (num_samples, num_quantiles) # Compute masks for all quantiles simultaneously underpredict_mask = errors >= 0 # y_true >= y_pred, shape: (num_samples, num_quantiles) overpredict_mask = errors < 0 # y_true < y_pred, shape: (num_samples, num_quantiles) # Vectorized pinball loss computation using broadcasting # quantiles broadcasts from (num_quantiles,) to (num_samples, num_quantiles) loss = quantiles * underpredict_mask * errors - (1 - quantile_values) * overpredict_mask * errors # Apply sample weights if provided if sample_weight is not None: sample_weight = np.asarray(sample_weight).reshape(-1, 1) # shape: (num_samples, 1) loss *= sample_weight total_weight = sample_weight.sum() * len(quantiles) else: total_weight = loss.size # Return mean loss across all samples and quantiles return float(loss.sum() / total_weight)