Source code for openstef_beam.metrics.metrics_deterministic

# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0

"""Metrics for forecasts that predict single values instead of probability distributions.

Deterministic forecasts predict one specific value (e.g., "load will be 100 MW").
These metrics measure how close predicted values are to actual values, with special
attention to peak load events that are critical for energy system operations.

Key focus areas:
    - Scale-invariant errors: Compare accuracy across different load levels
    - Peak detection: Identify when load will exceed operational thresholds
    - Operational effectiveness: Ensure predictions support actionable decisions
"""

from typing import NamedTuple

import numpy as np
import numpy.typing as npt
from sklearn.metrics import r2_score



[docs]
def completeness(
    y: npt.NDArray[np.floating],
) -> float:
    """Calculate the completeness of data.

    Completeness measures the proportion of non-missing data, providing insight
    into data availability and potential gaps in the data.

    Args:
        y: Values with shape (num_samples,). May contain NaN for missing data.

    Returns:
        The completeness as a float in range [0, 1], where 1.0 means no missing data.

    Example:
        Basic usage with energy load data:
        >>> import numpy as np
        >>> y = np.array([100, 120, np.nan, 130, 105])
        >>> completeness(y)
        0.8
    """
    # Ensure inputs are numpy arrays
    y = np.array(y)
    if y.size == 0:
        return 0.0
    non_missing_count = np.sum(~np.isnan(y))
    return float(non_missing_count / y.size)




[docs]
def mae(
    y_true: npt.NDArray[np.floating],
    y_pred: npt.NDArray[np.floating],
    *,
    sample_weights: npt.NDArray[np.floating] | None = None,
    allow_nan: bool = False,
) -> float:
    """Calculate the Mean Absolute Error (MAE).

    MAE measures the average magnitude of errors in a set of predictions, without
    considering their direction. It provides a straightforward interpretation of
    forecast accuracy in the same units as the data.

    Args:
        y_true: Ground truth values with shape (num_samples,).
        y_pred: Predicted values with shape (num_samples,).
        sample_weights: Optional weights for each sample with shape (num_samples,).
            If None, all samples are weighted equally.
        allow_nan: If True, allows NaN values in y_true and y_pred, which will be
            ignored in the MAE calculation. If False, any NaN values will result in a NaN MAE.

    Returns:
        The Mean Absolute Error as a float.

    Example:
        Basic usage with energy load data:
        >>> import numpy as np
        >>> y_true = np.array([100, 120, 110, 130, 105])
        >>> y_pred = np.array([98, 122, 108, 132, 107])
        >>> mae(y_true, y_pred)
        2.0
    """
    # Ensure inputs are numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if y_true.size == 0 or y_pred.size == 0:
        return float("NaN")

    # Calculate absolute errors
    ae = np.abs(y_true - y_pred)

    # Create a mask to ignore NaN values if allow_nan is True
    mask = ~np.isnan(ae) if allow_nan else np.ones_like(ae, dtype=bool)

    return float(np.average(ae[mask], weights=sample_weights))




[docs]
def rmae(
    y_true: npt.NDArray[np.floating],
    y_pred: npt.NDArray[np.floating],
    *,
    lower_quantile: float = 0.05,
    upper_quantile: float = 0.95,
    norm_value: float | None = None,
    allow_nan: bool = False,
    sample_weights: npt.NDArray[np.floating] | None = None,
) -> float:
    """Calculate the relative Mean Absolute Error (rMAE) using percentiles for range calculation.

    The rMAE normalizes the Mean Absolute Error by default by the range of true values,
    making it scale-invariant and suitable for comparing errors across different
    datasets or time periods. If norm_value is provided, it will be used directly for normalization
    instead of calculating the range from quantiles. This can be useful for
    consistent normalization across multiple evaluations.

    Args:
        y_true: Ground truth values with shape (num_samples,).
        y_pred: Predicted values with shape (num_samples,).
        lower_quantile: Lower quantile for range calculation. Must be in [0, 1].
        upper_quantile: Upper quantile for range calculation. Must be in [0, 1]
            and greater than lower_quantile.
        norm_value: Optional pre-calculated normalization value. If provided, it will be used
            directly instead of calculating the range from quantiles.
        allow_nan: If True, allows NaN values in y_true and y_pred, which will be
            ignored in the rMAE calculation. If False, any NaN values will result in a NaN rMAE.
        sample_weights: Optional weights for each sample with shape (num_samples,).
            If None, all samples are weighted equally.

    Returns:
        The relative Mean Absolute Error as a float. Returns NaN if the range
        between quantiles is zero.

    Example:
        Basic usage with energy load data:

        >>> import numpy as np
        >>> y_true = np.array([100, 120, 110, 130, 105])
        >>> y_pred = np.array([98, 122, 108, 135, 107])
        >>> error = rmae(y_true, y_pred)
        >>> round(error, 3)
        0.096

        With custom quantiles and weights:

        >>> weights = np.array([1, 2, 1, 2, 1])
        >>> error = rmae(y_true, y_pred, lower_quantile=0.1,
        ...               upper_quantile=0.9, sample_weights=weights)
        >>> isinstance(error, float)
        True
    """
    # Ensure inputs are numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if y_true.size == 0 or y_pred.size == 0:
        return float("NaN")

    # Calculate MAE
    mae_value = mae(y_true, y_pred, sample_weights=sample_weights, allow_nan=allow_nan)

    # Calculate range using quantile
    if norm_value is not None:
        y_range = norm_value
    else:
        y_range = np.quantile(y_true, q=upper_quantile) - np.quantile(y_true, q=lower_quantile)

    # Avoid division by zero if range is zero
    if y_range == 0:
        return float("NaN")

    # Calculate rMAE
    rmae = mae_value / y_range

    return float(rmae)




[docs]
def mape(
    y_true: npt.NDArray[np.floating],
    y_pred: npt.NDArray[np.floating],
) -> float:
    """Calculate the Mean Absolute Percentage Error (MAPE).

    MAPE measures the average magnitude of errors in percentage terms,
    making it scale-independent and easily interpretable. However, it
    can be undefined or inflated when true values are near zero.

    Args:
        y_true: Ground truth values with shape (num_samples,). Should not contain
            values close to zero to avoid division issues.
        y_pred: Predicted values with shape (num_samples,).

    Returns:
        The Mean Absolute Percentage Error as a float. May return inf or
        extremely large values if y_true contains values close to zero.

    Example:
        Basic usage with energy load data:

        >>> import numpy as np
        >>> y_true = np.array([100, 120, 110, 130, 105])
        >>> y_pred = np.array([98, 122, 108, 135, 107])
        >>> error = mape(y_true, y_pred)
        >>> round(error, 4)
        0.0225

        With perfect predictions:

        >>> perfect_pred = np.array([100, 120, 110, 130, 105])
        >>> mape(y_true, perfect_pred)
        0.0
    """
    # Ensure inputs are numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if y_true.size == 0 or y_pred.size == 0:
        return float("NaN")

    # Calculate MAPE
    mape_value = np.mean(np.abs((y_true - y_pred) / y_true))

    return float(mape_value)




[docs]
class ConfusionMatrix(NamedTuple):
    """Confusion matrix components for peak detection in energy forecasting.

    This class represents the results of classifying energy load peaks
    versus non-peaks, with additional effectiveness metrics to account for
    the direction and magnitude of prediction errors.

    Attributes:
        true_positives: Boolean array indicating correctly predicted peaks.
        true_negatives: Boolean array indicating correctly predicted non-peaks.
        false_positives: Boolean array indicating incorrectly predicted peaks.
        false_negatives: Boolean array indicating missed peaks.
        effective_true_positives: Boolean array indicating true positives that
            are effective (peak correctly predicted with appropriate magnitude/direction).
        ineffective_true_positives: Boolean array indicating true positives that
            are ineffective (peak correctly predicted but with wrong magnitude/direction).

    Note:
        All arrays have shape (num_samples,) and correspond to the same time points
        in the original forecast evaluation.
    """

    true_positives: npt.NDArray[np.bool_]
    true_negatives: npt.NDArray[np.bool_]
    false_positives: npt.NDArray[np.bool_]
    false_negatives: npt.NDArray[np.bool_]
    effective_true_positives: npt.NDArray[np.bool_]
    ineffective_true_positives: npt.NDArray[np.bool_]




[docs]
def confusion_matrix(
    y_true: npt.NDArray[np.floating],
    y_pred: npt.NDArray[np.floating],
    *,
    limit_pos: float,
    limit_neg: float,
) -> ConfusionMatrix:
    """Calculate confusion matrix for peak detection in energy load.

    A peak is defined as a value that exceeds the positive limit or falls below
    the negative limit. This function evaluates both the accuracy of peak detection
    and the effectiveness of predictions based on error direction.

    Args:
        y_true: Ground truth energy load values with shape (num_samples,).
        y_pred: Predicted energy load values with shape (num_samples,).
        limit_pos: Positive threshold defining high load peaks.
            Values >= limit_pos are considered positive peaks.
        limit_neg: Negative threshold defining low load peaks.
            Values <= limit_neg are considered negative peaks.

    Returns:
        ConfusionMatrix containing boolean arrays for all classification outcomes
        and effectiveness metrics.

    Example:
        Peak detection for energy load data:

        >>> import numpy as np
        >>> y_true = np.array([100, 150, 80, 200, 90])  # 150 and 200 are peaks
        >>> y_pred = np.array([105, 145, 85, 195, 95])
        >>> cm = confusion_matrix(y_true, y_pred, limit_pos=120, limit_neg=85)
        >>> int(cm.true_positives.sum())  # Successfully detected peaks
        3
        >>> int(cm.false_positives.sum())  # Incorrectly predicted peaks
        0

    Note:
        Effective true positives require that high peaks are predicted even higher
        (positive error) and low peaks are predicted even lower (negative error).
        This captures whether the forecast provides actionable information for
        energy system operators.
    """
    true_peaks_y: npt.NDArray[np.bool_] = (y_true >= limit_pos) | (y_true <= limit_neg)
    pred_peaks_y: npt.NDArray[np.bool_] = (y_pred >= limit_pos) | (y_pred <= limit_neg)

    error = y_pred - y_true
    effective_true_positives = ((y_true >= limit_pos) & (y_pred >= limit_pos) & (error > 0)) | (
        (y_true <= limit_neg) & (y_pred <= limit_neg) & (error < 0)
    )
    ineffective_true_positives = true_peaks_y & pred_peaks_y & ~effective_true_positives

    return ConfusionMatrix(
        true_positives=true_peaks_y & pred_peaks_y,
        true_negatives=~true_peaks_y & ~pred_peaks_y,
        false_positives=~true_peaks_y & pred_peaks_y,
        false_negatives=true_peaks_y & ~pred_peaks_y,
        effective_true_positives=effective_true_positives,
        ineffective_true_positives=ineffective_true_positives,
    )




[docs]
class PrecisionRecall(NamedTuple):
    """Container for precision and recall metrics.

    This class holds the fundamental classification metrics used to evaluate
    peak detection performance in energy forecasting applications.

    Attributes:
        precision: The fraction of predicted peaks that were actual peaks.
            Range [0, 1] where 1.0 is perfect precision.
        recall: The fraction of actual peaks that were correctly predicted.
            Range [0, 1] where 1.0 is perfect recall.

    Note:
        High precision means few false alarms, while high recall means
        few missed peaks. There is often a trade-off between these metrics.
    """

    precision: float
    recall: float




[docs]
def precision_recall(
    cm: ConfusionMatrix,
    *,
    effective: bool = False,
) -> PrecisionRecall:
    """Calculate precision and recall metrics from a confusion matrix.

    These metrics evaluate the quality of peak detection by measuring
    how many predicted peaks were correct (precision) and how many
    actual peaks were detected (recall).

    Args:
        cm: Confusion matrix from the confusion_matrix function containing
            all classification outcomes.
        effective: If True, uses effective true positives which account for
            prediction direction and magnitude. If False, uses standard
            true positives for calculation.

    Returns:
        PrecisionRecall containing precision and recall values, each in range [0, 1].

    Example:
        Calculate standard precision and recall:

        >>> import numpy as np
        >>> y_true = np.array([100, 150, 80, 200, 90])
        >>> y_pred = np.array([105, 145, 85, 195, 95])
        >>> cm = confusion_matrix(y_true, y_pred, limit_pos=120, limit_neg=85)
        >>> pr = precision_recall(cm)
        >>> float(pr.precision)
        1.0
        >>> float(pr.recall)
        1.0

    Note:
        When effective=True, the metrics focus on predictions that provide
        actionable information (correct magnitude and direction) rather than
        just correct classification.
    """
    relevant = np.sum(cm.true_positives) + np.sum(cm.false_negatives)
    retrieved = np.sum(cm.true_positives) + np.sum(cm.false_positives)

    if effective:
        precision = np.sum(cm.effective_true_positives) / retrieved if retrieved > 0 else 0
        recall = np.sum(cm.effective_true_positives) / relevant if relevant > 0 else 0
    else:
        precision = np.sum(cm.true_positives) / retrieved if retrieved > 0 else 0
        recall = np.sum(cm.true_positives) / relevant if relevant > 0 else 0

    return PrecisionRecall(
        precision=precision,
        recall=recall,
    )




[docs]
def fbeta(
    precision_recall: PrecisionRecall,
    beta: float = 2.0,
) -> float:
    """Calculate the F-beta score from precision and recall metrics.

    The F-beta score is a weighted harmonic mean of precision and recall,
    allowing for different emphasis on recall versus precision based on
    the beta parameter.

    Args:
        precision_recall: Container with precision and recall values from
            the precision_recall function.
        beta: Weight parameter controlling the relative importance of recall.
            Values > 1 favor recall, values < 1 favor precision.
            Common values: 0.5 (favor precision), 1.0 (balanced F1), 2.0 (favor recall).

    Returns:
        The F-beta score as a float in range [0, 1]. Returns 0.0 if both
        precision and recall are zero.

    Example:
        Calculate F2 score (favoring recall):

        >>> pr = PrecisionRecall(precision=0.8, recall=0.9)
        >>> score = fbeta(pr, beta=2.0)
        >>> round(score, 3)
        0.878

        Calculate F1 score (balanced):

        >>> score = fbeta(pr, beta=1.0)
        >>> round(score, 3)
        0.847

    Note:
        F-beta scores are particularly useful for imbalanced datasets where
        the cost of false positives and false negatives differs significantly.
        In energy forecasting, beta > 1 is often preferred to minimize missed peaks.
    """
    precision = precision_recall.precision
    recall = precision_recall.recall

    if precision + recall == 0:
        return 0.0

    return (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)




[docs]
def riqd(
    y_true: npt.NDArray[np.floating],
    y_pred_lower_q: npt.NDArray[np.floating],
    y_pred_upper_q: npt.NDArray[np.floating],
    *,
    measurement_range_lower_q: float = 0.05,
    measurement_range_upper_q: float = 0.95,
) -> float:
    """Calculate the relative Inter Quantile Distance (rIQD).

    rIQD measures the average distance between two quantiles, normalized by the measurement range.

    Args:
        y_true: Ground truth values with shape (num_samples,).
        y_pred_lower_q: Predicted values of lower quantile with shape (num_samples,).
        y_pred_upper_q: Predicted values of upper quantile with shape (num_samples,).
        measurement_range_lower_q: Lower quantile for range calculation. Must be in [0, 1].
        measurement_range_upper_q: Upper quantile for range calculation. Must be in [0, 1]
            and greater than measurement_range_lower_q.

    Returns:
        The relative Inter Quantile Distance (rIQD) as a float. Returns NaN if the measurement
            range is zero.

    Example:
        Basic usage with energy load data:

        >>> import numpy as np
        >>> y_true = np.array([100, 120, 110, 130, 105])
        >>> y_pred_lower_q = np.array([90, 100, 105, 95, 85])
        >>> y_pred_upper_q = np.array([110, 125, 140, 135, 90])
        >>> riqd = riqd(y_true, y_pred_lower_q, y_pred_upper_q)
        >>> round(riqd, 4)
        0.9259

    """
    # Ensure inputs are numpy arrays
    y_true = np.array(y_true)
    y_pred_lower_q = np.array(y_pred_lower_q)
    y_pred_upper_q = np.array(y_pred_upper_q)
    if y_true.size == 0 or y_pred_lower_q.size == 0 or y_pred_upper_q.size == 0:
        return float("NaN")

    y_range = np.quantile(y_true, q=measurement_range_upper_q) - np.quantile(y_true, q=measurement_range_lower_q)

    # Calculate IQD
    iqd = np.mean(y_pred_upper_q - y_pred_lower_q)

    # Avoid division by zero if range is zero
    if y_range == 0:
        return float("NaN")

    # Calculate rIQD
    riqd = iqd / y_range

    return float(riqd)




[docs]
def r2(
    y_true: npt.NDArray[np.floating],
    y_pred: npt.NDArray[np.floating],
    *,
    sample_weights: npt.NDArray[np.floating] | None = None,
) -> float:
    """Calculate the R² (coefficient of determination) score.

    R² represents the proportion of variance in the dependent variable that is
    predictable from the independent variable(s). It provides a measure of how
    well observed outcomes are replicated by the model, based on the proportion
    of total variation of outcomes explained by the model.

    Args:
        y_true: Ground truth values with shape (num_samples,).
        y_pred: Predicted values with shape (num_samples,).
        sample_weights: Optional weights for each sample with shape (num_samples,).
            If None, all samples are weighted equally.

    Returns:
        The R² score as a float. Best possible score is 1.0, and it can be negative
        (because the model can be arbitrarily worse). A constant model that always
        predicts the mean of y_true would get an R² score of 0.0.

    Example:
        Basic usage with energy load data:

        >>> import numpy as np
        >>> y_true = np.array([100, 120, 110, 130, 105])
        >>> y_pred = np.array([98, 122, 108, 135, 107])
        >>> score = r2(y_true, y_pred)
        >>> round(score, 3)
        0.929

        Perfect predictions give R² = 1.0:

        >>> perfect_pred = np.array([100, 120, 110, 130, 105])
        >>> r2(y_true, perfect_pred)
        1.0

        With sample weights:

        >>> weights = np.array([1, 2, 1, 2, 1])
        >>> score = r2(y_true, y_pred, sample_weights=weights)
        >>> isinstance(score, float)
        True
    """
    if len(y_true) == 0 or len(y_pred) == 0:
        return float("NaN")

    return float(r2_score(y_true, y_pred, sample_weight=sample_weights))




[docs]
def relative_pinball_loss(
    y_true: npt.NDArray[np.floating],
    y_pred: npt.NDArray[np.floating],
    *,
    quantile: float,
    measurement_range_lower_q: float = 0.05,
    measurement_range_upper_q: float = 0.95,
    sample_weights: npt.NDArray[np.floating] | None = None,
) -> float:
    """Calculate the relative Pinball Loss (also known as relative Quantile Loss).

    The relative pinball loss normalizes the pinball loss by the range of true values,
    making it scale-invariant and suitable for comparing quantile prediction errors
    across different datasets or time periods. The pinball loss can be used to quantify
    the accuracy of a single quantile.

    Args:
        y_true: Ground truth values with shape (num_samples,).
        y_pred: Predicted quantile values with shape (num_samples,).
        quantile: The quantile level being predicted (e.g., 0.1, 0.5, 0.9).
            Must be in [0, 1].
        measurement_range_lower_q: Lower quantile for range calculation. Must be in [0, 1].
        measurement_range_upper_q: Upper quantile for range calculation. Must be in [0, 1]
            and greater than measurement_range_lower_q.
        sample_weights: Optional weights for each sample with shape (num_samples,).
            If None, all samples are weighted equally.

    Returns:
        The relative Pinball Loss as a float. Returns NaN if the measurement range
        is zero.

    Example:
        Basic usage for 10th percentile predictions:

        >>> import numpy as np
        >>> y_true = np.array([100, 120, 110, 130, 105])
        >>> y_pred = np.array([95, 115, 105, 125, 100])  # 10th percentile predictions
        >>> rpbl = relative_pinball_loss(y_true, y_pred, quantile=0.1, measurement_range_lower_q=0.0,
        ...                               measurement_range_upper_q=1.0)
        >>> round(rpbl, 4)
        0.0167
    """
    # Ensure inputs are numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if y_true.size == 0 or y_pred.size == 0:
        return float("NaN")

    # Calculate pinball loss for each sample
    errors = y_true - y_pred
    pinball_losses = np.where(
        errors >= 0,
        quantile * errors,  # Under-prediction
        (quantile - 1) * errors,  # Over-prediction
    )

    # Calculate mean pinball loss (weighted if weights provided)
    mean_pinball_loss = np.average(pinball_losses, weights=sample_weights)

    # Calculate measurement range for normalization
    y_range = np.quantile(y_true, q=measurement_range_upper_q) - np.quantile(y_true, q=measurement_range_lower_q)

    # Avoid division by zero if range is zero
    if y_range == 0:
        return float("NaN")

    # Calculate relative pinball loss
    relative_pinball_loss = mean_pinball_loss / y_range

    return float(relative_pinball_loss)