# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
#
# SPDX-License-Identifier: MPL-2.0
"""Metrics for forecasts that predict single values instead of probability distributions.
Deterministic forecasts predict one specific value (e.g., "load will be 100 MW").
These metrics measure how close predicted values are to actual values, with special
attention to peak load events that are critical for energy system operations.
Key focus areas:
- Scale-invariant errors: Compare accuracy across different load levels
- Peak detection: Identify when load will exceed operational thresholds
- Operational effectiveness: Ensure predictions support actionable decisions
"""
from typing import NamedTuple
import numpy as np
import numpy.typing as npt
from sklearn.metrics import r2_score
[docs]
def rmae(
y_true: npt.NDArray[np.floating],
y_pred: npt.NDArray[np.floating],
*,
lower_quantile: float = 0.05,
upper_quantile: float = 0.95,
sample_weights: npt.NDArray[np.floating] | None = None,
) -> float:
"""Calculate the relative Mean Absolute Error (rMAE) using percentiles for range calculation.
The rMAE normalizes the Mean Absolute Error by the range of true values,
making it scale-invariant and suitable for comparing errors across different
datasets or time periods.
Args:
y_true: Ground truth values with shape (num_samples,).
y_pred: Predicted values with shape (num_samples,).
lower_quantile: Lower quantile for range calculation. Must be in [0, 1].
upper_quantile: Upper quantile for range calculation. Must be in [0, 1]
and greater than lower_quantile.
sample_weights: Optional weights for each sample with shape (num_samples,).
If None, all samples are weighted equally.
Returns:
The relative Mean Absolute Error as a float. Returns NaN if the range
between quantiles is zero.
Example:
Basic usage with energy load data:
>>> import numpy as np
>>> y_true = np.array([100, 120, 110, 130, 105])
>>> y_pred = np.array([98, 122, 108, 135, 107])
>>> error = rmae(y_true, y_pred)
>>> round(error, 3)
0.096
With custom quantiles and weights:
>>> weights = np.array([1, 2, 1, 2, 1])
>>> error = rmae(y_true, y_pred, lower_quantile=0.1,
... upper_quantile=0.9, sample_weights=weights)
>>> isinstance(error, float)
True
"""
# Ensure inputs are numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)
if y_true.size == 0 or y_pred.size == 0:
return float("NaN")
# Calculate MAE
mae = np.average(np.abs(y_true - y_pred), weights=sample_weights)
# Calculate range using quantile
y_range = np.quantile(y_true, q=upper_quantile) - np.quantile(y_true, q=lower_quantile)
# Avoid division by zero if range is zero
if y_range == 0:
return float("NaN")
# Calculate rMAE
rmae = mae / y_range
return float(rmae)
[docs]
def mape(
y_true: npt.NDArray[np.floating],
y_pred: npt.NDArray[np.floating],
) -> float:
"""Calculate the Mean Absolute Percentage Error (MAPE).
MAPE measures the average magnitude of errors in percentage terms,
making it scale-independent and easily interpretable. However, it
can be undefined or inflated when true values are near zero.
Args:
y_true: Ground truth values with shape (num_samples,). Should not contain
values close to zero to avoid division issues.
y_pred: Predicted values with shape (num_samples,).
Returns:
The Mean Absolute Percentage Error as a float. May return inf or
extremely large values if y_true contains values close to zero.
Example:
Basic usage with energy load data:
>>> import numpy as np
>>> y_true = np.array([100, 120, 110, 130, 105])
>>> y_pred = np.array([98, 122, 108, 135, 107])
>>> error = mape(y_true, y_pred)
>>> round(error, 4)
0.0225
With perfect predictions:
>>> perfect_pred = np.array([100, 120, 110, 130, 105])
>>> mape(y_true, perfect_pred)
0.0
"""
# Ensure inputs are numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)
if y_true.size == 0 or y_pred.size == 0:
return float("NaN")
# Calculate MAPE
mape_value = np.mean(np.abs((y_true - y_pred) / y_true))
return float(mape_value)
[docs]
class ConfusionMatrix(NamedTuple):
"""Confusion matrix components for peak detection in energy forecasting.
This class represents the results of classifying energy load peaks
versus non-peaks, with additional effectiveness metrics to account for
the direction and magnitude of prediction errors.
Attributes:
true_positives: Boolean array indicating correctly predicted peaks.
true_negatives: Boolean array indicating correctly predicted non-peaks.
false_positives: Boolean array indicating incorrectly predicted peaks.
false_negatives: Boolean array indicating missed peaks.
effective_true_positives: Boolean array indicating true positives that
are effective (peak correctly predicted with appropriate magnitude/direction).
ineffective_true_positives: Boolean array indicating true positives that
are ineffective (peak correctly predicted but with wrong magnitude/direction).
Note:
All arrays have shape (num_samples,) and correspond to the same time points
in the original forecast evaluation.
"""
true_positives: npt.NDArray[np.bool_]
true_negatives: npt.NDArray[np.bool_]
false_positives: npt.NDArray[np.bool_]
false_negatives: npt.NDArray[np.bool_]
effective_true_positives: npt.NDArray[np.bool_]
ineffective_true_positives: npt.NDArray[np.bool_]
[docs]
def confusion_matrix(
y_true: npt.NDArray[np.floating],
y_pred: npt.NDArray[np.floating],
*,
limit_pos: float,
limit_neg: float,
) -> ConfusionMatrix:
"""Calculate confusion matrix for peak detection in energy load.
A peak is defined as a value that exceeds the positive limit or falls below
the negative limit. This function evaluates both the accuracy of peak detection
and the effectiveness of predictions based on error direction.
Args:
y_true: Ground truth energy load values with shape (num_samples,).
y_pred: Predicted energy load values with shape (num_samples,).
limit_pos: Positive threshold defining high load peaks.
Values >= limit_pos are considered positive peaks.
limit_neg: Negative threshold defining low load peaks.
Values <= limit_neg are considered negative peaks.
Returns:
ConfusionMatrix containing boolean arrays for all classification outcomes
and effectiveness metrics.
Example:
Peak detection for energy load data:
>>> import numpy as np
>>> y_true = np.array([100, 150, 80, 200, 90]) # 150 and 200 are peaks
>>> y_pred = np.array([105, 145, 85, 195, 95])
>>> cm = confusion_matrix(y_true, y_pred, limit_pos=120, limit_neg=85)
>>> int(cm.true_positives.sum()) # Successfully detected peaks
3
>>> int(cm.false_positives.sum()) # Incorrectly predicted peaks
0
Note:
Effective true positives require that high peaks are predicted even higher
(positive error) and low peaks are predicted even lower (negative error).
This captures whether the forecast provides actionable information for
energy system operators.
"""
true_peaks_y: npt.NDArray[np.bool_] = (y_true >= limit_pos) | (y_true <= limit_neg)
pred_peaks_y: npt.NDArray[np.bool_] = (y_pred >= limit_pos) | (y_pred <= limit_neg)
error = y_pred - y_true
effective_true_positives = ((y_true >= limit_pos) & (y_pred >= limit_pos) & (error > 0)) | (
(y_true <= limit_neg) & (y_pred <= limit_neg) & (error < 0)
)
ineffective_true_positives = true_peaks_y & pred_peaks_y & ~effective_true_positives
return ConfusionMatrix(
true_positives=true_peaks_y & pred_peaks_y,
true_negatives=~true_peaks_y & ~pred_peaks_y,
false_positives=~true_peaks_y & pred_peaks_y,
false_negatives=true_peaks_y & ~pred_peaks_y,
effective_true_positives=effective_true_positives,
ineffective_true_positives=ineffective_true_positives,
)
[docs]
class PrecisionRecall(NamedTuple):
"""Container for precision and recall metrics.
This class holds the fundamental classification metrics used to evaluate
peak detection performance in energy forecasting applications.
Attributes:
precision: The fraction of predicted peaks that were actual peaks.
Range [0, 1] where 1.0 is perfect precision.
recall: The fraction of actual peaks that were correctly predicted.
Range [0, 1] where 1.0 is perfect recall.
Note:
High precision means few false alarms, while high recall means
few missed peaks. There is often a trade-off between these metrics.
"""
precision: float
recall: float
[docs]
def precision_recall(
cm: ConfusionMatrix,
*,
effective: bool = False,
) -> PrecisionRecall:
"""Calculate precision and recall metrics from a confusion matrix.
These metrics evaluate the quality of peak detection by measuring
how many predicted peaks were correct (precision) and how many
actual peaks were detected (recall).
Args:
cm: Confusion matrix from the confusion_matrix function containing
all classification outcomes.
effective: If True, uses effective true positives which account for
prediction direction and magnitude. If False, uses standard
true positives for calculation.
Returns:
PrecisionRecall containing precision and recall values, each in range [0, 1].
Example:
Calculate standard precision and recall:
>>> import numpy as np
>>> y_true = np.array([100, 150, 80, 200, 90])
>>> y_pred = np.array([105, 145, 85, 195, 95])
>>> cm = confusion_matrix(y_true, y_pred, limit_pos=120, limit_neg=85)
>>> pr = precision_recall(cm)
>>> float(pr.precision)
1.0
>>> float(pr.recall)
1.0
Note:
When effective=True, the metrics focus on predictions that provide
actionable information (correct magnitude and direction) rather than
just correct classification.
"""
relevant = np.sum(cm.true_positives) + np.sum(cm.false_negatives)
retrieved = np.sum(cm.true_positives) + np.sum(cm.false_positives)
if effective:
precision = np.sum(cm.effective_true_positives) / retrieved if retrieved > 0 else 0
recall = np.sum(cm.effective_true_positives) / relevant if relevant > 0 else 0
else:
precision = np.sum(cm.true_positives) / retrieved if retrieved > 0 else 0
recall = np.sum(cm.true_positives) / relevant if relevant > 0 else 0
return PrecisionRecall(
precision=precision,
recall=recall,
)
[docs]
def fbeta(
precision_recall: PrecisionRecall,
beta: float = 2.0,
) -> float:
"""Calculate the F-beta score from precision and recall metrics.
The F-beta score is a weighted harmonic mean of precision and recall,
allowing for different emphasis on recall versus precision based on
the beta parameter.
Args:
precision_recall: Container with precision and recall values from
the precision_recall function.
beta: Weight parameter controlling the relative importance of recall.
Values > 1 favor recall, values < 1 favor precision.
Common values: 0.5 (favor precision), 1.0 (balanced F1), 2.0 (favor recall).
Returns:
The F-beta score as a float in range [0, 1]. Returns 0.0 if both
precision and recall are zero.
Example:
Calculate F2 score (favoring recall):
>>> pr = PrecisionRecall(precision=0.8, recall=0.9)
>>> score = fbeta(pr, beta=2.0)
>>> round(score, 3)
0.878
Calculate F1 score (balanced):
>>> score = fbeta(pr, beta=1.0)
>>> round(score, 3)
0.847
Note:
F-beta scores are particularly useful for imbalanced datasets where
the cost of false positives and false negatives differs significantly.
In energy forecasting, beta > 1 is often preferred to minimize missed peaks.
"""
precision = precision_recall.precision
recall = precision_recall.recall
if precision + recall == 0:
return 0.0
return (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
[docs]
def riqd(
y_true: npt.NDArray[np.floating],
y_pred_lower_q: npt.NDArray[np.floating],
y_pred_upper_q: npt.NDArray[np.floating],
*,
measurement_range_lower_q: float = 0.05,
measurement_range_upper_q: float = 0.95,
) -> float:
"""Calculate the relative Inter Quantile Distance (rIQD).
rIQD measures the average distance between two quantiles, normalized by the measurement range.
Args:
y_true: Ground truth values with shape (num_samples,).
y_pred_lower_q: Predicted values of lower quantile with shape (num_samples,).
y_pred_upper_q: Predicted values of upper quantile with shape (num_samples,).
measurement_range_lower_q: Lower quantile for range calculation. Must be in [0, 1].
measurement_range_upper_q: Upper quantile for range calculation. Must be in [0, 1]
and greater than measurement_range_lower_q.
Returns:
The relative Inter Quantile Distance (rIQD) as a float. Returns NaN if the measurement
range is zero.
Example:
Basic usage with energy load data:
>>> import numpy as np
>>> y_true = np.array([100, 120, 110, 130, 105])
>>> y_pred_lower_q = np.array([90, 100, 105, 95, 85])
>>> y_pred_upper_q = np.array([110, 125, 140, 135, 90])
>>> riqd = riqd(y_true, y_pred_lower_q, y_pred_upper_q)
>>> round(riqd, 4)
0.9259
"""
# Ensure inputs are numpy arrays
y_true = np.array(y_true)
y_pred_lower_q = np.array(y_pred_lower_q)
y_pred_upper_q = np.array(y_pred_upper_q)
if y_true.size == 0 or y_pred_lower_q.size == 0 or y_pred_upper_q.size == 0:
return float("NaN")
y_range = np.quantile(y_true, q=measurement_range_upper_q) - np.quantile(y_true, q=measurement_range_lower_q)
# Calculate IQD
iqd = np.mean(y_pred_upper_q - y_pred_lower_q)
# Avoid division by zero if range is zero
if y_range == 0:
return float("NaN")
# Calculate rIQD
riqd = iqd / y_range
return float(riqd)
[docs]
def r2(
y_true: npt.NDArray[np.floating],
y_pred: npt.NDArray[np.floating],
*,
sample_weights: npt.NDArray[np.floating] | None = None,
) -> float:
"""Calculate the R² (coefficient of determination) score.
R² represents the proportion of variance in the dependent variable that is
predictable from the independent variable(s). It provides a measure of how
well observed outcomes are replicated by the model, based on the proportion
of total variation of outcomes explained by the model.
Args:
y_true: Ground truth values with shape (num_samples,).
y_pred: Predicted values with shape (num_samples,).
sample_weights: Optional weights for each sample with shape (num_samples,).
If None, all samples are weighted equally.
Returns:
The R² score as a float. Best possible score is 1.0, and it can be negative
(because the model can be arbitrarily worse). A constant model that always
predicts the mean of y_true would get an R² score of 0.0.
Example:
Basic usage with energy load data:
>>> import numpy as np
>>> y_true = np.array([100, 120, 110, 130, 105])
>>> y_pred = np.array([98, 122, 108, 135, 107])
>>> score = r2(y_true, y_pred)
>>> round(score, 3)
0.929
Perfect predictions give R² = 1.0:
>>> perfect_pred = np.array([100, 120, 110, 130, 105])
>>> r2(y_true, perfect_pred)
1.0
With sample weights:
>>> weights = np.array([1, 2, 1, 2, 1])
>>> score = r2(y_true, y_pred, sample_weights=weights)
>>> isinstance(score, float)
True
"""
if len(y_true) == 0 or len(y_pred) == 0:
return float("NaN")
return float(r2_score(y_true, y_pred, sample_weight=sample_weights))
[docs]
def relative_pinball_loss(
y_true: npt.NDArray[np.floating],
y_pred: npt.NDArray[np.floating],
*,
quantile: float,
measurement_range_lower_q: float = 0.05,
measurement_range_upper_q: float = 0.95,
sample_weights: npt.NDArray[np.floating] | None = None,
) -> float:
"""Calculate the relative Pinball Loss (also known as relative Quantile Loss).
The relative pinball loss normalizes the pinball loss by the range of true values,
making it scale-invariant and suitable for comparing quantile prediction errors
across different datasets or time periods. The pinball loss can be used to quantify
the accuracy of a single quantile.
Args:
y_true: Ground truth values with shape (num_samples,).
y_pred: Predicted quantile values with shape (num_samples,).
quantile: The quantile level being predicted (e.g., 0.1, 0.5, 0.9).
Must be in [0, 1].
measurement_range_lower_q: Lower quantile for range calculation. Must be in [0, 1].
measurement_range_upper_q: Upper quantile for range calculation. Must be in [0, 1]
and greater than measurement_range_lower_q.
sample_weights: Optional weights for each sample with shape (num_samples,).
If None, all samples are weighted equally.
Returns:
The relative Pinball Loss as a float. Returns NaN if the measurement range
is zero.
Example:
Basic usage for 10th percentile predictions:
>>> import numpy as np
>>> y_true = np.array([100, 120, 110, 130, 105])
>>> y_pred = np.array([95, 115, 105, 125, 100]) # 10th percentile predictions
>>> rpbl = relative_pinball_loss(y_true, y_pred, quantile=0.1, measurement_range_lower_q=0.0,
... measurement_range_upper_q=1.0)
>>> round(rpbl, 4)
0.0167
"""
# Ensure inputs are numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)
if y_true.size == 0 or y_pred.size == 0:
return float("NaN")
# Calculate pinball loss for each sample
errors = y_true - y_pred
pinball_losses = np.where(
errors >= 0,
quantile * errors, # Under-prediction
(quantile - 1) * errors, # Over-prediction
)
# Calculate mean pinball loss (weighted if weights provided)
mean_pinball_loss = np.average(pinball_losses, weights=sample_weights)
# Calculate measurement range for normalization
y_range = np.quantile(y_true, q=measurement_range_upper_q) - np.quantile(y_true, q=measurement_range_lower_q)
# Avoid division by zero if range is zero
if y_range == 0:
return float("NaN")
# Calculate relative pinball loss
relative_pinball_loss = mean_pinball_loss / y_range
return float(relative_pinball_loss)