Custom Benchmark Configuration#

Defines a complete benchmark: where your data lives, which metrics to compute, and how to assemble the pipeline.

User story: “I want to benchmark on my own data.”

Copy this file and modify MyTargetProvider to point at your dataset. The pipeline configuration (create_custom_benchmark_runner) shows all the knobs: backtest schedule, evaluation windows, analysis visualizations.

See also:

TargetProvider — abstract interface
SimpleTargetProvider — file-based implementation (what we extend here)
BenchmarkPipeline — the orchestrator
EvaluationConfig — how predictions are sliced and scored
Custom Forecaster template — implement your model here

from datetime import timedelta
from pathlib import Path
from typing import Literal, override

from huggingface_hub import snapshot_download
from pydantic import Field

from openstef_beam.analysis import AnalysisConfig
from openstef_beam.analysis.visualizations import WindowedMetricVisualization
from openstef_beam.analysis.visualizations.grouped_target_metric_visualization import GroupedTargetMetricVisualization
from openstef_beam.analysis.visualizations.quantile_probability_visualization import QuantileProbabilityVisualization
from openstef_beam.analysis.visualizations.summary_table_visualization import SummaryTableVisualization
from openstef_beam.analysis.visualizations.timeseries_visualization import TimeSeriesVisualization
from openstef_beam.backtesting import BacktestConfig
from openstef_beam.benchmarking import BenchmarkPipeline, BenchmarkTarget, StrictExecutionCallback
from openstef_beam.benchmarking.storage.base import BenchmarkStorage
from openstef_beam.benchmarking.target_provider import SimpleTargetProvider
from openstef_beam.evaluation import EvaluationConfig, Window
from openstef_beam.evaluation.metric_providers import MetricProvider, RCRPSProvider, RMAEProvider
from openstef_core.types import AvailableAt, LeadTime, Quantile

# Define your own target categories for filtering (must match group_name in targets.yaml)
type MyCategory = Literal["solar_park", "wind_park"]

Target Provider#

The TargetProvider tells BEAM where your data lives and which metrics to compute. Here we extend SimpleTargetProvider which handles file-based datasets with a targets YAML + parquet files.

CUSTOMIZE HERE: Change path templates, category types, and metric selection.

class MyTargetProvider(SimpleTargetProvider[BenchmarkTarget, list[MyCategory]]):
    """Custom target provider -- extend SimpleTargetProvider to load your own data.

    Configure path templates and data flags, then override methods to customize
    target filtering, metrics, and file resolution.
    """

    # Path templates -- adapt to your directory structure
    # {name} is replaced with target.name from targets.yaml
    targets_file_path: str = Field(default="liander2024_targets.yaml", init=False)
    measurements_path_template: str = Field(default="{name}.parquet", init=False)
    weather_path_template: str = Field(default="{name}.parquet", init=False)

    # Disable shared profiles and prices -- only per-target features are used
    # Set to True if you have shared data files (profiles.parquet, prices.parquet)
    use_profiles: bool = False
    use_prices: bool = False

    @override
    def get_targets(self, filter_args: list[MyCategory] | None = None) -> list[BenchmarkTarget]:
        """Load targets and optionally filter by category.

        Returns:
            Filtered list of benchmark targets.
        """
        # super().get_targets() reads targets from the YAML file
        targets = super().get_targets(filter_args)
        # Keep only targets whose group_name matches one of the filter categories
        if filter_args is not None:
            targets = [t for t in targets if t.group_name in filter_args]
        return targets

    @override
    def get_metrics_for_target(self, target: BenchmarkTarget) -> list[MetricProvider]:
        """Define which metrics to compute per target.

        Returns:
            List of metric providers.
        """
        # rMAE: deterministic accuracy at the median (lower is better)
        # rCRPS: probabilistic accuracy across all quantiles (lower is better)
        return [
            RMAEProvider(quantiles=[Quantile(0.5)], lower_quantile=Quantile(0.01), upper_quantile=Quantile(0.99)),
            RCRPSProvider(lower_quantile=Quantile(0.01), upper_quantile=Quantile(0.99)),
        ]

    @override
    def _get_measurements_path_for_target(self, target: BenchmarkTarget) -> Path:
        """Resolve path to load measurement parquet.

        Liander 2024 uses: data_dir/load_measurements/<group>/<name>.parquet
        Change this to match your directory structure.

        Returns:
            Path to the measurement parquet file.
        """
        return self.data_dir / "load_measurements" / target.group_name / f"{target.name}.parquet"

    @override
    def _get_weather_path_for_target(self, target: BenchmarkTarget) -> Path:
        """Resolve path to features parquet (weather, etc.).

        Liander 2024 uses: data_dir/weather_forecasts_versioned/<group>/<name>.parquet
        Change this to match your directory structure.

        Returns:
            Path to the features parquet file.
        """
        return self.data_dir / "weather_forecasts_versioned" / target.group_name / f"{target.name}.parquet"

Analysis Configuration#

Choose which visualizations and summary tables BEAM generates after evaluation. Add or remove providers to customize the output report.

# --- Analysis config: which plots and tables to generate after evaluation ---
ANALYSIS_CONFIG = AnalysisConfig(
    visualization_providers=[
        TimeSeriesVisualization(name="time_series"),
        WindowedMetricVisualization(
            name="rMAE_7D",
            metric=("rMAE", Quantile(0.5)),
            window=Window(lag=timedelta(hours=0), size=timedelta(days=7)),
        ),
        WindowedMetricVisualization(
            name="rCRPS_30D",
            metric="rCRPS",
            window=Window(lag=timedelta(hours=0), size=timedelta(days=30)),
        ),
        GroupedTargetMetricVisualization(name="rMAE_grouped", metric="rMAE", quantile=Quantile(0.5)),
        GroupedTargetMetricVisualization(name="rCRPS_grouped", metric="rCRPS"),
        SummaryTableVisualization(name="summary"),
        QuantileProbabilityVisualization(name="quantile_probability"),
    ],
)

Pipeline Assembly#

Wire everything together: backtest schedule, evaluation config, analysis, and target provider. See BacktestConfig and EvaluationConfig for all available options.

CUSTOMIZE HERE: Adjust predict_interval, train_interval, evaluation windows, and lead times.

def create_custom_benchmark_runner(
    storage: BenchmarkStorage,
    data_dir: Path | None = None,
) -> BenchmarkPipeline[BenchmarkTarget, list[MyCategory]]:
    """Assemble a benchmark pipeline with the custom target provider.

    Args:
        storage: Where to save results.
        data_dir: Dataset path. Downloads Liander 2024 from HuggingFace if None.

    Returns:
        Ready-to-run benchmark pipeline.
    """
    if data_dir is None:
        data_dir = Path(snapshot_download(repo_id="OpenSTEF/liander2024-stef-benchmark", repo_type="dataset"))

    return BenchmarkPipeline[BenchmarkTarget, list[MyCategory]](
        # Backtest: how to replay history
        backtest_config=BacktestConfig(
            prediction_sample_interval=timedelta(minutes=15),  # Data resolution
            predict_interval=timedelta(hours=6),  # New forecast every 6 hours
            train_interval=timedelta(days=7),  # Retrain model every 7 days
        ),
        # Evaluation: how to slice and score the results
        evaluation_config=EvaluationConfig(
            available_ats=[AvailableAt.from_string("D-1T06:00")],  # Day-ahead forecast at 06:00
            lead_times=[
                LeadTime.from_string("P1D"),  # 1 day ahead
            ],  # Evaluate all lead times
            windows=[  # Rolling windows for metrics
                Window(lag=timedelta(hours=0), size=timedelta(days=7)),
                Window(lag=timedelta(hours=0), size=timedelta(days=30)),
            ],
        ),
        analysis_config=ANALYSIS_CONFIG,
        target_provider=MyTargetProvider(data_dir=data_dir),
        storage=storage,
        callbacks=[StrictExecutionCallback()],  # Fail fast on errors
    )