Compare Benchmark Runs#

Generate side-by-side comparison plots from multiple benchmark runs. Uses BenchmarkComparisonPipeline to produce global, per-group, and per-target HTML visualizations.

Prerequisites: Run at least two models first (e.g. via run_liander2024_benchmark.py).

Setup#

Point at the result directories from your benchmark runs.


from pathlib import Path
from typing import cast

from openstef_beam.analysis.models import RunName
from openstef_beam.benchmarking import BenchmarkComparisonPipeline, LocalBenchmarkStorage
from openstef_beam.benchmarking.benchmarks import create_liander2024_benchmark_runner
from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG
from openstef_beam.benchmarking.storage import BenchmarkStorage

# One storage per run — keys are human-readable labels shown in comparison plots.
run_storages: dict[RunName, BenchmarkStorage] = {
    "ExampleBaseline": LocalBenchmarkStorage(base_path=Path("./benchmark_results/ExampleBaseline")),
    "GBLinear": LocalBenchmarkStorage(base_path=Path("./benchmark_results/GBLinear")),
}

# Check that results exist.
for name, storage in run_storages.items():
    base_path = cast(LocalBenchmarkStorage, storage).base_path
    if not base_path.exists():
        msg = f"Benchmark directory not found for '{name}': {base_path}. Run the benchmarks first."
        raise FileNotFoundError(msg)

Run comparison#

The pipeline loads predictions from each run, re-evaluates them, and produces comparison visualizations.

# Reuse the Liander 2024 target provider.
OUTPUT_PATH = Path("./benchmark_results_comparison/liander2024")
target_provider = create_liander2024_benchmark_runner(
    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),
).target_provider

# Run the comparison — generates global, group, and per-target HTML plots.
comparison = BenchmarkComparisonPipeline(
    analysis_config=LIANDER2024_ANALYSIS_CONFIG,
    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),
    target_provider=target_provider,
)
comparison.run(run_data=run_storages)