Ensemble Model Benchmark#
Run an ensemble of multiple base models (e.g. LightGBM + GBLinear) with a learned weight combiner on the Liander 2024 STEF benchmark.
What this does:
Downloads the Liander 2024 dataset from HuggingFace (automatic)
Trains multiple base models and a combiner that learns optimal weights
Produces probabilistic forecasts (7 quantiles) for a 36-hour horizon
Saves results locally for comparison
No code changes needed. To benchmark your own model, see Implement a Custom Forecaster.
Ensemble types
Change ensemble_type below to try different strategies:
"learned_weights"— a combiner model learns per-quantile weights"stacking"— base model outputs become features for a meta-model"rules"— fixed rule-based combination
Setup#
import logging
from datetime import timedelta
from pathlib import Path
from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig
from openstef_beam.benchmarking.baselines.openstef4 import (
create_openstef4_preset_backtest_forecaster,
)
from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner
from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback
from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage
from openstef_core.types import LeadTime, Q
from openstef_meta.presets import (
EnsembleForecastingWorkflowConfig,
)
from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage
from openstef_models.transforms.general import SampleWeightConfig
logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
Ensemble configuration#
Choose which base models to combine and how. The ensemble_type controls the
combination strategy; base_models lists which individual models to train.
OUTPUT_PATH = Path("./benchmark_results")
N_PROCESSES = int(os.environ.get("OPENSTEF_N_PROCESSES", "1"))
ensemble_type = "learned_weights" # "stacking", "learned_weights" or "rules"
base_models = ["lgbm", "gblinear"] # combination of "lgbm", "gblinear", "xgboost" and "lgbm_linear"
combiner_model = "lgbm" # "lgbm", "xgboost", "rf" or "logistic" for learned weights; "gblinear" for stacking
model = "Ensemble_" + "_".join(base_models) + "_" + ensemble_type + "_" + combiner_model
# Forecast 36 hours ahead, producing 7 quantile bands
FORECAST_HORIZONS = [LeadTime.from_string("PT36H")]
PREDICTION_QUANTILES = [
Q(0.05),
Q(0.1),
Q(0.3),
Q(0.5),
Q(0.7),
Q(0.9),
Q(0.95),
]
# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR])
BENCHMARK_FILTER: list[Liander2024Category] | None = None
Workflow configuration#
EnsembleForecastingWorkflowConfig extends the standard config with ensemble-specific
settings: which base models to use, the combiner strategy, and per-model sample weights.
USE_MLFLOW_STORAGE = os.environ.get("OPENSTEF_MLFLOW_STORAGE", "true").lower() == "true"
if USE_MLFLOW_STORAGE:
storage = MLFlowStorage(
tracking_uri=str(OUTPUT_PATH / "mlflow_artifacts"),
local_artifacts_path=OUTPUT_PATH / "mlflow_tracking_artifacts",
)
else:
storage = None
workflow_config = EnsembleForecastingWorkflowConfig(
model_id="common_model_",
ensemble_type=ensemble_type,
base_models=base_models, # type: ignore
combiner_model=combiner_model,
horizons=FORECAST_HORIZONS,
quantiles=PREDICTION_QUANTILES,
model_reuse_enable=False,
mlflow_storage=None,
radiation_column="shortwave_radiation",
rolling_aggregate_features=["mean", "median", "max", "min"],
wind_speed_column="wind_speed_80m",
pressure_column="surface_pressure",
temperature_column="temperature_2m",
relative_humidity_column="relative_humidity_2m",
energy_price_column="EPEX_NL",
forecaster_sample_weights={
"gblinear": SampleWeightConfig(method="exponential", weight_exponent=1.0),
"lgbm": SampleWeightConfig(weight_exponent=0.0),
"xgboost": SampleWeightConfig(weight_exponent=0.0),
"lgbm_linear": SampleWeightConfig(weight_exponent=0.0),
},
)
Backtest schedule#
The BacktestForecasterConfig controls how BEAM schedules training and prediction
windows. Ensemble models typically need more context than single models.
backtest_config = BacktestForecasterConfig(
requires_training=True,
predict_length=timedelta(days=7),
predict_min_length=timedelta(minutes=15),
predict_context_length=timedelta(days=14), # Context needed for lag features
predict_context_min_coverage=0.5,
training_context_length=timedelta(days=90), # Three months of training data
training_context_min_coverage=0.5,
predict_sample_interval=timedelta(minutes=15),
)
Run the benchmark#
if __name__ == "__main__":
start_time = time.time()
create_liander2024_benchmark_runner(
storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / model),
data_dir=None, # Path("../data/liander2024-energy-forecasting-benchmark"),
callbacks=[StrictExecutionCallback()],
).run(
forecaster_factory=create_openstef4_preset_backtest_forecaster(
workflow_config=workflow_config,
cache_dir=OUTPUT_PATH / "cache",
),
run_name=model,
n_processes=N_PROCESSES,
filter_args=BENCHMARK_FILTER,
)
end_time = time.time()
print(f"Benchmark completed in {end_time - start_time:.2f} seconds.")