Quick Start

from qubots import BenchmarkSuite, AutoProblem, AutoOptimizer

# Create benchmark suite
suite = BenchmarkSuite("Algorithm Comparison")

# Add problem and optimizers
problem = AutoProblem.from_repo("examples/tsp")
genetic_opt = AutoOptimizer.from_repo("examples/genetic_tsp")
sa_opt = AutoOptimizer.from_repo("examples/simulated_annealing")

suite.add_problem("TSP-Berlin52", problem, config={"n_cities": 52})
suite.add_optimizer("Genetic Algorithm", genetic_opt, config={"population_size": 100})
suite.add_optimizer("Simulated Annealing", sa_opt, config={"initial_temp": 1000})

# Run benchmark (30 independent runs)
genetic_result = suite.run_benchmark("TSP-Berlin52", "Genetic Algorithm", num_runs=30)
sa_result = suite.run_benchmark("TSP-Berlin52", "Simulated Annealing", num_runs=30)

# Compare results
print(f"Genetic: {genetic_result.metrics.mean_value:.4f} ± {genetic_result.metrics.std_value:.4f}")
print(f"SA: {sa_result.metrics.mean_value:.4f} ± {sa_result.metrics.std_value:.4f}")

Key Classes

BenchmarkSuite

Organize problems and optimizers for systematic evaluation:

from qubots import BenchmarkSuite

suite = BenchmarkSuite("My Benchmark Study")

# Add multiple problems
tsp_problem = AutoProblem.from_repo("examples/tsp")
maxcut_problem = AutoProblem.from_repo("examples/maxcut")
suite.add_problem("TSP-50", tsp_problem, config={"n_cities": 50})
suite.add_problem("MaxCut-20", maxcut_problem, config={"n_vertices": 20})

# Add multiple optimizers
genetic = AutoOptimizer.from_repo("examples/genetic_tsp")
sa = AutoOptimizer.from_repo("examples/simulated_annealing")
suite.add_optimizer("GA", genetic, config={"population_size": 100})
suite.add_optimizer("SA", sa, config={"cooling_rate": 0.95})

# Run all combinations
all_results = suite.run_full_benchmark(num_runs=20)

BenchmarkResult

Access detailed statistics from benchmark runs:

result = suite.run_benchmark("TSP-50", "GA", num_runs=30)

# Key metrics
metrics = result.metrics
print(f"Best: {metrics.best_value:.4f}")
print(f"Mean: {metrics.mean_value:.4f} ± {metrics.std_value:.4f}")
print(f"Success rate: {metrics.success_rate:.1%}")
print(f"Avg runtime: {metrics.mean_runtime_seconds:.2f}s")

# Additional metrics
print(f"Convergence rate: {metrics.convergence_rate:.1%}")
print(f"Coefficient of variation: {metrics.coefficient_of_variation:.3f}")
print(f"Evaluations/sec: {metrics.evaluations_per_second:.0f}")

# Individual run details
for i, run in enumerate(result.individual_runs[:5]):  # First 5 runs
    print(f"Run {i+1}: {run.best_value:.4f} ({run.runtime_seconds:.2f}s)")

Benchmark Types

Standard Performance Benchmarking

Default benchmarking for algorithm comparison:

from qubots import BenchmarkType

# Standard performance benchmark
result = suite.run_benchmark(
    "TSP-50",
    "GA",
    num_runs=30,
    benchmark_type=BenchmarkType.PERFORMANCE
)

print(f"Mean quality: {result.metrics.mean_value:.4f}")
print(f"Best quality: {result.metrics.best_value:.4f}")
print(f"Success rate: {result.metrics.success_rate:.1%}")

Convergence Analysis

Track how algorithms converge over time:

# Convergence benchmark
result = suite.run_benchmark(
    "MaxCut-20",
    "SA",
    num_runs=20,
    benchmark_type=BenchmarkType.CONVERGENCE,
    convergence_threshold=1e-4,
    max_stagnation=100
)

# Analyze convergence
print(f"Convergence rate: {result.metrics.convergence_rate:.1%}")
print(f"Avg iterations to converge: {result.metrics.mean_iterations_to_convergence:.0f}")

Scalability Testing

Test performance across different problem sizes:

# Test scalability
sizes = [20, 50, 100, 200]
runtimes = []

for size in sizes:
    # Create problem with specific size
    problem = AutoProblem.from_repo("examples/tsp", override_params={"n_cities": size})

    temp_suite = BenchmarkSuite(f"Scale-{size}")
    temp_suite.add_problem(f"TSP-{size}", problem)
    temp_suite.add_optimizer("GA", genetic)

    result = temp_suite.run_benchmark(f"TSP-{size}", "GA", num_runs=10)
    runtimes.append((size, result.metrics.mean_runtime_seconds))

# Analyze scaling
for size, runtime in runtimes:
    print(f"Size {size}: {runtime:.2f}s average")

Statistical Analysis

Algorithm Comparison

Compare multiple algorithms statistically:

# Benchmark multiple algorithms
algorithms = {
    "GA": AutoOptimizer.from_repo("examples/genetic_tsp"),
    "SA": AutoOptimizer.from_repo("examples/simulated_annealing"),
    "TS": AutoOptimizer.from_repo("examples/tabu_search")
}

results = {}
for name, optimizer in algorithms.items():
    suite.add_optimizer(name, optimizer)
    results[name] = suite.run_benchmark("TSP-50", name, num_runs=30)

# Compare results
print("Algorithm Comparison:")
print("Algorithm | Mean ± Std | Best | Success Rate")
print("-" * 50)
for name, result in results.items():
    m = result.metrics
    print(f"{name:8} | {m.mean_value:.3f} ± {m.std_value:.3f} | {m.best_value:.3f} | {m.success_rate:.1%}")

Statistical Significance

Test if differences are statistically significant:

from scipy import stats

# Get raw values from two algorithms
ga_values = [run.best_value for run in results["GA"].individual_runs]
sa_values = [run.best_value for run in results["SA"].individual_runs]

# Perform t-test
t_stat, p_value = stats.ttest_ind(ga_values, sa_values)
print(f"T-test p-value: {p_value:.4f}")
print(f"Significant difference: {p_value < 0.05}")

# Effect size (Cohen's d)
pooled_std = ((len(ga_values)-1)*np.std(ga_values)**2 + (len(sa_values)-1)*np.std(sa_values)**2) / (len(ga_values)+len(sa_values)-2)
cohens_d = (np.mean(ga_values) - np.mean(sa_values)) / np.sqrt(pooled_std)
print(f"Effect size (Cohen's d): {cohens_d:.3f}")

Platform Integration

Cloud Benchmarking

Run large-scale benchmarks in the cloud:

from qubots import execute_playground_optimization

# Cloud benchmark with multiple runs
cloud_results = []
for run in range(50):  # Large-scale benchmark
    result = execute_playground_optimization(
        problem_name="tsp_berlin52",
        problem_username="benchmarks",
        optimizer_name="genetic_tsp",
        optimizer_username="algorithms",
        optimizer_params={"population_size": 100, "generations": 1000}
    )
    if result["success"]:
        cloud_results.append(result["best_value"])

# Analyze cloud results
import numpy as np
print(f"Cloud benchmark results:")
print(f"Mean: {np.mean(cloud_results):.4f}")
print(f"Std:  {np.std(cloud_results):.4f}")
print(f"Best: {np.min(cloud_results):.4f}")

Leaderboard Submission

Submit your best results to the leaderboard:

from qubots import submit_to_leaderboard

# Get best run from benchmark
best_run = min(result.individual_runs, key=lambda r: r.best_value)

# Submit to leaderboard
submit_to_leaderboard(
    result=best_run,
    problem_id=1,
    solver_name="My Genetic Algorithm",
    solver_repository="username/my-ga-optimizer",
    solver_config=best_run.optimizer_config
)

Best Practices

Experimental Setup

  • Use ≥30 runs for statistical significance
  • Set different random seeds for each run
  • Control for external factors (CPU load, etc.)
  • Include baseline algorithms for comparison

Statistical Analysis

  • Report confidence intervals, not just means
  • Test for statistical significance (p-values)
  • Consider effect size (practical significance)
  • Analyze failure cases and outliers

Next Steps